Untitled

commit f769193bec18d1384ff8b2094088d46972da1673
Author: Juhyung Park <qkrwngud825@gmail.com>
Date:   Fri May 20 07:00:12 2022 +0900

    Samsung

    Signed-off-by: Juhyung Park <qkrwngud825@gmail.com>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46d253155a255..f96d8044b5a3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -64,12 +64,12 @@
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
-#include <linux/show_mem_notifier.h>
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
 #include <linux/khugepaged.h>
+#include <linux/sched/cputime.h>

 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -81,6 +81,36 @@
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)

+/* If RANK_BIT position in physical address is zero, it is main rank */
+#define is_main_rank(page) !rankid(page)
+
+static inline void rank_list_add(struct page *page, struct list_head *list)
+{
+   if (is_main_rank(page))
+       list_add(&(page)->lru, list);
+   else
+       list_add_tail(&(page)->lru, list);
+}
+
+static inline void rank_free_area_add(struct page *page, struct free_area *area,
+                     int migratetype)
+{
+   if (is_main_rank(page))
+       add_to_free_area(page, area, migratetype);
+   else
+       add_to_free_area_tail(page, area, migratetype);
+}
+
+static inline void rank_free_area_move(struct page *page,
+                      struct free_area *area,
+                      int migratetype)
+{
+   if (is_main_rank(page))
+       move_to_free_area(page, area, migratetype);
+   else
+       move_to_free_area_tail(page, area, migratetype);
+}
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -187,24 +217,6 @@ static int __init early_init_on_free(char *buf)
 }
 early_param("init_on_free", early_init_on_free);

-/*
- * A cached value of the page's pageblock's migratetype, used when the page is
- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
- * Also the migratetype set in the page does not necessarily match the pcplist
- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
- * other index - this ensures that it will be put on the correct CMA freelist.
- */
-static inline int get_pcppage_migratetype(struct page *page)
-{
-   return page->index;
-}
-
-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
-{
-   page->index = migratetype;
-}
-
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -919,7 +931,7 @@ static inline void __free_one_page(struct page *page,
    unsigned int max_order;
    struct capture_control *capc = task_capc(zone);

-   max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
+   max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);

    VM_BUG_ON(!zone_is_initialized(zone));
    VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -932,7 +944,7 @@ static inline void __free_one_page(struct page *page,
    VM_BUG_ON_PAGE(bad_range(zone, page), page);

 continue_merging:
-   while (order < max_order - 1) {
+   while (order < max_order) {
        if (compaction_capture(capc, page, order, migratetype)) {
            __mod_zone_freepage_state(zone, -(1 << order),
                                migratetype);
@@ -958,7 +970,7 @@ continue_merging:
        pfn = combined_pfn;
        order++;
    }
-   if (max_order < MAX_ORDER) {
+   if (order < MAX_ORDER - 1) {
        /* If we are here, it means order is >= pageblock_order.
         * We want to prevent merge between freepages on isolate
         * pageblock and normal pageblock. Without this, pageblock
@@ -979,7 +991,7 @@ continue_merging:
                        is_migrate_isolate(buddy_mt)))
                goto done_merging;
        }
-       max_order++;
+       max_order = order + 1;
        goto continue_merging;
    }

@@ -1003,17 +1015,17 @@ done_merging:
        higher_buddy = higher_page + (buddy_pfn - combined_pfn);
        if (pfn_valid_within(buddy_pfn) &&
            page_is_buddy(higher_page, higher_buddy, order + 1)) {
-           add_to_free_area_tail(page, &zone->free_area[order],
-                         migratetype);
+           rank_free_area_add(page, &zone->free_area[order],
+                      migratetype);
            return;
        }
    }

    if (is_shuffle_order(order))
        add_to_free_area_random(page, &zone->free_area[order],
-               migratetype);
+                   migratetype);
    else
-       add_to_free_area(page, &zone->free_area[order], migratetype);
+       rank_free_area_add(page, &zone->free_area[order], migratetype);

 }

@@ -1430,15 +1442,35 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
    }
 }

+#ifdef CONFIG_HUGEPAGE_POOL
+static void  __free_pages_ok(struct page *page, unsigned int order)
+{
+   ___free_pages_ok(page, order, false);
+}
+
+void ___free_pages_ok(struct page *page, unsigned int order,
+             bool skip_hugepage_pool)
+#else
 static void __free_pages_ok(struct page *page, unsigned int order)
+#endif
 {
    unsigned long flags;
    int migratetype;
    unsigned long pfn = page_to_pfn(page);

+#ifdef CONFIG_HUGEPAGE_POOL
+   if (!skip_hugepage_pool && !free_pages_prepare(page, order, true))
+       return;
+#else
    if (!free_pages_prepare(page, order, true))
        return;
+#endif

+#ifdef CONFIG_HUGEPAGE_POOL
+   if (!skip_hugepage_pool && order == HUGEPAGE_ORDER &&
+       insert_hugepage_pool(page, order))
+       return;
+#endif
    migratetype = get_pfnblock_migratetype(page, pfn);
    local_irq_save(flags);
    __count_vm_events(PGFREE, 1 << order);
@@ -2049,7 +2081,7 @@ static inline void expand(struct zone *zone, struct page *page,
        if (set_page_guard(zone, &page[size], high, migratetype))
            continue;

-       add_to_free_area(&page[size], area, migratetype);
+       rank_free_area_add(&page[size], area, migratetype);
        set_page_order(&page[size], high);
    }
 }
@@ -2166,8 +2198,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
    set_page_owner(page, order, gfp_flags);
 }

+#ifdef CONFIG_HUGEPAGE_POOL
+void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+                           unsigned int alloc_flags)
+#else
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                            unsigned int alloc_flags)
+#endif
 {
    post_alloc_hook(page, order, gfp_flags);

@@ -2190,14 +2227,18 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
 }

 /*
- * Go through the free lists for the given migratetype and remove
- * the smallest available page from the freelists
+ * Search the free lists from requested order to MAX_ORDER to find
+ * the main rank page and returns the order if exists.
+ * If main rank page doesn't exist, returns the smallest order of
+ * available backup rank page.
+ *
+ * MAX_ORDER is returned if there's no available pages.
  */
 static __always_inline
-struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
-                       int migratetype)
+unsigned int __get_min_rank_aware_order(struct zone *zone,
+                   unsigned int order, int migratetype)
 {
-   unsigned int current_order;
+   unsigned int current_order, backup_order = MAX_ORDER;
    struct free_area *area;
    struct page *page;

@@ -2205,15 +2246,36 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]);
        page = get_page_from_free_area(area, migratetype);
-       if (!page)
-           continue;
-       del_page_from_free_area(page, area);
-       expand(zone, page, order, current_order, area, migratetype);
-       set_pcppage_migratetype(page, migratetype);
-       return page;
+       if (page) {
+           if (is_main_rank(page))
+               return current_order;
+           if (backup_order == MAX_ORDER)
+               backup_order = current_order;
+       }
    }

-   return NULL;
+   return backup_order;
+}
+
+static __always_inline
+struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+                       int migratetype)
+{
+   unsigned int current_order;
+   struct free_area *area;
+   struct page *page;
+
+   current_order = __get_min_rank_aware_order(zone, order, migratetype);
+   if (current_order == MAX_ORDER)
+       return NULL;
+
+   area = &(zone->free_area[current_order]);
+   page = get_page_from_free_area(area, migratetype);
+   del_page_from_free_area(page, area);
+   expand(zone, page, order, current_order, area, migratetype);
+   set_pcppage_migratetype(page, migratetype);
+
+   return page;
 }


@@ -2282,7 +2344,7 @@ static int move_freepages(struct zone *zone,
        VM_BUG_ON_PAGE(page_zone(page) != zone, page);

        order = page_order(page);
-       move_to_free_area(page, &zone->free_area[order], migratetype);
+       rank_free_area_move(page, &zone->free_area[order], migratetype);
        page += 1 << order;
        pages_moved += 1 << order;
    }
@@ -2359,38 +2421,11 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
    return false;
 }

-static bool boost_eligible(struct zone *z)
-{
-   unsigned long high_wmark, threshold;
-   unsigned long reclaim_eligible, free_pages;
-
-   high_wmark = z->_watermark[WMARK_HIGH];
-   reclaim_eligible = zone_page_state_snapshot(z, NR_ZONE_INACTIVE_FILE) +
-           zone_page_state_snapshot(z, NR_ZONE_ACTIVE_FILE);
-   free_pages = zone_page_state(z, NR_FREE_PAGES) -
-           zone_page_state(z, NR_FREE_CMA_PAGES);
-   threshold = high_wmark + (2 * mult_frac(high_wmark,
-                   watermark_boost_factor, 10000));
-
-   /*
-    * Don't boost watermark If we are already low on memory where the
-    * boosting can simply put the watermarks at higher levels for a
-    * longer duration of time and thus the other users relied on the
-    * watermarks are forced to choose unintended decissions. If memory
-    * is so low, kswapd in normal mode should help.
-    */
-
-   if (reclaim_eligible < threshold && free_pages < threshold)
-       return false;
-
-   return true;
-}
-
 static inline bool boost_watermark(struct zone *zone)
 {
    unsigned long max_boost;

-   if (!watermark_boost_factor || !boost_eligible(zone))
+   if (!watermark_boost_factor)
        return false;
    /*
     * Don't bother in zones that are unlikely to produce results.
@@ -2506,7 +2541,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,

 single_page:
    area = &zone->free_area[current_order];
-   move_to_free_area(page, area, start_type);
+   rank_free_area_move(page, area, start_type);
 }

 /*
@@ -2837,7 +2872,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         * for IO devices that can merge IO requests if the physical
         * pages are ordered properly.
         */
-       list_add_tail(&page->lru, list);
+       rank_list_add(page, list);
        alloced++;
        if (is_migrate_cma(get_pcppage_migratetype(page)))
            __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3143,7 +3178,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
    }

    pcp = &this_cpu_ptr(zone->pageset)->pcp;
-   list_add(&page->lru, &pcp->lists[migratetype]);
+   rank_list_add(page, &pcp->lists[migratetype]);
    pcp->count++;
    if (pcp->count >= pcp->high) {
        unsigned long batch = READ_ONCE(pcp->batch);
@@ -3329,7 +3364,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
            if (unlikely(list == NULL) ||
                    unlikely(list_empty(list)))
                return NULL;
-
        }

        page = list_first_entry(list, struct page, lru);
@@ -3402,7 +3436,6 @@ struct page *rmqueue(struct zone *preferred_zone,

        if (!page)
            page = __rmqueue(zone, order, migratetype, alloc_flags);
-
    } while (page && check_new_pages(page, order));

    spin_unlock(&zone->lock);
@@ -3498,6 +3531,29 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

 #endif /* CONFIG_FAIL_PAGE_ALLOC */

+static inline long __zone_watermark_unusable_free(struct zone *z,
+               unsigned int order, unsigned int alloc_flags)
+{
+   const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+   long unusable_free = (1 << order) - 1;
+
+   /*
+    * If the caller does not have rights to ALLOC_HARDER then subtract
+    * the high-atomic reserves. This will over-estimate the size of the
+    * atomic reserve but it avoids a search.
+    */
+   if (likely(!alloc_harder))
+       unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+   /* If allocation can't use CMA areas don't use free CMA pages */
+   if (!(alloc_flags & ALLOC_CMA))
+       unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+   return unusable_free;
+}
+
 noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
    return __should_fail_alloc_page(gfp_mask, order);
@@ -3519,19 +3575,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));

    /* free_pages may go negative - that's OK */
-   free_pages -= (1 << order) - 1;
+   free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);

    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;

-   /*
-    * If the caller does not have rights to ALLOC_HARDER then subtract
-    * the high-atomic reserves. This will over-estimate the size of the
-    * atomic reserve but it avoids a search.
-    */
-   if (likely(!alloc_harder)) {
-       free_pages -= z->nr_reserved_highatomic;
-   } else {
+   if (unlikely(alloc_harder)) {
        /*
         * OOM victims can try even harder than normal ALLOC_HARDER
         * users on the grounds that it's definitely going to be in
@@ -3544,13 +3593,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
            min -= min / 4;
    }

-
-#ifdef CONFIG_CMA
-   /* If allocation can't use CMA areas don't use free CMA pages */
-   if (!(alloc_flags & ALLOC_CMA))
-       free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
    /*
     * Check watermarks for an order-0 allocation request. If these
     * are not met, then a high-order request also cannot go ahead
@@ -3572,14 +3614,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
            continue;

        for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
-#ifdef CONFIG_CMA
-           /*
-            * Note that this check is needed only
-            * when MIGRATE_CMA < MIGRATE_PCPTYPES.
-            */
-           if (mt == MIGRATE_CMA)
-               continue;
-#endif
            if (!free_area_empty(area, mt))
                return true;
        }
@@ -3608,24 +3642,22 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
                unsigned long mark, int classzone_idx,
                unsigned int alloc_flags, gfp_t gfp_mask)
 {
-   long free_pages = zone_page_state(z, NR_FREE_PAGES);
-   long cma_pages = 0;
+   long free_pages;

-#ifdef CONFIG_CMA
-   /* If allocation can't use CMA areas don't use free CMA pages */
-   if (!(alloc_flags & ALLOC_CMA))
-       cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+   free_pages = zone_page_state(z, NR_FREE_PAGES);

    /*
     * Fast check for order-0 only. If this fails then the reserves
-    * need to be calculated. There is a corner case where the check
-    * passes but only the high-order atomic reserve are free. If
-    * the caller is !atomic then it'll uselessly search the free
-    * list. That corner case is then slower but it is harmless.
+    * need to be calculated.
     */
-   if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
-       return true;
+   if (!order) {
+       long fast_free;
+
+       fast_free = free_pages;
+       fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
+       if (fast_free > mark + z->lowmem_reserve[classzone_idx])
+           return true;
+   }

    if (__zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                    free_pages))
@@ -3783,20 +3815,6 @@ retry:
        }

        mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
-       /*
-        * Allow high, atomic, harder order-0 allocation requests
-        * to skip the ->watermark_boost for min watermark check.
-        * In doing so, check for:
-        *  1) ALLOC_WMARK_MIN - Allow to wake up kswapd in the
-        *           slow path.
-        *  2) ALLOC_HIGH - Allow high priority requests.
-        *  3) ALLOC_HARDER - Allow (__GFP_ATOMIC && !__GFP_NOMEMALLOC),
-        *          of the others.
-        */
-       if (unlikely(!order && !(alloc_flags & ALLOC_WMARK_MASK) &&
-            (alloc_flags & (ALLOC_HARDER | ALLOC_HIGH)))) {
-           mark = zone->_watermark[WMARK_MIN];
-       }
        if (!zone_watermark_fast(zone, order, mark,
                       ac_classzone_idx(ac), alloc_flags,
                       gfp_mask)) {
@@ -3893,7 +3911,6 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
        filter &= ~SHOW_MEM_FILTER_NODES;

    show_mem(filter, nodemask);
-   show_mem_call_notifiers();
 }

 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -4308,7 +4325,8 @@ retry:
     */
    if (!page && !drained) {
        unreserve_highatomic_pageblock(ac, false);
-       drain_all_pages(NULL);
+       if (!need_memory_boosting(NULL))
+           drain_all_pages(NULL);
        drained = true;
        goto retry;
    }
@@ -4367,8 +4385,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        alloc_flags |= ALLOC_KSWAPD;

 #ifdef CONFIG_CMA
-   if ((gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) &&
-               (gfp_mask & __GFP_CMA))
+   if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;
 #endif
    return alloc_flags;
@@ -4558,13 +4575,19 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    struct page *page = NULL;
    unsigned int alloc_flags;
-   unsigned long did_some_progress;
+   unsigned long did_some_progress = 0;
    enum compact_priority compact_priority;
    enum compact_result compact_result;
    int compaction_retries;
    int no_progress_loops;
    unsigned int cpuset_mems_cookie;
    int reserve_flags;
+   unsigned long pages_reclaimed = 0;
+   int retry_loop_count = 0;
+   unsigned long jiffies_s = jiffies;
+   u64 utime, stime_s, stime_e, stime_d;
+
+   task_cputime(current, &utime, &stime_s);

    /*
     * We also sanity check to catch abuse of atomic reserves being used by
@@ -4679,6 +4702,7 @@ retry_cpuset:
    }

 retry:
+   retry_loop_count++;
    /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    if (alloc_flags & ALLOC_KSWAPD)
        wake_all_kswapds(order, gfp_mask, ac);
@@ -4711,13 +4735,10 @@ retry:
    if (current->flags & PF_MEMALLOC)
        goto nopage;

-   if (fatal_signal_pending(current) && !(gfp_mask & __GFP_NOFAIL) &&
-           (gfp_mask & __GFP_FS))
-       goto nopage;
-
    /* Try direct reclaim and then allocating */
    page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
                            &did_some_progress);
+   pages_reclaimed += did_some_progress;
    if (page)
        goto got_pg;

@@ -4825,6 +4846,29 @@ fail:
    warn_alloc(gfp_mask, ac->nodemask,
            "page allocation failure: order:%u", order);
 got_pg:
+   task_cputime(current, &utime, &stime_e);
+   stime_d = stime_e - stime_s;
+   if (stime_d / NSEC_PER_MSEC > 256) {
+       pg_data_t *pgdat;
+
+       unsigned long a_anon = 0;
+       unsigned long in_anon = 0;
+       unsigned long a_file = 0;
+       unsigned long in_file = 0;
+       for_each_online_pgdat(pgdat) {
+           a_anon += node_page_state(pgdat, NR_ACTIVE_ANON);
+           in_anon += node_page_state(pgdat, NR_INACTIVE_ANON);
+           a_file += node_page_state(pgdat, NR_ACTIVE_FILE);
+           in_file += node_page_state(pgdat, NR_INACTIVE_FILE);
+       }
+       pr_info("alloc stall: timeJS(ms):%u|%llu rec:%lu|%lu ret:%d o:%d gfp:%#x(%pGg) AaiFai:%lukB|%lukB|%lukB|%lukB\n",
+           jiffies_to_msecs(jiffies - jiffies_s),
+           stime_d / NSEC_PER_MSEC,
+           did_some_progress, pages_reclaimed, retry_loop_count,
+           order, gfp_mask, &gfp_mask,
+           a_anon << (PAGE_SHIFT-10), in_anon << (PAGE_SHIFT-10),
+           a_file << (PAGE_SHIFT-10), in_file << (PAGE_SHIFT-10));
+   }
    return page;
 }

@@ -4854,8 +4898,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    if (should_fail_alloc_page(gfp_mask, order))
        return false;

-   if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE &&
-           (gfp_mask & __GFP_CMA))
+   if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
        *alloc_flags |= ALLOC_CMA;

    return true;
@@ -5299,6 +5342,9 @@ long si_mem_available(void)
    reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
            global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
    available += reclaimable - min(reclaimable / 2, wmark_low);
+#ifdef CONFIG_ION_RBIN_HEAP
+   available += atomic_read(&rbin_cached_pages);
+#endif

    if (available < 0)
        available = 0;
@@ -5309,6 +5355,9 @@ EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
    val->totalram = totalram_pages();
+#ifdef CONFIG_ION_RBIN_HEAP
+   val->totalram += totalrbin_pages;
+#endif
    val->sharedram = global_node_page_state(NR_SHMEM);
    val->freeram = global_zone_page_state(NR_FREE_PAGES);
    val->bufferram = nr_blockdev_pages();
@@ -7056,8 +7105,6 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
    pg_data_t *pgdat = NODE_DATA(nid);
    unsigned long start_pfn = 0;
    unsigned long end_pfn = 0;
-   u64 i;
-   phys_addr_t start, end;

    /* pg_data_t should be reset to zero when it's allocated */
    WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
@@ -7071,10 +7118,6 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
        (u64)start_pfn << PAGE_SHIFT,
        end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
 #else
-   for_each_mem_range(i, &memblock.memory, NULL, nid, MEMBLOCK_NONE,
-              &start, &end, NULL)
-       subsection_map_init((unsigned long)start >> PAGE_SHIFT,
-                   (unsigned long)(end - start) >> PAGE_SHIFT);
    start_pfn = node_start_pfn;
 #endif
    calculate_node_totalpages(pgdat, start_pfn, end_pfn,
@@ -7692,14 +7735,15 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
        free_reserved_page(page);
    }

-   if (pages && s)
-       pr_info("Freeing %s memory: %ldK\n",
-           s, pages << (PAGE_SHIFT - 10));
+   if (pages && s) {
+       pr_info("Freeing %s memory: %ldK\n", s, pages << (PAGE_SHIFT - 10));
+       if (!strcmp(s, "initrd") || !strcmp(s, "unused kernel")) {
+           long size;

-#ifdef CONFIG_HAVE_MEMBLOCK
-       memblock_dbg("memblock_free: [%#016llx-%#016llx] %pS\n",
-           __pa(start), __pa(end), (void *)_RET_IP_);
-#endif
+           size = -1 * (long)(pages << PAGE_SHIFT);
+           memblock_memsize_mod_kernel_size(size);
+       }
+   }

    return pages;
 }
@@ -7978,11 +8022,11 @@ static void __setup_per_zone_wmarks(void)
                mult_frac(zone_managed_pages(zone),
                      watermark_scale_factor, 10000));

-       zone->watermark_boost = 0;
        zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) +
                    low + tmp;
        zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) +
                    low + tmp * 2;
+       zone->watermark_boost = 0;

        spin_unlock_irqrestore(&zone->lock, flags);
    }
@@ -8097,22 +8141,6 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
    return 0;
 }

-#ifdef CONFIG_MULTIPLE_KSWAPD
-int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
-   void __user *buffer, size_t *length, loff_t *ppos)
-{
-   int rc;
-
-   rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
-   if (rc)
-       return rc;
-
-   if (write)
-       update_kswapd_threads();
-
-   return 0;
-}
-#endif
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -8522,7 +8550,8 @@ static unsigned long pfn_max_align_up(unsigned long pfn)

 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
-                   unsigned long start, unsigned long end)
+                   unsigned long start, unsigned long end,
+                   bool drain)
 {
    /* This function is based on compact_zone() from compaction.c. */
    unsigned long nr_reclaimed;
@@ -8530,7 +8559,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
    unsigned int tries = 0;
    int ret = 0;

-   migrate_prep();
+   if (drain)
+       migrate_prep();

    while (pfn < end || !list_empty(&cc->migratepages)) {
        if (fatal_signal_pending(current)) {
@@ -8586,8 +8616,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
-int alloc_contig_range(unsigned long start, unsigned long end,
-              unsigned migratetype, gfp_t gfp_mask)
+int __alloc_contig_range(unsigned long start, unsigned long end,
+            unsigned migratetype, gfp_t gfp_mask, bool drain)
 {
    unsigned long outer_start, outer_end;
    unsigned int order;
@@ -8646,7 +8676,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
     * allocated.  So, if we fall through be sure to clear ret so that
     * -EBUSY is not accidentally used or returned to caller.
     */
-   ret = __alloc_contig_migrate_range(&cc, start, end);
+   ret = __alloc_contig_migrate_range(&cc, start, end, drain);
    if (ret && ret != -EBUSY)
        goto done;
    ret =0;
@@ -8668,37 +8698,40 @@ int alloc_contig_range(unsigned long start, unsigned long end,
     * isolated thus they won't get removed from buddy.
     */

-   lru_add_drain_all();
-
    order = 0;
    outer_start = start;
-   while (!PageBuddy(pfn_to_page(outer_start))) {
-       if (++order >= MAX_ORDER) {
-           outer_start = start;
-           break;
-       }
-       outer_start &= ~0UL << order;
-   }

-   if (outer_start != start) {
-       order = page_order(pfn_to_page(outer_start));
+   if (drain) {
+       lru_add_drain_all();
+       drain_all_pages(cc.zone);

-       /*
-        * outer_start page could be small order buddy page and
-        * it doesn't include start page. Adjust outer_start
-        * in this case to report failed page properly
-        * on tracepoint in test_pages_isolated()
-        */
-       if (outer_start + (1UL << order) <= start)
-           outer_start = start;
-   }
+       while (!PageBuddy(pfn_to_page(outer_start))) {
+           if (++order >= MAX_ORDER) {
+               outer_start = start;
+               break;
+           }
+           outer_start &= ~0UL << order;
+       }

-   /* Make sure the range is really isolated. */
-   if (test_pages_isolated(outer_start, end, false)) {
-       pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
-           __func__, outer_start, end);
-       ret = -EBUSY;
-       goto done;
+       if (outer_start != start) {
+           order = page_order(pfn_to_page(outer_start));
+
+           /*
+            * outer_start page could be small order buddy page and
+            * it doesn't include start page. Adjust outer_start
+            * in this case to report failed page properly
+            * on tracepoint in test_pages_isolated()
+            */
+           if (outer_start + (1UL << order) <= start)
+               outer_start = start;
+       }
+       /* Make sure the range is really isolated. */
+       if (test_pages_isolated(outer_start, end, false)) {
+           pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
+               __func__, outer_start, end);
+           ret = -EBUSY;
+           goto done;
+       }
    }

    /* Grab isolated pages from freelists. */
@@ -8722,6 +8755,17 @@ done:
 #endif
    return ret;
 }
+int alloc_contig_range(unsigned long start, unsigned long end,
+              unsigned migratetype, gfp_t gfp_mask)
+{
+   return __alloc_contig_range(start, end, migratetype, gfp_mask, true);
+}
+
+int alloc_contig_range_fast(unsigned long start, unsigned long end,
+               unsigned migratetype)
+{
+   return __alloc_contig_range(start, end, migratetype, GFP_KERNEL, false);
+}
 #endif /* CONFIG_CONTIG_ALLOC */

 void free_contig_range(unsigned long pfn, unsigned int nr_pages)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2d57e7eddfeb8..753985b1051aa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -89,9 +89,12 @@ struct scan_control {
    unsigned int may_swap:1;

    /*
-    * Cgroups are not reclaimed below their configured memory.low,
-    * unless we threaten to OOM. If any cgroups are skipped due to
-    * memory.low and nothing was reclaimed, go back for memory.low.
+    * Cgroup memory below memory.low is protected as long as we
+    * don't threaten to OOM. If any cgroup is reclaimed at
+    * reduced force or passed over entirely due to its memory.low
+    * setting (memcg_low_skipped), and nothing is reclaimed as a
+    * result, then go back for one more cycle that reclaims the protected
+    * memory (memcg_low_reclaim) to avert OOM.
     */
    unsigned int memcg_low_reclaim:1;
    unsigned int memcg_low_skipped:1;
@@ -131,21 +134,8 @@ struct scan_control {

    /* for recording the reclaimed slab by now */
    struct reclaim_state reclaim_state;
-   /*
-    * Reclaim pages from a vma. If the page is shared by other tasks
-    * it is zapped from a vma without reclaim so it ends up remaining
-    * on memory until last task zap it.
-    */
-   struct vm_area_struct *target_vma;
 };

-/*
- * Number of active kswapd threads
- */
-#define DEF_KSWAPD_THREADS_PER_NODE 1
-int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
-int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
-
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)           \
    do {                                \
@@ -485,10 +475,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
    long batch_size = shrinker->batch ? shrinker->batch
                      : SHRINK_BATCH;
    long scanned = 0, next_deferred;
-   long min_cache_size = batch_size;
-
-   if (current_is_kswapd())
-       min_cache_size = 0;

    if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
        nid = 0;
@@ -568,7 +554,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
     * scanning at high prio and therefore should try to reclaim as much as
     * possible.
     */
-   while (total_scan > min_cache_size ||
+   while (total_scan >= batch_size ||
           total_scan >= freeable) {
        unsigned long ret;
        unsigned long nr_to_scan = min(batch_size, total_scan);
@@ -614,6 +600,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
    unsigned long ret, freed = 0;
    int i;

+   /* allow shrink_slab_memcg for only kswapd */
+   if (!current_is_kswapd())
+       return 0;
+
    if (!mem_cgroup_online(memcg))
        return 0;

@@ -642,8 +632,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,

        /* Call non-slab shrinkers even though kmem is disabled */
        if (!memcg_kmem_enabled() &&
-           !(shrinker->flags & SHRINKER_NONSLAB))
+           !(shrinker->flags & SHRINKER_NONSLAB)) {
+           clear_bit(i, map->map);
            continue;
+       }

        ret = do_shrink_slab(&sc, shrinker, priority);
        if (ret == SHRINK_EMPTY) {
@@ -1165,8 +1157,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
            goto keep;

        VM_BUG_ON_PAGE(PageActive(page), page);
-       if (pgdat)
-           VM_BUG_ON_PAGE(page_pgdat(page) != pgdat, page);

        nr_pages = compound_nr(page);

@@ -1179,6 +1169,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        if (!sc->may_unmap && page_mapped(page))
            goto keep_locked;

+#ifdef CONFIG_HUGEPAGE_POOL
+       if (PageTransHuge(page))
+           goto keep_locked;
+#endif
+
        may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
            (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

@@ -1253,8 +1248,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
            /* Case 1 above */
            if (current_is_kswapd() &&
                PageReclaim(page) &&
-               (pgdat &&
-               test_bit(PGDAT_WRITEBACK, &pgdat->flags))) {
+               test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                stat->nr_immediate++;
                goto activate_locked;

@@ -1326,6 +1320,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (!add_to_swap(page)) {
                    if (!PageTransHuge(page))
                        goto activate_locked_split;
+#ifdef CONFIG_HUGEPAGE_POOL_DEBUG
+                   BUG();
+#endif
                    /* Fallback to swap normal pages */
                    if (split_huge_page_to_list(page,
                                    page_list))
@@ -1366,11 +1363,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
         */
        if (page_mapped(page)) {
            enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+           bool was_swapbacked = PageSwapBacked(page);

            if (unlikely(PageTransHuge(page)))
                flags |= TTU_SPLIT_HUGE_PMD;
-           if (!try_to_unmap(page, flags, sc->target_vma)) {
+           if (!try_to_unmap(page, flags)) {
                stat->nr_unmap_fail += nr_pages;
+               if (!was_swapbacked && PageSwapBacked(page))
+                   stat->nr_lazyfree_fail += nr_pages;
                goto activate_locked;
            }
        }
@@ -1388,8 +1388,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
             */
            if (page_is_file_cache(page) &&
                (!current_is_kswapd() || !PageReclaim(page) ||
-                (pgdat &&
-               !test_bit(PGDAT_DIRTY, &pgdat->flags)))) {
+                !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                /*
                 * Immediately reclaim when written back.
                 * Similar in principal to deactivate_page()
@@ -1513,14 +1512,13 @@ free_it:
        else
            list_add(&page->lru, &free_pages);
        /*
-        * If pagelist are from multiple nodes, we should decrease
+        * If pagelist are from multiple zones, we should decrease
         * NR_ISOLATED_ANON + x on freed pages in here.
         */
        if (!pgdat)
            dec_node_page_state(page, NR_ISOLATED_ANON +
-                   page_is_file_cache(page));
+                       page_is_file_cache(page));
        continue;
-
 activate_locked_split:
        /*
         * The tail pages that are failed to add into swap cache
@@ -1570,7 +1568,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        .may_unmap = 1,
    };
    struct reclaim_stat dummy_stat;
-   unsigned long ret;
+   unsigned long nr_reclaimed;
    struct page *page, *next;
    LIST_HEAD(clean_pages);

@@ -1582,16 +1580,25 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        }
    }

-   ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+   nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
            TTU_IGNORE_ACCESS, &dummy_stat, true);
    list_splice(&clean_pages, page_list);
-   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
-   return ret;
+   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+   /*
+    * Since lazyfree pages are isolated from file LRU from the beginning,
+    * they will rotate back to anonymous LRU in the end if it failed to
+    * discard so isolated count will be mismatched.
+    * Compensate the isolated count for both LRU lists.
+    */
+   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
+               dummy_stat.nr_lazyfree_fail);
+   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+               -dummy_stat.nr_lazyfree_fail);
+
+   return nr_reclaimed;
 }

-#ifdef CONFIG_PROCESS_RECLAIM
-unsigned long reclaim_pages_from_list(struct list_head *page_list,
-                   struct vm_area_struct *vma)
+unsigned long reclaim_pages_from_list(struct list_head *page_list)
 {
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
@@ -1599,30 +1606,34 @@ unsigned long reclaim_pages_from_list(struct list_head *page_list,
        .may_writepage = 1,
        .may_unmap = 1,
        .may_swap = 1,
-       .target_vma = vma,
    };
-
+   struct reclaim_stat dummy_stat;
    unsigned long nr_reclaimed;
-   struct reclaim_stat stat;
-   struct page *page;
+   struct page *page, *next;
+   LIST_HEAD(unevictable_pages);

-   list_for_each_entry(page, page_list, lru)
+   list_for_each_entry_safe(page, next, page_list, lru) {
+       if (PageUnevictable(page)) {
+           list_move(&page->lru, &unevictable_pages);
+           continue;
+       }
        ClearPageActive(page);
+   }

    nr_reclaimed = shrink_page_list(page_list, NULL, &sc,
-           TTU_IGNORE_ACCESS, &stat, true);
+                   TTU_IGNORE_ACCESS, &dummy_stat, true);

+   list_splice(&unevictable_pages, page_list);
    while (!list_empty(page_list)) {
        page = lru_to_page(page_list);
        list_del(&page->lru);
        dec_node_page_state(page, NR_ISOLATED_ANON +
-               page_is_file_cache(page));
+                   page_is_file_cache(page));
        putback_lru_page(page);
    }

    return nr_reclaimed;
 }
-#endif

 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
@@ -1770,7 +1781,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        nr_pages = compound_nr(page);
        total_scan += nr_pages;

+#ifdef CONFIG_HUGEPAGE_POOL
+       if (page_zonenum(page) > sc->reclaim_idx
+           || PageTransHuge(page)) {
+#else
        if (page_zonenum(page) > sc->reclaim_idx) {
+#endif
            list_move(&page->lru, &pages_skipped);
            nr_skipped[page_zonenum(page)] += nr_pages;
            continue;
@@ -2026,13 +2042,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (stalled)
            return 0;

-       /* We are about to die and free our memory. Return now. */
-       if (fatal_signal_pending(current))
-           return SWAP_CLUSTER_MAX;
-
        /* wait a bit for the reclaimer. */
        msleep(100);
        stalled = true;
+
+       /* We are about to die and free our memory. Return now. */
+       if (fatal_signal_pending(current))
+           return SWAP_CLUSTER_MAX;
    }

    lru_add_drain();
@@ -2355,6 +2371,214 @@ enum scan_balance {
    SCAN_FILE,
 };

+/* mem_boost throttles only kswapd's behavior */
+enum mem_boost {
+   NO_BOOST,
+   BOOST_MID = 1,
+   BOOST_HIGH = 2,
+   BOOST_KILL = 3,
+};
+static int mem_boost_mode = NO_BOOST;
+static unsigned long last_mode_change;
+static bool am_app_launch = false;
+
+#define MEM_BOOST_MAX_TIME (5 * HZ) /* 5 sec */
+
+#if CONFIG_KSWAPD_CPU
+static int set_kswapd_cpu_affinity_as_config(void);
+static int set_kswapd_cpu_affinity_as_boost(void);
+#endif
+
+#ifdef CONFIG_SYSFS
+static ssize_t mem_boost_mode_show(struct kobject *kobj,
+                   struct kobj_attribute *attr, char *buf)
+{
+   if (mem_boost_mode != NO_BOOST &&
+       time_after(jiffies, last_mode_change + MEM_BOOST_MAX_TIME)) {
+       mem_boost_mode = NO_BOOST;
+#ifdef CONFIG_KSWAPD_CPU
+       set_kswapd_cpu_affinity_as_config();
+#endif
+   }
+   return sprintf(buf, "%d\n", mem_boost_mode);
+}
+
+static ssize_t mem_boost_mode_store(struct kobject *kobj,
+                    struct kobj_attribute *attr,
+                    const char *buf, size_t count)
+{
+   int mode;
+   int err;
+
+   err = kstrtoint(buf, 10, &mode);
+   if (err || mode > BOOST_KILL || mode < NO_BOOST)
+       return -EINVAL;
+
+   mem_boost_mode = mode;
+   last_mode_change = jiffies;
+#ifdef CONFIG_ION_RBIN_HEAP
+   if (mem_boost_mode >= BOOST_HIGH)
+       wake_ion_rbin_heap_prereclaim();
+#endif
+#if CONFIG_KSWAPD_CPU
+   if (mem_boost_mode >= BOOST_HIGH)
+       set_kswapd_cpu_affinity_as_boost();
+   else if (mem_boost_mode == NO_BOOST)
+       set_kswapd_cpu_affinity_as_config();
+#endif
+   return count;
+}
+
+static inline bool mem_boost_pgdat_wmark(struct pglist_data *pgdat)
+{
+   int z;
+   struct zone *zone;
+   unsigned long mark;
+
+   for (z = 0; z < MAX_NR_ZONES; z++) {
+       zone = &pgdat->node_zones[z];
+       if (!managed_zone(zone))
+           continue;
+       mark = low_wmark_pages(zone); //TODO: low, high, or (low + high)/2
+       if (zone_watermark_ok_safe(zone, 0, mark, 0))
+           return true;
+   }
+   return false;
+}
+
+#define MB_TO_PAGES(x) ((x) << (20 - PAGE_SHIFT))
+#define GB_TO_PAGES(x) ((x) << (30 - PAGE_SHIFT))
+static unsigned long low_threshold;
+
+static inline bool is_too_low_file(struct pglist_data *pgdat)
+{
+       unsigned long pgdatfile;
+       if (!low_threshold) {
+               if (totalram_pages() > GB_TO_PAGES(2))
+                       low_threshold = MB_TO_PAGES(600);
+               else if (totalram_pages() > GB_TO_PAGES(1))
+                       low_threshold = MB_TO_PAGES(300);
+               else
+                       low_threshold = MB_TO_PAGES(200);
+       }
+
+       pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+                  node_page_state(pgdat, NR_INACTIVE_FILE);
+       return pgdatfile < low_threshold;
+}
+
+inline bool need_memory_boosting(struct pglist_data *pgdat)
+{
+   bool ret;
+
+   if (mem_boost_mode != NO_BOOST &&
+       (time_after(jiffies, last_mode_change + MEM_BOOST_MAX_TIME) ||
+       is_too_low_file(pgdat))) {
+       mem_boost_mode = NO_BOOST;
+#if CONFIG_KSWAPD_CPU
+       set_kswapd_cpu_affinity_as_config();
+#endif
+   }
+
+   switch (mem_boost_mode) {
+       case BOOST_KILL:
+       case BOOST_HIGH:
+           ret = true;
+           break;
+       case BOOST_MID:
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+           if (!pgdat)
+               pgdat = &contig_page_data;
+#endif
+           ret = mem_boost_pgdat_wmark(pgdat) ? false : true;
+           break;
+       case NO_BOOST:
+       default:
+           ret = false;
+           break;
+   }
+   return ret;
+}
+
+ATOMIC_NOTIFIER_HEAD(am_app_launch_notifier);
+
+int am_app_launch_notifier_register(struct notifier_block *nb)
+{
+   return atomic_notifier_chain_register(&am_app_launch_notifier, nb);
+}
+
+int am_app_launch_notifier_unregister(struct notifier_block *nb)
+{
+   return  atomic_notifier_chain_unregister(&am_app_launch_notifier, nb);
+}
+
+static ssize_t am_app_launch_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   int ret;
+
+   ret = am_app_launch ? 1 : 0;
+   return sprintf(buf, "%d\n", ret);
+}
+
+static int notify_app_launch_started(void)
+{
+   trace_printk("%s\n", "am_app_launch started");
+   atomic_notifier_call_chain(&am_app_launch_notifier, 1, NULL);
+   return 0;
+}
+
+static int notify_app_launch_finished(void)
+{
+   trace_printk("%s\n", "am_app_launch finished");
+   atomic_notifier_call_chain(&am_app_launch_notifier, 0, NULL);
+   return 0;
+}
+
+static ssize_t am_app_launch_store(struct kobject *kobj,
+                  struct kobj_attribute *attr,
+                  const char *buf, size_t count)
+{
+   int mode;
+   int err;
+   bool am_app_launch_new;
+
+   err = kstrtoint(buf, 10, &mode);
+   if (err || (mode != 0 && mode != 1))
+       return -EINVAL;
+
+   am_app_launch_new = mode ? true : false;
+   trace_printk("am_app_launch %d -> %d\n", am_app_launch,
+            am_app_launch_new);
+   if (am_app_launch != am_app_launch_new) {
+       if (am_app_launch_new)
+           notify_app_launch_started();
+       else
+           notify_app_launch_finished();
+   }
+   am_app_launch = am_app_launch_new;
+
+   return count;
+}
+
+#define MEM_BOOST_ATTR(_name) \
+   static struct kobj_attribute _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+MEM_BOOST_ATTR(mem_boost_mode);
+MEM_BOOST_ATTR(am_app_launch);
+
+static struct attribute *vmscan_attrs[] = {
+   &mem_boost_mode_attr.attr,
+   &am_app_launch_attr.attr,
+   NULL,
+};
+
+static struct attribute_group vmscan_attr_group = {
+   .attrs = vmscan_attrs,
+   .name = "vmscan",
+};
+#endif
+
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
@@ -2449,6 +2673,11 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        }
    }

+   if (current_is_kswapd() && need_memory_boosting(pgdat)) {
+       scan_balance = SCAN_FILE;
+       goto out;
+   }
+
    /*
     * If there is enough inactive page cache, i.e. if the size of the
     * inactive list is greater than that of the active list *and* the
@@ -2522,14 +2751,14 @@ out:
    for_each_evictable_lru(lru) {
        int file = is_file_lru(lru);
        unsigned long lruvec_size;
+       unsigned long low, min;
        unsigned long scan;
-       unsigned long protection;

        lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-       protection = mem_cgroup_protection(memcg,
-                          sc->memcg_low_reclaim);
+       mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+                     &min, &low);

-       if (protection) {
+       if (min || low) {
            /*
             * Scale a cgroup's reclaim pressure by proportioning
             * its current usage to its memory.low or memory.min
@@ -2560,6 +2789,15 @@ out:
             * hard protection.
             */
            unsigned long cgroup_size = mem_cgroup_size(memcg);
+           unsigned long protection;
+
+           /* memory.low scaling, make sure we retry before OOM */
+           if (!sc->memcg_low_reclaim && low > min) {
+               protection = low;
+               sc->memcg_low_skipped = 1;
+           } else {
+               protection = min;
+           }

            /* Avoid TOCTOU with earlier protection check */
            cgroup_size = max(cgroup_size, protection);
@@ -2621,6 +2859,65 @@ out:
    }
 }

+#ifdef CONFIG_MEMCG_HEIMDALL
+void forced_shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
+                 int type, unsigned long nr_requested)
+{
+   struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
+   unsigned long nr[NR_LRU_LISTS] = {0,};
+   unsigned long nr_to_scan;
+   enum lru_list lru;
+   unsigned long nr_reclaimed = 0;
+   struct blk_plug plug;
+   unsigned long anon = 0, file = 0;
+   struct scan_control sc = {
+       .nr_to_reclaim = SWAP_CLUSTER_MAX,
+       .gfp_mask = GFP_KERNEL,
+       .reclaim_idx = MAX_NR_ZONES - 1,
+       .target_mem_cgroup = memcg,
+       .priority = DEF_PRIORITY,
+       .may_writepage = !laptop_mode,
+       .may_unmap = 1,
+       .may_swap = 1,
+   };
+
+   if (type == MEMCG_HEIMDALL_SHRINK_ANON) {
+       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+           lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+       nr[LRU_ACTIVE_ANON] = nr[LRU_INACTIVE_ANON] = anon;
+       nr[LRU_ACTIVE_FILE] = nr[LRU_INACTIVE_FILE] = 0;
+   } else if (type == MEMCG_HEIMDALL_SHRINK_FILE) {
+       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+       nr[LRU_ACTIVE_ANON] = nr[LRU_INACTIVE_ANON] = 0;
+       nr[LRU_ACTIVE_FILE] = nr[LRU_INACTIVE_FILE] = file;
+   }
+
+   trace_printk("%s heimdall start %d %lu %lu %lu\n", __func__, type, nr_requested, anon, file);
+   blk_start_plug(&plug);
+   while (nr[LRU_INACTIVE_ANON] > 0 || nr[LRU_INACTIVE_FILE] > 0) {
+       for_each_evictable_lru(lru) {
+           if (nr[lru]) {
+               nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+               nr[lru] -= nr_to_scan;
+
+               nr_reclaimed += shrink_list(lru, nr_to_scan,
+                               lruvec, &sc);
+           }
+       }
+
+       if (nr_reclaimed >= nr_requested)
+           break;
+
+       cond_resched();
+   }
+   blk_finish_plug(&plug);
+   sc.nr_reclaimed += nr_reclaimed;
+   trace_printk("%s end %d %lu %lu %lu\n", __func__, type, nr_reclaimed,
+       nr[LRU_INACTIVE_ANON], nr[LRU_INACTIVE_FILE]);
+}
+#endif
+
 /*
  * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2731,6 +3028,9 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
    blk_finish_plug(&plug);
    sc->nr_reclaimed += nr_reclaimed;

+   if (need_memory_boosting(NULL))
+       return;
+
    /*
     * Even if we did not try to evict anon pages at all, we want to
     * rebalance the anon lru active/inactive ratio.
@@ -3347,7 +3647,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        .priority = DEF_PRIORITY,
        .may_writepage = !laptop_mode,
        .may_unmap = 1,
+#ifdef CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY
+       .may_swap = 0,
+#else
        .may_swap = 1,
+#endif
    };

    /*
@@ -3954,6 +4258,65 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
    finish_wait(&pgdat->kswapd_wait, &wait);
 }

+#if CONFIG_KSWAPD_CPU
+static struct cpumask kswapd_cpumask;
+
+#define KSWAPD_CPU_BIG 0xF0
+static struct cpumask kswapd_cpumask_boost;
+
+static void init_kswapd_cpumask(void)
+{
+   int i;
+
+   cpumask_clear(&kswapd_cpumask);
+   for (i = 0; i < nr_cpu_ids; i++) {
+       if (CONFIG_KSWAPD_CPU & (1 << i))
+           cpumask_set_cpu(i, &kswapd_cpumask);
+   }
+
+   cpumask_clear(&kswapd_cpumask_boost);
+   for (i = 0; i < nr_cpu_ids; i++) {
+       if (KSWAPD_CPU_BIG & (1 << i))
+           cpumask_set_cpu(i, &kswapd_cpumask_boost);
+   }
+}
+
+/* follow like kswapd_cpu_online(unsigned int cpu) */
+static int set_kswapd_cpu_affinity_as_config(void)
+{
+   int nid;
+
+   for_each_node_state(nid, N_MEMORY) {
+       pg_data_t *pgdat = NODE_DATA(nid);
+       const struct cpumask *mask;
+
+       mask = &kswapd_cpumask;
+
+       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+           /* One of our CPUs online: restore mask */
+           set_cpus_allowed_ptr(pgdat->kswapd, mask);
+   }
+   return 0;
+}
+
+static int set_kswapd_cpu_affinity_as_boost(void)
+{
+   int nid;
+
+   for_each_node_state(nid, N_MEMORY) {
+       pg_data_t *pgdat = NODE_DATA(nid);
+       const struct cpumask *mask;
+
+       mask = &kswapd_cpumask_boost;
+
+       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+           /* One of our CPUs online: restore mask */
+           set_cpus_allowed_ptr(pgdat->kswapd, mask);
+   }
+   return 0;
+}
+#endif
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -3973,7 +4336,11 @@ static int kswapd(void *p)
    unsigned int classzone_idx = MAX_NR_ZONES - 1;
    pg_data_t *pgdat = (pg_data_t*)p;
    struct task_struct *tsk = current;
+#if CONFIG_KSWAPD_CPU
+   const struct cpumask *cpumask = &kswapd_cpumask;
+#else
    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+#endif

    if (!cpumask_empty(cpumask))
        set_cpus_allowed_ptr(tsk, cpumask);
@@ -4133,116 +4500,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 }
 #endif /* CONFIG_HIBERNATION */

-#ifdef CONFIG_MULTIPLE_KSWAPD
-static void update_kswapd_threads_node(int nid)
-{
-   pg_data_t *pgdat;
-   int drop, increase;
-   int last_idx, start_idx, hid;
-   int nr_threads = kswapd_threads_current;
-
-   pgdat = NODE_DATA(nid);
-   last_idx = nr_threads - 1;
-   if (kswapd_threads < nr_threads) {
-       drop = nr_threads - kswapd_threads;
-       for (hid = last_idx; hid > (last_idx - drop); hid--) {
-           if (pgdat->mkswapd[hid]) {
-               kthread_stop(pgdat->mkswapd[hid]);
-               pgdat->mkswapd[hid] = NULL;
-           }
-       }
-   } else {
-       increase = kswapd_threads - nr_threads;
-       start_idx = last_idx + 1;
-       for (hid = start_idx; hid < (start_idx + increase); hid++) {
-           pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat,
-                       "kswapd%d:%d", nid, hid);
-           if (IS_ERR(pgdat->mkswapd[hid])) {
-               pr_err("Failed to start kswapd%d on node %d\n",
-                   hid, nid);
-               pgdat->mkswapd[hid] = NULL;
-               /*
-                * We are out of resources. Do not start any
-                * more threads.
-                */
-               break;
-           }
-       }
-   }
-}
-
-void update_kswapd_threads(void)
-{
-   int nid;
-
-   if (kswapd_threads_current == kswapd_threads)
-       return;
-
-   /*
-    * Hold the memory hotplug lock to avoid racing with memory
-    * hotplug initiated updates
-    */
-   mem_hotplug_begin();
-   for_each_node_state(nid, N_MEMORY)
-       update_kswapd_threads_node(nid);
-
-   pr_info("kswapd_thread count changed, old:%d new:%d\n",
-       kswapd_threads_current, kswapd_threads);
-   kswapd_threads_current = kswapd_threads;
-   mem_hotplug_done();
-}
-
-static int multi_kswapd_run(int nid)
-{
-   pg_data_t *pgdat = NODE_DATA(nid);
-   int hid, nr_threads = kswapd_threads;
-   int ret = 0;
-
-   pgdat->mkswapd[0] = pgdat->kswapd;
-   for (hid = 1; hid < nr_threads; ++hid) {
-       pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
-                               nid, hid);
-       if (IS_ERR(pgdat->mkswapd[hid])) {
-           /* failure at boot is fatal */
-           WARN_ON(system_state < SYSTEM_RUNNING);
-           pr_err("Failed to start kswapd%d on node %d\n",
-               hid, nid);
-           ret = PTR_ERR(pgdat->mkswapd[hid]);
-           pgdat->mkswapd[hid] = NULL;
-       }
-   }
-   kswapd_threads_current = nr_threads;
-
-   return ret;
-}
-
-static void multi_kswapd_stop(int nid)
-{
-   int hid = 0;
-   int nr_threads = kswapd_threads_current;
-   struct task_struct *kswapd;
-
-   NODE_DATA(nid)->mkswapd[hid] = NULL;
-   for (hid = 1; hid < nr_threads; hid++) {
-       kswapd = NODE_DATA(nid)->mkswapd[hid];
-       if (kswapd) {
-           kthread_stop(kswapd);
-           NODE_DATA(nid)->mkswapd[hid] = NULL;
-       }
-   }
-}
-
-static void multi_kswapd_cpu_online(pg_data_t *pgdat,
-                   const struct cpumask *mask)
-{
-   int hid;
-   int nr_threads = kswapd_threads_current;
-
-   for (hid = 1; hid < nr_threads; hid++)
-       set_cpus_allowed_ptr(pgdat->mkswapd[hid], mask);
-}
-#endif
-
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
@@ -4255,13 +4512,15 @@ static int kswapd_cpu_online(unsigned int cpu)
        pg_data_t *pgdat = NODE_DATA(nid);
        const struct cpumask *mask;

+#if CONFIG_KSWAPD_CPU
+       mask = &kswapd_cpumask;
+#else
        mask = cpumask_of_node(pgdat->node_id);
+#endif

-       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
+       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
            /* One of our CPUs online: restore mask */
            set_cpus_allowed_ptr(pgdat->kswapd, mask);
-           multi_kswapd_cpu_online(pgdat, mask);
-       }
    }
    return 0;
 }
@@ -4278,17 +4537,14 @@ int kswapd_run(int nid)
    if (pgdat->kswapd)
        return 0;

-   pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d:0", nid);
+   pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    if (IS_ERR(pgdat->kswapd)) {
        /* failure at boot is fatal */
        BUG_ON(system_state < SYSTEM_RUNNING);
        pr_err("Failed to start kswapd on node %d\n", nid);
        ret = PTR_ERR(pgdat->kswapd);
        pgdat->kswapd = NULL;
-       return ret;
    }
-   ret = multi_kswapd_run(nid);
-
    return ret;
 }

@@ -4304,14 +4560,15 @@ void kswapd_stop(int nid)
        kthread_stop(kswapd);
        NODE_DATA(nid)->kswapd = NULL;
    }
-
-   multi_kswapd_stop(nid);
 }

 static int __init kswapd_init(void)
 {
    int nid, ret;

+#if CONFIG_KSWAPD_CPU
+   init_kswapd_cpumask();
+#endif
    swap_setup();
    for_each_node_state(nid, N_MEMORY)
        kswapd_run(nid);
@@ -4319,6 +4576,10 @@ static int __init kswapd_init(void)
                    "mm/vmscan:online", kswapd_cpu_online,
                    NULL);
    WARN_ON(ret < 0);
+#ifdef CONFIG_SYSFS
+   if (sysfs_create_group(mm_kobj, &vmscan_attr_group))
+       pr_err("vmscan: register sysfs failed\n");
+#endif
    return 0;
 }