Guest User

slub.c

a guest
Nov 8th, 2020
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 150.62 KB | None | 0 0
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3.  * SLUB: A slab allocator that limits cache line use instead of queuing
  4.  * objects in per cpu and per node lists.
  5.  *
  6.  * The allocator synchronizes using per slab locks or atomic operatios
  7.  * and only uses a centralized lock to manage a pool of partial slabs.
  8.  *
  9.  * (C) 2007 SGI, Christoph Lameter
  10.  * (C) 2011 Linux Foundation, Christoph Lameter
  11.  */
  12.  
  13. #include <linux/mm.h>
  14. #include <linux/swap.h> /* struct reclaim_state */
  15. #include <linux/module.h>
  16. #include <linux/bit_spinlock.h>
  17. #include <linux/interrupt.h>
  18. #include <linux/bitops.h>
  19. #include <linux/slab.h>
  20. #include "slab.h"
  21. #include <linux/proc_fs.h>
  22. #include <linux/seq_file.h>
  23. #include <linux/kasan.h>
  24. #include <linux/cpu.h>
  25. #include <linux/cpuset.h>
  26. #include <linux/mempolicy.h>
  27. #include <linux/ctype.h>
  28. #include <linux/debugobjects.h>
  29. #include <linux/kallsyms.h>
  30. #include <linux/memory.h>
  31. #include <linux/math64.h>
  32. #include <linux/fault-inject.h>
  33. #include <linux/stacktrace.h>
  34. #include <linux/prefetch.h>
  35. #include <linux/memcontrol.h>
  36. #include <linux/random.h>
  37.  
  38. #include <trace/events/kmem.h>
  39.  
  40. #include "internal.h"
  41.  
  42. /*
  43.  * Lock order:
  44.  *   1. slab_mutex (Global Mutex)
  45.  *   2. node->list_lock
  46.  *   3. slab_lock(page) (Only on some arches and for debugging)
  47.  *
  48.  *   slab_mutex
  49.  *
  50.  *   The role of the slab_mutex is to protect the list of all the slabs
  51.  *   and to synchronize major metadata changes to slab cache structures.
  52.  *
  53.  *   The slab_lock is only used for debugging and on arches that do not
  54.  *   have the ability to do a cmpxchg_double. It only protects:
  55.  *  A. page->freelist   -> List of object free in a page
  56.  *  B. page->inuse      -> Number of objects in use
  57.  *  C. page->objects    -> Number of objects in page
  58.  *  D. page->frozen     -> frozen state
  59.  *
  60.  *   If a slab is frozen then it is exempt from list management. It is not
  61.  *   on any list except per cpu partial list. The processor that froze the
  62.  *   slab is the one who can perform list operations on the page. Other
  63.  *   processors may put objects onto the freelist but the processor that
  64.  *   froze the slab is the only one that can retrieve the objects from the
  65.  *   page's freelist.
  66.  *
  67.  *   The list_lock protects the partial and full list on each node and
  68.  *   the partial slab counter. If taken then no new slabs may be added or
  69.  *   removed from the lists nor make the number of partial slabs be modified.
  70.  *   (Note that the total number of slabs is an atomic value that may be
  71.  *   modified without taking the list lock).
  72.  *
  73.  *   The list_lock is a centralized lock and thus we avoid taking it as
  74.  *   much as possible. As long as SLUB does not have to handle partial
  75.  *   slabs, operations can continue without any centralized lock. F.e.
  76.  *   allocating a long series of objects that fill up slabs does not require
  77.  *   the list lock.
  78.  *   Interrupts are disabled during allocation and deallocation in order to
  79.  *   make the slab allocator safe to use in the context of an irq. In addition
  80.  *   interrupts are disabled to ensure that the processor does not change
  81.  *   while handling per_cpu slabs, due to kernel preemption.
  82.  *
  83.  * SLUB assigns one slab for allocation to each processor.
  84.  * Allocations only occur from these slabs called cpu slabs.
  85.  *
  86.  * Slabs with free elements are kept on a partial list and during regular
  87.  * operations no list for full slabs is used. If an object in a full slab is
  88.  * freed then the slab will show up again on the partial lists.
  89.  * We track full slabs for debugging purposes though because otherwise we
  90.  * cannot scan all objects.
  91.  *
  92.  * Slabs are freed when they become empty. Teardown and setup is
  93.  * minimal so we rely on the page allocators per cpu caches for
  94.  * fast frees and allocs.
  95.  *
  96.  * Overloading of page flags that are otherwise used for LRU management.
  97.  *
  98.  * PageActive       The slab is frozen and exempt from list processing.
  99.  *          This means that the slab is dedicated to a purpose
  100.  *          such as satisfying allocations for a specific
  101.  *          processor. Objects may be freed in the slab while
  102.  *          it is frozen but slab_free will then skip the usual
  103.  *          list operations. It is up to the processor holding
  104.  *          the slab to integrate the slab into the slab lists
  105.  *          when the slab is no longer needed.
  106.  *
  107.  *          One use of this flag is to mark slabs that are
  108.  *          used for allocations. Then such a slab becomes a cpu
  109.  *          slab. The cpu slab may be equipped with an additional
  110.  *          freelist that allows lockless access to
  111.  *          free objects in addition to the regular freelist
  112.  *          that requires the slab lock.
  113.  *
  114.  * PageError        Slab requires special handling due to debug
  115.  *          options set. This moves slab handling out of
  116.  *          the fast path and disables lockless freelists.
  117.  */
  118.  
  119. static inline int kmem_cache_debug(struct kmem_cache *s)
  120. {
  121. #ifdef CONFIG_SLUB_DEBUG
  122.     return unlikely(s->flags & SLAB_DEBUG_FLAGS);
  123. #else
  124.     return 0;
  125. #endif
  126. }
  127.  
  128. void *fixup_red_left(struct kmem_cache *s, void *p)
  129. {
  130.     if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
  131.         p += s->red_left_pad;
  132.  
  133.     return p;
  134. }
  135.  
  136. static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
  137. {
  138. #ifdef CONFIG_SLUB_CPU_PARTIAL
  139.     return !kmem_cache_debug(s);
  140. #else
  141.     return false;
  142. #endif
  143. }
  144.  
  145. /*
  146.  * Issues still to be resolved:
  147.  *
  148.  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  149.  *
  150.  * - Variable sizing of the per node arrays
  151.  */
  152.  
  153. /* Enable to test recovery from slab corruption on boot */
  154. #undef SLUB_RESILIENCY_TEST
  155.  
  156. /* Enable to log cmpxchg failures */
  157. #undef SLUB_DEBUG_CMPXCHG
  158.  
  159. /*
  160.  * Mininum number of partial slabs. These will be left on the partial
  161.  * lists even if they are empty. kmem_cache_shrink may reclaim them.
  162.  */
  163. #define MIN_PARTIAL 5
  164.  
  165. /*
  166.  * Maximum number of desirable partial slabs.
  167.  * The existence of more partial slabs makes kmem_cache_shrink
  168.  * sort the partial list by the number of objects in use.
  169.  */
  170. #define MAX_PARTIAL 10
  171.  
  172. #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
  173.                 SLAB_POISON | SLAB_STORE_USER)
  174.  
  175. /*
  176.  * These debug flags cannot use CMPXCHG because there might be consistency
  177.  * issues when checking or reading debug information
  178.  */
  179. #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
  180.                 SLAB_TRACE)
  181.  
  182.  
  183. /*
  184.  * Debugging flags that require metadata to be stored in the slab.  These get
  185.  * disabled when slub_debug=O is used and a cache's min order increases with
  186.  * metadata.
  187.  */
  188. #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
  189.  
  190. #define OO_SHIFT    16
  191. #define OO_MASK     ((1 << OO_SHIFT) - 1)
  192. #define MAX_OBJS_PER_PAGE   32767 /* since page.objects is u15 */
  193.  
  194. /* Internal SLUB flags */
  195. /* Poison object */
  196. #define __OBJECT_POISON     ((slab_flags_t __force)0x80000000U)
  197. /* Use cmpxchg_double */
  198. #define __CMPXCHG_DOUBLE    ((slab_flags_t __force)0x40000000U)
  199.  
  200. /*
  201.  * Tracking user of a slab.
  202.  */
  203. #define TRACK_ADDRS_COUNT 16
  204. struct track {
  205.     unsigned long addr; /* Called from address */
  206. #ifdef CONFIG_STACKTRACE
  207.     unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
  208. #endif
  209.     int cpu;        /* Was running on cpu */
  210.     int pid;        /* Pid context */
  211.     unsigned long when; /* When did the operation occur */
  212. };
  213.  
  214. enum track_item { TRACK_ALLOC, TRACK_FREE };
  215.  
  216. #ifdef CONFIG_SYSFS
  217. static int sysfs_slab_add(struct kmem_cache *);
  218. static int sysfs_slab_alias(struct kmem_cache *, const char *);
  219. static void memcg_propagate_slab_attrs(struct kmem_cache *s);
  220. static void sysfs_slab_remove(struct kmem_cache *s);
  221. #else
  222. static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
  223. static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
  224.                             { return 0; }
  225. static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
  226. static inline void sysfs_slab_remove(struct kmem_cache *s) { }
  227. #endif
  228.  
  229. static inline void stat(const struct kmem_cache *s, enum stat_item si)
  230. {
  231. #ifdef CONFIG_SLUB_STATS
  232.     /*
  233.      * The rmw is racy on a preemptible kernel but this is acceptable, so
  234.      * avoid this_cpu_add()'s irq-disable overhead.
  235.      */
  236.     raw_cpu_inc(s->cpu_slab->stat[si]);
  237. #endif
  238. }
  239.  
  240. /********************************************************************
  241.  *          Core slab cache functions
  242.  *******************************************************************/
  243.  
  244. /*
  245.  * Returns freelist pointer (ptr). With hardening, this is obfuscated
  246.  * with an XOR of the address where the pointer is held and a per-cache
  247.  * random number.
  248.  */
  249. static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
  250.                  unsigned long ptr_addr)
  251. {
  252. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  253.     /*
  254.      * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
  255.      * Normally, this doesn't cause any issues, as both set_freepointer()
  256.      * and get_freepointer() are called with a pointer with the same tag.
  257.      * However, there are some issues with CONFIG_SLUB_DEBUG code. For
  258.      * example, when __free_slub() iterates over objects in a cache, it
  259.      * passes untagged pointers to check_object(). check_object() in turns
  260.      * calls get_freepointer() with an untagged pointer, which causes the
  261.      * freepointer to be restored incorrectly.
  262.      */
  263.     return (void *)((unsigned long)ptr ^ s->random ^
  264.             swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
  265. #else
  266.     return ptr;
  267. #endif
  268. }
  269.  
  270. /* Returns the freelist pointer recorded at location ptr_addr. */
  271. static inline void *freelist_dereference(const struct kmem_cache *s,
  272.                      void *ptr_addr)
  273. {
  274.     return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
  275.                 (unsigned long)ptr_addr);
  276. }
  277.  
  278. static inline void *get_freepointer(struct kmem_cache *s, void *object)
  279. {
  280.     return freelist_dereference(s, object + s->offset);
  281. }
  282.  
  283. static void prefetch_freepointer(const struct kmem_cache *s, void *object)
  284. {
  285.     prefetch(object + s->offset);
  286. }
  287.  
  288. static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
  289. {
  290.     unsigned long freepointer_addr;
  291.     void *p;
  292.  
  293.     if (!debug_pagealloc_enabled_static())
  294.         return get_freepointer(s, object);
  295.  
  296.     freepointer_addr = (unsigned long)object + s->offset;
  297.     probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
  298.     return freelist_ptr(s, p, freepointer_addr);
  299. }
  300.  
  301. static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
  302. {
  303.     unsigned long freeptr_addr = (unsigned long)object + s->offset;
  304.  
  305. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  306.     BUG_ON(object == fp); /* naive detection of double free or corruption */
  307. #endif
  308.  
  309.     *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
  310. }
  311.  
  312. /* Loop over all objects in a slab */
  313. #define for_each_object(__p, __s, __addr, __objects) \
  314.     for (__p = fixup_red_left(__s, __addr); \
  315.         __p < (__addr) + (__objects) * (__s)->size; \
  316.         __p += (__s)->size)
  317.  
  318. /* Determine object index from a given position */
  319. static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
  320. {
  321.     return (kasan_reset_tag(p) - addr) / s->size;
  322. }
  323.  
  324. static inline unsigned int order_objects(unsigned int order, unsigned int size)
  325. {
  326.     return ((unsigned int)PAGE_SIZE << order) / size;
  327. }
  328.  
  329. static inline struct kmem_cache_order_objects oo_make(unsigned int order,
  330.         unsigned int size)
  331. {
  332.     struct kmem_cache_order_objects x = {
  333.         (order << OO_SHIFT) + order_objects(order, size)
  334.     };
  335.  
  336.     return x;
  337. }
  338.  
  339. static inline unsigned int oo_order(struct kmem_cache_order_objects x)
  340. {
  341.     return x.x >> OO_SHIFT;
  342. }
  343.  
  344. static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
  345. {
  346.     return x.x & OO_MASK;
  347. }
  348.  
  349. /*
  350.  * Per slab locking using the pagelock
  351.  */
  352. static __always_inline void slab_lock(struct page *page)
  353. {
  354.     VM_BUG_ON_PAGE(PageTail(page), page);
  355.     bit_spin_lock(PG_locked, &page->flags);
  356. }
  357.  
  358. static __always_inline void slab_unlock(struct page *page)
  359. {
  360.     VM_BUG_ON_PAGE(PageTail(page), page);
  361.     __bit_spin_unlock(PG_locked, &page->flags);
  362. }
  363.  
  364. /* Interrupts must be disabled (for the fallback code to work right) */
  365. static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
  366.         void *freelist_old, unsigned long counters_old,
  367.         void *freelist_new, unsigned long counters_new,
  368.         const char *n)
  369. {
  370.     VM_BUG_ON(!irqs_disabled());
  371. #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
  372.     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
  373.     if (s->flags & __CMPXCHG_DOUBLE) {
  374.         if (cmpxchg_double(&page->freelist, &page->counters,
  375.                    freelist_old, counters_old,
  376.                    freelist_new, counters_new))
  377.             return true;
  378.     } else
  379. #endif
  380.     {
  381.         slab_lock(page);
  382.         if (page->freelist == freelist_old &&
  383.                     page->counters == counters_old) {
  384.             page->freelist = freelist_new;
  385.             page->counters = counters_new;
  386.             slab_unlock(page);
  387.             return true;
  388.         }
  389.         slab_unlock(page);
  390.     }
  391.  
  392.     cpu_relax();
  393.     stat(s, CMPXCHG_DOUBLE_FAIL);
  394.  
  395. #ifdef SLUB_DEBUG_CMPXCHG
  396.     pr_info("%s %s: cmpxchg double redo ", n, s->name);
  397. #endif
  398.  
  399.     return false;
  400. }
  401.  
  402. static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
  403.         void *freelist_old, unsigned long counters_old,
  404.         void *freelist_new, unsigned long counters_new,
  405.         const char *n)
  406. {
  407. #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
  408.     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
  409.     if (s->flags & __CMPXCHG_DOUBLE) {
  410.         if (cmpxchg_double(&page->freelist, &page->counters,
  411.                    freelist_old, counters_old,
  412.                    freelist_new, counters_new))
  413.             return true;
  414.     } else
  415. #endif
  416.     {
  417.         unsigned long flags;
  418.  
  419.         local_irq_save(flags);
  420.         slab_lock(page);
  421.         if (page->freelist == freelist_old &&
  422.                     page->counters == counters_old) {
  423.             page->freelist = freelist_new;
  424.             page->counters = counters_new;
  425.             slab_unlock(page);
  426.             local_irq_restore(flags);
  427.             return true;
  428.         }
  429.         slab_unlock(page);
  430.         local_irq_restore(flags);
  431.     }
  432.  
  433.     cpu_relax();
  434.     stat(s, CMPXCHG_DOUBLE_FAIL);
  435.  
  436. #ifdef SLUB_DEBUG_CMPXCHG
  437.     pr_info("%s %s: cmpxchg double redo ", n, s->name);
  438. #endif
  439.  
  440.     return false;
  441. }
  442.  
  443. #ifdef CONFIG_SLUB_DEBUG
  444. /*
  445.  * Determine a map of object in use on a page.
  446.  *
  447.  * Node listlock must be held to guarantee that the page does
  448.  * not vanish from under us.
  449.  */
  450. static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
  451. {
  452.     void *p;
  453.     void *addr = page_address(page);
  454.  
  455.     for (p = page->freelist; p; p = get_freepointer(s, p))
  456.         set_bit(slab_index(p, s, addr), map);
  457. }
  458.  
  459. static inline unsigned int size_from_object(struct kmem_cache *s)
  460. {
  461.     if (s->flags & SLAB_RED_ZONE)
  462.         return s->size - s->red_left_pad;
  463.  
  464.     return s->size;
  465. }
  466.  
  467. static inline void *restore_red_left(struct kmem_cache *s, void *p)
  468. {
  469.     if (s->flags & SLAB_RED_ZONE)
  470.         p -= s->red_left_pad;
  471.  
  472.     return p;
  473. }
  474.  
  475. /*
  476.  * Debug settings:
  477.  */
  478. #if defined(CONFIG_SLUB_DEBUG_ON)
  479. static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
  480. #else
  481. static slab_flags_t slub_debug;
  482. #endif
  483.  
  484. static char *slub_debug_slabs;
  485. static int disable_higher_order_debug;
  486.  
  487. /*
  488.  * slub is about to manipulate internal object metadata.  This memory lies
  489.  * outside the range of the allocated object, so accessing it would normally
  490.  * be reported by kasan as a bounds error.  metadata_access_enable() is used
  491.  * to tell kasan that these accesses are OK.
  492.  */
  493. static inline void metadata_access_enable(void)
  494. {
  495.     kasan_disable_current();
  496. }
  497.  
  498. static inline void metadata_access_disable(void)
  499. {
  500.     kasan_enable_current();
  501. }
  502.  
  503. /*
  504.  * Object debugging
  505.  */
  506.  
  507. /* Verify that a pointer has an address that is valid within a slab page */
  508. static inline int check_valid_pointer(struct kmem_cache *s,
  509.                 struct page *page, void *object)
  510. {
  511.     void *base;
  512.  
  513.     if (!object)
  514.         return 1;
  515.  
  516.     base = page_address(page);
  517.     object = kasan_reset_tag(object);
  518.     object = restore_red_left(s, object);
  519.     if (object < base || object >= base + page->objects * s->size ||
  520.         (object - base) % s->size) {
  521.         return 0;
  522.     }
  523.  
  524.     return 1;
  525. }
  526.  
  527. static void print_section(char *level, char *text, u8 *addr,
  528.               unsigned int length)
  529. {
  530.     metadata_access_enable();
  531.     print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
  532.             length, 1);
  533.     metadata_access_disable();
  534. }
  535.  
  536. static struct track *get_track(struct kmem_cache *s, void *object,
  537.     enum track_item alloc)
  538. {
  539.     struct track *p;
  540.  
  541.     if (s->offset)
  542.         p = object + s->offset + sizeof(void *);
  543.     else
  544.         p = object + s->inuse;
  545.  
  546.     return p + alloc;
  547. }
  548.  
  549. static void set_track(struct kmem_cache *s, void *object,
  550.             enum track_item alloc, unsigned long addr)
  551. {
  552.     struct track *p = get_track(s, object, alloc);
  553.  
  554.     if (addr) {
  555. #ifdef CONFIG_STACKTRACE
  556.         unsigned int nr_entries;
  557.  
  558.         metadata_access_enable();
  559.         nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
  560.         metadata_access_disable();
  561.  
  562.         if (nr_entries < TRACK_ADDRS_COUNT)
  563.             p->addrs[nr_entries] = 0;
  564. #endif
  565.         p->addr = addr;
  566.         p->cpu = smp_processor_id();
  567.         p->pid = current->pid;
  568.         p->when = jiffies;
  569.     } else {
  570.         memset(p, 0, sizeof(struct track));
  571.     }
  572. }
  573.  
  574. static void init_tracking(struct kmem_cache *s, void *object)
  575. {
  576.     if (!(s->flags & SLAB_STORE_USER))
  577.         return;
  578.  
  579.     set_track(s, object, TRACK_FREE, 0UL);
  580.     set_track(s, object, TRACK_ALLOC, 0UL);
  581. }
  582.  
  583. static void print_track(const char *s, struct track *t, unsigned long pr_time)
  584. {
  585.     if (!t->addr)
  586.         return;
  587.  
  588.     pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
  589.            s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
  590. #ifdef CONFIG_STACKTRACE
  591.     {
  592.         int i;
  593.         for (i = 0; i < TRACK_ADDRS_COUNT; i++)
  594.             if (t->addrs[i])
  595.                 pr_err("\t%pS\n", (void *)t->addrs[i]);
  596.             else
  597.                 break;
  598.     }
  599. #endif
  600. }
  601.  
  602. static void print_tracking(struct kmem_cache *s, void *object)
  603. {
  604.     unsigned long pr_time = jiffies;
  605.     if (!(s->flags & SLAB_STORE_USER))
  606.         return;
  607.  
  608.     print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
  609.     print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
  610. }
  611.  
  612. static void print_page_info(struct page *page)
  613. {
  614.     pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
  615.            page, page->objects, page->inuse, page->freelist, page->flags);
  616.  
  617. }
  618.  
  619. static void slab_bug(struct kmem_cache *s, char *fmt, ...)
  620. {
  621.     struct va_format vaf;
  622.     va_list args;
  623.  
  624.     va_start(args, fmt);
  625.     vaf.fmt = fmt;
  626.     vaf.va = &args;
  627.     pr_err("=============================================================================\n");
  628.     pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
  629.     pr_err("-----------------------------------------------------------------------------\n\n");
  630.  
  631.     add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  632.     va_end(args);
  633. }
  634.  
  635. static void slab_fix(struct kmem_cache *s, char *fmt, ...)
  636. {
  637.     struct va_format vaf;
  638.     va_list args;
  639.  
  640.     va_start(args, fmt);
  641.     vaf.fmt = fmt;
  642.     vaf.va = &args;
  643.     pr_err("FIX %s: %pV\n", s->name, &vaf);
  644.     va_end(args);
  645. }
  646.  
  647. static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
  648.                    void **freelist, void *nextfree)
  649. {
  650.     if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
  651.         !check_valid_pointer(s, page, nextfree) && freelist) {
  652.         object_err(s, page, *freelist, "Freechain corrupt");
  653.         *freelist = NULL;
  654.         slab_fix(s, "Isolate corrupted freechain");
  655.         return true;
  656.     }
  657.  
  658.     return false;
  659. }
  660.  
  661. static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
  662. {
  663.     unsigned int off;   /* Offset of last byte */
  664.     u8 *addr = page_address(page);
  665.  
  666.     print_tracking(s, p);
  667.  
  668.     print_page_info(page);
  669.  
  670.     pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
  671.            p, p - addr, get_freepointer(s, p));
  672.  
  673.     if (s->flags & SLAB_RED_ZONE)
  674.         print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
  675.                   s->red_left_pad);
  676.     else if (p > addr + 16)
  677.         print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
  678.  
  679.     print_section(KERN_ERR, "Object ", p,
  680.               min_t(unsigned int, s->object_size, PAGE_SIZE));
  681.     if (s->flags & SLAB_RED_ZONE)
  682.         print_section(KERN_ERR, "Redzone ", p + s->object_size,
  683.             s->inuse - s->object_size);
  684.  
  685.     if (s->offset)
  686.         off = s->offset + sizeof(void *);
  687.     else
  688.         off = s->inuse;
  689.  
  690.     if (s->flags & SLAB_STORE_USER)
  691.         off += 2 * sizeof(struct track);
  692.  
  693.     off += kasan_metadata_size(s);
  694.  
  695.     if (off != size_from_object(s))
  696.         /* Beginning of the filler is the free pointer */
  697.         print_section(KERN_ERR, "Padding ", p + off,
  698.                   size_from_object(s) - off);
  699.  
  700.     dump_stack();
  701. }
  702.  
  703. void object_err(struct kmem_cache *s, struct page *page,
  704.             u8 *object, char *reason)
  705. {
  706.     slab_bug(s, "%s", reason);
  707.     print_trailer(s, page, object);
  708. }
  709.  
  710. static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
  711.             const char *fmt, ...)
  712. {
  713.     va_list args;
  714.     char buf[100];
  715.  
  716.     va_start(args, fmt);
  717.     vsnprintf(buf, sizeof(buf), fmt, args);
  718.     va_end(args);
  719.     slab_bug(s, "%s", buf);
  720.     print_page_info(page);
  721.     dump_stack();
  722. }
  723.  
  724. static void init_object(struct kmem_cache *s, void *object, u8 val)
  725. {
  726.     u8 *p = object;
  727.  
  728.     if (s->flags & SLAB_RED_ZONE)
  729.         memset(p - s->red_left_pad, val, s->red_left_pad);
  730.  
  731.     if (s->flags & __OBJECT_POISON) {
  732.         memset(p, POISON_FREE, s->object_size - 1);
  733.         p[s->object_size - 1] = POISON_END;
  734.     }
  735.  
  736.     if (s->flags & SLAB_RED_ZONE)
  737.         memset(p + s->object_size, val, s->inuse - s->object_size);
  738. }
  739.  
  740. static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
  741.                         void *from, void *to)
  742. {
  743.     slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
  744.     memset(from, data, to - from);
  745. }
  746.  
  747. static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
  748.             u8 *object, char *what,
  749.             u8 *start, unsigned int value, unsigned int bytes)
  750. {
  751.     u8 *fault;
  752.     u8 *end;
  753.  
  754.     metadata_access_enable();
  755.     fault = memchr_inv(start, value, bytes);
  756.     metadata_access_disable();
  757.     if (!fault)
  758.         return 1;
  759.  
  760.     end = start + bytes;
  761.     while (end > fault && end[-1] == value)
  762.         end--;
  763.  
  764.     slab_bug(s, "%s overwritten", what);
  765.     pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
  766.                     fault, end - 1, fault[0], value);
  767.     print_trailer(s, page, object);
  768.  
  769.     restore_bytes(s, what, value, fault, end);
  770.     return 0;
  771. }
  772.  
  773. /*
  774.  * Object layout:
  775.  *
  776.  * object address
  777.  *  Bytes of the object to be managed.
  778.  *  If the freepointer may overlay the object then the free
  779.  *  pointer is the first word of the object.
  780.  *
  781.  *  Poisoning uses 0x6b (POISON_FREE) and the last byte is
  782.  *  0xa5 (POISON_END)
  783.  *
  784.  * object + s->object_size
  785.  *  Padding to reach word boundary. This is also used for Redzoning.
  786.  *  Padding is extended by another word if Redzoning is enabled and
  787.  *  object_size == inuse.
  788.  *
  789.  *  We fill with 0xbb (RED_INACTIVE) for inactive objects and with
  790.  *  0xcc (RED_ACTIVE) for objects in use.
  791.  *
  792.  * object + s->inuse
  793.  *  Meta data starts here.
  794.  *
  795.  *  A. Free pointer (if we cannot overwrite object on free)
  796.  *  B. Tracking data for SLAB_STORE_USER
  797.  *  C. Padding to reach required alignment boundary or at mininum
  798.  *      one word if debugging is on to be able to detect writes
  799.  *      before the word boundary.
  800.  *
  801.  *  Padding is done using 0x5a (POISON_INUSE)
  802.  *
  803.  * object + s->size
  804.  *  Nothing is used beyond s->size.
  805.  *
  806.  * If slabcaches are merged then the object_size and inuse boundaries are mostly
  807.  * ignored. And therefore no slab options that rely on these boundaries
  808.  * may be used with merged slabcaches.
  809.  */
  810.  
  811. static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
  812. {
  813.     unsigned long off = s->inuse;   /* The end of info */
  814.  
  815.     if (s->offset)
  816.         /* Freepointer is placed after the object. */
  817.         off += sizeof(void *);
  818.  
  819.     if (s->flags & SLAB_STORE_USER)
  820.         /* We also have user information there */
  821.         off += 2 * sizeof(struct track);
  822.  
  823.     off += kasan_metadata_size(s);
  824.  
  825.     if (size_from_object(s) == off)
  826.         return 1;
  827.  
  828.     return check_bytes_and_report(s, page, p, "Object padding",
  829.             p + off, POISON_INUSE, size_from_object(s) - off);
  830. }
  831.  
  832. /* Check the pad bytes at the end of a slab page */
  833. static int slab_pad_check(struct kmem_cache *s, struct page *page)
  834. {
  835.     u8 *start;
  836.     u8 *fault;
  837.     u8 *end;
  838.     u8 *pad;
  839.     int length;
  840.     int remainder;
  841.  
  842.     if (!(s->flags & SLAB_POISON))
  843.         return 1;
  844.  
  845.     start = page_address(page);
  846.     length = page_size(page);
  847.     end = start + length;
  848.     remainder = length % s->size;
  849.     if (!remainder)
  850.         return 1;
  851.  
  852.     pad = end - remainder;
  853.     metadata_access_enable();
  854.     fault = memchr_inv(pad, POISON_INUSE, remainder);
  855.     metadata_access_disable();
  856.     if (!fault)
  857.         return 1;
  858.     while (end > fault && end[-1] == POISON_INUSE)
  859.         end--;
  860.  
  861.     slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
  862.     print_section(KERN_ERR, "Padding ", pad, remainder);
  863.  
  864.     restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
  865.     return 0;
  866. }
  867.  
  868. static int check_object(struct kmem_cache *s, struct page *page,
  869.                     void *object, u8 val)
  870. {
  871.     u8 *p = object;
  872.     u8 *endobject = object + s->object_size;
  873.  
  874.     if (s->flags & SLAB_RED_ZONE) {
  875.         if (!check_bytes_and_report(s, page, object, "Redzone",
  876.             object - s->red_left_pad, val, s->red_left_pad))
  877.             return 0;
  878.  
  879.         if (!check_bytes_and_report(s, page, object, "Redzone",
  880.             endobject, val, s->inuse - s->object_size))
  881.             return 0;
  882.     } else {
  883.         if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
  884.             check_bytes_and_report(s, page, p, "Alignment padding",
  885.                 endobject, POISON_INUSE,
  886.                 s->inuse - s->object_size);
  887.         }
  888.     }
  889.  
  890.     if (s->flags & SLAB_POISON) {
  891.         if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
  892.             (!check_bytes_and_report(s, page, p, "Poison", p,
  893.                     POISON_FREE, s->object_size - 1) ||
  894.              !check_bytes_and_report(s, page, p, "Poison",
  895.                 p + s->object_size - 1, POISON_END, 1)))
  896.             return 0;
  897.         /*
  898.          * check_pad_bytes cleans up on its own.
  899.          */
  900.         check_pad_bytes(s, page, p);
  901.     }
  902.  
  903.     if (!s->offset && val == SLUB_RED_ACTIVE)
  904.         /*
  905.          * Object and freepointer overlap. Cannot check
  906.          * freepointer while object is allocated.
  907.          */
  908.         return 1;
  909.  
  910.     /* Check free pointer validity */
  911.     if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
  912.         object_err(s, page, p, "Freepointer corrupt");
  913.         /*
  914.          * No choice but to zap it and thus lose the remainder
  915.          * of the free objects in this slab. May cause
  916.          * another error because the object count is now wrong.
  917.          */
  918.         set_freepointer(s, p, NULL);
  919.         return 0;
  920.     }
  921.     return 1;
  922. }
  923.  
  924. static int check_slab(struct kmem_cache *s, struct page *page)
  925. {
  926.     int maxobj;
  927.  
  928.     VM_BUG_ON(!irqs_disabled());
  929.  
  930.     if (!PageSlab(page)) {
  931.         slab_err(s, page, "Not a valid slab page");
  932.         return 0;
  933.     }
  934.  
  935.     maxobj = order_objects(compound_order(page), s->size);
  936.     if (page->objects > maxobj) {
  937.         slab_err(s, page, "objects %u > max %u",
  938.             page->objects, maxobj);
  939.         return 0;
  940.     }
  941.     if (page->inuse > page->objects) {
  942.         slab_err(s, page, "inuse %u > max %u",
  943.             page->inuse, page->objects);
  944.         return 0;
  945.     }
  946.     /* Slab_pad_check fixes things up after itself */
  947.     slab_pad_check(s, page);
  948.     return 1;
  949. }
  950.  
  951. /*
  952.  * Determine if a certain object on a page is on the freelist. Must hold the
  953.  * slab lock to guarantee that the chains are in a consistent state.
  954.  */
  955. static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
  956. {
  957.     int nr = 0;
  958.     void *fp;
  959.     void *object = NULL;
  960.     int max_objects;
  961.  
  962.     fp = page->freelist;
  963.     while (fp && nr <= page->objects) {
  964.         if (fp == search)
  965.             return 1;
  966.         if (!check_valid_pointer(s, page, fp)) {
  967.             if (object) {
  968.                 object_err(s, page, object,
  969.                     "Freechain corrupt");
  970.                 set_freepointer(s, object, NULL);
  971.             } else {
  972.                 slab_err(s, page, "Freepointer corrupt");
  973.                 page->freelist = NULL;
  974.                 page->inuse = page->objects;
  975.                 slab_fix(s, "Freelist cleared");
  976.                 return 0;
  977.             }
  978.             break;
  979.         }
  980.         object = fp;
  981.         fp = get_freepointer(s, object);
  982.         nr++;
  983.     }
  984.  
  985.     max_objects = order_objects(compound_order(page), s->size);
  986.     if (max_objects > MAX_OBJS_PER_PAGE)
  987.         max_objects = MAX_OBJS_PER_PAGE;
  988.  
  989.     if (page->objects != max_objects) {
  990.         slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
  991.              page->objects, max_objects);
  992.         page->objects = max_objects;
  993.         slab_fix(s, "Number of objects adjusted.");
  994.     }
  995.     if (page->inuse != page->objects - nr) {
  996.         slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
  997.              page->inuse, page->objects - nr);
  998.         page->inuse = page->objects - nr;
  999.         slab_fix(s, "Object count adjusted.");
  1000.     }
  1001.     return search == NULL;
  1002. }
  1003.  
  1004. static void trace(struct kmem_cache *s, struct page *page, void *object,
  1005.                                 int alloc)
  1006. {
  1007.     if (s->flags & SLAB_TRACE) {
  1008.         pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
  1009.             s->name,
  1010.             alloc ? "alloc" : "free",
  1011.             object, page->inuse,
  1012.             page->freelist);
  1013.  
  1014.         if (!alloc)
  1015.             print_section(KERN_INFO, "Object ", (void *)object,
  1016.                     s->object_size);
  1017.  
  1018.         dump_stack();
  1019.     }
  1020. }
  1021.  
  1022. /*
  1023.  * Tracking of fully allocated slabs for debugging purposes.
  1024.  */
  1025. static void add_full(struct kmem_cache *s,
  1026.     struct kmem_cache_node *n, struct page *page)
  1027. {
  1028.     if (!(s->flags & SLAB_STORE_USER))
  1029.         return;
  1030.  
  1031.     lockdep_assert_held(&n->list_lock);
  1032.     list_add(&page->slab_list, &n->full);
  1033. }
  1034.  
  1035. static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
  1036. {
  1037.     if (!(s->flags & SLAB_STORE_USER))
  1038.         return;
  1039.  
  1040.     lockdep_assert_held(&n->list_lock);
  1041.     list_del(&page->slab_list);
  1042. }
  1043.  
  1044. /* Tracking of the number of slabs for debugging purposes */
  1045. static inline unsigned long slabs_node(struct kmem_cache *s, int node)
  1046. {
  1047.     struct kmem_cache_node *n = get_node(s, node);
  1048.  
  1049.     return atomic_long_read(&n->nr_slabs);
  1050. }
  1051.  
  1052. static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
  1053. {
  1054.     return atomic_long_read(&n->nr_slabs);
  1055. }
  1056.  
  1057. static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
  1058. {
  1059.     struct kmem_cache_node *n = get_node(s, node);
  1060.  
  1061.     /*
  1062.      * May be called early in order to allocate a slab for the
  1063.      * kmem_cache_node structure. Solve the chicken-egg
  1064.      * dilemma by deferring the increment of the count during
  1065.      * bootstrap (see early_kmem_cache_node_alloc).
  1066.      */
  1067.     if (likely(n)) {
  1068.         atomic_long_inc(&n->nr_slabs);
  1069.         atomic_long_add(objects, &n->total_objects);
  1070.     }
  1071. }
  1072. static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
  1073. {
  1074.     struct kmem_cache_node *n = get_node(s, node);
  1075.  
  1076.     atomic_long_dec(&n->nr_slabs);
  1077.     atomic_long_sub(objects, &n->total_objects);
  1078. }
  1079.  
  1080. /* Object debug checks for alloc/free paths */
  1081. static void setup_object_debug(struct kmem_cache *s, struct page *page,
  1082.                                 void *object)
  1083. {
  1084.     if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
  1085.         return;
  1086.  
  1087.     init_object(s, object, SLUB_RED_INACTIVE);
  1088.     init_tracking(s, object);
  1089. }
  1090.  
  1091. static
  1092. void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
  1093. {
  1094.     if (!(s->flags & SLAB_POISON))
  1095.         return;
  1096.  
  1097.     metadata_access_enable();
  1098.     memset(addr, POISON_INUSE, page_size(page));
  1099.     metadata_access_disable();
  1100. }
  1101.  
  1102. static inline int alloc_consistency_checks(struct kmem_cache *s,
  1103.                     struct page *page, void *object)
  1104. {
  1105.     if (!check_slab(s, page))
  1106.         return 0;
  1107.  
  1108.     if (!check_valid_pointer(s, page, object)) {
  1109.         object_err(s, page, object, "Freelist Pointer check fails");
  1110.         return 0;
  1111.     }
  1112.  
  1113.     if (!check_object(s, page, object, SLUB_RED_INACTIVE))
  1114.         return 0;
  1115.  
  1116.     return 1;
  1117. }
  1118.  
  1119. static noinline int alloc_debug_processing(struct kmem_cache *s,
  1120.                     struct page *page,
  1121.                     void *object, unsigned long addr)
  1122. {
  1123.     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  1124.         if (!alloc_consistency_checks(s, page, object))
  1125.             goto bad;
  1126.     }
  1127.  
  1128.     /* Success perform special debug activities for allocs */
  1129.     if (s->flags & SLAB_STORE_USER)
  1130.         set_track(s, object, TRACK_ALLOC, addr);
  1131.     trace(s, page, object, 1);
  1132.     init_object(s, object, SLUB_RED_ACTIVE);
  1133.     return 1;
  1134.  
  1135. bad:
  1136.     if (PageSlab(page)) {
  1137.         /*
  1138.          * If this is a slab page then lets do the best we can
  1139.          * to avoid issues in the future. Marking all objects
  1140.          * as used avoids touching the remaining objects.
  1141.          */
  1142.         slab_fix(s, "Marking all objects used");
  1143.         page->inuse = page->objects;
  1144.         page->freelist = NULL;
  1145.     }
  1146.     return 0;
  1147. }
  1148.  
  1149. static inline int free_consistency_checks(struct kmem_cache *s,
  1150.         struct page *page, void *object, unsigned long addr)
  1151. {
  1152.     if (!check_valid_pointer(s, page, object)) {
  1153.         slab_err(s, page, "Invalid object pointer 0x%p", object);
  1154.         return 0;
  1155.     }
  1156.  
  1157.     if (on_freelist(s, page, object)) {
  1158.         object_err(s, page, object, "Object already free");
  1159.         return 0;
  1160.     }
  1161.  
  1162.     if (!check_object(s, page, object, SLUB_RED_ACTIVE))
  1163.         return 0;
  1164.  
  1165.     if (unlikely(s != page->slab_cache)) {
  1166.         if (!PageSlab(page)) {
  1167.             slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
  1168.                  object);
  1169.         } else if (!page->slab_cache) {
  1170.             pr_err("SLUB <none>: no slab for object 0x%p.\n",
  1171.                    object);
  1172.             dump_stack();
  1173.         } else
  1174.             object_err(s, page, object,
  1175.                     "page slab pointer corrupt.");
  1176.         return 0;
  1177.     }
  1178.     return 1;
  1179. }
  1180.  
  1181. /* Supports checking bulk free of a constructed freelist */
  1182. static noinline int free_debug_processing(
  1183.     struct kmem_cache *s, struct page *page,
  1184.     void *head, void *tail, int bulk_cnt,
  1185.     unsigned long addr)
  1186. {
  1187.     struct kmem_cache_node *n = get_node(s, page_to_nid(page));
  1188.     void *object = head;
  1189.     int cnt = 0;
  1190.     unsigned long uninitialized_var(flags);
  1191.     int ret = 0;
  1192.  
  1193.     spin_lock_irqsave(&n->list_lock, flags);
  1194.     slab_lock(page);
  1195.  
  1196.     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  1197.         if (!check_slab(s, page))
  1198.             goto out;
  1199.     }
  1200.  
  1201. next_object:
  1202.     cnt++;
  1203.  
  1204.     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  1205.         if (!free_consistency_checks(s, page, object, addr))
  1206.             goto out;
  1207.     }
  1208.  
  1209.     if (s->flags & SLAB_STORE_USER)
  1210.         set_track(s, object, TRACK_FREE, addr);
  1211.     trace(s, page, object, 0);
  1212.     /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
  1213.     init_object(s, object, SLUB_RED_INACTIVE);
  1214.  
  1215.     /* Reached end of constructed freelist yet? */
  1216.     if (object != tail) {
  1217.         object = get_freepointer(s, object);
  1218.         goto next_object;
  1219.     }
  1220.     ret = 1;
  1221.  
  1222. out:
  1223.     if (cnt != bulk_cnt)
  1224.         slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
  1225.              bulk_cnt, cnt);
  1226.  
  1227.     slab_unlock(page);
  1228.     spin_unlock_irqrestore(&n->list_lock, flags);
  1229.     if (!ret)
  1230.         slab_fix(s, "Object at 0x%p not freed", object);
  1231.     return ret;
  1232. }
  1233.  
  1234. static int __init setup_slub_debug(char *str)
  1235. {
  1236.     slub_debug = DEBUG_DEFAULT_FLAGS;
  1237.     if (*str++ != '=' || !*str)
  1238.         /*
  1239.          * No options specified. Switch on full debugging.
  1240.          */
  1241.         goto out;
  1242.  
  1243.     if (*str == ',')
  1244.         /*
  1245.          * No options but restriction on slabs. This means full
  1246.          * debugging for slabs matching a pattern.
  1247.          */
  1248.         goto check_slabs;
  1249.  
  1250.     slub_debug = 0;
  1251.     if (*str == '-')
  1252.         /*
  1253.          * Switch off all debugging measures.
  1254.          */
  1255.         goto out;
  1256.  
  1257.     /*
  1258.      * Determine which debug features should be switched on
  1259.      */
  1260.     for (; *str && *str != ','; str++) {
  1261.         switch (tolower(*str)) {
  1262.         case 'f':
  1263.             slub_debug |= SLAB_CONSISTENCY_CHECKS;
  1264.             break;
  1265.         case 'z':
  1266.             slub_debug |= SLAB_RED_ZONE;
  1267.             break;
  1268.         case 'p':
  1269.             slub_debug |= SLAB_POISON;
  1270.             break;
  1271.         case 'u':
  1272.             slub_debug |= SLAB_STORE_USER;
  1273.             break;
  1274.         case 't':
  1275.             slub_debug |= SLAB_TRACE;
  1276.             break;
  1277.         case 'a':
  1278.             slub_debug |= SLAB_FAILSLAB;
  1279.             break;
  1280.         case 'o':
  1281.             /*
  1282.              * Avoid enabling debugging on caches if its minimum
  1283.              * order would increase as a result.
  1284.              */
  1285.             disable_higher_order_debug = 1;
  1286.             break;
  1287.         default:
  1288.             pr_err("slub_debug option '%c' unknown. skipped\n",
  1289.                    *str);
  1290.         }
  1291.     }
  1292.  
  1293. check_slabs:
  1294.     if (*str == ',')
  1295.         slub_debug_slabs = str + 1;
  1296. out:
  1297.     if ((static_branch_unlikely(&init_on_alloc) ||
  1298.          static_branch_unlikely(&init_on_free)) &&
  1299.         (slub_debug & SLAB_POISON))
  1300.         pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
  1301.     return 1;
  1302. }
  1303.  
  1304. __setup("slub_debug", setup_slub_debug);
  1305.  
  1306. /*
  1307.  * kmem_cache_flags - apply debugging options to the cache
  1308.  * @object_size:    the size of an object without meta data
  1309.  * @flags:      flags to set
  1310.  * @name:       name of the cache
  1311.  * @ctor:       constructor function
  1312.  *
  1313.  * Debug option(s) are applied to @flags. In addition to the debug
  1314.  * option(s), if a slab name (or multiple) is specified i.e.
  1315.  * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
  1316.  * then only the select slabs will receive the debug option(s).
  1317.  */
  1318. slab_flags_t kmem_cache_flags(unsigned int object_size,
  1319.     slab_flags_t flags, const char *name,
  1320.     void (*ctor)(void *))
  1321. {
  1322.     char *iter;
  1323.     size_t len;
  1324.  
  1325.     /* If slub_debug = 0, it folds into the if conditional. */
  1326.     if (!slub_debug_slabs)
  1327.         return flags | slub_debug;
  1328.  
  1329.     len = strlen(name);
  1330.     iter = slub_debug_slabs;
  1331.     while (*iter) {
  1332.         char *end, *glob;
  1333.         size_t cmplen;
  1334.  
  1335.         end = strchrnul(iter, ',');
  1336.  
  1337.         glob = strnchr(iter, end - iter, '*');
  1338.         if (glob)
  1339.             cmplen = glob - iter;
  1340.         else
  1341.             cmplen = max_t(size_t, len, (end - iter));
  1342.  
  1343.         if (!strncmp(name, iter, cmplen)) {
  1344.             flags |= slub_debug;
  1345.             break;
  1346.         }
  1347.  
  1348.         if (!*end)
  1349.             break;
  1350.         iter = end + 1;
  1351.     }
  1352.  
  1353.     return flags;
  1354. }
  1355. #else /* !CONFIG_SLUB_DEBUG */
  1356. static inline void setup_object_debug(struct kmem_cache *s,
  1357.             struct page *page, void *object) {}
  1358. static inline
  1359. void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
  1360.  
  1361. static inline int alloc_debug_processing(struct kmem_cache *s,
  1362.     struct page *page, void *object, unsigned long addr) { return 0; }
  1363.  
  1364. static inline int free_debug_processing(
  1365.     struct kmem_cache *s, struct page *page,
  1366.     void *head, void *tail, int bulk_cnt,
  1367.     unsigned long addr) { return 0; }
  1368.  
  1369. static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
  1370.             { return 1; }
  1371. static inline int check_object(struct kmem_cache *s, struct page *page,
  1372.             void *object, u8 val) { return 1; }
  1373. static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
  1374.                     struct page *page) {}
  1375. static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
  1376.                     struct page *page) {}
  1377. slab_flags_t kmem_cache_flags(unsigned int object_size,
  1378.     slab_flags_t flags, const char *name,
  1379.     void (*ctor)(void *))
  1380. {
  1381.     return flags;
  1382. }
  1383. #define slub_debug 0
  1384.  
  1385. #define disable_higher_order_debug 0
  1386.  
  1387. static inline unsigned long slabs_node(struct kmem_cache *s, int node)
  1388.                             { return 0; }
  1389. static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
  1390.                             { return 0; }
  1391. static inline void inc_slabs_node(struct kmem_cache *s, int node,
  1392.                             int objects) {}
  1393. static inline void dec_slabs_node(struct kmem_cache *s, int node,
  1394.                             int objects) {}
  1395.  
  1396. static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
  1397.                    void **freelist, void *nextfree)
  1398. {
  1399.     return false;
  1400. }
  1401. #endif /* CONFIG_SLUB_DEBUG */
  1402.  
  1403. /*
  1404.  * Hooks for other subsystems that check memory allocations. In a typical
  1405.  * production configuration these hooks all should produce no code at all.
  1406.  */
  1407. static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
  1408. {
  1409.     ptr = kasan_kmalloc_large(ptr, size, flags);
  1410.     /* As ptr might get tagged, call kmemleak hook after KASAN. */
  1411.     kmemleak_alloc(ptr, size, 1, flags);
  1412.     return ptr;
  1413. }
  1414.  
  1415. static __always_inline void kfree_hook(void *x)
  1416. {
  1417.     kmemleak_free(x);
  1418.     kasan_kfree_large(x, _RET_IP_);
  1419. }
  1420.  
  1421. static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
  1422. {
  1423.     kmemleak_free_recursive(x, s->flags);
  1424.  
  1425.     /*
  1426.      * Trouble is that we may no longer disable interrupts in the fast path
  1427.      * So in order to make the debug calls that expect irqs to be
  1428.      * disabled we need to disable interrupts temporarily.
  1429.      */
  1430. #ifdef CONFIG_LOCKDEP
  1431.     {
  1432.         unsigned long flags;
  1433.  
  1434.         local_irq_save(flags);
  1435.         debug_check_no_locks_freed(x, s->object_size);
  1436.         local_irq_restore(flags);
  1437.     }
  1438. #endif
  1439.     if (!(s->flags & SLAB_DEBUG_OBJECTS))
  1440.         debug_check_no_obj_freed(x, s->object_size);
  1441.  
  1442.     /* KASAN might put x into memory quarantine, delaying its reuse */
  1443.     return kasan_slab_free(s, x, _RET_IP_);
  1444. }
  1445.  
  1446. static inline bool slab_free_freelist_hook(struct kmem_cache *s,
  1447.                        void **head, void **tail)
  1448. {
  1449.  
  1450.     void *object;
  1451.     void *next = *head;
  1452.     void *old_tail = *tail ? *tail : *head;
  1453.     int rsize;
  1454.  
  1455.     /* Head and tail of the reconstructed freelist */
  1456.     *head = NULL;
  1457.     *tail = NULL;
  1458.  
  1459.     do {
  1460.         object = next;
  1461.         next = get_freepointer(s, object);
  1462.  
  1463.         if (slab_want_init_on_free(s)) {
  1464.             /*
  1465.              * Clear the object and the metadata, but don't touch
  1466.              * the redzone.
  1467.              */
  1468.             memset(object, 0, s->object_size);
  1469.             rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
  1470.                                : 0;
  1471.             memset((char *)object + s->inuse, 0,
  1472.                    s->size - s->inuse - rsize);
  1473.  
  1474.         }
  1475.         /* If object's reuse doesn't have to be delayed */
  1476.         if (!slab_free_hook(s, object)) {
  1477.             /* Move object to the new freelist */
  1478.             set_freepointer(s, object, *head);
  1479.             *head = object;
  1480.             if (!*tail)
  1481.                 *tail = object;
  1482.         }
  1483.     } while (object != old_tail);
  1484.  
  1485.     if (*head == *tail)
  1486.         *tail = NULL;
  1487.  
  1488.     return *head != NULL;
  1489. }
  1490.  
  1491. static void *setup_object(struct kmem_cache *s, struct page *page,
  1492.                 void *object)
  1493. {
  1494.     setup_object_debug(s, page, object);
  1495.     object = kasan_init_slab_obj(s, object);
  1496.     if (unlikely(s->ctor)) {
  1497.         kasan_unpoison_object_data(s, object);
  1498.         s->ctor(object);
  1499.         kasan_poison_object_data(s, object);
  1500.     }
  1501.     return object;
  1502. }
  1503.  
  1504. /*
  1505.  * Slab allocation and freeing
  1506.  */
  1507. static inline struct page *alloc_slab_page(struct kmem_cache *s,
  1508.         gfp_t flags, int node, struct kmem_cache_order_objects oo)
  1509. {
  1510.     struct page *page;
  1511.     unsigned int order = oo_order(oo);
  1512.  
  1513.     if (node == NUMA_NO_NODE)
  1514.         page = alloc_pages(flags, order);
  1515.     else
  1516.         page = __alloc_pages_node(node, flags, order);
  1517.  
  1518.     if (page && charge_slab_page(page, flags, order, s)) {
  1519.         __free_pages(page, order);
  1520.         page = NULL;
  1521.     }
  1522.  
  1523.     return page;
  1524. }
  1525.  
  1526. #ifdef CONFIG_SLAB_FREELIST_RANDOM
  1527. /* Pre-initialize the random sequence cache */
  1528. static int init_cache_random_seq(struct kmem_cache *s)
  1529. {
  1530.     unsigned int count = oo_objects(s->oo);
  1531.     int err;
  1532.  
  1533.     /* Bailout if already initialised */
  1534.     if (s->random_seq)
  1535.         return 0;
  1536.  
  1537.     err = cache_random_seq_create(s, count, GFP_KERNEL);
  1538.     if (err) {
  1539.         pr_err("SLUB: Unable to initialize free list for %s\n",
  1540.             s->name);
  1541.         return err;
  1542.     }
  1543.  
  1544.     /* Transform to an offset on the set of pages */
  1545.     if (s->random_seq) {
  1546.         unsigned int i;
  1547.  
  1548.         for (i = 0; i < count; i++)
  1549.             s->random_seq[i] *= s->size;
  1550.     }
  1551.     return 0;
  1552. }
  1553.  
  1554. /* Initialize each random sequence freelist per cache */
  1555. static void __init init_freelist_randomization(void)
  1556. {
  1557.     struct kmem_cache *s;
  1558.  
  1559.     mutex_lock(&slab_mutex);
  1560.  
  1561.     list_for_each_entry(s, &slab_caches, list)
  1562.         init_cache_random_seq(s);
  1563.  
  1564.     mutex_unlock(&slab_mutex);
  1565. }
  1566.  
  1567. /* Get the next entry on the pre-computed freelist randomized */
  1568. static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
  1569.                 unsigned long *pos, void *start,
  1570.                 unsigned long page_limit,
  1571.                 unsigned long freelist_count)
  1572. {
  1573.     unsigned int idx;
  1574.  
  1575.     /*
  1576.      * If the target page allocation failed, the number of objects on the
  1577.      * page might be smaller than the usual size defined by the cache.
  1578.      */
  1579.     do {
  1580.         idx = s->random_seq[*pos];
  1581.         *pos += 1;
  1582.         if (*pos >= freelist_count)
  1583.             *pos = 0;
  1584.     } while (unlikely(idx >= page_limit));
  1585.  
  1586.     return (char *)start + idx;
  1587. }
  1588.  
  1589. /* Shuffle the single linked freelist based on a random pre-computed sequence */
  1590. static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
  1591. {
  1592.     void *start;
  1593.     void *cur;
  1594.     void *next;
  1595.     unsigned long idx, pos, page_limit, freelist_count;
  1596.  
  1597.     if (page->objects < 2 || !s->random_seq)
  1598.         return false;
  1599.  
  1600.     freelist_count = oo_objects(s->oo);
  1601.     pos = get_random_int() % freelist_count;
  1602.  
  1603.     page_limit = page->objects * s->size;
  1604.     start = fixup_red_left(s, page_address(page));
  1605.  
  1606.     /* First entry is used as the base of the freelist */
  1607.     cur = next_freelist_entry(s, page, &pos, start, page_limit,
  1608.                 freelist_count);
  1609.     cur = setup_object(s, page, cur);
  1610.     page->freelist = cur;
  1611.  
  1612.     for (idx = 1; idx < page->objects; idx++) {
  1613.         next = next_freelist_entry(s, page, &pos, start, page_limit,
  1614.             freelist_count);
  1615.         next = setup_object(s, page, next);
  1616.         set_freepointer(s, cur, next);
  1617.         cur = next;
  1618.     }
  1619.     set_freepointer(s, cur, NULL);
  1620.  
  1621.     return true;
  1622. }
  1623. #else
  1624. static inline int init_cache_random_seq(struct kmem_cache *s)
  1625. {
  1626.     return 0;
  1627. }
  1628. static inline void init_freelist_randomization(void) { }
  1629. static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
  1630. {
  1631.     return false;
  1632. }
  1633. #endif /* CONFIG_SLAB_FREELIST_RANDOM */
  1634.  
  1635. static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
  1636. {
  1637.     struct page *page;
  1638.     struct kmem_cache_order_objects oo = s->oo;
  1639.     gfp_t alloc_gfp;
  1640.     void *start, *p, *next;
  1641.     int idx;
  1642.     bool shuffle;
  1643.  
  1644.     flags &= gfp_allowed_mask;
  1645.  
  1646.     if (gfpflags_allow_blocking(flags))
  1647.         local_irq_enable();
  1648.  
  1649.     flags |= s->allocflags;
  1650.  
  1651.     /*
  1652.      * Let the initial higher-order allocation fail under memory pressure
  1653.      * so we fall-back to the minimum order allocation.
  1654.      */
  1655.     alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
  1656.     if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
  1657.         alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
  1658.  
  1659.     page = alloc_slab_page(s, alloc_gfp, node, oo);
  1660.     if (unlikely(!page)) {
  1661.         oo = s->min;
  1662.         alloc_gfp = flags;
  1663.         /*
  1664.          * Allocation may have failed due to fragmentation.
  1665.          * Try a lower order alloc if possible
  1666.          */
  1667.         page = alloc_slab_page(s, alloc_gfp, node, oo);
  1668.         if (unlikely(!page))
  1669.             goto out;
  1670.         stat(s, ORDER_FALLBACK);
  1671.     }
  1672.  
  1673.     page->objects = oo_objects(oo);
  1674.  
  1675.     page->slab_cache = s;
  1676.     __SetPageSlab(page);
  1677.     if (page_is_pfmemalloc(page))
  1678.         SetPageSlabPfmemalloc(page);
  1679.  
  1680.     kasan_poison_slab(page);
  1681.  
  1682.     start = page_address(page);
  1683.  
  1684.     setup_page_debug(s, page, start);
  1685.  
  1686.     shuffle = shuffle_freelist(s, page);
  1687.  
  1688.     if (!shuffle) {
  1689.         start = fixup_red_left(s, start);
  1690.         start = setup_object(s, page, start);
  1691.         page->freelist = start;
  1692.         for (idx = 0, p = start; idx < page->objects - 1; idx++) {
  1693.             next = p + s->size;
  1694.             next = setup_object(s, page, next);
  1695.             set_freepointer(s, p, next);
  1696.             p = next;
  1697.         }
  1698.         set_freepointer(s, p, NULL);
  1699.     }
  1700.  
  1701.     page->inuse = page->objects;
  1702.     page->frozen = 1;
  1703.  
  1704. out:
  1705.     if (gfpflags_allow_blocking(flags))
  1706.         local_irq_disable();
  1707.     if (!page)
  1708.         return NULL;
  1709.  
  1710.     inc_slabs_node(s, page_to_nid(page), page->objects);
  1711.  
  1712.     return page;
  1713. }
  1714.  
  1715. static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
  1716. {
  1717.     if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
  1718.         gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
  1719.         flags &= ~GFP_SLAB_BUG_MASK;
  1720.         pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
  1721.                 invalid_mask, &invalid_mask, flags, &flags);
  1722.         dump_stack();
  1723.     }
  1724.  
  1725.     return allocate_slab(s,
  1726.         flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
  1727. }
  1728.  
  1729. static void __free_slab(struct kmem_cache *s, struct page *page)
  1730. {
  1731.     int order = compound_order(page);
  1732.     int pages = 1 << order;
  1733.  
  1734.     if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  1735.         void *p;
  1736.  
  1737.         slab_pad_check(s, page);
  1738.         for_each_object(p, s, page_address(page),
  1739.                         page->objects)
  1740.             check_object(s, page, p, SLUB_RED_INACTIVE);
  1741.     }
  1742.  
  1743.     __ClearPageSlabPfmemalloc(page);
  1744.     __ClearPageSlab(page);
  1745.  
  1746.     page->mapping = NULL;
  1747.     if (current->reclaim_state)
  1748.         current->reclaim_state->reclaimed_slab += pages;
  1749.     uncharge_slab_page(page, order, s);
  1750.     __free_pages(page, order);
  1751. }
  1752.  
  1753. static void rcu_free_slab(struct rcu_head *h)
  1754. {
  1755.     struct page *page = container_of(h, struct page, rcu_head);
  1756.  
  1757.     __free_slab(page->slab_cache, page);
  1758. }
  1759.  
  1760. static void free_slab(struct kmem_cache *s, struct page *page)
  1761. {
  1762.     if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
  1763.         call_rcu(&page->rcu_head, rcu_free_slab);
  1764.     } else
  1765.         __free_slab(s, page);
  1766. }
  1767.  
  1768. static void discard_slab(struct kmem_cache *s, struct page *page)
  1769. {
  1770.     dec_slabs_node(s, page_to_nid(page), page->objects);
  1771.     free_slab(s, page);
  1772. }
  1773.  
  1774. /*
  1775.  * Management of partially allocated slabs.
  1776.  */
  1777. static inline void
  1778. __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
  1779. {
  1780.     n->nr_partial++;
  1781.     if (tail == DEACTIVATE_TO_TAIL)
  1782.         list_add_tail(&page->slab_list, &n->partial);
  1783.     else
  1784.         list_add(&page->slab_list, &n->partial);
  1785. }
  1786.  
  1787. static inline void add_partial(struct kmem_cache_node *n,
  1788.                 struct page *page, int tail)
  1789. {
  1790.     lockdep_assert_held(&n->list_lock);
  1791.     __add_partial(n, page, tail);
  1792. }
  1793.  
  1794. static inline void remove_partial(struct kmem_cache_node *n,
  1795.                     struct page *page)
  1796. {
  1797.     lockdep_assert_held(&n->list_lock);
  1798.     list_del(&page->slab_list);
  1799.     n->nr_partial--;
  1800. }
  1801.  
  1802. /*
  1803.  * Remove slab from the partial list, freeze it and
  1804.  * return the pointer to the freelist.
  1805.  *
  1806.  * Returns a list of objects or NULL if it fails.
  1807.  */
  1808. static inline void *acquire_slab(struct kmem_cache *s,
  1809.         struct kmem_cache_node *n, struct page *page,
  1810.         int mode, int *objects)
  1811. {
  1812.     void *freelist;
  1813.     unsigned long counters;
  1814.     struct page new;
  1815.  
  1816.     lockdep_assert_held(&n->list_lock);
  1817.  
  1818.     /*
  1819.      * Zap the freelist and set the frozen bit.
  1820.      * The old freelist is the list of objects for the
  1821.      * per cpu allocation list.
  1822.      */
  1823.     freelist = page->freelist;
  1824.     counters = page->counters;
  1825.     new.counters = counters;
  1826.     *objects = new.objects - new.inuse;
  1827.     if (mode) {
  1828.         new.inuse = page->objects;
  1829.         new.freelist = NULL;
  1830.     } else {
  1831.         new.freelist = freelist;
  1832.     }
  1833.  
  1834.     VM_BUG_ON(new.frozen);
  1835.     new.frozen = 1;
  1836.  
  1837.     if (!__cmpxchg_double_slab(s, page,
  1838.             freelist, counters,
  1839.             new.freelist, new.counters,
  1840.             "acquire_slab"))
  1841.         return NULL;
  1842.  
  1843.     remove_partial(n, page);
  1844.     WARN_ON(!freelist);
  1845.     return freelist;
  1846. }
  1847.  
  1848. static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
  1849. static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
  1850.  
  1851. /*
  1852.  * Try to allocate a partial slab from a specific node.
  1853.  */
  1854. static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
  1855.                 struct kmem_cache_cpu *c, gfp_t flags)
  1856. {
  1857.     struct page *page, *page2;
  1858.     void *object = NULL;
  1859.     unsigned int available = 0;
  1860.     int objects;
  1861.  
  1862.     /*
  1863.      * Racy check. If we mistakenly see no partial slabs then we
  1864.      * just allocate an empty slab. If we mistakenly try to get a
  1865.      * partial slab and there is none available then get_partials()
  1866.      * will return NULL.
  1867.      */
  1868.     if (!n || !n->nr_partial)
  1869.         return NULL;
  1870.  
  1871.     spin_lock(&n->list_lock);
  1872.     list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
  1873.         void *t;
  1874.  
  1875.         if (!pfmemalloc_match(page, flags))
  1876.             continue;
  1877.  
  1878.         t = acquire_slab(s, n, page, object == NULL, &objects);
  1879.         if (!t)
  1880.             break;
  1881.  
  1882.         available += objects;
  1883.         if (!object) {
  1884.             c->page = page;
  1885.             stat(s, ALLOC_FROM_PARTIAL);
  1886.             object = t;
  1887.         } else {
  1888.             put_cpu_partial(s, page, 0);
  1889.             stat(s, CPU_PARTIAL_NODE);
  1890.         }
  1891.         if (!kmem_cache_has_cpu_partial(s)
  1892.             || available > slub_cpu_partial(s) / 2)
  1893.             break;
  1894.  
  1895.     }
  1896.     spin_unlock(&n->list_lock);
  1897.     return object;
  1898. }
  1899.  
  1900. /*
  1901.  * Get a page from somewhere. Search in increasing NUMA distances.
  1902.  */
  1903. static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
  1904.         struct kmem_cache_cpu *c)
  1905. {
  1906. #ifdef CONFIG_NUMA
  1907.     struct zonelist *zonelist;
  1908.     struct zoneref *z;
  1909.     struct zone *zone;
  1910.     enum zone_type high_zoneidx = gfp_zone(flags);
  1911.     void *object;
  1912.     unsigned int cpuset_mems_cookie;
  1913.  
  1914.     /*
  1915.      * The defrag ratio allows a configuration of the tradeoffs between
  1916.      * inter node defragmentation and node local allocations. A lower
  1917.      * defrag_ratio increases the tendency to do local allocations
  1918.      * instead of attempting to obtain partial slabs from other nodes.
  1919.      *
  1920.      * If the defrag_ratio is set to 0 then kmalloc() always
  1921.      * returns node local objects. If the ratio is higher then kmalloc()
  1922.      * may return off node objects because partial slabs are obtained
  1923.      * from other nodes and filled up.
  1924.      *
  1925.      * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
  1926.      * (which makes defrag_ratio = 1000) then every (well almost)
  1927.      * allocation will first attempt to defrag slab caches on other nodes.
  1928.      * This means scanning over all nodes to look for partial slabs which
  1929.      * may be expensive if we do it every time we are trying to find a slab
  1930.      * with available objects.
  1931.      */
  1932.     if (!s->remote_node_defrag_ratio ||
  1933.             get_cycles() % 1024 > s->remote_node_defrag_ratio)
  1934.         return NULL;
  1935.  
  1936.     do {
  1937.         cpuset_mems_cookie = read_mems_allowed_begin();
  1938.         zonelist = node_zonelist(mempolicy_slab_node(), flags);
  1939.         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
  1940.             struct kmem_cache_node *n;
  1941.  
  1942.             n = get_node(s, zone_to_nid(zone));
  1943.  
  1944.             if (n && cpuset_zone_allowed(zone, flags) &&
  1945.                     n->nr_partial > s->min_partial) {
  1946.                 object = get_partial_node(s, n, c, flags);
  1947.                 if (object) {
  1948.                     /*
  1949.                      * Don't check read_mems_allowed_retry()
  1950.                      * here - if mems_allowed was updated in
  1951.                      * parallel, that was a harmless race
  1952.                      * between allocation and the cpuset
  1953.                      * update
  1954.                      */
  1955.                     return object;
  1956.                 }
  1957.             }
  1958.         }
  1959.     } while (read_mems_allowed_retry(cpuset_mems_cookie));
  1960. #endif  /* CONFIG_NUMA */
  1961.     return NULL;
  1962. }
  1963.  
  1964. /*
  1965.  * Get a partial page, lock it and return it.
  1966.  */
  1967. static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
  1968.         struct kmem_cache_cpu *c)
  1969. {
  1970.     void *object;
  1971.     int searchnode = node;
  1972.  
  1973.     if (node == NUMA_NO_NODE)
  1974.         searchnode = numa_mem_id();
  1975.  
  1976.     object = get_partial_node(s, get_node(s, searchnode), c, flags);
  1977.     if (object || node != NUMA_NO_NODE)
  1978.         return object;
  1979.  
  1980.     return get_any_partial(s, flags, c);
  1981. }
  1982.  
  1983. #ifdef CONFIG_PREEMPT
  1984. /*
  1985.  * Calculate the next globally unique transaction for disambiguiation
  1986.  * during cmpxchg. The transactions start with the cpu number and are then
  1987.  * incremented by CONFIG_NR_CPUS.
  1988.  */
  1989. #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
  1990. #else
  1991. /*
  1992.  * No preemption supported therefore also no need to check for
  1993.  * different cpus.
  1994.  */
  1995. #define TID_STEP 1
  1996. #endif
  1997.  
  1998. static inline unsigned long next_tid(unsigned long tid)
  1999. {
  2000.     return tid + TID_STEP;
  2001. }
  2002.  
  2003. #ifdef SLUB_DEBUG_CMPXCHG
  2004. static inline unsigned int tid_to_cpu(unsigned long tid)
  2005. {
  2006.     return tid % TID_STEP;
  2007. }
  2008.  
  2009. static inline unsigned long tid_to_event(unsigned long tid)
  2010. {
  2011.     return tid / TID_STEP;
  2012. }
  2013. #endif
  2014.  
  2015. static inline unsigned int init_tid(int cpu)
  2016. {
  2017.     return cpu;
  2018. }
  2019.  
  2020. static inline void note_cmpxchg_failure(const char *n,
  2021.         const struct kmem_cache *s, unsigned long tid)
  2022. {
  2023. #ifdef SLUB_DEBUG_CMPXCHG
  2024.     unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
  2025.  
  2026.     pr_info("%s %s: cmpxchg redo ", n, s->name);
  2027.  
  2028. #ifdef CONFIG_PREEMPT
  2029.     if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
  2030.         pr_warn("due to cpu change %d -> %d\n",
  2031.             tid_to_cpu(tid), tid_to_cpu(actual_tid));
  2032.     else
  2033. #endif
  2034.     if (tid_to_event(tid) != tid_to_event(actual_tid))
  2035.         pr_warn("due to cpu running other code. Event %ld->%ld\n",
  2036.             tid_to_event(tid), tid_to_event(actual_tid));
  2037.     else
  2038.         pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
  2039.             actual_tid, tid, next_tid(tid));
  2040. #endif
  2041.     stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
  2042. }
  2043.  
  2044. static void init_kmem_cache_cpus(struct kmem_cache *s)
  2045. {
  2046.     int cpu;
  2047.  
  2048.     for_each_possible_cpu(cpu)
  2049.         per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
  2050. }
  2051.  
  2052. /*
  2053.  * Remove the cpu slab
  2054.  */
  2055. static void deactivate_slab(struct kmem_cache *s, struct page *page,
  2056.                 void *freelist, struct kmem_cache_cpu *c)
  2057. {
  2058.     enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
  2059.     struct kmem_cache_node *n = get_node(s, page_to_nid(page));
  2060.     int lock = 0;
  2061.     enum slab_modes l = M_NONE, m = M_NONE;
  2062.     void *nextfree;
  2063.     int tail = DEACTIVATE_TO_HEAD;
  2064.     struct page new;
  2065.     struct page old;
  2066.  
  2067.     if (page->freelist) {
  2068.         stat(s, DEACTIVATE_REMOTE_FREES);
  2069.         tail = DEACTIVATE_TO_TAIL;
  2070.     }
  2071.  
  2072.     /*
  2073.      * Stage one: Free all available per cpu objects back
  2074.      * to the page freelist while it is still frozen. Leave the
  2075.      * last one.
  2076.      *
  2077.      * There is no need to take the list->lock because the page
  2078.      * is still frozen.
  2079.      */
  2080.     while (freelist && (nextfree = get_freepointer(s, freelist))) {
  2081.         void *prior;
  2082.         unsigned long counters;
  2083.  
  2084.         /*
  2085.          * If 'nextfree' is invalid, it is possible that the object at
  2086.          * 'freelist' is already corrupted.  So isolate all objects
  2087.          * starting at 'freelist'.
  2088.          */
  2089.         if (freelist_corrupted(s, page, &freelist, nextfree))
  2090.             break;
  2091.  
  2092.         do {
  2093.             prior = page->freelist;
  2094.             counters = page->counters;
  2095.             set_freepointer(s, freelist, prior);
  2096.             new.counters = counters;
  2097.             new.inuse--;
  2098.             VM_BUG_ON(!new.frozen);
  2099.  
  2100.         } while (!__cmpxchg_double_slab(s, page,
  2101.             prior, counters,
  2102.             freelist, new.counters,
  2103.             "drain percpu freelist"));
  2104.  
  2105.         freelist = nextfree;
  2106.     }
  2107.  
  2108.     /*
  2109.      * Stage two: Ensure that the page is unfrozen while the
  2110.      * list presence reflects the actual number of objects
  2111.      * during unfreeze.
  2112.      *
  2113.      * We setup the list membership and then perform a cmpxchg
  2114.      * with the count. If there is a mismatch then the page
  2115.      * is not unfrozen but the page is on the wrong list.
  2116.      *
  2117.      * Then we restart the process which may have to remove
  2118.      * the page from the list that we just put it on again
  2119.      * because the number of objects in the slab may have
  2120.      * changed.
  2121.      */
  2122. redo:
  2123.  
  2124.     old.freelist = page->freelist;
  2125.     old.counters = page->counters;
  2126.     VM_BUG_ON(!old.frozen);
  2127.  
  2128.     /* Determine target state of the slab */
  2129.     new.counters = old.counters;
  2130.     if (freelist) {
  2131.         new.inuse--;
  2132.         set_freepointer(s, freelist, old.freelist);
  2133.         new.freelist = freelist;
  2134.     } else
  2135.         new.freelist = old.freelist;
  2136.  
  2137.     new.frozen = 0;
  2138.  
  2139.     if (!new.inuse && n->nr_partial >= s->min_partial)
  2140.         m = M_FREE;
  2141.     else if (new.freelist) {
  2142.         m = M_PARTIAL;
  2143.         if (!lock) {
  2144.             lock = 1;
  2145.             /*
  2146.              * Taking the spinlock removes the possibility
  2147.              * that acquire_slab() will see a slab page that
  2148.              * is frozen
  2149.              */
  2150.             spin_lock(&n->list_lock);
  2151.         }
  2152.     } else {
  2153.         m = M_FULL;
  2154.         if (kmem_cache_debug(s) && !lock) {
  2155.             lock = 1;
  2156.             /*
  2157.              * This also ensures that the scanning of full
  2158.              * slabs from diagnostic functions will not see
  2159.              * any frozen slabs.
  2160.              */
  2161.             spin_lock(&n->list_lock);
  2162.         }
  2163.     }
  2164.  
  2165.     if (l != m) {
  2166.         if (l == M_PARTIAL)
  2167.             remove_partial(n, page);
  2168.         else if (l == M_FULL)
  2169.             remove_full(s, n, page);
  2170.  
  2171.         if (m == M_PARTIAL)
  2172.             add_partial(n, page, tail);
  2173.         else if (m == M_FULL)
  2174.             add_full(s, n, page);
  2175.     }
  2176.  
  2177.     l = m;
  2178.     if (!__cmpxchg_double_slab(s, page,
  2179.                 old.freelist, old.counters,
  2180.                 new.freelist, new.counters,
  2181.                 "unfreezing slab"))
  2182.         goto redo;
  2183.  
  2184.     if (lock)
  2185.         spin_unlock(&n->list_lock);
  2186.  
  2187.     if (m == M_PARTIAL)
  2188.         stat(s, tail);
  2189.     else if (m == M_FULL)
  2190.         stat(s, DEACTIVATE_FULL);
  2191.     else if (m == M_FREE) {
  2192.         stat(s, DEACTIVATE_EMPTY);
  2193.         discard_slab(s, page);
  2194.         stat(s, FREE_SLAB);
  2195.     }
  2196.  
  2197.     c->page = NULL;
  2198.     c->freelist = NULL;
  2199. }
  2200.  
  2201. /*
  2202.  * Unfreeze all the cpu partial slabs.
  2203.  *
  2204.  * This function must be called with interrupts disabled
  2205.  * for the cpu using c (or some other guarantee must be there
  2206.  * to guarantee no concurrent accesses).
  2207.  */
  2208. static void unfreeze_partials(struct kmem_cache *s,
  2209.         struct kmem_cache_cpu *c)
  2210. {
  2211. #ifdef CONFIG_SLUB_CPU_PARTIAL
  2212.     struct kmem_cache_node *n = NULL, *n2 = NULL;
  2213.     struct page *page, *discard_page = NULL;
  2214.  
  2215.     while ((page = c->partial)) {
  2216.         struct page new;
  2217.         struct page old;
  2218.  
  2219.         c->partial = page->next;
  2220.  
  2221.         n2 = get_node(s, page_to_nid(page));
  2222.         if (n != n2) {
  2223.             if (n)
  2224.                 spin_unlock(&n->list_lock);
  2225.  
  2226.             n = n2;
  2227.             spin_lock(&n->list_lock);
  2228.         }
  2229.  
  2230.         do {
  2231.  
  2232.             old.freelist = page->freelist;
  2233.             old.counters = page->counters;
  2234.             VM_BUG_ON(!old.frozen);
  2235.  
  2236.             new.counters = old.counters;
  2237.             new.freelist = old.freelist;
  2238.  
  2239.             new.frozen = 0;
  2240.  
  2241.         } while (!__cmpxchg_double_slab(s, page,
  2242.                 old.freelist, old.counters,
  2243.                 new.freelist, new.counters,
  2244.                 "unfreezing slab"));
  2245.  
  2246.         if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
  2247.             page->next = discard_page;
  2248.             discard_page = page;
  2249.         } else {
  2250.             add_partial(n, page, DEACTIVATE_TO_TAIL);
  2251.             stat(s, FREE_ADD_PARTIAL);
  2252.         }
  2253.     }
  2254.  
  2255.     if (n)
  2256.         spin_unlock(&n->list_lock);
  2257.  
  2258.     while (discard_page) {
  2259.         page = discard_page;
  2260.         discard_page = discard_page->next;
  2261.  
  2262.         stat(s, DEACTIVATE_EMPTY);
  2263.         discard_slab(s, page);
  2264.         stat(s, FREE_SLAB);
  2265.     }
  2266. #endif  /* CONFIG_SLUB_CPU_PARTIAL */
  2267. }
  2268.  
  2269. /*
  2270.  * Put a page that was just frozen (in __slab_free|get_partial_node) into a
  2271.  * partial page slot if available.
  2272.  *
  2273.  * If we did not find a slot then simply move all the partials to the
  2274.  * per node partial list.
  2275.  */
  2276. static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
  2277. {
  2278. #ifdef CONFIG_SLUB_CPU_PARTIAL
  2279.     struct page *oldpage;
  2280.     int pages;
  2281.     int pobjects;
  2282.  
  2283.     preempt_disable();
  2284.     do {
  2285.         pages = 0;
  2286.         pobjects = 0;
  2287.         oldpage = this_cpu_read(s->cpu_slab->partial);
  2288.  
  2289.         if (oldpage) {
  2290.             pobjects = oldpage->pobjects;
  2291.             pages = oldpage->pages;
  2292.             if (drain && pobjects > s->cpu_partial) {
  2293.                 unsigned long flags;
  2294.                 /*
  2295.                  * partial array is full. Move the existing
  2296.                  * set to the per node partial list.
  2297.                  */
  2298.                 local_irq_save(flags);
  2299.                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
  2300.                 local_irq_restore(flags);
  2301.                 oldpage = NULL;
  2302.                 pobjects = 0;
  2303.                 pages = 0;
  2304.                 stat(s, CPU_PARTIAL_DRAIN);
  2305.             }
  2306.         }
  2307.  
  2308.         pages++;
  2309.         pobjects += page->objects - page->inuse;
  2310.  
  2311.         page->pages = pages;
  2312.         page->pobjects = pobjects;
  2313.         page->next = oldpage;
  2314.  
  2315.     } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
  2316.                                 != oldpage);
  2317.     if (unlikely(!s->cpu_partial)) {
  2318.         unsigned long flags;
  2319.  
  2320.         local_irq_save(flags);
  2321.         unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
  2322.         local_irq_restore(flags);
  2323.     }
  2324.     preempt_enable();
  2325. #endif  /* CONFIG_SLUB_CPU_PARTIAL */
  2326. }
  2327.  
  2328. static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
  2329. {
  2330.     stat(s, CPUSLAB_FLUSH);
  2331.     deactivate_slab(s, c->page, c->freelist, c);
  2332.  
  2333.     c->tid = next_tid(c->tid);
  2334. }
  2335.  
  2336. /*
  2337.  * Flush cpu slab.
  2338.  *
  2339.  * Called from IPI handler with interrupts disabled.
  2340.  */
  2341. static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
  2342. {
  2343.     struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
  2344.  
  2345.     if (c->page)
  2346.         flush_slab(s, c);
  2347.  
  2348.     unfreeze_partials(s, c);
  2349. }
  2350.  
  2351. static void flush_cpu_slab(void *d)
  2352. {
  2353.     struct kmem_cache *s = d;
  2354.  
  2355.     __flush_cpu_slab(s, smp_processor_id());
  2356. }
  2357.  
  2358. static bool has_cpu_slab(int cpu, void *info)
  2359. {
  2360.     struct kmem_cache *s = info;
  2361.     struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
  2362.  
  2363.     return c->page || slub_percpu_partial(c);
  2364. }
  2365.  
  2366. static void flush_all(struct kmem_cache *s)
  2367. {
  2368.     on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
  2369. }
  2370.  
  2371. /*
  2372.  * Use the cpu notifier to insure that the cpu slabs are flushed when
  2373.  * necessary.
  2374.  */
  2375. static int slub_cpu_dead(unsigned int cpu)
  2376. {
  2377.     struct kmem_cache *s;
  2378.     unsigned long flags;
  2379.  
  2380.     mutex_lock(&slab_mutex);
  2381.     list_for_each_entry(s, &slab_caches, list) {
  2382.         local_irq_save(flags);
  2383.         __flush_cpu_slab(s, cpu);
  2384.         local_irq_restore(flags);
  2385.     }
  2386.     mutex_unlock(&slab_mutex);
  2387.     return 0;
  2388. }
  2389.  
  2390. /*
  2391.  * Check if the objects in a per cpu structure fit numa
  2392.  * locality expectations.
  2393.  */
  2394. static inline int node_match(struct page *page, int node)
  2395. {
  2396. #ifdef CONFIG_NUMA
  2397.     if (node != NUMA_NO_NODE && page_to_nid(page) != node)
  2398.         return 0;
  2399. #endif
  2400.     return 1;
  2401. }
  2402.  
  2403. #ifdef CONFIG_SLUB_DEBUG
  2404. static int count_free(struct page *page)
  2405. {
  2406.     return page->objects - page->inuse;
  2407. }
  2408.  
  2409. static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
  2410. {
  2411.     return atomic_long_read(&n->total_objects);
  2412. }
  2413. #endif /* CONFIG_SLUB_DEBUG */
  2414.  
  2415. #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
  2416. static unsigned long count_partial(struct kmem_cache_node *n,
  2417.                     int (*get_count)(struct page *))
  2418. {
  2419.     unsigned long flags;
  2420.     unsigned long x = 0;
  2421.     struct page *page;
  2422.  
  2423.     spin_lock_irqsave(&n->list_lock, flags);
  2424.     list_for_each_entry(page, &n->partial, slab_list)
  2425.         x += get_count(page);
  2426.     spin_unlock_irqrestore(&n->list_lock, flags);
  2427.     return x;
  2428. }
  2429. #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
  2430.  
  2431. static noinline void
  2432. slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
  2433. {
  2434. #ifdef CONFIG_SLUB_DEBUG
  2435.     static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
  2436.                       DEFAULT_RATELIMIT_BURST);
  2437.     int node;
  2438.     struct kmem_cache_node *n;
  2439.  
  2440.     if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
  2441.         return;
  2442.  
  2443.     pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
  2444.         nid, gfpflags, &gfpflags);
  2445.     pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
  2446.         s->name, s->object_size, s->size, oo_order(s->oo),
  2447.         oo_order(s->min));
  2448.  
  2449.     if (oo_order(s->min) > get_order(s->object_size))
  2450.         pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
  2451.             s->name);
  2452.  
  2453.     for_each_kmem_cache_node(s, node, n) {
  2454.         unsigned long nr_slabs;
  2455.         unsigned long nr_objs;
  2456.         unsigned long nr_free;
  2457.  
  2458.         nr_free  = count_partial(n, count_free);
  2459.         nr_slabs = node_nr_slabs(n);
  2460.         nr_objs  = node_nr_objs(n);
  2461.  
  2462.         pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
  2463.             node, nr_slabs, nr_objs, nr_free);
  2464.     }
  2465. #endif
  2466. }
  2467.  
  2468. static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
  2469.             int node, struct kmem_cache_cpu **pc)
  2470. {
  2471.     void *freelist;
  2472.     struct kmem_cache_cpu *c = *pc;
  2473.     struct page *page;
  2474.  
  2475.     WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
  2476.  
  2477.     freelist = get_partial(s, flags, node, c);
  2478.  
  2479.     if (freelist)
  2480.         return freelist;
  2481.  
  2482.     page = new_slab(s, flags, node);
  2483.     if (page) {
  2484.         c = raw_cpu_ptr(s->cpu_slab);
  2485.         if (c->page)
  2486.             flush_slab(s, c);
  2487.  
  2488.         /*
  2489.          * No other reference to the page yet so we can
  2490.          * muck around with it freely without cmpxchg
  2491.          */
  2492.         freelist = page->freelist;
  2493.         page->freelist = NULL;
  2494.  
  2495.         stat(s, ALLOC_SLAB);
  2496.         c->page = page;
  2497.         *pc = c;
  2498.     }
  2499.  
  2500.     return freelist;
  2501. }
  2502.  
  2503. static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
  2504. {
  2505.     if (unlikely(PageSlabPfmemalloc(page)))
  2506.         return gfp_pfmemalloc_allowed(gfpflags);
  2507.  
  2508.     return true;
  2509. }
  2510.  
  2511. /*
  2512.  * Check the page->freelist of a page and either transfer the freelist to the
  2513.  * per cpu freelist or deactivate the page.
  2514.  *
  2515.  * The page is still frozen if the return value is not NULL.
  2516.  *
  2517.  * If this function returns NULL then the page has been unfrozen.
  2518.  *
  2519.  * This function must be called with interrupt disabled.
  2520.  */
  2521. static inline void *get_freelist(struct kmem_cache *s, struct page *page)
  2522. {
  2523.     struct page new;
  2524.     unsigned long counters;
  2525.     void *freelist;
  2526.  
  2527.     do {
  2528.         freelist = page->freelist;
  2529.         counters = page->counters;
  2530.  
  2531.         new.counters = counters;
  2532.         VM_BUG_ON(!new.frozen);
  2533.  
  2534.         new.inuse = page->objects;
  2535.         new.frozen = freelist != NULL;
  2536.  
  2537.     } while (!__cmpxchg_double_slab(s, page,
  2538.         freelist, counters,
  2539.         NULL, new.counters,
  2540.         "get_freelist"));
  2541.  
  2542.     return freelist;
  2543. }
  2544.  
  2545. /*
  2546.  * Slow path. The lockless freelist is empty or we need to perform
  2547.  * debugging duties.
  2548.  *
  2549.  * Processing is still very fast if new objects have been freed to the
  2550.  * regular freelist. In that case we simply take over the regular freelist
  2551.  * as the lockless freelist and zap the regular freelist.
  2552.  *
  2553.  * If that is not working then we fall back to the partial lists. We take the
  2554.  * first element of the freelist as the object to allocate now and move the
  2555.  * rest of the freelist to the lockless freelist.
  2556.  *
  2557.  * And if we were unable to get a new slab from the partial slab lists then
  2558.  * we need to allocate a new slab. This is the slowest path since it involves
  2559.  * a call to the page allocator and the setup of a new slab.
  2560.  *
  2561.  * Version of __slab_alloc to use when we know that interrupts are
  2562.  * already disabled (which is the case for bulk allocation).
  2563.  */
  2564. static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  2565.               unsigned long addr, struct kmem_cache_cpu *c)
  2566. {
  2567.     void *freelist;
  2568.     struct page *page;
  2569.  
  2570.     page = c->page;
  2571.     if (!page) {
  2572.         /*
  2573.          * if the node is not online or has no normal memory, just
  2574.          * ignore the node constraint
  2575.          */
  2576.         if (unlikely(node != NUMA_NO_NODE &&
  2577.                  !node_state(node, N_NORMAL_MEMORY)))
  2578.             node = NUMA_NO_NODE;
  2579.         goto new_slab;
  2580.     }
  2581. redo:
  2582.  
  2583.     if (unlikely(!node_match(page, node))) {
  2584.         /*
  2585.          * same as above but node_match() being false already
  2586.          * implies node != NUMA_NO_NODE
  2587.          */
  2588.         if (!node_state(node, N_NORMAL_MEMORY)) {
  2589.             node = NUMA_NO_NODE;
  2590.             goto redo;
  2591.         } else {
  2592.             stat(s, ALLOC_NODE_MISMATCH);
  2593.             deactivate_slab(s, page, c->freelist, c);
  2594.             goto new_slab;
  2595.         }
  2596.     }
  2597.  
  2598.     /*
  2599.      * By rights, we should be searching for a slab page that was
  2600.      * PFMEMALLOC but right now, we are losing the pfmemalloc
  2601.      * information when the page leaves the per-cpu allocator
  2602.      */
  2603.     if (unlikely(!pfmemalloc_match(page, gfpflags))) {
  2604.         deactivate_slab(s, page, c->freelist, c);
  2605.         goto new_slab;
  2606.     }
  2607.  
  2608.     /* must check again c->freelist in case of cpu migration or IRQ */
  2609.     freelist = c->freelist;
  2610.     if (freelist)
  2611.         goto load_freelist;
  2612.  
  2613.     freelist = get_freelist(s, page);
  2614.  
  2615.     if (!freelist) {
  2616.         c->page = NULL;
  2617.         stat(s, DEACTIVATE_BYPASS);
  2618.         goto new_slab;
  2619.     }
  2620.  
  2621.     stat(s, ALLOC_REFILL);
  2622.  
  2623. load_freelist:
  2624.     /*
  2625.      * freelist is pointing to the list of objects to be used.
  2626.      * page is pointing to the page from which the objects are obtained.
  2627.      * That page must be frozen for per cpu allocations to work.
  2628.      */
  2629.     VM_BUG_ON(!c->page->frozen);
  2630.     c->freelist = get_freepointer(s, freelist);
  2631.     c->tid = next_tid(c->tid);
  2632.     return freelist;
  2633.  
  2634. new_slab:
  2635.  
  2636.     if (slub_percpu_partial(c)) {
  2637.         page = c->page = slub_percpu_partial(c);
  2638.         slub_set_percpu_partial(c, page);
  2639.         stat(s, CPU_PARTIAL_ALLOC);
  2640.         goto redo;
  2641.     }
  2642.  
  2643.     freelist = new_slab_objects(s, gfpflags, node, &c);
  2644.  
  2645.     if (unlikely(!freelist)) {
  2646.         slab_out_of_memory(s, gfpflags, node);
  2647.         return NULL;
  2648.     }
  2649.  
  2650.     page = c->page;
  2651.     if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
  2652.         goto load_freelist;
  2653.  
  2654.     /* Only entered in the debug case */
  2655.     if (kmem_cache_debug(s) &&
  2656.             !alloc_debug_processing(s, page, freelist, addr))
  2657.         goto new_slab;  /* Slab failed checks. Next slab needed */
  2658.  
  2659.     deactivate_slab(s, page, get_freepointer(s, freelist), c);
  2660.     return freelist;
  2661. }
  2662.  
  2663. /*
  2664.  * Another one that disabled interrupt and compensates for possible
  2665.  * cpu changes by refetching the per cpu area pointer.
  2666.  */
  2667. static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  2668.               unsigned long addr, struct kmem_cache_cpu *c)
  2669. {
  2670.     void *p;
  2671.     unsigned long flags;
  2672.  
  2673.     local_irq_save(flags);
  2674. #ifdef CONFIG_PREEMPT
  2675.     /*
  2676.      * We may have been preempted and rescheduled on a different
  2677.      * cpu before disabling interrupts. Need to reload cpu area
  2678.      * pointer.
  2679.      */
  2680.     c = this_cpu_ptr(s->cpu_slab);
  2681. #endif
  2682.  
  2683.     p = ___slab_alloc(s, gfpflags, node, addr, c);
  2684.     local_irq_restore(flags);
  2685.     return p;
  2686. }
  2687.  
  2688. /*
  2689.  * If the object has been wiped upon free, make sure it's fully initialized by
  2690.  * zeroing out freelist pointer.
  2691.  */
  2692. static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
  2693.                            void *obj)
  2694. {
  2695.     if (unlikely(slab_want_init_on_free(s)) && obj)
  2696.         memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
  2697. }
  2698.  
  2699. /*
  2700.  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  2701.  * have the fastpath folded into their functions. So no function call
  2702.  * overhead for requests that can be satisfied on the fastpath.
  2703.  *
  2704.  * The fastpath works by first checking if the lockless freelist can be used.
  2705.  * If not then __slab_alloc is called for slow processing.
  2706.  *
  2707.  * Otherwise we can simply pick the next object from the lockless free list.
  2708.  */
  2709. static __always_inline void *slab_alloc_node(struct kmem_cache *s,
  2710.         gfp_t gfpflags, int node, unsigned long addr)
  2711. {
  2712.     void *object;
  2713.     struct kmem_cache_cpu *c;
  2714.     struct page *page;
  2715.     unsigned long tid;
  2716.  
  2717.     s = slab_pre_alloc_hook(s, gfpflags);
  2718.     if (!s)
  2719.         return NULL;
  2720. redo:
  2721.     /*
  2722.      * Must read kmem_cache cpu data via this cpu ptr. Preemption is
  2723.      * enabled. We may switch back and forth between cpus while
  2724.      * reading from one cpu area. That does not matter as long
  2725.      * as we end up on the original cpu again when doing the cmpxchg.
  2726.      *
  2727.      * We should guarantee that tid and kmem_cache are retrieved on
  2728.      * the same cpu. It could be different if CONFIG_PREEMPT so we need
  2729.      * to check if it is matched or not.
  2730.      */
  2731.     do {
  2732.         tid = this_cpu_read(s->cpu_slab->tid);
  2733.         c = raw_cpu_ptr(s->cpu_slab);
  2734.     } while (IS_ENABLED(CONFIG_PREEMPT) &&
  2735.          unlikely(tid != READ_ONCE(c->tid)));
  2736.  
  2737.     /*
  2738.      * Irqless object alloc/free algorithm used here depends on sequence
  2739.      * of fetching cpu_slab's data. tid should be fetched before anything
  2740.      * on c to guarantee that object and page associated with previous tid
  2741.      * won't be used with current tid. If we fetch tid first, object and
  2742.      * page could be one associated with next tid and our alloc/free
  2743.      * request will be failed. In this case, we will retry. So, no problem.
  2744.      */
  2745.     barrier();
  2746.  
  2747.     /*
  2748.      * The transaction ids are globally unique per cpu and per operation on
  2749.      * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
  2750.      * occurs on the right processor and that there was no operation on the
  2751.      * linked list in between.
  2752.      */
  2753.  
  2754.     object = c->freelist;
  2755.     page = c->page;
  2756.     if (unlikely(!object || !node_match(page, node))) {
  2757.         object = __slab_alloc(s, gfpflags, node, addr, c);
  2758.         stat(s, ALLOC_SLOWPATH);
  2759.     } else {
  2760.         void *next_object = get_freepointer_safe(s, object);
  2761.  
  2762.         /*
  2763.          * The cmpxchg will only match if there was no additional
  2764.          * operation and if we are on the right processor.
  2765.          *
  2766.          * The cmpxchg does the following atomically (without lock
  2767.          * semantics!)
  2768.          * 1. Relocate first pointer to the current per cpu area.
  2769.          * 2. Verify that tid and freelist have not been changed
  2770.          * 3. If they were not changed replace tid and freelist
  2771.          *
  2772.          * Since this is without lock semantics the protection is only
  2773.          * against code executing on this cpu *not* from access by
  2774.          * other cpus.
  2775.          */
  2776.         if (unlikely(!this_cpu_cmpxchg_double(
  2777.                 s->cpu_slab->freelist, s->cpu_slab->tid,
  2778.                 object, tid,
  2779.                 next_object, next_tid(tid)))) {
  2780.  
  2781.             note_cmpxchg_failure("slab_alloc", s, tid);
  2782.             goto redo;
  2783.         }
  2784.         prefetch_freepointer(s, next_object);
  2785.         stat(s, ALLOC_FASTPATH);
  2786.     }
  2787.  
  2788.     maybe_wipe_obj_freeptr(s, object);
  2789.  
  2790.     if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
  2791.         memset(object, 0, s->object_size);
  2792.  
  2793.     slab_post_alloc_hook(s, gfpflags, 1, &object);
  2794.  
  2795.     return object;
  2796. }
  2797.  
  2798. static __always_inline void *slab_alloc(struct kmem_cache *s,
  2799.         gfp_t gfpflags, unsigned long addr)
  2800. {
  2801.     return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
  2802. }
  2803.  
  2804. void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
  2805. {
  2806.     void *ret = slab_alloc(s, gfpflags, _RET_IP_);
  2807.  
  2808.     trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
  2809.                 s->size, gfpflags);
  2810.  
  2811.     return ret;
  2812. }
  2813. EXPORT_SYMBOL(kmem_cache_alloc);
  2814.  
  2815. #ifdef CONFIG_TRACING
  2816. void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
  2817. {
  2818.     void *ret = slab_alloc(s, gfpflags, _RET_IP_);
  2819.     trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
  2820.     ret = kasan_kmalloc(s, ret, size, gfpflags);
  2821.     return ret;
  2822. }
  2823. EXPORT_SYMBOL(kmem_cache_alloc_trace);
  2824. #endif
  2825.  
  2826. #ifdef CONFIG_NUMA
  2827. void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
  2828. {
  2829.     void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
  2830.  
  2831.     trace_kmem_cache_alloc_node(_RET_IP_, ret,
  2832.                     s->object_size, s->size, gfpflags, node);
  2833.  
  2834.     return ret;
  2835. }
  2836. EXPORT_SYMBOL(kmem_cache_alloc_node);
  2837.  
  2838. #ifdef CONFIG_TRACING
  2839. void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
  2840.                     gfp_t gfpflags,
  2841.                     int node, size_t size)
  2842. {
  2843.     void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
  2844.  
  2845.     trace_kmalloc_node(_RET_IP_, ret,
  2846.                size, s->size, gfpflags, node);
  2847.  
  2848.     ret = kasan_kmalloc(s, ret, size, gfpflags);
  2849.     return ret;
  2850. }
  2851. EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
  2852. #endif
  2853. #endif  /* CONFIG_NUMA */
  2854.  
  2855. /*
  2856.  * Slow path handling. This may still be called frequently since objects
  2857.  * have a longer lifetime than the cpu slabs in most processing loads.
  2858.  *
  2859.  * So we still attempt to reduce cache line usage. Just take the slab
  2860.  * lock and free the item. If there is no additional partial page
  2861.  * handling required then we can return immediately.
  2862.  */
  2863. static void __slab_free(struct kmem_cache *s, struct page *page,
  2864.             void *head, void *tail, int cnt,
  2865.             unsigned long addr)
  2866.  
  2867. {
  2868.     void *prior;
  2869.     int was_frozen;
  2870.     struct page new;
  2871.     unsigned long counters;
  2872.     struct kmem_cache_node *n = NULL;
  2873.     unsigned long uninitialized_var(flags);
  2874.  
  2875.     stat(s, FREE_SLOWPATH);
  2876.  
  2877.     if (kmem_cache_debug(s) &&
  2878.         !free_debug_processing(s, page, head, tail, cnt, addr))
  2879.         return;
  2880.  
  2881.     do {
  2882.         if (unlikely(n)) {
  2883.             spin_unlock_irqrestore(&n->list_lock, flags);
  2884.             n = NULL;
  2885.         }
  2886.         prior = page->freelist;
  2887.         counters = page->counters;
  2888.         set_freepointer(s, tail, prior);
  2889.         new.counters = counters;
  2890.         was_frozen = new.frozen;
  2891.         new.inuse -= cnt;
  2892.         if ((!new.inuse || !prior) && !was_frozen) {
  2893.  
  2894.             if (kmem_cache_has_cpu_partial(s) && !prior) {
  2895.  
  2896.                 /*
  2897.                  * Slab was on no list before and will be
  2898.                  * partially empty
  2899.                  * We can defer the list move and instead
  2900.                  * freeze it.
  2901.                  */
  2902.                 new.frozen = 1;
  2903.  
  2904.             } else { /* Needs to be taken off a list */
  2905.  
  2906.                 n = get_node(s, page_to_nid(page));
  2907.                 /*
  2908.                  * Speculatively acquire the list_lock.
  2909.                  * If the cmpxchg does not succeed then we may
  2910.                  * drop the list_lock without any processing.
  2911.                  *
  2912.                  * Otherwise the list_lock will synchronize with
  2913.                  * other processors updating the list of slabs.
  2914.                  */
  2915.                 spin_lock_irqsave(&n->list_lock, flags);
  2916.  
  2917.             }
  2918.         }
  2919.  
  2920.     } while (!cmpxchg_double_slab(s, page,
  2921.         prior, counters,
  2922.         head, new.counters,
  2923.         "__slab_free"));
  2924.  
  2925.     if (likely(!n)) {
  2926.  
  2927.         /*
  2928.          * If we just froze the page then put it onto the
  2929.          * per cpu partial list.
  2930.          */
  2931.         if (new.frozen && !was_frozen) {
  2932.             put_cpu_partial(s, page, 1);
  2933.             stat(s, CPU_PARTIAL_FREE);
  2934.         }
  2935.         /*
  2936.          * The list lock was not taken therefore no list
  2937.          * activity can be necessary.
  2938.          */
  2939.         if (was_frozen)
  2940.             stat(s, FREE_FROZEN);
  2941.         return;
  2942.     }
  2943.  
  2944.     if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
  2945.         goto slab_empty;
  2946.  
  2947.     /*
  2948.      * Objects left in the slab. If it was not on the partial list before
  2949.      * then add it.
  2950.      */
  2951.     if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
  2952.         remove_full(s, n, page);
  2953.         add_partial(n, page, DEACTIVATE_TO_TAIL);
  2954.         stat(s, FREE_ADD_PARTIAL);
  2955.     }
  2956.     spin_unlock_irqrestore(&n->list_lock, flags);
  2957.     return;
  2958.  
  2959. slab_empty:
  2960.     if (prior) {
  2961.         /*
  2962.          * Slab on the partial list.
  2963.          */
  2964.         remove_partial(n, page);
  2965.         stat(s, FREE_REMOVE_PARTIAL);
  2966.     } else {
  2967.         /* Slab must be on the full list */
  2968.         remove_full(s, n, page);
  2969.     }
  2970.  
  2971.     spin_unlock_irqrestore(&n->list_lock, flags);
  2972.     stat(s, FREE_SLAB);
  2973.     discard_slab(s, page);
  2974. }
  2975.  
  2976. /*
  2977.  * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
  2978.  * can perform fastpath freeing without additional function calls.
  2979.  *
  2980.  * The fastpath is only possible if we are freeing to the current cpu slab
  2981.  * of this processor. This typically the case if we have just allocated
  2982.  * the item before.
  2983.  *
  2984.  * If fastpath is not possible then fall back to __slab_free where we deal
  2985.  * with all sorts of special processing.
  2986.  *
  2987.  * Bulk free of a freelist with several objects (all pointing to the
  2988.  * same page) possible by specifying head and tail ptr, plus objects
  2989.  * count (cnt). Bulk free indicated by tail pointer being set.
  2990.  */
  2991. static __always_inline void do_slab_free(struct kmem_cache *s,
  2992.                 struct page *page, void *head, void *tail,
  2993.                 int cnt, unsigned long addr)
  2994. {
  2995.     void *tail_obj = tail ? : head;
  2996.     struct kmem_cache_cpu *c;
  2997.     unsigned long tid;
  2998. redo:
  2999.     /*
  3000.      * Determine the currently cpus per cpu slab.
  3001.      * The cpu may change afterward. However that does not matter since
  3002.      * data is retrieved via this pointer. If we are on the same cpu
  3003.      * during the cmpxchg then the free will succeed.
  3004.      */
  3005.     do {
  3006.         tid = this_cpu_read(s->cpu_slab->tid);
  3007.         c = raw_cpu_ptr(s->cpu_slab);
  3008.     } while (IS_ENABLED(CONFIG_PREEMPT) &&
  3009.          unlikely(tid != READ_ONCE(c->tid)));
  3010.  
  3011.     /* Same with comment on barrier() in slab_alloc_node() */
  3012.     barrier();
  3013.  
  3014.     if (likely(page == c->page)) {
  3015.         void **freelist = READ_ONCE(c->freelist);
  3016.  
  3017.         set_freepointer(s, tail_obj, freelist);
  3018.  
  3019.         if (unlikely(!this_cpu_cmpxchg_double(
  3020.                 s->cpu_slab->freelist, s->cpu_slab->tid,
  3021.                 freelist, tid,
  3022.                 head, next_tid(tid)))) {
  3023.  
  3024.             note_cmpxchg_failure("slab_free", s, tid);
  3025.             goto redo;
  3026.         }
  3027.         stat(s, FREE_FASTPATH);
  3028.     } else
  3029.         __slab_free(s, page, head, tail_obj, cnt, addr);
  3030.  
  3031. }
  3032.  
  3033. static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
  3034.                       void *head, void *tail, int cnt,
  3035.                       unsigned long addr)
  3036. {
  3037.     /*
  3038.      * With KASAN enabled slab_free_freelist_hook modifies the freelist
  3039.      * to remove objects, whose reuse must be delayed.
  3040.      */
  3041.     if (slab_free_freelist_hook(s, &head, &tail))
  3042.         do_slab_free(s, page, head, tail, cnt, addr);
  3043. }
  3044.  
  3045. #ifdef CONFIG_KASAN_GENERIC
  3046. void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
  3047. {
  3048.     do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
  3049. }
  3050. #endif
  3051.  
  3052. void kmem_cache_free(struct kmem_cache *s, void *x)
  3053. {
  3054.     s = cache_from_obj(s, x);
  3055.     if (!s)
  3056.         return;
  3057.     slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
  3058.     trace_kmem_cache_free(_RET_IP_, x);
  3059. }
  3060. EXPORT_SYMBOL(kmem_cache_free);
  3061.  
  3062. struct detached_freelist {
  3063.     struct page *page;
  3064.     void *tail;
  3065.     void *freelist;
  3066.     int cnt;
  3067.     struct kmem_cache *s;
  3068. };
  3069.  
  3070. /*
  3071.  * This function progressively scans the array with free objects (with
  3072.  * a limited look ahead) and extract objects belonging to the same
  3073.  * page.  It builds a detached freelist directly within the given
  3074.  * page/objects.  This can happen without any need for
  3075.  * synchronization, because the objects are owned by running process.
  3076.  * The freelist is build up as a single linked list in the objects.
  3077.  * The idea is, that this detached freelist can then be bulk
  3078.  * transferred to the real freelist(s), but only requiring a single
  3079.  * synchronization primitive.  Look ahead in the array is limited due
  3080.  * to performance reasons.
  3081.  */
  3082. static inline
  3083. int build_detached_freelist(struct kmem_cache *s, size_t size,
  3084.                 void **p, struct detached_freelist *df)
  3085. {
  3086.     size_t first_skipped_index = 0;
  3087.     int lookahead = 3;
  3088.     void *object;
  3089.     struct page *page;
  3090.  
  3091.     /* Always re-init detached_freelist */
  3092.     df->page = NULL;
  3093.  
  3094.     do {
  3095.         object = p[--size];
  3096.         /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
  3097.     } while (!object && size);
  3098.  
  3099.     if (!object)
  3100.         return 0;
  3101.  
  3102.     page = virt_to_head_page(object);
  3103.     if (!s) {
  3104.         /* Handle kalloc'ed objects */
  3105.         if (unlikely(!PageSlab(page))) {
  3106.             BUG_ON(!PageCompound(page));
  3107.             kfree_hook(object);
  3108.             __free_pages(page, compound_order(page));
  3109.             p[size] = NULL; /* mark object processed */
  3110.             return size;
  3111.         }
  3112.         /* Derive kmem_cache from object */
  3113.         df->s = page->slab_cache;
  3114.     } else {
  3115.         df->s = cache_from_obj(s, object); /* Support for memcg */
  3116.     }
  3117.  
  3118.     /* Start new detached freelist */
  3119.     df->page = page;
  3120.     set_freepointer(df->s, object, NULL);
  3121.     df->tail = object;
  3122.     df->freelist = object;
  3123.     p[size] = NULL; /* mark object processed */
  3124.     df->cnt = 1;
  3125.  
  3126.     while (size) {
  3127.         object = p[--size];
  3128.         if (!object)
  3129.             continue; /* Skip processed objects */
  3130.  
  3131.         /* df->page is always set at this point */
  3132.         if (df->page == virt_to_head_page(object)) {
  3133.             /* Opportunity build freelist */
  3134.             set_freepointer(df->s, object, df->freelist);
  3135.             df->freelist = object;
  3136.             df->cnt++;
  3137.             p[size] = NULL; /* mark object processed */
  3138.  
  3139.             continue;
  3140.         }
  3141.  
  3142.         /* Limit look ahead search */
  3143.         if (!--lookahead)
  3144.             break;
  3145.  
  3146.         if (!first_skipped_index)
  3147.             first_skipped_index = size + 1;
  3148.     }
  3149.  
  3150.     return first_skipped_index;
  3151. }
  3152.  
  3153. /* Note that interrupts must be enabled when calling this function. */
  3154. void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
  3155. {
  3156.     if (WARN_ON(!size))
  3157.         return;
  3158.  
  3159.     do {
  3160.         struct detached_freelist df;
  3161.  
  3162.         size = build_detached_freelist(s, size, p, &df);
  3163.         if (!df.page)
  3164.             continue;
  3165.  
  3166.         slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
  3167.     } while (likely(size));
  3168. }
  3169. EXPORT_SYMBOL(kmem_cache_free_bulk);
  3170.  
  3171. /* Note that interrupts must be enabled when calling this function. */
  3172. int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
  3173.               void **p)
  3174. {
  3175.     struct kmem_cache_cpu *c;
  3176.     int i;
  3177.  
  3178.     /* memcg and kmem_cache debug support */
  3179.     s = slab_pre_alloc_hook(s, flags);
  3180.     if (unlikely(!s))
  3181.         return false;
  3182.     /*
  3183.      * Drain objects in the per cpu slab, while disabling local
  3184.      * IRQs, which protects against PREEMPT and interrupts
  3185.      * handlers invoking normal fastpath.
  3186.      */
  3187.     local_irq_disable();
  3188.     c = this_cpu_ptr(s->cpu_slab);
  3189.  
  3190.     for (i = 0; i < size; i++) {
  3191.         void *object = c->freelist;
  3192.  
  3193.         if (unlikely(!object)) {
  3194.             /*
  3195.              * We may have removed an object from c->freelist using
  3196.              * the fastpath in the previous iteration; in that case,
  3197.              * c->tid has not been bumped yet.
  3198.              * Since ___slab_alloc() may reenable interrupts while
  3199.              * allocating memory, we should bump c->tid now.
  3200.              */
  3201.             c->tid = next_tid(c->tid);
  3202.  
  3203.             /*
  3204.              * Invoking slow path likely have side-effect
  3205.              * of re-populating per CPU c->freelist
  3206.              */
  3207.             p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
  3208.                         _RET_IP_, c);
  3209.             if (unlikely(!p[i]))
  3210.                 goto error;
  3211.  
  3212.             c = this_cpu_ptr(s->cpu_slab);
  3213.             maybe_wipe_obj_freeptr(s, p[i]);
  3214.  
  3215.             continue; /* goto for-loop */
  3216.         }
  3217.         c->freelist = get_freepointer(s, object);
  3218.         p[i] = object;
  3219.         maybe_wipe_obj_freeptr(s, p[i]);
  3220.     }
  3221.     c->tid = next_tid(c->tid);
  3222.     local_irq_enable();
  3223.  
  3224.     /* Clear memory outside IRQ disabled fastpath loop */
  3225.     if (unlikely(slab_want_init_on_alloc(flags, s))) {
  3226.         int j;
  3227.  
  3228.         for (j = 0; j < i; j++)
  3229.             memset(p[j], 0, s->object_size);
  3230.     }
  3231.  
  3232.     /* memcg and kmem_cache debug support */
  3233.     slab_post_alloc_hook(s, flags, size, p);
  3234.     return i;
  3235. error:
  3236.     local_irq_enable();
  3237.     slab_post_alloc_hook(s, flags, i, p);
  3238.     __kmem_cache_free_bulk(s, i, p);
  3239.     return 0;
  3240. }
  3241. EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  3242.  
  3243.  
  3244. /*
  3245.  * Object placement in a slab is made very easy because we always start at
  3246.  * offset 0. If we tune the size of the object to the alignment then we can
  3247.  * get the required alignment by putting one properly sized object after
  3248.  * another.
  3249.  *
  3250.  * Notice that the allocation order determines the sizes of the per cpu
  3251.  * caches. Each processor has always one slab available for allocations.
  3252.  * Increasing the allocation order reduces the number of times that slabs
  3253.  * must be moved on and off the partial lists and is therefore a factor in
  3254.  * locking overhead.
  3255.  */
  3256.  
  3257. /*
  3258.  * Mininum / Maximum order of slab pages. This influences locking overhead
  3259.  * and slab fragmentation. A higher order reduces the number of partial slabs
  3260.  * and increases the number of allocations possible without having to
  3261.  * take the list_lock.
  3262.  */
  3263. static unsigned int slub_min_order;
  3264. static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
  3265. static unsigned int slub_min_objects;
  3266.  
  3267. /*
  3268.  * Calculate the order of allocation given an slab object size.
  3269.  *
  3270.  * The order of allocation has significant impact on performance and other
  3271.  * system components. Generally order 0 allocations should be preferred since
  3272.  * order 0 does not cause fragmentation in the page allocator. Larger objects
  3273.  * be problematic to put into order 0 slabs because there may be too much
  3274.  * unused space left. We go to a higher order if more than 1/16th of the slab
  3275.  * would be wasted.
  3276.  *
  3277.  * In order to reach satisfactory performance we must ensure that a minimum
  3278.  * number of objects is in one slab. Otherwise we may generate too much
  3279.  * activity on the partial lists which requires taking the list_lock. This is
  3280.  * less a concern for large slabs though which are rarely used.
  3281.  *
  3282.  * slub_max_order specifies the order where we begin to stop considering the
  3283.  * number of objects in a slab as critical. If we reach slub_max_order then
  3284.  * we try to keep the page order as low as possible. So we accept more waste
  3285.  * of space in favor of a small page order.
  3286.  *
  3287.  * Higher order allocations also allow the placement of more objects in a
  3288.  * slab and thereby reduce object handling overhead. If the user has
  3289.  * requested a higher mininum order then we start with that one instead of
  3290.  * the smallest order which will fit the object.
  3291.  */
  3292. static inline unsigned int slab_order(unsigned int size,
  3293.         unsigned int min_objects, unsigned int max_order,
  3294.         unsigned int fract_leftover)
  3295. {
  3296.     unsigned int min_order = slub_min_order;
  3297.     unsigned int order;
  3298.  
  3299.     if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
  3300.         return get_order(size * MAX_OBJS_PER_PAGE) - 1;
  3301.  
  3302.     for (order = max(min_order, (unsigned int)get_order(min_objects * size));
  3303.             order <= max_order; order++) {
  3304.  
  3305.         unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
  3306.         unsigned int rem;
  3307.  
  3308.         rem = slab_size % size;
  3309.  
  3310.         if (rem <= slab_size / fract_leftover)
  3311.             break;
  3312.     }
  3313.  
  3314.     return order;
  3315. }
  3316.  
  3317. static inline int calculate_order(unsigned int size)
  3318. {
  3319.     unsigned int order;
  3320.     unsigned int min_objects;
  3321.     unsigned int max_objects;
  3322.  
  3323.     /*
  3324.      * Attempt to find best configuration for a slab. This
  3325.      * works by first attempting to generate a layout with
  3326.      * the best configuration and backing off gradually.
  3327.      *
  3328.      * First we increase the acceptable waste in a slab. Then
  3329.      * we reduce the minimum objects required in a slab.
  3330.      */
  3331.     min_objects = slub_min_objects;
  3332.     if (!min_objects)
  3333.         min_objects = 4 * (fls(nr_cpu_ids) + 1);
  3334.     max_objects = order_objects(slub_max_order, size);
  3335.     min_objects = min(min_objects, max_objects);
  3336.  
  3337.     while (min_objects > 1) {
  3338.         unsigned int fraction;
  3339.  
  3340.         fraction = 16;
  3341.         while (fraction >= 4) {
  3342.             order = slab_order(size, min_objects,
  3343.                     slub_max_order, fraction);
  3344.             if (order <= slub_max_order)
  3345.                 return order;
  3346.             fraction /= 2;
  3347.         }
  3348.         min_objects--;
  3349.     }
  3350.  
  3351.     /*
  3352.      * We were unable to place multiple objects in a slab. Now
  3353.      * lets see if we can place a single object there.
  3354.      */
  3355.     order = slab_order(size, 1, slub_max_order, 1);
  3356.     if (order <= slub_max_order)
  3357.         return order;
  3358.  
  3359.     /*
  3360.      * Doh this slab cannot be placed using slub_max_order.
  3361.      */
  3362.     order = slab_order(size, 1, MAX_ORDER, 1);
  3363.     if (order < MAX_ORDER)
  3364.         return order;
  3365.     return -ENOSYS;
  3366. }
  3367.  
  3368. static void
  3369. init_kmem_cache_node(struct kmem_cache_node *n)
  3370. {
  3371.     n->nr_partial = 0;
  3372.     spin_lock_init(&n->list_lock);
  3373.     INIT_LIST_HEAD(&n->partial);
  3374. #ifdef CONFIG_SLUB_DEBUG
  3375.     atomic_long_set(&n->nr_slabs, 0);
  3376.     atomic_long_set(&n->total_objects, 0);
  3377.     INIT_LIST_HEAD(&n->full);
  3378. #endif
  3379. }
  3380.  
  3381. static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
  3382. {
  3383.     BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
  3384.             KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
  3385.  
  3386.     /*
  3387.      * Must align to double word boundary for the double cmpxchg
  3388.      * instructions to work; see __pcpu_double_call_return_bool().
  3389.      */
  3390.     s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
  3391.                      2 * sizeof(void *));
  3392.  
  3393.     if (!s->cpu_slab)
  3394.         return 0;
  3395.  
  3396.     init_kmem_cache_cpus(s);
  3397.  
  3398.     return 1;
  3399. }
  3400.  
  3401. static struct kmem_cache *kmem_cache_node;
  3402.  
  3403. /*
  3404.  * No kmalloc_node yet so do it by hand. We know that this is the first
  3405.  * slab on the node for this slabcache. There are no concurrent accesses
  3406.  * possible.
  3407.  *
  3408.  * Note that this function only works on the kmem_cache_node
  3409.  * when allocating for the kmem_cache_node. This is used for bootstrapping
  3410.  * memory on a fresh node that has no slab structures yet.
  3411.  */
  3412. static void early_kmem_cache_node_alloc(int node)
  3413. {
  3414.     struct page *page;
  3415.     struct kmem_cache_node *n;
  3416.  
  3417.     BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
  3418.  
  3419.     page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
  3420.  
  3421.     BUG_ON(!page);
  3422.     if (page_to_nid(page) != node) {
  3423.         pr_err("SLUB: Unable to allocate memory from node %d\n", node);
  3424.         pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
  3425.     }
  3426.  
  3427.     n = page->freelist;
  3428.     BUG_ON(!n);
  3429. #ifdef CONFIG_SLUB_DEBUG
  3430.     init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
  3431.     init_tracking(kmem_cache_node, n);
  3432. #endif
  3433.     n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
  3434.               GFP_KERNEL);
  3435.     page->freelist = get_freepointer(kmem_cache_node, n);
  3436.     page->inuse = 1;
  3437.     page->frozen = 0;
  3438.     kmem_cache_node->node[node] = n;
  3439.     init_kmem_cache_node(n);
  3440.     inc_slabs_node(kmem_cache_node, node, page->objects);
  3441.  
  3442.     /*
  3443.      * No locks need to be taken here as it has just been
  3444.      * initialized and there is no concurrent access.
  3445.      */
  3446.     __add_partial(n, page, DEACTIVATE_TO_HEAD);
  3447. }
  3448.  
  3449. static void free_kmem_cache_nodes(struct kmem_cache *s)
  3450. {
  3451.     int node;
  3452.     struct kmem_cache_node *n;
  3453.  
  3454.     for_each_kmem_cache_node(s, node, n) {
  3455.         s->node[node] = NULL;
  3456.         kmem_cache_free(kmem_cache_node, n);
  3457.     }
  3458. }
  3459.  
  3460. void __kmem_cache_release(struct kmem_cache *s)
  3461. {
  3462.     cache_random_seq_destroy(s);
  3463.     free_percpu(s->cpu_slab);
  3464.     free_kmem_cache_nodes(s);
  3465. }
  3466.  
  3467. static int init_kmem_cache_nodes(struct kmem_cache *s)
  3468. {
  3469.     int node;
  3470.  
  3471.     for_each_node_state(node, N_NORMAL_MEMORY) {
  3472.         struct kmem_cache_node *n;
  3473.  
  3474.         if (slab_state == DOWN) {
  3475.             early_kmem_cache_node_alloc(node);
  3476.             continue;
  3477.         }
  3478.         n = kmem_cache_alloc_node(kmem_cache_node,
  3479.                         GFP_KERNEL, node);
  3480.  
  3481.         if (!n) {
  3482.             free_kmem_cache_nodes(s);
  3483.             return 0;
  3484.         }
  3485.  
  3486.         init_kmem_cache_node(n);
  3487.         s->node[node] = n;
  3488.     }
  3489.     return 1;
  3490. }
  3491.  
  3492. static void set_min_partial(struct kmem_cache *s, unsigned long min)
  3493. {
  3494.     if (min < MIN_PARTIAL)
  3495.         min = MIN_PARTIAL;
  3496.     else if (min > MAX_PARTIAL)
  3497.         min = MAX_PARTIAL;
  3498.     s->min_partial = min;
  3499. }
  3500.  
  3501. static void set_cpu_partial(struct kmem_cache *s)
  3502. {
  3503. #ifdef CONFIG_SLUB_CPU_PARTIAL
  3504.     /*
  3505.      * cpu_partial determined the maximum number of objects kept in the
  3506.      * per cpu partial lists of a processor.
  3507.      *
  3508.      * Per cpu partial lists mainly contain slabs that just have one
  3509.      * object freed. If they are used for allocation then they can be
  3510.      * filled up again with minimal effort. The slab will never hit the
  3511.      * per node partial lists and therefore no locking will be required.
  3512.      *
  3513.      * This setting also determines
  3514.      *
  3515.      * A) The number of objects from per cpu partial slabs dumped to the
  3516.      *    per node list when we reach the limit.
  3517.      * B) The number of objects in cpu partial slabs to extract from the
  3518.      *    per node list when we run out of per cpu objects. We only fetch
  3519.      *    50% to keep some capacity around for frees.
  3520.      */
  3521.     if (!kmem_cache_has_cpu_partial(s))
  3522.         s->cpu_partial = 0;
  3523.     else if (s->size >= PAGE_SIZE)
  3524.         s->cpu_partial = 2;
  3525.     else if (s->size >= 1024)
  3526.         s->cpu_partial = 6;
  3527.     else if (s->size >= 256)
  3528.         s->cpu_partial = 13;
  3529.     else
  3530.         s->cpu_partial = 30;
  3531. #endif
  3532. }
  3533.  
  3534. /*
  3535.  * calculate_sizes() determines the order and the distribution of data within
  3536.  * a slab object.
  3537.  */
  3538. static int calculate_sizes(struct kmem_cache *s, int forced_order)
  3539. {
  3540.     slab_flags_t flags = s->flags;
  3541.     unsigned int size = s->object_size;
  3542.     unsigned int order;
  3543.  
  3544.     /*
  3545.      * Round up object size to the next word boundary. We can only
  3546.      * place the free pointer at word boundaries and this determines
  3547.      * the possible location of the free pointer.
  3548.      */
  3549.     size = ALIGN(size, sizeof(void *));
  3550.  
  3551. #ifdef CONFIG_SLUB_DEBUG
  3552.     /*
  3553.      * Determine if we can poison the object itself. If the user of
  3554.      * the slab may touch the object after free or before allocation
  3555.      * then we should never poison the object itself.
  3556.      */
  3557.     if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
  3558.             !s->ctor)
  3559.         s->flags |= __OBJECT_POISON;
  3560.     else
  3561.         s->flags &= ~__OBJECT_POISON;
  3562.  
  3563.  
  3564.     /*
  3565.      * If we are Redzoning then check if there is some space between the
  3566.      * end of the object and the free pointer. If not then add an
  3567.      * additional word to have some bytes to store Redzone information.
  3568.      */
  3569.     if ((flags & SLAB_RED_ZONE) && size == s->object_size)
  3570.         size += sizeof(void *);
  3571. #endif
  3572.  
  3573.     /*
  3574.      * With that we have determined the number of bytes in actual use
  3575.      * by the object. This is the potential offset to the free pointer.
  3576.      */
  3577.     s->inuse = size;
  3578.  
  3579.     if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
  3580.         s->ctor)) {
  3581.         /*
  3582.          * Relocate free pointer after the object if it is not
  3583.          * permitted to overwrite the first word of the object on
  3584.          * kmem_cache_free.
  3585.          *
  3586.          * This is the case if we do RCU, have a constructor or
  3587.          * destructor or are poisoning the objects.
  3588.          */
  3589.         s->offset = size;
  3590.         size += sizeof(void *);
  3591.     }
  3592.  
  3593. #ifdef CONFIG_SLUB_DEBUG
  3594.     if (flags & SLAB_STORE_USER)
  3595.         /*
  3596.          * Need to store information about allocs and frees after
  3597.          * the object.
  3598.          */
  3599.         size += 2 * sizeof(struct track);
  3600. #endif
  3601.  
  3602.     kasan_cache_create(s, &size, &s->flags);
  3603. #ifdef CONFIG_SLUB_DEBUG
  3604.     if (flags & SLAB_RED_ZONE) {
  3605.         /*
  3606.          * Add some empty padding so that we can catch
  3607.          * overwrites from earlier objects rather than let
  3608.          * tracking information or the free pointer be
  3609.          * corrupted if a user writes before the start
  3610.          * of the object.
  3611.          */
  3612.         size += sizeof(void *);
  3613.  
  3614.         s->red_left_pad = sizeof(void *);
  3615.         s->red_left_pad = ALIGN(s->red_left_pad, s->align);
  3616.         size += s->red_left_pad;
  3617.     }
  3618. #endif
  3619.  
  3620.     /*
  3621.      * SLUB stores one object immediately after another beginning from
  3622.      * offset 0. In order to align the objects we have to simply size
  3623.      * each object to conform to the alignment.
  3624.      */
  3625.     size = ALIGN(size, s->align);
  3626.     s->size = size;
  3627.     if (forced_order >= 0)
  3628.         order = forced_order;
  3629.     else
  3630.         order = calculate_order(size);
  3631.  
  3632.     if ((int)order < 0)
  3633.         return 0;
  3634.  
  3635.     s->allocflags = 0;
  3636.     if (order)
  3637.         s->allocflags |= __GFP_COMP;
  3638.  
  3639.     if (s->flags & SLAB_CACHE_DMA)
  3640.         s->allocflags |= GFP_DMA;
  3641.  
  3642.     if (s->flags & SLAB_CACHE_DMA32)
  3643.         s->allocflags |= GFP_DMA32;
  3644.  
  3645.     if (s->flags & SLAB_RECLAIM_ACCOUNT)
  3646.         s->allocflags |= __GFP_RECLAIMABLE;
  3647.  
  3648.     /*
  3649.      * Determine the number of objects per slab
  3650.      */
  3651.     s->oo = oo_make(order, size);
  3652.     s->min = oo_make(get_order(size), size);
  3653.     if (oo_objects(s->oo) > oo_objects(s->max))
  3654.         s->max = s->oo;
  3655.  
  3656.     return !!oo_objects(s->oo);
  3657. }
  3658.  
  3659. static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
  3660. {
  3661.     s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
  3662. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  3663.     s->random = get_random_long();
  3664. #endif
  3665.  
  3666.     if (!calculate_sizes(s, -1))
  3667.         goto error;
  3668.     if (disable_higher_order_debug) {
  3669.         /*
  3670.          * Disable debugging flags that store metadata if the min slab
  3671.          * order increased.
  3672.          */
  3673.         if (get_order(s->size) > get_order(s->object_size)) {
  3674.             s->flags &= ~DEBUG_METADATA_FLAGS;
  3675.             s->offset = 0;
  3676.             if (!calculate_sizes(s, -1))
  3677.                 goto error;
  3678.         }
  3679.     }
  3680.  
  3681. #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
  3682.     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
  3683.     if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
  3684.         /* Enable fast mode */
  3685.         s->flags |= __CMPXCHG_DOUBLE;
  3686. #endif
  3687.  
  3688.     /*
  3689.      * The larger the object size is, the more pages we want on the partial
  3690.      * list to avoid pounding the page allocator excessively.
  3691.      */
  3692.     set_min_partial(s, ilog2(s->size) / 2);
  3693.  
  3694.     set_cpu_partial(s);
  3695.  
  3696. #ifdef CONFIG_NUMA
  3697.     s->remote_node_defrag_ratio = 1000;
  3698. #endif
  3699.  
  3700.     /* Initialize the pre-computed randomized freelist if slab is up */
  3701.     if (slab_state >= UP) {
  3702.         if (init_cache_random_seq(s))
  3703.             goto error;
  3704.     }
  3705.  
  3706.     if (!init_kmem_cache_nodes(s))
  3707.         goto error;
  3708.  
  3709.     if (alloc_kmem_cache_cpus(s))
  3710.         return 0;
  3711.  
  3712.     free_kmem_cache_nodes(s);
  3713. error:
  3714.     return -EINVAL;
  3715. }
  3716.  
  3717. static void list_slab_objects(struct kmem_cache *s, struct page *page,
  3718.                             const char *text)
  3719. {
  3720. #ifdef CONFIG_SLUB_DEBUG
  3721.     void *addr = page_address(page);
  3722.     void *p;
  3723.     unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
  3724.     if (!map)
  3725.         return;
  3726.     slab_err(s, page, text, s->name);
  3727.     slab_lock(page);
  3728.  
  3729.     get_map(s, page, map);
  3730.     for_each_object(p, s, addr, page->objects) {
  3731.  
  3732.         if (!test_bit(slab_index(p, s, addr), map)) {
  3733.             pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
  3734.             print_tracking(s, p);
  3735.         }
  3736.     }
  3737.     slab_unlock(page);
  3738.     bitmap_free(map);
  3739. #endif
  3740. }
  3741.  
  3742. /*
  3743.  * Attempt to free all partial slabs on a node.
  3744.  * This is called from __kmem_cache_shutdown(). We must take list_lock
  3745.  * because sysfs file might still access partial list after the shutdowning.
  3746.  */
  3747. static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
  3748. {
  3749.     LIST_HEAD(discard);
  3750.     struct page *page, *h;
  3751.  
  3752.     BUG_ON(irqs_disabled());
  3753.     spin_lock_irq(&n->list_lock);
  3754.     list_for_each_entry_safe(page, h, &n->partial, slab_list) {
  3755.         if (!page->inuse) {
  3756.             remove_partial(n, page);
  3757.             list_add(&page->slab_list, &discard);
  3758.         } else {
  3759.             list_slab_objects(s, page,
  3760.             "Objects remaining in %s on __kmem_cache_shutdown()");
  3761.         }
  3762.     }
  3763.     spin_unlock_irq(&n->list_lock);
  3764.  
  3765.     list_for_each_entry_safe(page, h, &discard, slab_list)
  3766.         discard_slab(s, page);
  3767. }
  3768.  
  3769. bool __kmem_cache_empty(struct kmem_cache *s)
  3770. {
  3771.     int node;
  3772.     struct kmem_cache_node *n;
  3773.  
  3774.     for_each_kmem_cache_node(s, node, n)
  3775.         if (n->nr_partial || slabs_node(s, node))
  3776.             return false;
  3777.     return true;
  3778. }
  3779.  
  3780. /*
  3781.  * Release all resources used by a slab cache.
  3782.  */
  3783. int __kmem_cache_shutdown(struct kmem_cache *s)
  3784. {
  3785.     int node;
  3786.     struct kmem_cache_node *n;
  3787.  
  3788.     flush_all(s);
  3789.     /* Attempt to free all objects */
  3790.     for_each_kmem_cache_node(s, node, n) {
  3791.         free_partial(s, n);
  3792.         if (n->nr_partial || slabs_node(s, node))
  3793.             return 1;
  3794.     }
  3795.     sysfs_slab_remove(s);
  3796.     return 0;
  3797. }
  3798.  
  3799. /********************************************************************
  3800.  *      Kmalloc subsystem
  3801.  *******************************************************************/
  3802.  
  3803. static int __init setup_slub_min_order(char *str)
  3804. {
  3805.     get_option(&str, (int *)&slub_min_order);
  3806.  
  3807.     return 1;
  3808. }
  3809.  
  3810. __setup("slub_min_order=", setup_slub_min_order);
  3811.  
  3812. static int __init setup_slub_max_order(char *str)
  3813. {
  3814.     get_option(&str, (int *)&slub_max_order);
  3815.     slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
  3816.  
  3817.     return 1;
  3818. }
  3819.  
  3820. __setup("slub_max_order=", setup_slub_max_order);
  3821.  
  3822. static int __init setup_slub_min_objects(char *str)
  3823. {
  3824.     get_option(&str, (int *)&slub_min_objects);
  3825.  
  3826.     return 1;
  3827. }
  3828.  
  3829. __setup("slub_min_objects=", setup_slub_min_objects);
  3830.  
  3831. void *__kmalloc(size_t size, gfp_t flags)
  3832. {
  3833.     struct kmem_cache *s;
  3834.     void *ret;
  3835.  
  3836.     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
  3837.         return kmalloc_large(size, flags);
  3838.  
  3839.     s = kmalloc_slab(size, flags);
  3840.  
  3841.     if (unlikely(ZERO_OR_NULL_PTR(s)))
  3842.         return s;
  3843.  
  3844.     ret = slab_alloc(s, flags, _RET_IP_);
  3845.  
  3846.     trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
  3847.  
  3848.     ret = kasan_kmalloc(s, ret, size, flags);
  3849.  
  3850.     return ret;
  3851. }
  3852. EXPORT_SYMBOL(__kmalloc);
  3853.  
  3854. #ifdef CONFIG_NUMA
  3855. static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
  3856. {
  3857.     struct page *page;
  3858.     void *ptr = NULL;
  3859.     unsigned int order = get_order(size);
  3860.  
  3861.     flags |= __GFP_COMP;
  3862.     page = alloc_pages_node(node, flags, order);
  3863.     if (page) {
  3864.         ptr = page_address(page);
  3865.         mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
  3866.                     1 << order);
  3867.     }
  3868.  
  3869.     return kmalloc_large_node_hook(ptr, size, flags);
  3870. }
  3871.  
  3872. void *__kmalloc_node(size_t size, gfp_t flags, int node)
  3873. {
  3874.     struct kmem_cache *s;
  3875.     void *ret;
  3876.  
  3877.     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
  3878.         ret = kmalloc_large_node(size, flags, node);
  3879.  
  3880.         trace_kmalloc_node(_RET_IP_, ret,
  3881.                    size, PAGE_SIZE << get_order(size),
  3882.                    flags, node);
  3883.  
  3884.         return ret;
  3885.     }
  3886.  
  3887.     s = kmalloc_slab(size, flags);
  3888.  
  3889.     if (unlikely(ZERO_OR_NULL_PTR(s)))
  3890.         return s;
  3891.  
  3892.     ret = slab_alloc_node(s, flags, node, _RET_IP_);
  3893.  
  3894.     trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
  3895.  
  3896.     ret = kasan_kmalloc(s, ret, size, flags);
  3897.  
  3898.     return ret;
  3899. }
  3900. EXPORT_SYMBOL(__kmalloc_node);
  3901. #endif  /* CONFIG_NUMA */
  3902.  
  3903. #ifdef CONFIG_HARDENED_USERCOPY
  3904. /*
  3905.  * Rejects incorrectly sized objects and objects that are to be copied
  3906.  * to/from userspace but do not fall entirely within the containing slab
  3907.  * cache's usercopy region.
  3908.  *
  3909.  * Returns NULL if check passes, otherwise const char * to name of cache
  3910.  * to indicate an error.
  3911.  */
  3912. void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
  3913.              bool to_user)
  3914. {
  3915.     struct kmem_cache *s;
  3916.     unsigned int offset;
  3917.     size_t object_size;
  3918.  
  3919.     ptr = kasan_reset_tag(ptr);
  3920.  
  3921.     /* Find object and usable object size. */
  3922.     s = page->slab_cache;
  3923.  
  3924.     /* Reject impossible pointers. */
  3925.     if (ptr < page_address(page))
  3926.         usercopy_abort("SLUB object not in SLUB page?!", NULL,
  3927.                    to_user, 0, n);
  3928.  
  3929.     /* Find offset within object. */
  3930.     offset = (ptr - page_address(page)) % s->size;
  3931.  
  3932.     /* Adjust for redzone and reject if within the redzone. */
  3933.     if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
  3934.         if (offset < s->red_left_pad)
  3935.             usercopy_abort("SLUB object in left red zone",
  3936.                        s->name, to_user, offset, n);
  3937.         offset -= s->red_left_pad;
  3938.     }
  3939.  
  3940.     /* Allow address range falling entirely within usercopy region. */
  3941.     if (offset >= s->useroffset &&
  3942.         offset - s->useroffset <= s->usersize &&
  3943.         n <= s->useroffset - offset + s->usersize)
  3944.         return;
  3945.  
  3946.     /*
  3947.      * If the copy is still within the allocated object, produce
  3948.      * a warning instead of rejecting the copy. This is intended
  3949.      * to be a temporary method to find any missing usercopy
  3950.      * whitelists.
  3951.      */
  3952.     object_size = slab_ksize(s);
  3953.     if (usercopy_fallback &&
  3954.         offset <= object_size && n <= object_size - offset) {
  3955.         usercopy_warn("SLUB object", s->name, to_user, offset, n);
  3956.         return;
  3957.     }
  3958.  
  3959.     usercopy_abort("SLUB object", s->name, to_user, offset, n);
  3960. }
  3961. #endif /* CONFIG_HARDENED_USERCOPY */
  3962.  
  3963. size_t __ksize(const void *object)
  3964. {
  3965.     struct page *page;
  3966.  
  3967.     if (unlikely(object == ZERO_SIZE_PTR))
  3968.         return 0;
  3969.  
  3970.     page = virt_to_head_page(object);
  3971.  
  3972.     if (unlikely(!PageSlab(page))) {
  3973.         WARN_ON(!PageCompound(page));
  3974.         return page_size(page);
  3975.     }
  3976.  
  3977.     return slab_ksize(page->slab_cache);
  3978. }
  3979. EXPORT_SYMBOL(__ksize);
  3980.  
  3981. void kfree(const void *x)
  3982. {
  3983.     struct page *page;
  3984.     void *object = (void *)x;
  3985.  
  3986.     trace_kfree(_RET_IP_, x);
  3987.  
  3988.     if (unlikely(ZERO_OR_NULL_PTR(x)))
  3989.         return;
  3990.  
  3991.     page = virt_to_head_page(x);
  3992.     if (unlikely(!PageSlab(page))) {
  3993.         unsigned int order = compound_order(page);
  3994.  
  3995.         BUG_ON(!PageCompound(page));
  3996.         kfree_hook(object);
  3997.         mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
  3998.                     -(1 << order));
  3999.         __free_pages(page, order);
  4000.         return;
  4001.     }
  4002.     slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
  4003. }
  4004. EXPORT_SYMBOL(kfree);
  4005.  
  4006. #define SHRINK_PROMOTE_MAX 32
  4007.  
  4008. /*
  4009.  * kmem_cache_shrink discards empty slabs and promotes the slabs filled
  4010.  * up most to the head of the partial lists. New allocations will then
  4011.  * fill those up and thus they can be removed from the partial lists.
  4012.  *
  4013.  * The slabs with the least items are placed last. This results in them
  4014.  * being allocated from last increasing the chance that the last objects
  4015.  * are freed in them.
  4016.  */
  4017. int __kmem_cache_shrink(struct kmem_cache *s)
  4018. {
  4019.     int node;
  4020.     int i;
  4021.     struct kmem_cache_node *n;
  4022.     struct page *page;
  4023.     struct page *t;
  4024.     struct list_head discard;
  4025.     struct list_head promote[SHRINK_PROMOTE_MAX];
  4026.     unsigned long flags;
  4027.     int ret = 0;
  4028.  
  4029.     flush_all(s);
  4030.     for_each_kmem_cache_node(s, node, n) {
  4031.         INIT_LIST_HEAD(&discard);
  4032.         for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
  4033.             INIT_LIST_HEAD(promote + i);
  4034.  
  4035.         spin_lock_irqsave(&n->list_lock, flags);
  4036.  
  4037.         /*
  4038.          * Build lists of slabs to discard or promote.
  4039.          *
  4040.          * Note that concurrent frees may occur while we hold the
  4041.          * list_lock. page->inuse here is the upper limit.
  4042.          */
  4043.         list_for_each_entry_safe(page, t, &n->partial, slab_list) {
  4044.             int free = page->objects - page->inuse;
  4045.  
  4046.             /* Do not reread page->inuse */
  4047.             barrier();
  4048.  
  4049.             /* We do not keep full slabs on the list */
  4050.             BUG_ON(free <= 0);
  4051.  
  4052.             if (free == page->objects) {
  4053.                 list_move(&page->slab_list, &discard);
  4054.                 n->nr_partial--;
  4055.             } else if (free <= SHRINK_PROMOTE_MAX)
  4056.                 list_move(&page->slab_list, promote + free - 1);
  4057.         }
  4058.  
  4059.         /*
  4060.          * Promote the slabs filled up most to the head of the
  4061.          * partial list.
  4062.          */
  4063.         for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
  4064.             list_splice(promote + i, &n->partial);
  4065.  
  4066.         spin_unlock_irqrestore(&n->list_lock, flags);
  4067.  
  4068.         /* Release empty slabs */
  4069.         list_for_each_entry_safe(page, t, &discard, slab_list)
  4070.             discard_slab(s, page);
  4071.  
  4072.         if (slabs_node(s, node))
  4073.             ret = 1;
  4074.     }
  4075.  
  4076.     return ret;
  4077. }
  4078.  
  4079. #ifdef CONFIG_MEMCG
  4080. void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
  4081. {
  4082.     /*
  4083.      * Called with all the locks held after a sched RCU grace period.
  4084.      * Even if @s becomes empty after shrinking, we can't know that @s
  4085.      * doesn't have allocations already in-flight and thus can't
  4086.      * destroy @s until the associated memcg is released.
  4087.      *
  4088.      * However, let's remove the sysfs files for empty caches here.
  4089.      * Each cache has a lot of interface files which aren't
  4090.      * particularly useful for empty draining caches; otherwise, we can
  4091.      * easily end up with millions of unnecessary sysfs files on
  4092.      * systems which have a lot of memory and transient cgroups.
  4093.      */
  4094.     if (!__kmem_cache_shrink(s))
  4095.         sysfs_slab_remove(s);
  4096. }
  4097.  
  4098. void __kmemcg_cache_deactivate(struct kmem_cache *s)
  4099. {
  4100.     /*
  4101.      * Disable empty slabs caching. Used to avoid pinning offline
  4102.      * memory cgroups by kmem pages that can be freed.
  4103.      */
  4104.     slub_set_cpu_partial(s, 0);
  4105.     s->min_partial = 0;
  4106. }
  4107. #endif  /* CONFIG_MEMCG */
  4108.  
  4109. static int slab_mem_going_offline_callback(void *arg)
  4110. {
  4111.     struct kmem_cache *s;
  4112.  
  4113.     mutex_lock(&slab_mutex);
  4114.     list_for_each_entry(s, &slab_caches, list)
  4115.         __kmem_cache_shrink(s);
  4116.     mutex_unlock(&slab_mutex);
  4117.  
  4118.     return 0;
  4119. }
  4120.  
  4121. static void slab_mem_offline_callback(void *arg)
  4122. {
  4123.     struct kmem_cache_node *n;
  4124.     struct kmem_cache *s;
  4125.     struct memory_notify *marg = arg;
  4126.     int offline_node;
  4127.  
  4128.     offline_node = marg->status_change_nid_normal;
  4129.  
  4130.     /*
  4131.      * If the node still has available memory. we need kmem_cache_node
  4132.      * for it yet.
  4133.      */
  4134.     if (offline_node < 0)
  4135.         return;
  4136.  
  4137.     mutex_lock(&slab_mutex);
  4138.     list_for_each_entry(s, &slab_caches, list) {
  4139.         n = get_node(s, offline_node);
  4140.         if (n) {
  4141.             /*
  4142.              * if n->nr_slabs > 0, slabs still exist on the node
  4143.              * that is going down. We were unable to free them,
  4144.              * and offline_pages() function shouldn't call this
  4145.              * callback. So, we must fail.
  4146.              */
  4147.             BUG_ON(slabs_node(s, offline_node));
  4148.  
  4149.             s->node[offline_node] = NULL;
  4150.             kmem_cache_free(kmem_cache_node, n);
  4151.         }
  4152.     }
  4153.     mutex_unlock(&slab_mutex);
  4154. }
  4155.  
  4156. static int slab_mem_going_online_callback(void *arg)
  4157. {
  4158.     struct kmem_cache_node *n;
  4159.     struct kmem_cache *s;
  4160.     struct memory_notify *marg = arg;
  4161.     int nid = marg->status_change_nid_normal;
  4162.     int ret = 0;
  4163.  
  4164.     /*
  4165.      * If the node's memory is already available, then kmem_cache_node is
  4166.      * already created. Nothing to do.
  4167.      */
  4168.     if (nid < 0)
  4169.         return 0;
  4170.  
  4171.     /*
  4172.      * We are bringing a node online. No memory is available yet. We must
  4173.      * allocate a kmem_cache_node structure in order to bring the node
  4174.      * online.
  4175.      */
  4176.     mutex_lock(&slab_mutex);
  4177.     list_for_each_entry(s, &slab_caches, list) {
  4178.         /*
  4179.          * XXX: kmem_cache_alloc_node will fallback to other nodes
  4180.          *      since memory is not yet available from the node that
  4181.          *      is brought up.
  4182.          */
  4183.         n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
  4184.         if (!n) {
  4185.             ret = -ENOMEM;
  4186.             goto out;
  4187.         }
  4188.         init_kmem_cache_node(n);
  4189.         s->node[nid] = n;
  4190.     }
  4191. out:
  4192.     mutex_unlock(&slab_mutex);
  4193.     return ret;
  4194. }
  4195.  
  4196. static int slab_memory_callback(struct notifier_block *self,
  4197.                 unsigned long action, void *arg)
  4198. {
  4199.     int ret = 0;
  4200.  
  4201.     switch (action) {
  4202.     case MEM_GOING_ONLINE:
  4203.         ret = slab_mem_going_online_callback(arg);
  4204.         break;
  4205.     case MEM_GOING_OFFLINE:
  4206.         ret = slab_mem_going_offline_callback(arg);
  4207.         break;
  4208.     case MEM_OFFLINE:
  4209.     case MEM_CANCEL_ONLINE:
  4210.         slab_mem_offline_callback(arg);
  4211.         break;
  4212.     case MEM_ONLINE:
  4213.     case MEM_CANCEL_OFFLINE:
  4214.         break;
  4215.     }
  4216.     if (ret)
  4217.         ret = notifier_from_errno(ret);
  4218.     else
  4219.         ret = NOTIFY_OK;
  4220.     return ret;
  4221. }
  4222.  
  4223. static struct notifier_block slab_memory_callback_nb = {
  4224.     .notifier_call = slab_memory_callback,
  4225.     .priority = SLAB_CALLBACK_PRI,
  4226. };
  4227.  
  4228. /********************************************************************
  4229.  *          Basic setup of slabs
  4230.  *******************************************************************/
  4231.  
  4232. /*
  4233.  * Used for early kmem_cache structures that were allocated using
  4234.  * the page allocator. Allocate them properly then fix up the pointers
  4235.  * that may be pointing to the wrong kmem_cache structure.
  4236.  */
  4237.  
  4238. static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
  4239. {
  4240.     int node;
  4241.     struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
  4242.     struct kmem_cache_node *n;
  4243.  
  4244.     memcpy(s, static_cache, kmem_cache->object_size);
  4245.  
  4246.     /*
  4247.      * This runs very early, and only the boot processor is supposed to be
  4248.      * up.  Even if it weren't true, IRQs are not up so we couldn't fire
  4249.      * IPIs around.
  4250.      */
  4251.     __flush_cpu_slab(s, smp_processor_id());
  4252.     for_each_kmem_cache_node(s, node, n) {
  4253.         struct page *p;
  4254.  
  4255.         list_for_each_entry(p, &n->partial, slab_list)
  4256.             p->slab_cache = s;
  4257.  
  4258. #ifdef CONFIG_SLUB_DEBUG
  4259.         list_for_each_entry(p, &n->full, slab_list)
  4260.             p->slab_cache = s;
  4261. #endif
  4262.     }
  4263.     slab_init_memcg_params(s);
  4264.     list_add(&s->list, &slab_caches);
  4265.     memcg_link_cache(s, NULL);
  4266.     return s;
  4267. }
  4268.  
  4269. void __init kmem_cache_init(void)
  4270. {
  4271.     static __initdata struct kmem_cache boot_kmem_cache,
  4272.         boot_kmem_cache_node;
  4273.  
  4274.     if (debug_guardpage_minorder())
  4275.         slub_max_order = 0;
  4276.  
  4277.     kmem_cache_node = &boot_kmem_cache_node;
  4278.     kmem_cache = &boot_kmem_cache;
  4279.  
  4280.     create_boot_cache(kmem_cache_node, "kmem_cache_node",
  4281.         sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
  4282.  
  4283.     register_hotmemory_notifier(&slab_memory_callback_nb);
  4284.  
  4285.     /* Able to allocate the per node structures */
  4286.     slab_state = PARTIAL;
  4287.  
  4288.     create_boot_cache(kmem_cache, "kmem_cache",
  4289.             offsetof(struct kmem_cache, node) +
  4290.                 nr_node_ids * sizeof(struct kmem_cache_node *),
  4291.                SLAB_HWCACHE_ALIGN, 0, 0);
  4292.  
  4293.     kmem_cache = bootstrap(&boot_kmem_cache);
  4294.     kmem_cache_node = bootstrap(&boot_kmem_cache_node);
  4295.  
  4296.     /* Now we can use the kmem_cache to allocate kmalloc slabs */
  4297.     setup_kmalloc_cache_index_table();
  4298.     create_kmalloc_caches(0);
  4299.  
  4300.     /* Setup random freelists for each cache */
  4301.     init_freelist_randomization();
  4302.  
  4303.     cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
  4304.                   slub_cpu_dead);
  4305.  
  4306.     pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
  4307.         cache_line_size(),
  4308.         slub_min_order, slub_max_order, slub_min_objects,
  4309.         nr_cpu_ids, nr_node_ids);
  4310. }
  4311.  
  4312. void __init kmem_cache_init_late(void)
  4313. {
  4314. }
  4315.  
  4316. struct kmem_cache *
  4317. __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
  4318.            slab_flags_t flags, void (*ctor)(void *))
  4319. {
  4320.     struct kmem_cache *s, *c;
  4321.  
  4322.     s = find_mergeable(size, align, flags, name, ctor);
  4323.     if (s) {
  4324.         s->refcount++;
  4325.  
  4326.         /*
  4327.          * Adjust the object sizes so that we clear
  4328.          * the complete object on kzalloc.
  4329.          */
  4330.         s->object_size = max(s->object_size, size);
  4331.         s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
  4332.  
  4333.         for_each_memcg_cache(c, s) {
  4334.             c->object_size = s->object_size;
  4335.             c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
  4336.         }
  4337.  
  4338.         if (sysfs_slab_alias(s, name)) {
  4339.             s->refcount--;
  4340.             s = NULL;
  4341.         }
  4342.     }
  4343.  
  4344.     return s;
  4345. }
  4346.  
  4347. int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
  4348. {
  4349.     int err;
  4350.  
  4351.     err = kmem_cache_open(s, flags);
  4352.     if (err)
  4353.         return err;
  4354.  
  4355.     /* Mutex is not taken during early boot */
  4356.     if (slab_state <= UP)
  4357.         return 0;
  4358.  
  4359.     memcg_propagate_slab_attrs(s);
  4360.     err = sysfs_slab_add(s);
  4361.     if (err)
  4362.         __kmem_cache_release(s);
  4363.  
  4364.     return err;
  4365. }
  4366.  
  4367. void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
  4368. {
  4369.     struct kmem_cache *s;
  4370.     void *ret;
  4371.  
  4372.     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
  4373.         return kmalloc_large(size, gfpflags);
  4374.  
  4375.     s = kmalloc_slab(size, gfpflags);
  4376.  
  4377.     if (unlikely(ZERO_OR_NULL_PTR(s)))
  4378.         return s;
  4379.  
  4380.     ret = slab_alloc(s, gfpflags, caller);
  4381.  
  4382.     /* Honor the call site pointer we received. */
  4383.     trace_kmalloc(caller, ret, size, s->size, gfpflags);
  4384.  
  4385.     return ret;
  4386. }
  4387.  
  4388. #ifdef CONFIG_NUMA
  4389. void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
  4390.                     int node, unsigned long caller)
  4391. {
  4392.     struct kmem_cache *s;
  4393.     void *ret;
  4394.  
  4395.     if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
  4396.         ret = kmalloc_large_node(size, gfpflags, node);
  4397.  
  4398.         trace_kmalloc_node(caller, ret,
  4399.                    size, PAGE_SIZE << get_order(size),
  4400.                    gfpflags, node);
  4401.  
  4402.         return ret;
  4403.     }
  4404.  
  4405.     s = kmalloc_slab(size, gfpflags);
  4406.  
  4407.     if (unlikely(ZERO_OR_NULL_PTR(s)))
  4408.         return s;
  4409.  
  4410.     ret = slab_alloc_node(s, gfpflags, node, caller);
  4411.  
  4412.     /* Honor the call site pointer we received. */
  4413.     trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
  4414.  
  4415.     return ret;
  4416. }
  4417. #endif
  4418.  
  4419. #ifdef CONFIG_SYSFS
  4420. static int count_inuse(struct page *page)
  4421. {
  4422.     return page->inuse;
  4423. }
  4424.  
  4425. static int count_total(struct page *page)
  4426. {
  4427.     return page->objects;
  4428. }
  4429. #endif
  4430.  
  4431. #ifdef CONFIG_SLUB_DEBUG
  4432. static int validate_slab(struct kmem_cache *s, struct page *page,
  4433.                         unsigned long *map)
  4434. {
  4435.     void *p;
  4436.     void *addr = page_address(page);
  4437.  
  4438.     if (!check_slab(s, page) ||
  4439.             !on_freelist(s, page, NULL))
  4440.         return 0;
  4441.  
  4442.     /* Now we know that a valid freelist exists */
  4443.     bitmap_zero(map, page->objects);
  4444.  
  4445.     get_map(s, page, map);
  4446.     for_each_object(p, s, addr, page->objects) {
  4447.         if (test_bit(slab_index(p, s, addr), map))
  4448.             if (!check_object(s, page, p, SLUB_RED_INACTIVE))
  4449.                 return 0;
  4450.     }
  4451.  
  4452.     for_each_object(p, s, addr, page->objects)
  4453.         if (!test_bit(slab_index(p, s, addr), map))
  4454.             if (!check_object(s, page, p, SLUB_RED_ACTIVE))
  4455.                 return 0;
  4456.     return 1;
  4457. }
  4458.  
  4459. static void validate_slab_slab(struct kmem_cache *s, struct page *page,
  4460.                         unsigned long *map)
  4461. {
  4462.     slab_lock(page);
  4463.     validate_slab(s, page, map);
  4464.     slab_unlock(page);
  4465. }
  4466.  
  4467. static int validate_slab_node(struct kmem_cache *s,
  4468.         struct kmem_cache_node *n, unsigned long *map)
  4469. {
  4470.     unsigned long count = 0;
  4471.     struct page *page;
  4472.     unsigned long flags;
  4473.  
  4474.     spin_lock_irqsave(&n->list_lock, flags);
  4475.  
  4476.     list_for_each_entry(page, &n->partial, slab_list) {
  4477.         validate_slab_slab(s, page, map);
  4478.         count++;
  4479.     }
  4480.     if (count != n->nr_partial)
  4481.         pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
  4482.                s->name, count, n->nr_partial);
  4483.  
  4484.     if (!(s->flags & SLAB_STORE_USER))
  4485.         goto out;
  4486.  
  4487.     list_for_each_entry(page, &n->full, slab_list) {
  4488.         validate_slab_slab(s, page, map);
  4489.         count++;
  4490.     }
  4491.     if (count != atomic_long_read(&n->nr_slabs))
  4492.         pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
  4493.                s->name, count, atomic_long_read(&n->nr_slabs));
  4494.  
  4495. out:
  4496.     spin_unlock_irqrestore(&n->list_lock, flags);
  4497.     return count;
  4498. }
  4499.  
  4500. static long validate_slab_cache(struct kmem_cache *s)
  4501. {
  4502.     int node;
  4503.     unsigned long count = 0;
  4504.     struct kmem_cache_node *n;
  4505.     unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
  4506.  
  4507.     if (!map)
  4508.         return -ENOMEM;
  4509.  
  4510.     flush_all(s);
  4511.     for_each_kmem_cache_node(s, node, n)
  4512.         count += validate_slab_node(s, n, map);
  4513.     bitmap_free(map);
  4514.     return count;
  4515. }
  4516. /*
  4517.  * Generate lists of code addresses where slabcache objects are allocated
  4518.  * and freed.
  4519.  */
  4520.  
  4521. struct location {
  4522.     unsigned long count;
  4523.     unsigned long addr;
  4524.     long long sum_time;
  4525.     long min_time;
  4526.     long max_time;
  4527.     long min_pid;
  4528.     long max_pid;
  4529.     DECLARE_BITMAP(cpus, NR_CPUS);
  4530.     nodemask_t nodes;
  4531. };
  4532.  
  4533. struct loc_track {
  4534.     unsigned long max;
  4535.     unsigned long count;
  4536.     struct location *loc;
  4537. };
  4538.  
  4539. static void free_loc_track(struct loc_track *t)
  4540. {
  4541.     if (t->max)
  4542.         free_pages((unsigned long)t->loc,
  4543.             get_order(sizeof(struct location) * t->max));
  4544. }
  4545.  
  4546. static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
  4547. {
  4548.     struct location *l;
  4549.     int order;
  4550.  
  4551.     order = get_order(sizeof(struct location) * max);
  4552.  
  4553.     l = (void *)__get_free_pages(flags, order);
  4554.     if (!l)
  4555.         return 0;
  4556.  
  4557.     if (t->count) {
  4558.         memcpy(l, t->loc, sizeof(struct location) * t->count);
  4559.         free_loc_track(t);
  4560.     }
  4561.     t->max = max;
  4562.     t->loc = l;
  4563.     return 1;
  4564. }
  4565.  
  4566. static int add_location(struct loc_track *t, struct kmem_cache *s,
  4567.                 const struct track *track)
  4568. {
  4569.     long start, end, pos;
  4570.     struct location *l;
  4571.     unsigned long caddr;
  4572.     unsigned long age = jiffies - track->when;
  4573.  
  4574.     start = -1;
  4575.     end = t->count;
  4576.  
  4577.     for ( ; ; ) {
  4578.         pos = start + (end - start + 1) / 2;
  4579.  
  4580.         /*
  4581.          * There is nothing at "end". If we end up there
  4582.          * we need to add something to before end.
  4583.          */
  4584.         if (pos == end)
  4585.             break;
  4586.  
  4587.         caddr = t->loc[pos].addr;
  4588.         if (track->addr == caddr) {
  4589.  
  4590.             l = &t->loc[pos];
  4591.             l->count++;
  4592.             if (track->when) {
  4593.                 l->sum_time += age;
  4594.                 if (age < l->min_time)
  4595.                     l->min_time = age;
  4596.                 if (age > l->max_time)
  4597.                     l->max_time = age;
  4598.  
  4599.                 if (track->pid < l->min_pid)
  4600.                     l->min_pid = track->pid;
  4601.                 if (track->pid > l->max_pid)
  4602.                     l->max_pid = track->pid;
  4603.  
  4604.                 cpumask_set_cpu(track->cpu,
  4605.                         to_cpumask(l->cpus));
  4606.             }
  4607.             node_set(page_to_nid(virt_to_page(track)), l->nodes);
  4608.             return 1;
  4609.         }
  4610.  
  4611.         if (track->addr < caddr)
  4612.             end = pos;
  4613.         else
  4614.             start = pos;
  4615.     }
  4616.  
  4617.     /*
  4618.      * Not found. Insert new tracking element.
  4619.      */
  4620.     if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
  4621.         return 0;
  4622.  
  4623.     l = t->loc + pos;
  4624.     if (pos < t->count)
  4625.         memmove(l + 1, l,
  4626.             (t->count - pos) * sizeof(struct location));
  4627.     t->count++;
  4628.     l->count = 1;
  4629.     l->addr = track->addr;
  4630.     l->sum_time = age;
  4631.     l->min_time = age;
  4632.     l->max_time = age;
  4633.     l->min_pid = track->pid;
  4634.     l->max_pid = track->pid;
  4635.     cpumask_clear(to_cpumask(l->cpus));
  4636.     cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
  4637.     nodes_clear(l->nodes);
  4638.     node_set(page_to_nid(virt_to_page(track)), l->nodes);
  4639.     return 1;
  4640. }
  4641.  
  4642. static void process_slab(struct loc_track *t, struct kmem_cache *s,
  4643.         struct page *page, enum track_item alloc,
  4644.         unsigned long *map)
  4645. {
  4646.     void *addr = page_address(page);
  4647.     void *p;
  4648.  
  4649.     bitmap_zero(map, page->objects);
  4650.     get_map(s, page, map);
  4651.  
  4652.     for_each_object(p, s, addr, page->objects)
  4653.         if (!test_bit(slab_index(p, s, addr), map))
  4654.             add_location(t, s, get_track(s, p, alloc));
  4655. }
  4656.  
  4657. static int list_locations(struct kmem_cache *s, char *buf,
  4658.                     enum track_item alloc)
  4659. {
  4660.     int len = 0;
  4661.     unsigned long i;
  4662.     struct loc_track t = { 0, 0, NULL };
  4663.     int node;
  4664.     struct kmem_cache_node *n;
  4665.     unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
  4666.  
  4667.     if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
  4668.                      GFP_KERNEL)) {
  4669.         bitmap_free(map);
  4670.         return sprintf(buf, "Out of memory\n");
  4671.     }
  4672.     /* Push back cpu slabs */
  4673.     flush_all(s);
  4674.  
  4675.     for_each_kmem_cache_node(s, node, n) {
  4676.         unsigned long flags;
  4677.         struct page *page;
  4678.  
  4679.         if (!atomic_long_read(&n->nr_slabs))
  4680.             continue;
  4681.  
  4682.         spin_lock_irqsave(&n->list_lock, flags);
  4683.         list_for_each_entry(page, &n->partial, slab_list)
  4684.             process_slab(&t, s, page, alloc, map);
  4685.         list_for_each_entry(page, &n->full, slab_list)
  4686.             process_slab(&t, s, page, alloc, map);
  4687.         spin_unlock_irqrestore(&n->list_lock, flags);
  4688.     }
  4689.  
  4690.     for (i = 0; i < t.count; i++) {
  4691.         struct location *l = &t.loc[i];
  4692.  
  4693.         if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
  4694.             break;
  4695.         len += sprintf(buf + len, "%7ld ", l->count);
  4696.  
  4697.         if (l->addr)
  4698.             len += sprintf(buf + len, "%pS", (void *)l->addr);
  4699.         else
  4700.             len += sprintf(buf + len, "<not-available>");
  4701.  
  4702.         if (l->sum_time != l->min_time) {
  4703.             len += sprintf(buf + len, " age=%ld/%ld/%ld",
  4704.                 l->min_time,
  4705.                 (long)div_u64(l->sum_time, l->count),
  4706.                 l->max_time);
  4707.         } else
  4708.             len += sprintf(buf + len, " age=%ld",
  4709.                 l->min_time);
  4710.  
  4711.         if (l->min_pid != l->max_pid)
  4712.             len += sprintf(buf + len, " pid=%ld-%ld",
  4713.                 l->min_pid, l->max_pid);
  4714.         else
  4715.             len += sprintf(buf + len, " pid=%ld",
  4716.                 l->min_pid);
  4717.  
  4718.         if (num_online_cpus() > 1 &&
  4719.                 !cpumask_empty(to_cpumask(l->cpus)) &&
  4720.                 len < PAGE_SIZE - 60)
  4721.             len += scnprintf(buf + len, PAGE_SIZE - len - 50,
  4722.                      " cpus=%*pbl",
  4723.                      cpumask_pr_args(to_cpumask(l->cpus)));
  4724.  
  4725.         if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
  4726.                 len < PAGE_SIZE - 60)
  4727.             len += scnprintf(buf + len, PAGE_SIZE - len - 50,
  4728.                      " nodes=%*pbl",
  4729.                      nodemask_pr_args(&l->nodes));
  4730.  
  4731.         len += sprintf(buf + len, "\n");
  4732.     }
  4733.  
  4734.     free_loc_track(&t);
  4735.     bitmap_free(map);
  4736.     if (!t.count)
  4737.         len += sprintf(buf, "No data\n");
  4738.     return len;
  4739. }
  4740. #endif  /* CONFIG_SLUB_DEBUG */
  4741.  
  4742. #ifdef SLUB_RESILIENCY_TEST
  4743. static void __init resiliency_test(void)
  4744. {
  4745.     u8 *p;
  4746.     int type = KMALLOC_NORMAL;
  4747.  
  4748.     BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
  4749.  
  4750.     pr_err("SLUB resiliency testing\n");
  4751.     pr_err("-----------------------\n");
  4752.     pr_err("A. Corruption after allocation\n");
  4753.  
  4754.     p = kzalloc(16, GFP_KERNEL);
  4755.     p[16] = 0x12;
  4756.     pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
  4757.            p + 16);
  4758.  
  4759.     validate_slab_cache(kmalloc_caches[type][4]);
  4760.  
  4761.     /* Hmmm... The next two are dangerous */
  4762.     p = kzalloc(32, GFP_KERNEL);
  4763.     p[32 + sizeof(void *)] = 0x34;
  4764.     pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
  4765.            p);
  4766.     pr_err("If allocated object is overwritten then not detectable\n\n");
  4767.  
  4768.     validate_slab_cache(kmalloc_caches[type][5]);
  4769.     p = kzalloc(64, GFP_KERNEL);
  4770.     p += 64 + (get_cycles() & 0xff) * sizeof(void *);
  4771.     *p = 0x56;
  4772.     pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
  4773.            p);
  4774.     pr_err("If allocated object is overwritten then not detectable\n\n");
  4775.     validate_slab_cache(kmalloc_caches[type][6]);
  4776.  
  4777.     pr_err("\nB. Corruption after free\n");
  4778.     p = kzalloc(128, GFP_KERNEL);
  4779.     kfree(p);
  4780.     *p = 0x78;
  4781.     pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
  4782.     validate_slab_cache(kmalloc_caches[type][7]);
  4783.  
  4784.     p = kzalloc(256, GFP_KERNEL);
  4785.     kfree(p);
  4786.     p[50] = 0x9a;
  4787.     pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
  4788.     validate_slab_cache(kmalloc_caches[type][8]);
  4789.  
  4790.     p = kzalloc(512, GFP_KERNEL);
  4791.     kfree(p);
  4792.     p[512] = 0xab;
  4793.     pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
  4794.     validate_slab_cache(kmalloc_caches[type][9]);
  4795. }
  4796. #else
  4797. #ifdef CONFIG_SYSFS
  4798. static void resiliency_test(void) {};
  4799. #endif
  4800. #endif  /* SLUB_RESILIENCY_TEST */
  4801.  
  4802. #ifdef CONFIG_SYSFS
  4803. enum slab_stat_type {
  4804.     SL_ALL,         /* All slabs */
  4805.     SL_PARTIAL,     /* Only partially allocated slabs */
  4806.     SL_CPU,         /* Only slabs used for cpu caches */
  4807.     SL_OBJECTS,     /* Determine allocated objects not slabs */
  4808.     SL_TOTAL        /* Determine object capacity not slabs */
  4809. };
  4810.  
  4811. #define SO_ALL      (1 << SL_ALL)
  4812. #define SO_PARTIAL  (1 << SL_PARTIAL)
  4813. #define SO_CPU      (1 << SL_CPU)
  4814. #define SO_OBJECTS  (1 << SL_OBJECTS)
  4815. #define SO_TOTAL    (1 << SL_TOTAL)
  4816.  
  4817. #ifdef CONFIG_MEMCG
  4818. static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
  4819.  
  4820. static int __init setup_slub_memcg_sysfs(char *str)
  4821. {
  4822.     int v;
  4823.  
  4824.     if (get_option(&str, &v) > 0)
  4825.         memcg_sysfs_enabled = v;
  4826.  
  4827.     return 1;
  4828. }
  4829.  
  4830. __setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
  4831. #endif
  4832.  
  4833. static ssize_t show_slab_objects(struct kmem_cache *s,
  4834.                 char *buf, unsigned long flags)
  4835. {
  4836.     unsigned long total = 0;
  4837.     int node;
  4838.     int x;
  4839.     unsigned long *nodes;
  4840.  
  4841.     nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
  4842.     if (!nodes)
  4843.         return -ENOMEM;
  4844.  
  4845.     if (flags & SO_CPU) {
  4846.         int cpu;
  4847.  
  4848.         for_each_possible_cpu(cpu) {
  4849.             struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
  4850.                                    cpu);
  4851.             int node;
  4852.             struct page *page;
  4853.  
  4854.             page = READ_ONCE(c->page);
  4855.             if (!page)
  4856.                 continue;
  4857.  
  4858.             node = page_to_nid(page);
  4859.             if (flags & SO_TOTAL)
  4860.                 x = page->objects;
  4861.             else if (flags & SO_OBJECTS)
  4862.                 x = page->inuse;
  4863.             else
  4864.                 x = 1;
  4865.  
  4866.             total += x;
  4867.             nodes[node] += x;
  4868.  
  4869.             page = slub_percpu_partial_read_once(c);
  4870.             if (page) {
  4871.                 node = page_to_nid(page);
  4872.                 if (flags & SO_TOTAL)
  4873.                     WARN_ON_ONCE(1);
  4874.                 else if (flags & SO_OBJECTS)
  4875.                     WARN_ON_ONCE(1);
  4876.                 else
  4877.                     x = page->pages;
  4878.                 total += x;
  4879.                 nodes[node] += x;
  4880.             }
  4881.         }
  4882.     }
  4883.  
  4884.     /*
  4885.      * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
  4886.      * already held which will conflict with an existing lock order:
  4887.      *
  4888.      * mem_hotplug_lock->slab_mutex->kernfs_mutex
  4889.      *
  4890.      * We don't really need mem_hotplug_lock (to hold off
  4891.      * slab_mem_going_offline_callback) here because slab's memory hot
  4892.      * unplug code doesn't destroy the kmem_cache->node[] data.
  4893.      */
  4894.  
  4895. #ifdef CONFIG_SLUB_DEBUG
  4896.     if (flags & SO_ALL) {
  4897.         struct kmem_cache_node *n;
  4898.  
  4899.         for_each_kmem_cache_node(s, node, n) {
  4900.  
  4901.             if (flags & SO_TOTAL)
  4902.                 x = atomic_long_read(&n->total_objects);
  4903.             else if (flags & SO_OBJECTS)
  4904.                 x = atomic_long_read(&n->total_objects) -
  4905.                     count_partial(n, count_free);
  4906.             else
  4907.                 x = atomic_long_read(&n->nr_slabs);
  4908.             total += x;
  4909.             nodes[node] += x;
  4910.         }
  4911.  
  4912.     } else
  4913. #endif
  4914.     if (flags & SO_PARTIAL) {
  4915.         struct kmem_cache_node *n;
  4916.  
  4917.         for_each_kmem_cache_node(s, node, n) {
  4918.             if (flags & SO_TOTAL)
  4919.                 x = count_partial(n, count_total);
  4920.             else if (flags & SO_OBJECTS)
  4921.                 x = count_partial(n, count_inuse);
  4922.             else
  4923.                 x = n->nr_partial;
  4924.             total += x;
  4925.             nodes[node] += x;
  4926.         }
  4927.     }
  4928.     x = sprintf(buf, "%lu", total);
  4929. #ifdef CONFIG_NUMA
  4930.     for (node = 0; node < nr_node_ids; node++)
  4931.         if (nodes[node])
  4932.             x += sprintf(buf + x, " N%d=%lu",
  4933.                     node, nodes[node]);
  4934. #endif
  4935.     kfree(nodes);
  4936.     return x + sprintf(buf + x, "\n");
  4937. }
  4938.  
  4939. #ifdef CONFIG_SLUB_DEBUG
  4940. static int any_slab_objects(struct kmem_cache *s)
  4941. {
  4942.     int node;
  4943.     struct kmem_cache_node *n;
  4944.  
  4945.     for_each_kmem_cache_node(s, node, n)
  4946.         if (atomic_long_read(&n->total_objects))
  4947.             return 1;
  4948.  
  4949.     return 0;
  4950. }
  4951. #endif
  4952.  
  4953. #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
  4954. #define to_slab(n) container_of(n, struct kmem_cache, kobj)
  4955.  
  4956. struct slab_attribute {
  4957.     struct attribute attr;
  4958.     ssize_t (*show)(struct kmem_cache *s, char *buf);
  4959.     ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
  4960. };
  4961.  
  4962. #define SLAB_ATTR_RO(_name) \
  4963.     static struct slab_attribute _name##_attr = \
  4964.     __ATTR(_name, 0400, _name##_show, NULL)
  4965.  
  4966. #define SLAB_ATTR(_name) \
  4967.     static struct slab_attribute _name##_attr =  \
  4968.     __ATTR(_name, 0600, _name##_show, _name##_store)
  4969.  
  4970. static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
  4971. {
  4972.     return sprintf(buf, "%u\n", s->size);
  4973. }
  4974. SLAB_ATTR_RO(slab_size);
  4975.  
  4976. static ssize_t align_show(struct kmem_cache *s, char *buf)
  4977. {
  4978.     return sprintf(buf, "%u\n", s->align);
  4979. }
  4980. SLAB_ATTR_RO(align);
  4981.  
  4982. static ssize_t object_size_show(struct kmem_cache *s, char *buf)
  4983. {
  4984.     return sprintf(buf, "%u\n", s->object_size);
  4985. }
  4986. SLAB_ATTR_RO(object_size);
  4987.  
  4988. static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
  4989. {
  4990.     return sprintf(buf, "%u\n", oo_objects(s->oo));
  4991. }
  4992. SLAB_ATTR_RO(objs_per_slab);
  4993.  
  4994. static ssize_t order_store(struct kmem_cache *s,
  4995.                 const char *buf, size_t length)
  4996. {
  4997.     unsigned int order;
  4998.     int err;
  4999.  
  5000.     err = kstrtouint(buf, 10, &order);
  5001.     if (err)
  5002.         return err;
  5003.  
  5004.     if (order > slub_max_order || order < slub_min_order)
  5005.         return -EINVAL;
  5006.  
  5007.     calculate_sizes(s, order);
  5008.     return length;
  5009. }
  5010.  
  5011. static ssize_t order_show(struct kmem_cache *s, char *buf)
  5012. {
  5013.     return sprintf(buf, "%u\n", oo_order(s->oo));
  5014. }
  5015. SLAB_ATTR(order);
  5016.  
  5017. static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
  5018. {
  5019.     return sprintf(buf, "%lu\n", s->min_partial);
  5020. }
  5021.  
  5022. static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
  5023.                  size_t length)
  5024. {
  5025.     unsigned long min;
  5026.     int err;
  5027.  
  5028.     err = kstrtoul(buf, 10, &min);
  5029.     if (err)
  5030.         return err;
  5031.  
  5032.     set_min_partial(s, min);
  5033.     return length;
  5034. }
  5035. SLAB_ATTR(min_partial);
  5036.  
  5037. static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
  5038. {
  5039.     return sprintf(buf, "%u\n", slub_cpu_partial(s));
  5040. }
  5041.  
  5042. static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
  5043.                  size_t length)
  5044. {
  5045.     unsigned int objects;
  5046.     int err;
  5047.  
  5048.     err = kstrtouint(buf, 10, &objects);
  5049.     if (err)
  5050.         return err;
  5051.     if (objects && !kmem_cache_has_cpu_partial(s))
  5052.         return -EINVAL;
  5053.  
  5054.     slub_set_cpu_partial(s, objects);
  5055.     flush_all(s);
  5056.     return length;
  5057. }
  5058. SLAB_ATTR(cpu_partial);
  5059.  
  5060. static ssize_t ctor_show(struct kmem_cache *s, char *buf)
  5061. {
  5062.     if (!s->ctor)
  5063.         return 0;
  5064.     return sprintf(buf, "%pS\n", s->ctor);
  5065. }
  5066. SLAB_ATTR_RO(ctor);
  5067.  
  5068. static ssize_t aliases_show(struct kmem_cache *s, char *buf)
  5069. {
  5070.     return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
  5071. }
  5072. SLAB_ATTR_RO(aliases);
  5073.  
  5074. static ssize_t partial_show(struct kmem_cache *s, char *buf)
  5075. {
  5076.     return show_slab_objects(s, buf, SO_PARTIAL);
  5077. }
  5078. SLAB_ATTR_RO(partial);
  5079.  
  5080. static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
  5081. {
  5082.     return show_slab_objects(s, buf, SO_CPU);
  5083. }
  5084. SLAB_ATTR_RO(cpu_slabs);
  5085.  
  5086. static ssize_t objects_show(struct kmem_cache *s, char *buf)
  5087. {
  5088.     return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
  5089. }
  5090. SLAB_ATTR_RO(objects);
  5091.  
  5092. static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
  5093. {
  5094.     return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
  5095. }
  5096. SLAB_ATTR_RO(objects_partial);
  5097.  
  5098. static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
  5099. {
  5100.     int objects = 0;
  5101.     int pages = 0;
  5102.     int cpu;
  5103.     int len;
  5104.  
  5105.     for_each_online_cpu(cpu) {
  5106.         struct page *page;
  5107.  
  5108.         page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
  5109.  
  5110.         if (page) {
  5111.             pages += page->pages;
  5112.             objects += page->pobjects;
  5113.         }
  5114.     }
  5115.  
  5116.     len = sprintf(buf, "%d(%d)", objects, pages);
  5117.  
  5118. #ifdef CONFIG_SMP
  5119.     for_each_online_cpu(cpu) {
  5120.         struct page *page;
  5121.  
  5122.         page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
  5123.  
  5124.         if (page && len < PAGE_SIZE - 20)
  5125.             len += sprintf(buf + len, " C%d=%d(%d)", cpu,
  5126.                 page->pobjects, page->pages);
  5127.     }
  5128. #endif
  5129.     return len + sprintf(buf + len, "\n");
  5130. }
  5131. SLAB_ATTR_RO(slabs_cpu_partial);
  5132.  
  5133. static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
  5134. {
  5135.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
  5136. }
  5137.  
  5138. static ssize_t reclaim_account_store(struct kmem_cache *s,
  5139.                 const char *buf, size_t length)
  5140. {
  5141.     s->flags &= ~SLAB_RECLAIM_ACCOUNT;
  5142.     if (buf[0] == '1')
  5143.         s->flags |= SLAB_RECLAIM_ACCOUNT;
  5144.     return length;
  5145. }
  5146. SLAB_ATTR(reclaim_account);
  5147.  
  5148. static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
  5149. {
  5150.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
  5151. }
  5152. SLAB_ATTR_RO(hwcache_align);
  5153.  
  5154. #ifdef CONFIG_ZONE_DMA
  5155. static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
  5156. {
  5157.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
  5158. }
  5159. SLAB_ATTR_RO(cache_dma);
  5160. #endif
  5161.  
  5162. static ssize_t usersize_show(struct kmem_cache *s, char *buf)
  5163. {
  5164.     return sprintf(buf, "%u\n", s->usersize);
  5165. }
  5166. SLAB_ATTR_RO(usersize);
  5167.  
  5168. static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
  5169. {
  5170.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
  5171. }
  5172. SLAB_ATTR_RO(destroy_by_rcu);
  5173.  
  5174. #ifdef CONFIG_SLUB_DEBUG
  5175. static ssize_t slabs_show(struct kmem_cache *s, char *buf)
  5176. {
  5177.     return show_slab_objects(s, buf, SO_ALL);
  5178. }
  5179. SLAB_ATTR_RO(slabs);
  5180.  
  5181. static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
  5182. {
  5183.     return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
  5184. }
  5185. SLAB_ATTR_RO(total_objects);
  5186.  
  5187. static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
  5188. {
  5189.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
  5190. }
  5191.  
  5192. static ssize_t sanity_checks_store(struct kmem_cache *s,
  5193.                 const char *buf, size_t length)
  5194. {
  5195.     s->flags &= ~SLAB_CONSISTENCY_CHECKS;
  5196.     if (buf[0] == '1') {
  5197.         s->flags &= ~__CMPXCHG_DOUBLE;
  5198.         s->flags |= SLAB_CONSISTENCY_CHECKS;
  5199.     }
  5200.     return length;
  5201. }
  5202. SLAB_ATTR(sanity_checks);
  5203.  
  5204. static ssize_t trace_show(struct kmem_cache *s, char *buf)
  5205. {
  5206.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
  5207. }
  5208.  
  5209. static ssize_t trace_store(struct kmem_cache *s, const char *buf,
  5210.                             size_t length)
  5211. {
  5212.     /*
  5213.      * Tracing a merged cache is going to give confusing results
  5214.      * as well as cause other issues like converting a mergeable
  5215.      * cache into an umergeable one.
  5216.      */
  5217.     if (s->refcount > 1)
  5218.         return -EINVAL;
  5219.  
  5220.     s->flags &= ~SLAB_TRACE;
  5221.     if (buf[0] == '1') {
  5222.         s->flags &= ~__CMPXCHG_DOUBLE;
  5223.         s->flags |= SLAB_TRACE;
  5224.     }
  5225.     return length;
  5226. }
  5227. SLAB_ATTR(trace);
  5228.  
  5229. static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
  5230. {
  5231.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
  5232. }
  5233.  
  5234. static ssize_t red_zone_store(struct kmem_cache *s,
  5235.                 const char *buf, size_t length)
  5236. {
  5237.     if (any_slab_objects(s))
  5238.         return -EBUSY;
  5239.  
  5240.     s->flags &= ~SLAB_RED_ZONE;
  5241.     if (buf[0] == '1') {
  5242.         s->flags |= SLAB_RED_ZONE;
  5243.     }
  5244.     calculate_sizes(s, -1);
  5245.     return length;
  5246. }
  5247. SLAB_ATTR(red_zone);
  5248.  
  5249. static ssize_t poison_show(struct kmem_cache *s, char *buf)
  5250. {
  5251.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
  5252. }
  5253.  
  5254. static ssize_t poison_store(struct kmem_cache *s,
  5255.                 const char *buf, size_t length)
  5256. {
  5257.     if (any_slab_objects(s))
  5258.         return -EBUSY;
  5259.  
  5260.     s->flags &= ~SLAB_POISON;
  5261.     if (buf[0] == '1') {
  5262.         s->flags |= SLAB_POISON;
  5263.     }
  5264.     calculate_sizes(s, -1);
  5265.     return length;
  5266. }
  5267. SLAB_ATTR(poison);
  5268.  
  5269. static ssize_t store_user_show(struct kmem_cache *s, char *buf)
  5270. {
  5271.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
  5272. }
  5273.  
  5274. static ssize_t store_user_store(struct kmem_cache *s,
  5275.                 const char *buf, size_t length)
  5276. {
  5277.     if (any_slab_objects(s))
  5278.         return -EBUSY;
  5279.  
  5280.     s->flags &= ~SLAB_STORE_USER;
  5281.     if (buf[0] == '1') {
  5282.         s->flags &= ~__CMPXCHG_DOUBLE;
  5283.         s->flags |= SLAB_STORE_USER;
  5284.     }
  5285.     calculate_sizes(s, -1);
  5286.     return length;
  5287. }
  5288. SLAB_ATTR(store_user);
  5289.  
  5290. static ssize_t validate_show(struct kmem_cache *s, char *buf)
  5291. {
  5292.     return 0;
  5293. }
  5294.  
  5295. static ssize_t validate_store(struct kmem_cache *s,
  5296.             const char *buf, size_t length)
  5297. {
  5298.     int ret = -EINVAL;
  5299.  
  5300.     if (buf[0] == '1') {
  5301.         ret = validate_slab_cache(s);
  5302.         if (ret >= 0)
  5303.             ret = length;
  5304.     }
  5305.     return ret;
  5306. }
  5307. SLAB_ATTR(validate);
  5308.  
  5309. static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
  5310. {
  5311.     if (!(s->flags & SLAB_STORE_USER))
  5312.         return -ENOSYS;
  5313.     return list_locations(s, buf, TRACK_ALLOC);
  5314. }
  5315. SLAB_ATTR_RO(alloc_calls);
  5316.  
  5317. static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
  5318. {
  5319.     if (!(s->flags & SLAB_STORE_USER))
  5320.         return -ENOSYS;
  5321.     return list_locations(s, buf, TRACK_FREE);
  5322. }
  5323. SLAB_ATTR_RO(free_calls);
  5324. #endif /* CONFIG_SLUB_DEBUG */
  5325.  
  5326. #ifdef CONFIG_FAILSLAB
  5327. static ssize_t failslab_show(struct kmem_cache *s, char *buf)
  5328. {
  5329.     return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
  5330. }
  5331.  
  5332. static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
  5333.                             size_t length)
  5334. {
  5335.     if (s->refcount > 1)
  5336.         return -EINVAL;
  5337.  
  5338.     s->flags &= ~SLAB_FAILSLAB;
  5339.     if (buf[0] == '1')
  5340.         s->flags |= SLAB_FAILSLAB;
  5341.     return length;
  5342. }
  5343. SLAB_ATTR(failslab);
  5344. #endif
  5345.  
  5346. static ssize_t shrink_show(struct kmem_cache *s, char *buf)
  5347. {
  5348.     return 0;
  5349. }
  5350.  
  5351. static ssize_t shrink_store(struct kmem_cache *s,
  5352.             const char *buf, size_t length)
  5353. {
  5354.     if (buf[0] == '1')
  5355.         kmem_cache_shrink_all(s);
  5356.     else
  5357.         return -EINVAL;
  5358.     return length;
  5359. }
  5360. SLAB_ATTR(shrink);
  5361.  
  5362. #ifdef CONFIG_NUMA
  5363. static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
  5364. {
  5365.     return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
  5366. }
  5367.  
  5368. static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
  5369.                 const char *buf, size_t length)
  5370. {
  5371.     unsigned int ratio;
  5372.     int err;
  5373.  
  5374.     err = kstrtouint(buf, 10, &ratio);
  5375.     if (err)
  5376.         return err;
  5377.     if (ratio > 100)
  5378.         return -ERANGE;
  5379.  
  5380.     s->remote_node_defrag_ratio = ratio * 10;
  5381.  
  5382.     return length;
  5383. }
  5384. SLAB_ATTR(remote_node_defrag_ratio);
  5385. #endif
  5386.  
  5387. #ifdef CONFIG_SLUB_STATS
  5388. static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
  5389. {
  5390.     unsigned long sum  = 0;
  5391.     int cpu;
  5392.     int len;
  5393.     int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
  5394.  
  5395.     if (!data)
  5396.         return -ENOMEM;
  5397.  
  5398.     for_each_online_cpu(cpu) {
  5399.         unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
  5400.  
  5401.         data[cpu] = x;
  5402.         sum += x;
  5403.     }
  5404.  
  5405.     len = sprintf(buf, "%lu", sum);
  5406.  
  5407. #ifdef CONFIG_SMP
  5408.     for_each_online_cpu(cpu) {
  5409.         if (data[cpu] && len < PAGE_SIZE - 20)
  5410.             len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
  5411.     }
  5412. #endif
  5413.     kfree(data);
  5414.     return len + sprintf(buf + len, "\n");
  5415. }
  5416.  
  5417. static void clear_stat(struct kmem_cache *s, enum stat_item si)
  5418. {
  5419.     int cpu;
  5420.  
  5421.     for_each_online_cpu(cpu)
  5422.         per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
  5423. }
  5424.  
  5425. #define STAT_ATTR(si, text)                     \
  5426. static ssize_t text##_show(struct kmem_cache *s, char *buf) \
  5427. {                               \
  5428.     return show_stat(s, buf, si);               \
  5429. }                               \
  5430. static ssize_t text##_store(struct kmem_cache *s,       \
  5431.                 const char *buf, size_t length) \
  5432. {                               \
  5433.     if (buf[0] != '0')                  \
  5434.         return -EINVAL;                 \
  5435.     clear_stat(s, si);                  \
  5436.     return length;                      \
  5437. }                               \
  5438. SLAB_ATTR(text);                        \
  5439.  
  5440. STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
  5441. STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
  5442. STAT_ATTR(FREE_FASTPATH, free_fastpath);
  5443. STAT_ATTR(FREE_SLOWPATH, free_slowpath);
  5444. STAT_ATTR(FREE_FROZEN, free_frozen);
  5445. STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
  5446. STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
  5447. STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
  5448. STAT_ATTR(ALLOC_SLAB, alloc_slab);
  5449. STAT_ATTR(ALLOC_REFILL, alloc_refill);
  5450. STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
  5451. STAT_ATTR(FREE_SLAB, free_slab);
  5452. STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
  5453. STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
  5454. STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
  5455. STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
  5456. STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
  5457. STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
  5458. STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
  5459. STAT_ATTR(ORDER_FALLBACK, order_fallback);
  5460. STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
  5461. STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
  5462. STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
  5463. STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
  5464. STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
  5465. STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
  5466. #endif  /* CONFIG_SLUB_STATS */
  5467.  
  5468. static struct attribute *slab_attrs[] = {
  5469.     &slab_size_attr.attr,
  5470.     &object_size_attr.attr,
  5471.     &objs_per_slab_attr.attr,
  5472.     &order_attr.attr,
  5473.     &min_partial_attr.attr,
  5474.     &cpu_partial_attr.attr,
  5475.     &objects_attr.attr,
  5476.     &objects_partial_attr.attr,
  5477.     &partial_attr.attr,
  5478.     &cpu_slabs_attr.attr,
  5479.     &ctor_attr.attr,
  5480.     &aliases_attr.attr,
  5481.     &align_attr.attr,
  5482.     &hwcache_align_attr.attr,
  5483.     &reclaim_account_attr.attr,
  5484.     &destroy_by_rcu_attr.attr,
  5485.     &shrink_attr.attr,
  5486.     &slabs_cpu_partial_attr.attr,
  5487. #ifdef CONFIG_SLUB_DEBUG
  5488.     &total_objects_attr.attr,
  5489.     &slabs_attr.attr,
  5490.     &sanity_checks_attr.attr,
  5491.     &trace_attr.attr,
  5492.     &red_zone_attr.attr,
  5493.     &poison_attr.attr,
  5494.     &store_user_attr.attr,
  5495.     &validate_attr.attr,
  5496.     &alloc_calls_attr.attr,
  5497.     &free_calls_attr.attr,
  5498. #endif
  5499. #ifdef CONFIG_ZONE_DMA
  5500.     &cache_dma_attr.attr,
  5501. #endif
  5502. #ifdef CONFIG_NUMA
  5503.     &remote_node_defrag_ratio_attr.attr,
  5504. #endif
  5505. #ifdef CONFIG_SLUB_STATS
  5506.     &alloc_fastpath_attr.attr,
  5507.     &alloc_slowpath_attr.attr,
  5508.     &free_fastpath_attr.attr,
  5509.     &free_slowpath_attr.attr,
  5510.     &free_frozen_attr.attr,
  5511.     &free_add_partial_attr.attr,
  5512.     &free_remove_partial_attr.attr,
  5513.     &alloc_from_partial_attr.attr,
  5514.     &alloc_slab_attr.attr,
  5515.     &alloc_refill_attr.attr,
  5516.     &alloc_node_mismatch_attr.attr,
  5517.     &free_slab_attr.attr,
  5518.     &cpuslab_flush_attr.attr,
  5519.     &deactivate_full_attr.attr,
  5520.     &deactivate_empty_attr.attr,
  5521.     &deactivate_to_head_attr.attr,
  5522.     &deactivate_to_tail_attr.attr,
  5523.     &deactivate_remote_frees_attr.attr,
  5524.     &deactivate_bypass_attr.attr,
  5525.     &order_fallback_attr.attr,
  5526.     &cmpxchg_double_fail_attr.attr,
  5527.     &cmpxchg_double_cpu_fail_attr.attr,
  5528.     &cpu_partial_alloc_attr.attr,
  5529.     &cpu_partial_free_attr.attr,
  5530.     &cpu_partial_node_attr.attr,
  5531.     &cpu_partial_drain_attr.attr,
  5532. #endif
  5533. #ifdef CONFIG_FAILSLAB
  5534.     &failslab_attr.attr,
  5535. #endif
  5536.     &usersize_attr.attr,
  5537.  
  5538.     NULL
  5539. };
  5540.  
  5541. static const struct attribute_group slab_attr_group = {
  5542.     .attrs = slab_attrs,
  5543. };
  5544.  
  5545. static ssize_t slab_attr_show(struct kobject *kobj,
  5546.                 struct attribute *attr,
  5547.                 char *buf)
  5548. {
  5549.     struct slab_attribute *attribute;
  5550.     struct kmem_cache *s;
  5551.     int err;
  5552.  
  5553.     attribute = to_slab_attr(attr);
  5554.     s = to_slab(kobj);
  5555.  
  5556.     if (!attribute->show)
  5557.         return -EIO;
  5558.  
  5559.     err = attribute->show(s, buf);
  5560.  
  5561.     return err;
  5562. }
  5563.  
  5564. static ssize_t slab_attr_store(struct kobject *kobj,
  5565.                 struct attribute *attr,
  5566.                 const char *buf, size_t len)
  5567. {
  5568.     struct slab_attribute *attribute;
  5569.     struct kmem_cache *s;
  5570.     int err;
  5571.  
  5572.     attribute = to_slab_attr(attr);
  5573.     s = to_slab(kobj);
  5574.  
  5575.     if (!attribute->store)
  5576.         return -EIO;
  5577.  
  5578.     err = attribute->store(s, buf, len);
  5579. #ifdef CONFIG_MEMCG
  5580.     if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
  5581.         struct kmem_cache *c;
  5582.  
  5583.         mutex_lock(&slab_mutex);
  5584.         if (s->max_attr_size < len)
  5585.             s->max_attr_size = len;
  5586.  
  5587.         /*
  5588.          * This is a best effort propagation, so this function's return
  5589.          * value will be determined by the parent cache only. This is
  5590.          * basically because not all attributes will have a well
  5591.          * defined semantics for rollbacks - most of the actions will
  5592.          * have permanent effects.
  5593.          *
  5594.          * Returning the error value of any of the children that fail
  5595.          * is not 100 % defined, in the sense that users seeing the
  5596.          * error code won't be able to know anything about the state of
  5597.          * the cache.
  5598.          *
  5599.          * Only returning the error code for the parent cache at least
  5600.          * has well defined semantics. The cache being written to
  5601.          * directly either failed or succeeded, in which case we loop
  5602.          * through the descendants with best-effort propagation.
  5603.          */
  5604.         for_each_memcg_cache(c, s)
  5605.             attribute->store(c, buf, len);
  5606.         mutex_unlock(&slab_mutex);
  5607.     }
  5608. #endif
  5609.     return err;
  5610. }
  5611.  
  5612. static void memcg_propagate_slab_attrs(struct kmem_cache *s)
  5613. {
  5614. #ifdef CONFIG_MEMCG
  5615.     int i;
  5616.     char *buffer = NULL;
  5617.     struct kmem_cache *root_cache;
  5618.  
  5619.     if (is_root_cache(s))
  5620.         return;
  5621.  
  5622.     root_cache = s->memcg_params.root_cache;
  5623.  
  5624.     /*
  5625.      * This mean this cache had no attribute written. Therefore, no point
  5626.      * in copying default values around
  5627.      */
  5628.     if (!root_cache->max_attr_size)
  5629.         return;
  5630.  
  5631.     for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
  5632.         char mbuf[64];
  5633.         char *buf;
  5634.         struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
  5635.         ssize_t len;
  5636.  
  5637.         if (!attr || !attr->store || !attr->show)
  5638.             continue;
  5639.  
  5640.         /*
  5641.          * It is really bad that we have to allocate here, so we will
  5642.          * do it only as a fallback. If we actually allocate, though,
  5643.          * we can just use the allocated buffer until the end.
  5644.          *
  5645.          * Most of the slub attributes will tend to be very small in
  5646.          * size, but sysfs allows buffers up to a page, so they can
  5647.          * theoretically happen.
  5648.          */
  5649.         if (buffer)
  5650.             buf = buffer;
  5651.         else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
  5652.              !IS_ENABLED(CONFIG_SLUB_STATS))
  5653.             buf = mbuf;
  5654.         else {
  5655.             buffer = (char *) get_zeroed_page(GFP_KERNEL);
  5656.             if (WARN_ON(!buffer))
  5657.                 continue;
  5658.             buf = buffer;
  5659.         }
  5660.  
  5661.         len = attr->show(root_cache, buf);
  5662.         if (len > 0)
  5663.             attr->store(s, buf, len);
  5664.     }
  5665.  
  5666.     if (buffer)
  5667.         free_page((unsigned long)buffer);
  5668. #endif  /* CONFIG_MEMCG */
  5669. }
  5670.  
  5671. static void kmem_cache_release(struct kobject *k)
  5672. {
  5673.     slab_kmem_cache_release(to_slab(k));
  5674. }
  5675.  
  5676. static const struct sysfs_ops slab_sysfs_ops = {
  5677.     .show = slab_attr_show,
  5678.     .store = slab_attr_store,
  5679. };
  5680.  
  5681. static struct kobj_type slab_ktype = {
  5682.     .sysfs_ops = &slab_sysfs_ops,
  5683.     .release = kmem_cache_release,
  5684. };
  5685.  
  5686. static int uevent_filter(struct kset *kset, struct kobject *kobj)
  5687. {
  5688.     struct kobj_type *ktype = get_ktype(kobj);
  5689.  
  5690.     if (ktype == &slab_ktype)
  5691.         return 1;
  5692.     return 0;
  5693. }
  5694.  
  5695. static const struct kset_uevent_ops slab_uevent_ops = {
  5696.     .filter = uevent_filter,
  5697. };
  5698.  
  5699. static struct kset *slab_kset;
  5700.  
  5701. static inline struct kset *cache_kset(struct kmem_cache *s)
  5702. {
  5703. #ifdef CONFIG_MEMCG
  5704.     if (!is_root_cache(s))
  5705.         return s->memcg_params.root_cache->memcg_kset;
  5706. #endif
  5707.     return slab_kset;
  5708. }
  5709.  
  5710. #define ID_STR_LENGTH 64
  5711.  
  5712. /* Create a unique string id for a slab cache:
  5713.  *
  5714.  * Format   :[flags-]size
  5715.  */
  5716. static char *create_unique_id(struct kmem_cache *s)
  5717. {
  5718.     char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
  5719.     char *p = name;
  5720.  
  5721.     BUG_ON(!name);
  5722.  
  5723.     *p++ = ':';
  5724.     /*
  5725.      * First flags affecting slabcache operations. We will only
  5726.      * get here for aliasable slabs so we do not need to support
  5727.      * too many flags. The flags here must cover all flags that
  5728.      * are matched during merging to guarantee that the id is
  5729.      * unique.
  5730.      */
  5731.     if (s->flags & SLAB_CACHE_DMA)
  5732.         *p++ = 'd';
  5733.     if (s->flags & SLAB_CACHE_DMA32)
  5734.         *p++ = 'D';
  5735.     if (s->flags & SLAB_RECLAIM_ACCOUNT)
  5736.         *p++ = 'a';
  5737.     if (s->flags & SLAB_CONSISTENCY_CHECKS)
  5738.         *p++ = 'F';
  5739.     if (s->flags & SLAB_ACCOUNT)
  5740.         *p++ = 'A';
  5741.     if (p != name + 1)
  5742.         *p++ = '-';
  5743.     p += sprintf(p, "%07u", s->size);
  5744.  
  5745.     BUG_ON(p > name + ID_STR_LENGTH - 1);
  5746.     return name;
  5747. }
  5748.  
  5749. static void sysfs_slab_remove_workfn(struct work_struct *work)
  5750. {
  5751.     struct kmem_cache *s =
  5752.         container_of(work, struct kmem_cache, kobj_remove_work);
  5753.  
  5754.     if (!s->kobj.state_in_sysfs)
  5755.         /*
  5756.          * For a memcg cache, this may be called during
  5757.          * deactivation and again on shutdown.  Remove only once.
  5758.          * A cache is never shut down before deactivation is
  5759.          * complete, so no need to worry about synchronization.
  5760.          */
  5761.         goto out;
  5762.  
  5763. #ifdef CONFIG_MEMCG
  5764.     kset_unregister(s->memcg_kset);
  5765. #endif
  5766.     kobject_uevent(&s->kobj, KOBJ_REMOVE);
  5767. out:
  5768.     kobject_put(&s->kobj);
  5769. }
  5770.  
  5771. static int sysfs_slab_add(struct kmem_cache *s)
  5772. {
  5773.     int err;
  5774.     const char *name;
  5775.     struct kset *kset = cache_kset(s);
  5776.     int unmergeable = slab_unmergeable(s);
  5777.  
  5778.     INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
  5779.  
  5780.     if (!kset) {
  5781.         kobject_init(&s->kobj, &slab_ktype);
  5782.         return 0;
  5783.     }
  5784.  
  5785.     if (!unmergeable && disable_higher_order_debug &&
  5786.             (slub_debug & DEBUG_METADATA_FLAGS))
  5787.         unmergeable = 1;
  5788.  
  5789.     if (unmergeable) {
  5790.         /*
  5791.          * Slabcache can never be merged so we can use the name proper.
  5792.          * This is typically the case for debug situations. In that
  5793.          * case we can catch duplicate names easily.
  5794.          */
  5795.         sysfs_remove_link(&slab_kset->kobj, s->name);
  5796.         name = s->name;
  5797.     } else {
  5798.         /*
  5799.          * Create a unique name for the slab as a target
  5800.          * for the symlinks.
  5801.          */
  5802.         name = create_unique_id(s);
  5803.     }
  5804.  
  5805.     s->kobj.kset = kset;
  5806.     err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
  5807.     if (err)
  5808.         goto out;
  5809.  
  5810.     err = sysfs_create_group(&s->kobj, &slab_attr_group);
  5811.     if (err)
  5812.         goto out_del_kobj;
  5813.  
  5814. #ifdef CONFIG_MEMCG
  5815.     if (is_root_cache(s) && memcg_sysfs_enabled) {
  5816.         s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
  5817.         if (!s->memcg_kset) {
  5818.             err = -ENOMEM;
  5819.             goto out_del_kobj;
  5820.         }
  5821.     }
  5822. #endif
  5823.  
  5824.     kobject_uevent(&s->kobj, KOBJ_ADD);
  5825.     if (!unmergeable) {
  5826.         /* Setup first alias */
  5827.         sysfs_slab_alias(s, s->name);
  5828.     }
  5829. out:
  5830.     if (!unmergeable)
  5831.         kfree(name);
  5832.     return err;
  5833. out_del_kobj:
  5834.     kobject_del(&s->kobj);
  5835.     goto out;
  5836. }
  5837.  
  5838. static void sysfs_slab_remove(struct kmem_cache *s)
  5839. {
  5840.     if (slab_state < FULL)
  5841.         /*
  5842.          * Sysfs has not been setup yet so no need to remove the
  5843.          * cache from sysfs.
  5844.          */
  5845.         return;
  5846.  
  5847.     kobject_get(&s->kobj);
  5848.     schedule_work(&s->kobj_remove_work);
  5849. }
  5850.  
  5851. void sysfs_slab_unlink(struct kmem_cache *s)
  5852. {
  5853.     if (slab_state >= FULL)
  5854.         kobject_del(&s->kobj);
  5855. }
  5856.  
  5857. void sysfs_slab_release(struct kmem_cache *s)
  5858. {
  5859.     if (slab_state >= FULL)
  5860.         kobject_put(&s->kobj);
  5861. }
  5862.  
  5863. /*
  5864.  * Need to buffer aliases during bootup until sysfs becomes
  5865.  * available lest we lose that information.
  5866.  */
  5867. struct saved_alias {
  5868.     struct kmem_cache *s;
  5869.     const char *name;
  5870.     struct saved_alias *next;
  5871. };
  5872.  
  5873. static struct saved_alias *alias_list;
  5874.  
  5875. static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
  5876. {
  5877.     struct saved_alias *al;
  5878.  
  5879.     if (slab_state == FULL) {
  5880.         /*
  5881.          * If we have a leftover link then remove it.
  5882.          */
  5883.         sysfs_remove_link(&slab_kset->kobj, name);
  5884.         return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
  5885.     }
  5886.  
  5887.     al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
  5888.     if (!al)
  5889.         return -ENOMEM;
  5890.  
  5891.     al->s = s;
  5892.     al->name = name;
  5893.     al->next = alias_list;
  5894.     alias_list = al;
  5895.     return 0;
  5896. }
  5897.  
  5898. static int __init slab_sysfs_init(void)
  5899. {
  5900.     struct kmem_cache *s;
  5901.     int err;
  5902.  
  5903.     mutex_lock(&slab_mutex);
  5904.  
  5905.     slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
  5906.     if (!slab_kset) {
  5907.         mutex_unlock(&slab_mutex);
  5908.         pr_err("Cannot register slab subsystem.\n");
  5909.         return -ENOSYS;
  5910.     }
  5911.  
  5912.     slab_state = FULL;
  5913.  
  5914.     list_for_each_entry(s, &slab_caches, list) {
  5915.         err = sysfs_slab_add(s);
  5916.         if (err)
  5917.             pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
  5918.                    s->name);
  5919.     }
  5920.  
  5921.     while (alias_list) {
  5922.         struct saved_alias *al = alias_list;
  5923.  
  5924.         alias_list = alias_list->next;
  5925.         err = sysfs_slab_alias(al->s, al->name);
  5926.         if (err)
  5927.             pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
  5928.                    al->name);
  5929.         kfree(al);
  5930.     }
  5931.  
  5932.     mutex_unlock(&slab_mutex);
  5933.     resiliency_test();
  5934.     return 0;
  5935. }
  5936.  
  5937. __initcall(slab_sysfs_init);
  5938. #endif /* CONFIG_SYSFS */
  5939.  
  5940. /*
  5941.  * The /proc/slabinfo ABI
  5942.  */
  5943. #ifdef CONFIG_SLUB_DEBUG
  5944. void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
  5945. {
  5946.     unsigned long nr_slabs = 0;
  5947.     unsigned long nr_objs = 0;
  5948.     unsigned long nr_free = 0;
  5949.     int node;
  5950.     struct kmem_cache_node *n;
  5951.  
  5952.     for_each_kmem_cache_node(s, node, n) {
  5953.         nr_slabs += node_nr_slabs(n);
  5954.         nr_objs += node_nr_objs(n);
  5955.         nr_free += count_partial(n, count_free);
  5956.     }
  5957.  
  5958.     sinfo->active_objs = nr_objs - nr_free;
  5959.     sinfo->num_objs = nr_objs;
  5960.     sinfo->active_slabs = nr_slabs;
  5961.     sinfo->num_slabs = nr_slabs;
  5962.     sinfo->objects_per_slab = oo_objects(s->oo);
  5963.     sinfo->cache_order = oo_order(s->oo);
  5964. }
  5965.  
  5966. void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
  5967. {
  5968. }
  5969.  
  5970. ssize_t slabinfo_write(struct file *file, const char __user *buffer,
  5971.                size_t count, loff_t *ppos)
  5972. {
  5973.     return -EIO;
  5974. }
  5975. #endif /* CONFIG_SLUB_DEBUG */
Add Comment
Please, Sign In to add comment