mm/slub: optimize alloc fastpath code layout
authorVlastimil Babka <vbabka@suse.cz>
Mon, 13 Nov 2023 17:04:05 +0000 (18:04 +0100)
committerVlastimil Babka <vbabka@suse.cz>
Wed, 6 Dec 2023 10:57:22 +0000 (11:57 +0100)
With allocation fastpaths no longer divided between two .c files, we
have better inlining, however checking the disassembly of
kmem_cache_alloc() reveals we can do better to make the fastpaths
smaller and move the less common situations out of line or to separate
functions, to reduce instruction cache pressure.

- split memcg pre/post alloc hooks to inlined checks that use likely()
  to assume there will be no objcg handling necessary, and non-inline
  functions doing the actual handling

- add some more likely/unlikely() to pre/post alloc hooks to indicate
  which scenarios should be out of line

- change gfp_allowed_mask handling in slab_post_alloc_hook() so the
  code can be optimized away when kasan/kmsan/kmemleak is configured out

bloat-o-meter shows:
add/remove: 4/2 grow/shrink: 1/8 up/down: 521/-2924 (-2403)
Function                                     old     new   delta
__memcg_slab_post_alloc_hook                   -     461    +461
kmem_cache_alloc_bulk                        775     791     +16
__pfx_should_failslab.constprop                -      16     +16
__pfx___memcg_slab_post_alloc_hook             -      16     +16
should_failslab.constprop                      -      12     +12
__pfx_memcg_slab_post_alloc_hook              16       -     -16
kmem_cache_alloc_lru                        1295    1023    -272
kmem_cache_alloc_node                       1118     817    -301
kmem_cache_alloc                            1076     772    -304
kmalloc_node_trace                          1149     838    -311
kmalloc_trace                               1102     789    -313
__kmalloc_node_track_caller                 1393    1080    -313
__kmalloc_node                              1397    1082    -315
__kmalloc                                   1374    1059    -315
memcg_slab_post_alloc_hook                   464       -    -464

Note that gcc still decided to inline __memcg_pre_alloc_hook(), but the
code is out of line. Forcing noinline did not improve the results. As a
result the fastpaths are shorter and overal code size is reduced.

Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
mm/slub.c

index 5683f1d02e4f4842dda5a99b399e68e1fcf8d294..77d259f3d592f20d400c2d023128109cb695e817 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1866,25 +1866,17 @@ static inline size_t obj_full_size(struct kmem_cache *s)
 /*
  * Returns false if the allocation should fail.
  */
-static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
-                                            struct list_lru *lru,
-                                            struct obj_cgroup **objcgp,
-                                            size_t objects, gfp_t flags)
+static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+                                       struct list_lru *lru,
+                                       struct obj_cgroup **objcgp,
+                                       size_t objects, gfp_t flags)
 {
-       struct obj_cgroup *objcg;
-
-       if (!memcg_kmem_online())
-               return true;
-
-       if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
-               return true;
-
        /*
         * The obtained objcg pointer is safe to use within the current scope,
         * defined by current task or set_active_memcg() pair.
         * obj_cgroup_get() is used to get a permanent reference.
         */
-       objcg = current_obj_cgroup();
+       struct obj_cgroup *objcg = current_obj_cgroup();
        if (!objcg)
                return true;
 
@@ -1907,17 +1899,34 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
        return true;
 }
 
-static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
-                                             struct obj_cgroup *objcg,
-                                             gfp_t flags, size_t size,
-                                             void **p)
+/*
+ * Returns false if the allocation should fail.
+ */
+static __fastpath_inline
+bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
+                              struct obj_cgroup **objcgp, size_t objects,
+                              gfp_t flags)
+{
+       if (!memcg_kmem_online())
+               return true;
+
+       if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
+               return true;
+
+       return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects,
+                                                 flags));
+}
+
+static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
+                                        struct obj_cgroup *objcg,
+                                        gfp_t flags, size_t size,
+                                        void **p)
 {
        struct slab *slab;
        unsigned long off;
        size_t i;
 
-       if (!memcg_kmem_online() || !objcg)
-               return;
+       flags &= gfp_allowed_mask;
 
        for (i = 0; i < size; i++) {
                if (likely(p[i])) {
@@ -1940,6 +1949,16 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
        }
 }
 
+static __fastpath_inline
+void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+                               gfp_t flags, size_t size, void **p)
+{
+       if (likely(!memcg_kmem_online() || !objcg))
+               return;
+
+       return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+}
+
 static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
                                        void **p, int objects)
 {
@@ -3709,34 +3728,34 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
 }
 ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
 
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
-                                                    struct list_lru *lru,
-                                                    struct obj_cgroup **objcgp,
-                                                    size_t size, gfp_t flags)
+static __fastpath_inline
+struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+                                      struct list_lru *lru,
+                                      struct obj_cgroup **objcgp,
+                                      size_t size, gfp_t flags)
 {
        flags &= gfp_allowed_mask;
 
        might_alloc(flags);
 
-       if (should_failslab(s, flags))
+       if (unlikely(should_failslab(s, flags)))
                return NULL;
 
-       if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
+       if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
                return NULL;
 
        return s;
 }
 
-static inline void slab_post_alloc_hook(struct kmem_cache *s,
-                                       struct obj_cgroup *objcg, gfp_t flags,
-                                       size_t size, void **p, bool init,
-                                       unsigned int orig_size)
+static __fastpath_inline
+void slab_post_alloc_hook(struct kmem_cache *s,        struct obj_cgroup *objcg,
+                         gfp_t flags, size_t size, void **p, bool init,
+                         unsigned int orig_size)
 {
        unsigned int zero_size = s->object_size;
        bool kasan_init = init;
        size_t i;
-
-       flags &= gfp_allowed_mask;
+       gfp_t init_flags = flags & gfp_allowed_mask;
 
        /*
         * For kmalloc object, the allocated memory size(object_size) is likely
@@ -3769,13 +3788,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
         * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
         */
        for (i = 0; i < size; i++) {
-               p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init);
+               p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
                if (p[i] && init && (!kasan_init ||
                                     !kasan_has_integrated_init()))
                        memset(p[i], 0, zero_size);
                kmemleak_alloc_recursive(p[i], s->object_size, 1,
-                                        s->flags, flags);
-               kmsan_slab_alloc(s, p[i], flags);
+                                        s->flags, init_flags);
+               kmsan_slab_alloc(s, p[i], init_flags);
        }
 
        memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
@@ -3799,7 +3818,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
        bool init = false;
 
        s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
-       if (!s)
+       if (unlikely(!s))
                return NULL;
 
        object = kfence_alloc(s, orig_size, gfpflags);