cgroup: Reorganize css_set_lock and kernfs path processing
authorMichal Koutný <mkoutny@suse.com>
Mon, 10 Oct 2022 08:29:18 +0000 (10:29 +0200)
committerTejun Heo <tj@kernel.org>
Mon, 10 Oct 2022 20:23:18 +0000 (10:23 -1000)
The commit 74e4b956eb1c incorrectly wrapped kernfs_walk_and_get
(might_sleep) under css_set_lock (spinlock). css_set_lock is needed by
__cset_cgroup_from_root to ensure stable cset->cgrp_links but not for
kernfs_walk_and_get.

We only need to make sure that the returned root_cgrp won't be freed
under us. This is given in the case of global root because it is static
(cgrp_dfl_root.cgrp). When the root_cgrp is lower in the hierarchy, it
is pinned by cgroup_ns->root_cset (and `current` task cannot switch
namespace asynchronously so ns_proxy pins cgroup_ns).

Note this reasoning won't hold for root cgroups in v1 hierarchies,
therefore create a special-cased helper function just for the default
hierarchy.

Fixes: 74e4b956eb1c ("cgroup: Honor caller's cgroup NS when resolving path")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/cgroup/cgroup.c

index 764bdd5fd8d14ecd24287715983800ee621a8871..ecf409e3c3a72a838540a8676ed96ab34b8aed86 100644 (file)
@@ -1392,6 +1392,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        cgroup_free_root(root);
 }
 
+/*
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
+ */
 static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
 {
@@ -1403,6 +1406,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                res_cgroup = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
+               lockdep_assert_held(&css_set_lock);
 
                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;
@@ -1414,6 +1418,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                }
        }
 
+       BUG_ON(!res_cgroup);
        return res_cgroup;
 }
 
@@ -1436,23 +1441,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
 
        rcu_read_unlock();
 
-       BUG_ON(!res);
        return res;
 }
 
+/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ *   pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ *   switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+       struct css_set *cset;
+
+       cset = current->nsproxy->cgroup_ns->root_cset;
+       return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
 {
-       struct cgroup *res = NULL;
-
        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);
 
-       res = __cset_cgroup_from_root(cset, root);
-
-       BUG_ON(!res);
-       return res;
+       return __cset_cgroup_from_root(cset, root);
 }
 
 /*
@@ -6105,9 +6123,7 @@ struct cgroup *cgroup_get_from_id(u64 id)
        if (!cgrp)
                return ERR_PTR(-ENOENT);
 
-       spin_lock_irq(&css_set_lock);
-       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
-       spin_unlock_irq(&css_set_lock);
+       root_cgrp = current_cgns_cgroup_dfl();
        if (!cgroup_is_descendant(cgrp, root_cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-ENOENT);
@@ -6686,10 +6702,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
        struct cgroup *cgrp = ERR_PTR(-ENOENT);
        struct cgroup *root_cgrp;
 
-       spin_lock_irq(&css_set_lock);
-       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+       root_cgrp = current_cgns_cgroup_dfl();
        kn = kernfs_walk_and_get(root_cgrp->kn, path);
-       spin_unlock_irq(&css_set_lock);
        if (!kn)
                goto out;