Reimplement RLIMIT_NPROC on top of ucounts
authorAlexey Gladkov <legion@kernel.org>
Thu, 22 Apr 2021 12:27:11 +0000 (14:27 +0200)
committerEric W. Biederman <ebiederm@xmission.com>
Fri, 30 Apr 2021 19:14:01 +0000 (14:14 -0500)
The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

To illustrate the impact of rlimits, let's say there is a program that
does not fork. Some service-A wants to run this program as user X in
multiple containers. Since the program never fork the service wants to
set RLIMIT_NPROC=1.

service-A
 \- program (uid=1000, container1, rlimit_nproc=1)
 \- program (uid=1000, container2, rlimit_nproc=1)

The service-A sets RLIMIT_NPROC=1 and runs the program in container1.
When the service-A tries to run a program with RLIMIT_NPROC=1 in
container2 it fails since user X already has one running process.

We cannot use existing inc_ucounts / dec_ucounts because they do not
allow us to exceed the maximum for the counter. Some rlimits can be
overlimited by root or if the user has the appropriate capability.

Changelog

v11:
* Change inc_rlimit_ucounts() which now returns top value of ucounts.
* Drop inc_rlimit_ucounts_and_test() because the return code of
  inc_rlimit_ucounts() can be checked.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
fs/exec.c
include/linux/cred.h
include/linux/sched/user.h
include/linux/user_namespace.h
kernel/cred.c
kernel/exit.c
kernel/fork.c
kernel/sys.c
kernel/ucount.c
kernel/user.c
kernel/user_namespace.c

index d7c4187ca023e2e96b4295574efb8ec528b678f1..f2bcdbeb3afb7a970eb7dab58de3134014f6b25e 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
-           atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+           is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                retval = -EAGAIN;
                goto out_ret;
        }
index 66436e655032854babefaee943f4510743b19966..5ca1e8a1d03541453c3c2427f91c634e9f83eeb6 100644 (file)
@@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
 
 #define task_uid(task)         (task_cred_xxx((task), uid))
 #define task_euid(task)                (task_cred_xxx((task), euid))
+#define task_ucounts(task)     (task_cred_xxx((task), ucounts))
 
 #define current_cred_xxx(xxx)                  \
 ({                                             \
@@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred)
 #define current_fsgid()        (current_cred_xxx(fsgid))
 #define current_cap()          (current_cred_xxx(cap_effective))
 #define current_user()         (current_cred_xxx(user))
+#define current_ucounts()      (current_cred_xxx(ucounts))
 
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
index a8ec3b6093fcbde053f49b09c4f91108bfd66e5d..d33d867ad6c12fa13c74625e9b372c462febe9cb 100644 (file)
@@ -12,7 +12,6 @@
  */
 struct user_struct {
        refcount_t __count;     /* reference count */
-       atomic_t processes;     /* How many processes does this user have? */
        atomic_t sigpending;    /* How many pending signals does this user have? */
 #ifdef CONFIG_FANOTIFY
        atomic_t fanotify_listeners;
index 80b5bf12feaeb7965248b828d8cb8f8002919344..4a97acc359903881b579cfa0525246773115bce4 100644 (file)
@@ -50,9 +50,12 @@ enum ucount_type {
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
 #endif
+       UCOUNT_RLIMIT_NPROC,
        UCOUNT_COUNTS,
 };
 
+#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
+
 struct user_namespace {
        struct uid_gid_map      uid_map;
        struct uid_gid_map      gid_map;
@@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
 struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
 void put_ucounts(struct ucounts *ucounts);
 
+static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
+{
+       return atomic_long_read(&ucounts->ucount[type]);
+}
+
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+
 #ifdef CONFIG_USER_NS
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
index 58a8a9e24347d4202a96a412e17ce2288db5c273..dcfa30b337c5a75cb9f78f17419e220c03d79e11 100644 (file)
@@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                kdebug("share_creds(%p{%d,%d})",
                       p->cred, atomic_read(&p->cred->usage),
                       read_cred_subscribers(p->cred));
-               atomic_inc(&p->cred->user->processes);
+               inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
                return 0;
        }
 
@@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        }
 #endif
 
-       atomic_inc(&new->user->processes);
        p->cred = p->real_cred = get_cred(new);
+       inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        alter_cred_subscribers(new, 2);
        validate_creds(new);
        return 0;
@@ -496,12 +496,12 @@ int commit_creds(struct cred *new)
         * in set_user().
         */
        alter_cred_subscribers(new, 2);
-       if (new->user != old->user)
-               atomic_inc(&new->user->processes);
+       if (new->user != old->user || new->user_ns != old->user_ns)
+               inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        rcu_assign_pointer(task->real_cred, new);
        rcu_assign_pointer(task->cred, new);
        if (new->user != old->user)
-               atomic_dec(&old->user->processes);
+               dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        alter_cred_subscribers(old, -2);
 
        /* send notifications */
index 04029e35e69af40942815b96e39ebb576f8394c3..61c0fe902b508e4ade6bf7fa1e5311e91b16a78c 100644 (file)
@@ -188,7 +188,7 @@ repeat:
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
-       atomic_dec(&__task_cred(p)->user->processes);
+       dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();
 
        cgroup_release(p);
index 321a5e31d817e17a266ea36ab60614436913af0a..ed7dfb07178d346d21c5e385a570e71e1262acbb 100644 (file)
@@ -819,9 +819,11 @@ void __init fork_init(void)
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
 
-       for (i = 0; i < UCOUNT_COUNTS; i++)
+       for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;
 
+       init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
+
 #ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
@@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process(
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
        retval = -EAGAIN;
-       if (atomic_read(&p->real_cred->user->processes) >=
-                       task_rlimit(p, RLIMIT_NPROC)) {
+       if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
@@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock:
 #endif
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
-       atomic_dec(&p->cred->user->processes);
+       dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
 bad_fork_free:
        p->state = TASK_DEAD;
index cabfc5b861754887fdc44222e3465898e2e3b4ea..00266a65a0006a0c93efede2a27a2d5753eb6031 100644 (file)
@@ -473,7 +473,7 @@ static int set_user(struct cred *new)
         * for programs doing set*uid()+execve() by harmlessly deferring the
         * failure to the execve() stage.
         */
-       if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+       if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
                        new_user != INIT_USER)
                current->flags |= PF_NPROC_EXCEEDED;
        else
index 365865f368ecd96118556eebe4e49ea0ad74760f..6caa56f7dec855d7c0bdb71b3af55596d98927b0 100644 (file)
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_inotify_instances"),
        UCOUNT_ENTRY("max_inotify_watches"),
 #endif
+       { },
        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
        put_ucounts(ucounts);
 }
 
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+       struct ucounts *iter;
+       long ret = 0;
+
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               long max = READ_ONCE(iter->ns->ucount_max[type]);
+               long new = atomic_long_add_return(v, &iter->ucount[type]);
+               if (new < 0 || new > max)
+                       ret = LONG_MAX;
+               else if (iter == ucounts)
+                       ret = new;
+       }
+       return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+       struct ucounts *iter;
+       long new;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+               WARN_ON_ONCE(dec < 0);
+               if (iter == ucounts)
+                       new = dec;
+       }
+       return (new == 0);
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+       struct ucounts *iter;
+       if (get_ucounts_value(ucounts, type) > max)
+               return true;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               max = READ_ONCE(iter->ns->ucount_max[type]);
+               if (get_ucounts_value(iter, type) > max)
+                       return true;
+       }
+       return false;
+}
+
 static __init int user_namespace_sysctl_init(void)
 {
 #ifdef CONFIG_SYSCTL
@@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void)
        BUG_ON(!setup_userns_sysctls(&init_user_ns));
 #endif
        hlist_add_ucounts(&init_ucounts);
+       inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
        return 0;
 }
 subsys_initcall(user_namespace_sysctl_init);
index a2478cddf536e440664a53d1e0f730c29c68f581..7f5ff498207a77cb4f542085feea75a4a7f7a8b1 100644 (file)
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
        .__count        = REFCOUNT_INIT(1),
-       .processes      = ATOMIC_INIT(1),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .uid            = GLOBAL_ROOT_UID,
index f1b7b4b8ffa256f6912494a3f26c6b1c3b82c249..e6577c83507207c51b6b053add38eda901eb6be3 100644 (file)
@@ -119,9 +119,10 @@ int create_user_ns(struct cred *new)
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
-       for (i = 0; i < UCOUNT_COUNTS; i++) {
+       for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
+       ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
        ns->ucounts = ucounts;
 
        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */