kernel: Implement selective syscall userspace redirection
authorGabriel Krisman Bertazi <krisman@collabora.com>
Fri, 27 Nov 2020 19:32:34 +0000 (14:32 -0500)
committerThomas Gleixner <tglx@linutronix.de>
Wed, 2 Dec 2020 14:07:56 +0000 (15:07 +0100)
Introduce a mechanism to quickly disable/enable syscall handling for a
specific process and redirect to userspace via SIGSYS.  This is useful
for processes with parts that require syscall redirection and parts that
don't, but who need to perform this boundary crossing really fast,
without paying the cost of a system call to reconfigure syscall handling
on each boundary transition.  This is particularly important for Windows
games running over Wine.

The proposed interface looks like this:

  prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <off>, <length>, [selector])

The range [<offset>,<offset>+<length>) is a part of the process memory
map that is allowed to by-pass the redirection code and dispatch
syscalls directly, such that in fast paths a process doesn't need to
disable the trap nor the kernel has to check the selector.  This is
essential to return from SIGSYS to a blocked area without triggering
another SIGSYS from rt_sigreturn.

selector is an optional pointer to a char-sized userspace memory region
that has a key switch for the mechanism. This key switch is set to
either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the
redirection without calling the kernel.

The feature is meant to be set per-thread and it is disabled on
fork/clone/execv.

Internally, this doesn't add overhead to the syscall hot path, and it
requires very little per-architecture support.  I avoided using seccomp,
even though it duplicates some functionality, due to previous feedback
that maybe it shouldn't mix with seccomp since it is not a security
mechanism.  And obviously, this should never be considered a security
mechanism, since any part of the program can by-pass it by using the
syscall dispatcher.

For the sysinfo benchmark, which measures the overhead added to
executing a native syscall that doesn't require interception, the
overhead using only the direct dispatcher region to issue syscalls is
pretty much irrelevant.  The overhead of using the selector goes around
40ns for a native (unredirected) syscall in my system, and it is (as
expected) dominated by the supervisor-mode user-address access.  In
fact, with SMAP off, the overhead is consistently less than 5ns on my
test box.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com
fs/exec.c
include/linux/sched.h
include/linux/syscall_user_dispatch.h [new file with mode: 0644]
include/linux/thread_info.h
include/uapi/linux/prctl.h
kernel/entry/Makefile
kernel/entry/common.h [new file with mode: 0644]
kernel/entry/syscall_user_dispatch.c [new file with mode: 0644]
kernel/fork.c
kernel/sys.c

index 547a2390baf54f4e285fb41d3488025f1954aed9..aee36e5733ce4aee7202759a0d0279f93ba7f0b4 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
 #include <linux/io_uring.h>
+#include <linux/syscall_user_dispatch.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm)
        flush_thread();
        me->personality &= ~bprm->per_clear;
 
+       clear_syscall_work_syscall_user_dispatch(me);
+
        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
index 063cd120b459339443db7b003df5cd22140316c6..5a24a033b3f8316f94517aba6aa9a2202ade9d67 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
+#include <linux/syscall_user_dispatch.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers.h>
@@ -965,6 +966,7 @@ struct task_struct {
        unsigned int                    sessionid;
 #endif
        struct seccomp                  seccomp;
+       struct syscall_user_dispatch    syscall_dispatch;
 
        /* Thread group tracking: */
        u64                             parent_exec_id;
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
new file mode 100644 (file)
index 0000000..a0ae443
--- /dev/null
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#ifndef _SYSCALL_USER_DISPATCH_H
+#define _SYSCALL_USER_DISPATCH_H
+
+#include <linux/thread_info.h>
+
+#ifdef CONFIG_GENERIC_ENTRY
+
+struct syscall_user_dispatch {
+       char __user     *selector;
+       unsigned long   offset;
+       unsigned long   len;
+       bool            on_dispatch;
+};
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+                             unsigned long len, char __user *selector);
+
+#define clear_syscall_work_syscall_user_dispatch(tsk) \
+       clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH)
+
+#else
+struct syscall_user_dispatch {};
+
+static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+                                           unsigned long len, char __user *selector)
+{
+       return -EINVAL;
+}
+
+static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk)
+{
+}
+
+#endif /* CONFIG_GENERIC_ENTRY */
+
+#endif /* _SYSCALL_USER_DISPATCH_H */
index ca80a214df090576117cc2ef5ba9813f4267e469..c8a974cead730da476d67210898d04624fc97aad 100644 (file)
@@ -42,6 +42,7 @@ enum syscall_work_bit {
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
+       SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
 };
 
 #define SYSCALL_WORK_SECCOMP           BIT(SYSCALL_WORK_BIT_SECCOMP)
@@ -49,6 +50,7 @@ enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_TRACE     BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
 #define SYSCALL_WORK_SYSCALL_EMU       BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
 #define SYSCALL_WORK_SYSCALL_AUDIT     BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
 #endif
 
 #include <asm/thread_info.h>
index 7f0827705c9a461fcf0af8bf070a283f4bd708c1..90deb41c8a34637bc3d5bf5704f49729ef51c1e9 100644 (file)
@@ -247,4 +247,9 @@ struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER              57
 #define PR_GET_IO_FLUSHER              58
 
+/* Dispatch syscalls to a userspace handler */
+#define PR_SET_SYSCALL_USER_DISPATCH   59
+# define PR_SYS_DISPATCH_OFF           0
+# define PR_SYS_DISPATCH_ON            1
+
 #endif /* _LINUX_PRCTL_H */
index 34c8a3f1c735aac7254d5c63231891d4158ee9e7..095c775e001e279fccf0f64be469c859f7c0edf4 100644 (file)
@@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n
 CFLAGS_REMOVE_common.o  = -fstack-protector -fstack-protector-strong
 CFLAGS_common.o                += -fno-stack-protector
 
-obj-$(CONFIG_GENERIC_ENTRY)            += common.o
+obj-$(CONFIG_GENERIC_ENTRY)            += common.o syscall_user_dispatch.o
 obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK)   += kvm.o
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
new file mode 100644 (file)
index 0000000..f6e6d02
--- /dev/null
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _COMMON_H
+#define _COMMON_H
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
new file mode 100644 (file)
index 0000000..b0338a5
--- /dev/null
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#include <linux/sched.h>
+#include <linux/prctl.h>
+#include <linux/syscall_user_dispatch.h>
+#include <linux/uaccess.h>
+#include <linux/signal.h>
+#include <linux/elf.h>
+
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/syscall.h>
+
+#include "common.h"
+
+static void trigger_sigsys(struct pt_regs *regs)
+{
+       struct kernel_siginfo info;
+
+       clear_siginfo(&info);
+       info.si_signo = SIGSYS;
+       info.si_code = SYS_USER_DISPATCH;
+       info.si_call_addr = (void __user *)KSTK_EIP(current);
+       info.si_errno = 0;
+       info.si_arch = syscall_get_arch(current);
+       info.si_syscall = syscall_get_nr(current, regs);
+
+       force_sig_info(&info);
+}
+
+bool syscall_user_dispatch(struct pt_regs *regs)
+{
+       struct syscall_user_dispatch *sd = &current->syscall_dispatch;
+       char state;
+
+       if (likely(instruction_pointer(regs) - sd->offset < sd->len))
+               return false;
+
+       if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
+               return false;
+
+       if (likely(sd->selector)) {
+               /*
+                * access_ok() is performed once, at prctl time, when
+                * the selector is loaded by userspace.
+                */
+               if (unlikely(__get_user(state, sd->selector)))
+                       do_exit(SIGSEGV);
+
+               if (likely(state == PR_SYS_DISPATCH_OFF))
+                       return false;
+
+               if (state != PR_SYS_DISPATCH_ON)
+                       do_exit(SIGSYS);
+       }
+
+       sd->on_dispatch = true;
+       syscall_rollback(current, regs);
+       trigger_sigsys(regs);
+
+       return true;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+                             unsigned long len, char __user *selector)
+{
+       switch (mode) {
+       case PR_SYS_DISPATCH_OFF:
+               if (offset || len || selector)
+                       return -EINVAL;
+               break;
+       case PR_SYS_DISPATCH_ON:
+               /*
+                * Validate the direct dispatcher region just for basic
+                * sanity against overflow and a 0-sized dispatcher
+                * region.  If the user is able to submit a syscall from
+                * an address, that address is obviously valid.
+                */
+               if (offset && offset + len <= offset)
+                       return -EINVAL;
+
+               if (selector && !access_ok(selector, sizeof(*selector)))
+                       return -EFAULT;
+
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       current->syscall_dispatch.selector = selector;
+       current->syscall_dispatch.offset = offset;
+       current->syscall_dispatch.len = len;
+       current->syscall_dispatch.on_dispatch = false;
+
+       if (mode == PR_SYS_DISPATCH_ON)
+               set_syscall_work(SYSCALL_USER_DISPATCH);
+       else
+               clear_syscall_work(SYSCALL_USER_DISPATCH);
+
+       return 0;
+}
index 02b689a23457f26e0e8beafb29938b31c0c613f0..4a5ecb41f4400e97c721cc76f6853a1f20691684 100644 (file)
@@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
+       clear_syscall_work_syscall_user_dispatch(tsk);
 
 #ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
index a730c03ee607cc5941628ed6fc1800f512d41467..51f00fe20e4d15fbf37a38d4a66d2a62166f8cce 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/syscall_user_dispatch.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
                error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
                break;
+       case PR_SET_SYSCALL_USER_DISPATCH:
+               error = set_syscall_user_dispatch(arg2, arg3, arg4,
+                                                 (char __user *) arg5);
+               break;
        default:
                error = -EINVAL;
                break;