mm: implement memory-deny-write-execute as a prctl
authorJoey Gouly <joey.gouly@arm.com>
Thu, 19 Jan 2023 16:03:43 +0000 (16:03 +0000)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 3 Feb 2023 06:33:24 +0000 (22:33 -0800)
Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)",
v2.

The background to this is that systemd has a configuration option called
MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter.  Its aim
is to prevent a user task from inadvertently creating an executable
mapping that is (or was) writeable.  Since such BPF filter is stateless,
it cannot detect mappings that were previously writeable but subsequently
changed to read-only.  Therefore the filter simply rejects any
mprotect(PROT_EXEC).  The side-effect is that on arm64 with BTI support
(Branch Target Identification), the dynamic loader cannot change an ELF
section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect().  For
libraries, it can resort to unmapping and re-mapping but for the main
executable it does not have a file descriptor.  The original bug report in
the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries
- [4].

This series adds in-kernel support for this feature as a prctl
PR_SET_MDWE, that is inherited on fork().  The prctl denies PROT_WRITE |
PROT_EXEC mappings.  Like the systemd BPF filter it also denies adding
PROT_EXEC to mappings.  However unlike the BPF filter it only denies it if
the mapping didn't previous have PROT_EXEC.  This allows to PROT_EXEC ->
PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF
filter.

This patch (of 2):

The aim of such policy is to prevent a user task from creating an
executable mapping that is also writeable.

An example of mmap() returning -EACCESS if the policy is enabled:

mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);

Similarly, mprotect() would return -EACCESS below:

addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);

The BPF filter that systemd MDWE uses is stateless, and disallows
mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
be enabled if it was already PROT_EXEC, which allows the following case:

addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);

where PROT_BTI enables branch tracking identification on arm64.

Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com
Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: nd <nd@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Szabolcs Nagy <szabolcs.nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Cc: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mman.h
include/linux/sched/coredump.h
include/uapi/linux/prctl.h
kernel/sys.c
mm/mmap.c
mm/mprotect.c

index 58b3abd457a38df455a870eca02cb0e63c6a3fc0..cee1e4b566d80095822b516eeab1d42837772d77 100644 (file)
@@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags)
 }
 
 unsigned long vm_commit_limit(void);
+
+/*
+ * Denies creating a writable executable mapping or gaining executable permissions.
+ *
+ * This denies the following:
+ *
+ *     a)      mmap(PROT_WRITE | PROT_EXEC)
+ *
+ *     b)      mmap(PROT_WRITE)
+ *             mprotect(PROT_EXEC)
+ *
+ *     c)      mmap(PROT_WRITE)
+ *             mprotect(PROT_READ)
+ *             mprotect(PROT_EXEC)
+ *
+ * But allows the following:
+ *
+ *     d)      mmap(PROT_READ | PROT_EXEC)
+ *             mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ */
+static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
+{
+       if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
+               return false;
+
+       if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+               return true;
+
+       if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+               return true;
+
+       return false;
+}
+
 #endif /* _LINUX_MMAN_H */
index 8270ad7ae14c2ab8f70c9d1b1687476dd8e06d56..0e17ae7fbfd357aced9b38f9e4ab780b4cfb743d 100644 (file)
@@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
  * lifecycle of this mm, just for simplicity.
  */
 #define MMF_HAS_PINNED         27      /* FOLL_PIN has run, never cleared */
+
+#define MMF_HAS_MDWE           28
+#define MMF_HAS_MDWE_MASK      (1 << MMF_HAS_MDWE)
+
 #define MMF_DISABLE_THP_MASK   (1 << MMF_DISABLE_THP)
 
 #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
-                                MMF_DISABLE_THP_MASK)
+                                MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #endif /* _LINUX_SCHED_COREDUMP_H */
index a5e06dcbba136d618c6dcf61f0cbb6e3bfe9ea2c..1312a137f7fb85e1b042139f7762b3bb9775fa8e 100644 (file)
@@ -281,6 +281,12 @@ struct prctl_mm_map {
 # define PR_SME_VL_LEN_MASK            0xffff
 # define PR_SME_VL_INHERIT             (1 << 17) /* inherit across exec */
 
+/* Memory deny write / execute */
+#define PR_SET_MDWE                    65
+# define PR_MDWE_REFUSE_EXEC_GAIN      1
+
+#define PR_GET_MDWE                    66
+
 #define PR_SET_VMA             0x53564d41
 # define PR_SET_VMA_ANON_NAME          0
 
index 5fd54bf0e886726dfb9fb955b0099239aedc9464..b3cab94545ed3f8b0289dc58825a0ae98e48d5ab 100644 (file)
@@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
+static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
+                                unsigned long arg4, unsigned long arg5)
+{
+       if (arg3 || arg4 || arg5)
+               return -EINVAL;
+
+       if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+               return -EINVAL;
+
+       if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
+               set_bit(MMF_HAS_MDWE, &current->mm->flags);
+       else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+               return -EPERM; /* Cannot unset the flag */
+
+       return 0;
+}
+
+static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
+                                unsigned long arg4, unsigned long arg5)
+{
+       if (arg2 || arg3 || arg4 || arg5)
+               return -EINVAL;
+
+       return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
+               PR_MDWE_REFUSE_EXEC_GAIN : 0;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                error = sched_core_share_pid(arg2, arg3, arg4, arg5);
                break;
 #endif
+       case PR_SET_MDWE:
+               error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
+               break;
+       case PR_GET_MDWE:
+               error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
+               break;
        case PR_SET_VMA:
                error = prctl_set_vma(arg2, arg3, arg4, arg5);
                break;
index 335ba3df9898e4da0251a5375f5d823183d529f2..ffc0815cd7fb71ed3a402d1286e701c9ec9a956d 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2669,6 +2669,16 @@ cannot_expand:
                vma_set_anonymous(vma);
        }
 
+       if (map_deny_write_exec(vma, vma->vm_flags)) {
+               error = -EACCES;
+               if (file)
+                       goto close_and_free_vma;
+               else if (vma->vm_file)
+                       goto unmap_and_free_vma;
+               else
+                       goto free_vma;
+       }
+
        /* Allow architectures to sanity-check the vm_flags */
        if (!arch_validate_flags(vma->vm_flags)) {
                error = -EINVAL;
index 6ecdf0671b810c49286cc107c7e19e117ba5d83c..6a22f3ad9b84d2a68005b031e17ab63361193758 100644 (file)
@@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
                        break;
                }
 
+               if (map_deny_write_exec(vma, newflags)) {
+                       error = -EACCES;
+                       goto out;
+               }
+
                /* Allow architectures to sanity-check the new flags */
                if (!arch_validate_flags(newflags)) {
                        error = -EINVAL;