mm: make PTE_MARKER_SWAPIN_ERROR more general
authorAxel Rasmussen <axelrasmussen@google.com>
Fri, 7 Jul 2023 21:55:33 +0000 (14:55 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 18 Aug 2023 17:12:16 +0000 (10:12 -0700)
Patch series "add UFFDIO_POISON to simulate memory poisoning with UFFD",
v4.

This series adds a new userfaultfd feature, UFFDIO_POISON. See commit 4
for a detailed description of the feature.

This patch (of 8):

Future patches will reuse PTE_MARKER_SWAPIN_ERROR to implement
UFFDIO_POISON, so make some various preparations for that:

First, rename it to just PTE_MARKER_POISONED.  The "SWAPIN" can be
confusing since we're going to re-use it for something not really related
to swap.  This can be particularly confusing for things like hugetlbfs,
which doesn't support swap whatsoever.  Also rename some various helper
functions.

Next, fix pte marker copying for hugetlbfs.  Previously, it would WARN on
seeing a PTE_MARKER_SWAPIN_ERROR, since hugetlbfs doesn't support swap.
But, since we're going to re-use it, we want it to go ahead and copy it
just like non-hugetlbfs memory does today.  Since the code to do this is
more complicated now, pull it out into a helper which can be re-used in
both places.  While we're at it, also make it slightly more explicit in
its handling of e.g.  uffd wp markers.

For non-hugetlbfs page faults, instead of returning VM_FAULT_SIGBUS for an
error entry, return VM_FAULT_HWPOISON.  For most cases this change doesn't
matter, e.g.  a userspace program would receive a SIGBUS either way.  But
for UFFDIO_POISON, this change will let KVM guests get an MCE out of the
box, instead of giving a SIGBUS to the hypervisor and requiring it to
somehow inject an MCE.

Finally, for hugetlbfs faults, handle PTE_MARKER_POISONED, and return
VM_FAULT_HWPOISON_LARGE in such cases.  Note that this can't happen today
because the lack of swap support means we'll never end up with such a PTE
anyway, but this behavior will be needed once such entries *can* show up
via UFFDIO_POISON.

Link: https://lkml.kernel.org/r/20230707215540.2324998-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20230707215540.2324998-2-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gaosheng Cui <cuigaosheng1@huawei.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: T.J. Alumbaugh <talumbau@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm_inline.h
include/linux/swapops.h
mm/hugetlb.c
mm/madvise.c
mm/memory.c
mm/mprotect.c
mm/shmem.c
mm/swapfile.c

index 21d6c72bcc71e334070df8b72a20b187543ebba2..a86c84600787763359e3e588009a1b29ef44f91a 100644 (file)
@@ -523,6 +523,25 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
        return atomic_read(&mm->tlb_flush_pending) > 1;
 }
 
+/*
+ * Computes the pte marker to copy from the given source entry into dst_vma.
+ * If no marker should be copied, returns 0.
+ * The caller should insert a new pte created with make_pte_marker().
+ */
+static inline pte_marker copy_pte_marker(
+               swp_entry_t entry, struct vm_area_struct *dst_vma)
+{
+       pte_marker srcm = pte_marker_get(entry);
+       /* Always copy error entries. */
+       pte_marker dstm = srcm & PTE_MARKER_POISONED;
+
+       /* Only copy PTE markers if UFFD register matches. */
+       if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
+               dstm |= PTE_MARKER_UFFD_WP;
+
+       return dstm;
+}
+
 /*
  * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
  * replace a none pte.  NOTE!  This should only be called when *pte is already
index 4c932cb45e0b410b7e89f431698f054e93a96f0f..bff1e8d97de0e089a70ed3f9aa872bf050955087 100644 (file)
@@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
 typedef unsigned long pte_marker;
 
 #define  PTE_MARKER_UFFD_WP                    BIT(0)
-#define  PTE_MARKER_SWAPIN_ERROR               BIT(1)
+/*
+ * "Poisoned" here is meant in the very general sense of "future accesses are
+ * invalid", instead of referring very specifically to hardware memory errors.
+ * This marker is meant to represent any of various different causes of this.
+ */
+#define  PTE_MARKER_POISONED                   BIT(1)
 #define  PTE_MARKER_MASK                       (BIT(2) - 1)
 
 static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
@@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker)
        return swp_entry_to_pte(make_pte_marker_entry(marker));
 }
 
-static inline swp_entry_t make_swapin_error_entry(void)
+static inline swp_entry_t make_poisoned_swp_entry(void)
 {
-       return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR);
+       return make_pte_marker_entry(PTE_MARKER_POISONED);
 }
 
-static inline int is_swapin_error_entry(swp_entry_t entry)
+static inline int is_poisoned_swp_entry(swp_entry_t entry)
 {
        return is_pte_marker_entry(entry) &&
-           (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR);
+           (pte_marker_get(entry) & PTE_MARKER_POISONED);
 }
 
 /*
index e3839eee465790acd8b8400f44fe97e2d2d8266a..ffee2978dfed4ba367f466977c51e414e022fe6c 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/nospec.h>
 #include <linux/delayacct.h>
 #include <linux/memory.h>
+#include <linux/mm_inline.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -5101,15 +5102,12 @@ again:
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                } else if (unlikely(is_pte_marker(entry))) {
-                       /* No swap on hugetlb */
-                       WARN_ON_ONCE(
-                           is_swapin_error_entry(pte_to_swp_entry(entry)));
-                       /*
-                        * We copy the pte marker only if the dst vma has
-                        * uffd-wp enabled.
-                        */
-                       if (userfaultfd_wp(dst_vma))
-                               set_huge_pte_at(dst, addr, dst_pte, entry);
+                       pte_marker marker = copy_pte_marker(
+                               pte_to_swp_entry(entry), dst_vma);
+
+                       if (marker)
+                               set_huge_pte_at(dst, addr, dst_pte,
+                                               make_pte_marker(marker));
                } else {
                        entry = huge_ptep_get(src_pte);
                        pte_folio = page_folio(pte_page(entry));
@@ -6089,14 +6087,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        entry = huge_ptep_get(ptep);
-       /* PTE markers should be handled the same way as none pte */
-       if (huge_pte_none_mostly(entry))
+       if (huge_pte_none_mostly(entry)) {
+               if (is_pte_marker(entry)) {
+                       pte_marker marker =
+                               pte_marker_get(pte_to_swp_entry(entry));
+
+                       if (marker & PTE_MARKER_POISONED) {
+                               ret = VM_FAULT_HWPOISON_LARGE;
+                               goto out_mutex;
+                       }
+               }
+
                /*
+                * Other PTE markers should be handled the same way as none PTE.
+                *
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
                return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
                                      entry, flags);
+       }
 
        ret = 0;
 
index 05f97038eac3da08ea18cbe50815816faa12ba0b..da65f8bd9ac33b80c1f1bf58ce4502f8e7de4f0f 100644 (file)
@@ -664,7 +664,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                free_swap_and_cache(entry);
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
-                                  is_swapin_error_entry(entry)) {
+                                  is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        }
                        continue;
index ff19719da03261d7fd5e8f30bf04e37b75c86aa1..36b164ee9ffb0b752af565f9dee7ab09b5c85cd5 100644 (file)
@@ -860,8 +860,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
-               if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
-                       set_pte_at(dst_mm, addr, dst_pte, pte);
+               pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+               if (marker)
+                       set_pte_at(dst_mm, addr, dst_pte,
+                                  make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
@@ -1502,7 +1505,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                            !zap_drop_file_uffd_wp(details))
                                continue;
                } else if (is_hwpoison_entry(entry) ||
-                          is_swapin_error_entry(entry)) {
+                          is_poisoned_swp_entry(entry)) {
                        if (!should_zap_cows(details))
                                continue;
                } else {
@@ -3651,7 +3654,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
-        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+        * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3697,8 +3700,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
 
        /* Higher priority than uffd-wp when data corrupted */
-       if (marker & PTE_MARKER_SWAPIN_ERROR)
-               return VM_FAULT_SIGBUS;
+       if (marker & PTE_MARKER_POISONED)
+               return VM_FAULT_HWPOISON;
 
        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);
index 6f658d4837047836e5a97c7ccf6e373929fc1a74..5c3112d9246648af560b7e7dd619b415fbef3495 100644 (file)
@@ -230,10 +230,10 @@ static long change_pte_range(struct mmu_gather *tlb,
                                        newpte = pte_swp_mkuffd_wp(newpte);
                        } else if (is_pte_marker_entry(entry)) {
                                /*
-                                * Ignore swapin errors unconditionally,
+                                * Ignore error swap entries unconditionally,
                                 * because any access should sigbus anyway.
                                 */
-                               if (is_swapin_error_entry(entry))
+                               if (is_poisoned_swp_entry(entry))
                                        continue;
                                /*
                                 * If this is uffd-wp pte marker and we'd like
index 8dfd72bdc86ab88b20670e2e145456145fe42239..235f2b2fd20251e9794148fcb762fd55a2e0afcc 100644 (file)
@@ -1707,7 +1707,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
        swp_entry_t swapin_error;
        void *old;
 
-       swapin_error = make_swapin_error_entry();
+       swapin_error = make_poisoned_swp_entry();
        old = xa_cmpxchg_irq(&mapping->i_pages, index,
                             swp_to_radix_entry(swap),
                             swp_to_radix_entry(swapin_error), 0);
@@ -1752,7 +1752,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
        swap = radix_to_swp_entry(*foliop);
        *foliop = NULL;
 
-       if (is_swapin_error_entry(swap))
+       if (is_poisoned_swp_entry(swap))
                return -EIO;
 
        si = get_swap_device(swap);
index d996c335fc3c2afd6d513eb081c0b11c199e4a07..346e22b8ae970cbeed27665cccc64872aa480c15 100644 (file)
@@ -1771,7 +1771,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                        swp_entry = make_hwpoison_entry(swapcache);
                        page = swapcache;
                } else {
-                       swp_entry = make_swapin_error_entry();
+                       swp_entry = make_poisoned_swp_entry();
                }
                new_pte = swp_entry_to_pte(swp_entry);
                ret = 0;