mm: allow ->huge_fault() to be called without the mmap_lock held
authorMatthew Wilcox (Oracle) <willy@infradead.org>
Fri, 18 Aug 2023 20:23:34 +0000 (21:23 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 24 Aug 2023 23:20:29 +0000 (16:20 -0700)
Remove the checks for the VMA lock being held, allowing the page fault
path to call into the filesystem instead of retrying with the mmap_lock
held.  This will improve scalability for DAX page faults.  Also update the
documentation to match (and fix some other changes that have happened
recently).

Link: https://lkml.kernel.org/r/20230818202335.2739663-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/filesystems/locking.rst
Documentation/filesystems/porting.rst
mm/memory.c

index 211a0305399223c6cb1a732c5cc1a3f28cfcca30..1a2cb60b24992c05dd81060defbdf73f62216584 100644 (file)
@@ -628,26 +628,29 @@ vm_operations_struct
 
 prototypes::
 
-       void (*open)(struct vm_area_struct*);
-       void (*close)(struct vm_area_struct*);
-       vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *);
+       void (*open)(struct vm_area_struct *);
+       void (*close)(struct vm_area_struct *);
+       vm_fault_t (*fault)(struct vm_fault *);
+       vm_fault_t (*huge_fault)(struct vm_fault *, unsigned int order);
+       vm_fault_t (*map_pages)(struct vm_fault *, pgoff_t start, pgoff_t end);
        vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
        vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *);
        int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
 
-=============  =========       ===========================
+=============  ==========      ===========================
 ops            mmap_lock       PageLocked(page)
-=============  =========       ===========================
-open:          yes
-close:         yes
-fault:         yes             can return with page locked
-map_pages:     read
-page_mkwrite:  yes             can return with page locked
-pfn_mkwrite:   yes
-access:                yes
-=============  =========       ===========================
+=============  ==========      ===========================
+open:          write
+close:         read/write
+fault:         read            can return with page locked
+huge_fault:    maybe-read
+map_pages:     maybe-read
+page_mkwrite:  read            can return with page locked
+pfn_mkwrite:   read
+access:                read
+=============  ==========      ===========================
 
 ->fault() is called when a previously not present pte is about to be faulted
 in. The filesystem must find and return the page associated with the passed in
@@ -657,6 +660,13 @@ then ensure the page is not already truncated (invalidate_lock will block
 subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
 locked. The VM will unlock the page.
 
+->huge_fault() is called when there is no PUD or PMD entry present.  This
+gives the filesystem the opportunity to install a PUD or PMD sized page.
+Filesystems can also use the ->fault method to return a PMD sized page,
+so implementing this function may not be necessary.  In particular,
+filesystems should not call filemap_fault() from ->huge_fault().
+The mmap_lock may not be held when this method is called.
+
 ->map_pages() is called when VM asks to map easy accessible pages.
 Filesystem should find and map pages associated with offsets from "start_pgoff"
 till "end_pgoff". ->map_pages() is called with the RCU lock held and must
index d2d684ae77984f066b96fe028477fc5aec2e31ec..7ce352265de103ef96933b5b14d22c5327f2dd6a 100644 (file)
@@ -943,3 +943,14 @@ file pointer instead of struct dentry pointer.  d_tmpfile() is similarly
 changed to simplify callers.  The passed file is in a non-open state and on
 success must be opened before returning (e.g. by calling
 finish_open_simple()).
+
+---
+
+**mandatory**
+
+Calling convention for ->huge_fault has changed.  It now takes a page
+order instead of an enum page_entry_size, and it may be called without the
+mmap_lock held.  All in-tree users have been audited and do not seem to
+depend on the mmap_lock being held, but out of tree users should verify
+for themselves.  If they do need it, they can return VM_FAULT_RETRY to
+be called with the mmap_lock held.
index 50f44c1bfa195b8baeb88bbab35e778338ae7ce4..7a7e58729510df5daab3d4ffd481b3f2902edd74 100644 (file)
@@ -4854,13 +4854,8 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
-       if (vma->vm_ops->huge_fault) {
-               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-                       vma_end_read(vma);
-                       return VM_FAULT_RETRY;
-               }
+       if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
-       }
        return VM_FAULT_FALLBACK;
 }
 
@@ -4880,10 +4875,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
-                       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-                               vma_end_read(vma);
-                               return VM_FAULT_RETRY;
-                       }
                        ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
@@ -4904,13 +4895,8 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
-       if (vma->vm_ops->huge_fault) {
-               if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-                       vma_end_read(vma);
-                       return VM_FAULT_RETRY;
-               }
+       if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-       }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
 }
@@ -4927,10 +4913,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
-                       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-                               vma_end_read(vma);
-                               return VM_FAULT_RETRY;
-                       }
                        ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;