mm: handling Non-LRU pages returned by vm_normal_pages
authorAlex Sierra <alex.sierra@amd.com>
Fri, 15 Jul 2022 15:05:11 +0000 (10:05 -0500)
committerakpm <akpm@linux-foundation.org>
Mon, 18 Jul 2022 00:14:28 +0000 (17:14 -0700)
With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages.  Although they
behave like normal pages for purposes of mapping in CPU page, and for COW.
They do not support LRU lists, NUMA migration or THP.

Callers to follow_page() currently don't expect ZONE_DEVICE pages,
however, with DEVICE_COHERENT we might now return ZONE_DEVICE.  Check for
ZONE_DEVICE pages in applicable users of follow_page() as well.

Link: https://lkml.kernel.org/r/20220715150521.18165-5-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> [v2]
Reviewed-by: Alistair Popple <apopple@nvidia.com> [v6]
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/proc/task_mmu.c
mm/huge_memory.c
mm/khugepaged.c
mm/ksm.c
mm/madvise.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/mlock.c
mm/mprotect.c

index 751c19d5bfdd926cd8624a22660f5140772c5281..1d7fd832123b44c3469b505bdfaa80fb132b6329 100644 (file)
@@ -1795,7 +1795,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
                return NULL;
 
        page = vm_normal_page(vma, addr, pte);
-       if (!page)
+       if (!page || is_zone_device_page(page))
                return NULL;
 
        if (PageReserved(page))
index 60d742c33de34d72511008bd71ad6ee910b58ab5..a563de8234c140c3f91083d9c6b7ddfd89278fd2 100644 (file)
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 
                if (IS_ERR(page))
                        continue;
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
 
                if (!is_transparent_hugepage(page))
index 01e0d6336754ece59f91a05eea787828f0ecf6a6..dea102170ab36d3676a9e20bc8760da2e636545c 100644 (file)
@@ -611,7 +611,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
                page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page)) {
+               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }
@@ -1261,7 +1261,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        writable = true;
 
                page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page)) {
+               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
@@ -1472,7 +1472,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                        goto abort;
 
                page = vm_normal_page(vma, addr, *pte);
-
+               if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+                       page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
@@ -1490,6 +1491,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
                if (pte_none(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
+               if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+                       goto abort;
                page_remove_rmap(page, vma, false);
        }
 
index 8d2dc501c92c6f6e2009ee334745a2b56e576400..55f1d96348690ea55752d1f6d2adc1898d00bc26 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
                cond_resched();
                page = follow_page(vma, addr,
                                FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-               if (IS_ERR_OR_NULL(page))
+               if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
                        break;
                if (PageKsm(page))
                        ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
                goto out;
 
        page = follow_page(vma, addr, FOLL_GET);
-       if (IS_ERR_OR_NULL(page))
+       if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
                goto out;
        if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                       if (IS_ERR_OR_NULL(*page)) {
+                       if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
                                ksm_scan.address += PAGE_SIZE;
                                cond_resched();
                                continue;
index e55108d4e4b2c1dd78cf3310a3557276f221068e..5f0f0948a50e4399a3115f6cd12ae8deae25f1bd 100644 (file)
@@ -421,7 +421,7 @@ regular_page:
                        continue;
 
                page = vm_normal_page(vma, addr, ptent);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
 
                /*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                }
 
                page = vm_normal_page(vma, addr, ptent);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
 
                /*
index 580c62febe42eb8ea06b35b7de27de68c92dec89..dce0b2e686eb327dd57428b17fb536ca01b8f27c 100644 (file)
@@ -624,6 +624,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
+               /*
+                * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
+                * and will have refcounts incremented on their struct pages
+                * when they are inserted into PTEs, thus they are safe to
+                * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
+                * do not have refcounts. Example of legacy ZONE_DEVICE is
+                * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+                */
                        return NULL;
 
                print_bad_pte(vma, addr, pte, NULL);
@@ -4693,7 +4701,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        pte = pte_modify(old_pte, vma->vm_page_prot);
 
        page = vm_normal_page(vma, vmf->address, pte);
-       if (!page)
+       if (!page || is_zone_device_page(page))
                goto out_map;
 
        /* TODO: handle PTE-mapped THP */
index f4cd963550c1c5905b371b7ae76f0cf84fa7e050..88a5173c6ff077b74bad2e95a34a60a5575fbf42 100644 (file)
@@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
                /*
                 * vm_normal_page() filters out zero pages, but there might
index 7934eebf168912cc98421b691ee195afb1a96111..1649270bc1a777019e4d647ea81fe3331b556179 100644 (file)
@@ -1630,7 +1630,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
                goto out;
 
        err = -ENOENT;
-       if (!page)
+       if (!page || is_zone_device_page(page))
                goto out;
 
        err = 0;
@@ -1821,7 +1821,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                if (IS_ERR(page))
                        goto set_status;
 
-               if (page) {
+               if (page && !is_zone_device_page(page)) {
                        err = page_to_nid(page);
                        put_page(page);
                } else {
index 716caf851043121dcc8bd94542b6ada0a2acd7c7..b14e929084ccaa5b86d5c9c199d7054b10e0ccab 100644 (file)
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
-               if (!page)
+               if (!page || is_zone_device_page(page))
                        continue;
                if (PageTransCompound(page))
                        continue;
index 996a97e213adcc32fc1ac30d9cfd4648c763ffc0..5ef478b06a7d38d3ee5de8c6851ba8363a4a889a 100644 (file)
@@ -127,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
                                        continue;
 
                                page = vm_normal_page(vma, addr, oldpte);
-                               if (!page || PageKsm(page))
+                               if (!page || is_zone_device_page(page) || PageKsm(page))
                                        continue;
 
                                /* Also skip shared copy-on-write pages */