* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))