iommu/amd: Initial support for AMD IOMMU v2 page table
authorVasant Hegde <vasant.hegde@amd.com>
Thu, 25 Aug 2022 06:39:36 +0000 (06:39 +0000)
committerJoerg Roedel <jroedel@suse.de>
Wed, 7 Sep 2022 14:12:35 +0000 (16:12 +0200)
Introduce IO page table framework support for AMD IOMMU v2 page table.
This patch implements 4 level page table within iommu amd driver and
supports 4K/2M/1G page sizes.

Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Link: https://lore.kernel.org/r/20220825063939.8360-7-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
drivers/iommu/amd/Makefile
drivers/iommu/amd/amd_iommu_types.h
drivers/iommu/amd/io_pgtable_v2.c [new file with mode: 0644]
drivers/iommu/io-pgtable.c
include/linux/io-pgtable.h

index a935f8f4b9744d7d1fb8405e35672d3025686520..773d8aa002837eebce6d25c3cfea5e9b28bb85c3 100644 (file)
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o
+obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o
 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
 obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o
index 5b1019dab328569b7e50740715756c257b76e68a..660c1f044912a91aa4fee3eb082a1b229190eb8f 100644 (file)
  * 512GB Pages are not supported due to a hardware bug
  */
 #define AMD_IOMMU_PGSIZES      ((~0xFFFUL) & ~(2ULL << 38))
+/* 4K, 2MB, 1G page sizes are supported */
+#define AMD_IOMMU_PGSIZES_V2   (PAGE_SIZE | (1ULL << 21) | (1ULL << 30))
 
 /* Bit value definition for dte irq remapping fields*/
 #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
@@ -526,7 +528,8 @@ struct amd_io_pgtable {
        struct io_pgtable       iop;
        int                     mode;
        u64                     *root;
-       atomic64_t              pt_root;    /* pgtable root and pgtable mode */
+       atomic64_t              pt_root;        /* pgtable root and pgtable mode */
+       u64                     *pgd;           /* v2 pgtable pgd pointer */
 };
 
 /*
diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c
new file mode 100644 (file)
index 0000000..8638ddf
--- /dev/null
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU-agnostic AMD IO page table v2 allocator.
+ *
+ * Copyright (C) 2022 Advanced Micro Devices, Inc.
+ * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ * Author: Vasant Hegde <vasant.hegde@amd.com>
+ */
+
+#define pr_fmt(fmt)    "AMD-Vi: " fmt
+#define dev_fmt(fmt)   pr_fmt(fmt)
+
+#include <linux/bitops.h>
+#include <linux/io-pgtable.h>
+#include <linux/kernel.h>
+
+#include <asm/barrier.h>
+
+#include "amd_iommu_types.h"
+#include "amd_iommu.h"
+
+#define IOMMU_PAGE_PRESENT     BIT_ULL(0)      /* Is present */
+#define IOMMU_PAGE_RW          BIT_ULL(1)      /* Writeable */
+#define IOMMU_PAGE_USER                BIT_ULL(2)      /* Userspace addressable */
+#define IOMMU_PAGE_PWT         BIT_ULL(3)      /* Page write through */
+#define IOMMU_PAGE_PCD         BIT_ULL(4)      /* Page cache disabled */
+#define IOMMU_PAGE_ACCESS      BIT_ULL(5)      /* Was accessed (updated by IOMMU) */
+#define IOMMU_PAGE_DIRTY       BIT_ULL(6)      /* Was written to (updated by IOMMU) */
+#define IOMMU_PAGE_PSE         BIT_ULL(7)      /* Page Size Extensions */
+#define IOMMU_PAGE_NX          BIT_ULL(63)     /* No execute */
+
+#define MAX_PTRS_PER_PAGE      512
+
+#define IOMMU_PAGE_SIZE_2M     BIT_ULL(21)
+#define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
+
+
+static inline int get_pgtable_level(void)
+{
+       /* 5 level page table is not supported */
+       return PAGE_MODE_4_LEVEL;
+}
+
+static inline bool is_large_pte(u64 pte)
+{
+       return (pte & IOMMU_PAGE_PSE);
+}
+
+static inline void *alloc_pgtable_page(void)
+{
+       return (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static inline u64 set_pgtable_attr(u64 *page)
+{
+       u64 prot;
+
+       prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER;
+       prot |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
+
+       return (iommu_virt_to_phys(page) | prot);
+}
+
+static inline void *get_pgtable_pte(u64 pte)
+{
+       return iommu_phys_to_virt(pte & PM_ADDR_MASK);
+}
+
+static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot)
+{
+       u64 pte;
+
+       pte = __sme_set(paddr & PM_ADDR_MASK);
+       pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER;
+       pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY;
+
+       if (prot & IOMMU_PROT_IW)
+               pte |= IOMMU_PAGE_RW;
+
+       /* Large page */
+       if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M)
+               pte |= IOMMU_PAGE_PSE;
+
+       return pte;
+}
+
+static inline u64 get_alloc_page_size(u64 size)
+{
+       if (size >= IOMMU_PAGE_SIZE_1G)
+               return IOMMU_PAGE_SIZE_1G;
+
+       if (size >= IOMMU_PAGE_SIZE_2M)
+               return IOMMU_PAGE_SIZE_2M;
+
+       return PAGE_SIZE;
+}
+
+static inline int page_size_to_level(u64 pg_size)
+{
+       if (pg_size == IOMMU_PAGE_SIZE_1G)
+               return PAGE_MODE_3_LEVEL;
+       if (pg_size == IOMMU_PAGE_SIZE_2M)
+               return PAGE_MODE_2_LEVEL;
+
+       return PAGE_MODE_1_LEVEL;
+}
+
+static inline void free_pgtable_page(u64 *pt)
+{
+       free_page((unsigned long)pt);
+}
+
+static void free_pgtable(u64 *pt, int level)
+{
+       u64 *p;
+       int i;
+
+       for (i = 0; i < MAX_PTRS_PER_PAGE; i++) {
+               /* PTE present? */
+               if (!IOMMU_PTE_PRESENT(pt[i]))
+                       continue;
+
+               if (is_large_pte(pt[i]))
+                       continue;
+
+               /*
+                * Free the next level. No need to look at l1 tables here since
+                * they can only contain leaf PTEs; just free them directly.
+                */
+               p = get_pgtable_pte(pt[i]);
+               if (level > 2)
+                       free_pgtable(p, level - 1);
+               else
+                       free_pgtable_page(p);
+       }
+
+       free_pgtable_page(pt);
+}
+
+/* Allocate page table */
+static u64 *v2_alloc_pte(u64 *pgd, unsigned long iova,
+                        unsigned long pg_size, bool *updated)
+{
+       u64 *pte, *page;
+       int level, end_level;
+
+       level = get_pgtable_level() - 1;
+       end_level = page_size_to_level(pg_size);
+       pte = &pgd[PM_LEVEL_INDEX(level, iova)];
+       iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE);
+
+       while (level >= end_level) {
+               u64 __pte, __npte;
+
+               __pte = *pte;
+
+               if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) {
+                       /* Unmap large pte */
+                       cmpxchg64(pte, *pte, 0ULL);
+                       *updated = true;
+                       continue;
+               }
+
+               if (!IOMMU_PTE_PRESENT(__pte)) {
+                       page = alloc_pgtable_page();
+                       if (!page)
+                               return NULL;
+
+                       __npte = set_pgtable_attr(page);
+                       /* pte could have been changed somewhere. */
+                       if (cmpxchg64(pte, __pte, __npte) != __pte)
+                               free_pgtable_page(page);
+                       else if (IOMMU_PTE_PRESENT(__pte))
+                               *updated = true;
+
+                       continue;
+               }
+
+               level -= 1;
+               pte = get_pgtable_pte(__pte);
+               pte = &pte[PM_LEVEL_INDEX(level, iova)];
+       }
+
+       /* Tear down existing pte entries */
+       if (IOMMU_PTE_PRESENT(*pte)) {
+               u64 *__pte;
+
+               *updated = true;
+               __pte = get_pgtable_pte(*pte);
+               cmpxchg64(pte, *pte, 0ULL);
+               if (pg_size == IOMMU_PAGE_SIZE_1G)
+                       free_pgtable(__pte, end_level - 1);
+               else if (pg_size == IOMMU_PAGE_SIZE_2M)
+                       free_pgtable_page(__pte);
+       }
+
+       return pte;
+}
+
+/*
+ * This function checks if there is a PTE for a given dma address.
+ * If there is one, it returns the pointer to it.
+ */
+static u64 *fetch_pte(struct amd_io_pgtable *pgtable,
+                     unsigned long iova, unsigned long *page_size)
+{
+       u64 *pte;
+       int level;
+
+       level = get_pgtable_level() - 1;
+       pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)];
+       /* Default page size is 4K */
+       *page_size = PAGE_SIZE;
+
+       while (level) {
+               /* Not present */
+               if (!IOMMU_PTE_PRESENT(*pte))
+                       return NULL;
+
+               /* Walk to the next level */
+               pte = get_pgtable_pte(*pte);
+               pte = &pte[PM_LEVEL_INDEX(level - 1, iova)];
+
+               /* Large page */
+               if (is_large_pte(*pte)) {
+                       if (level == PAGE_MODE_3_LEVEL)
+                               *page_size = IOMMU_PAGE_SIZE_1G;
+                       else if (level == PAGE_MODE_2_LEVEL)
+                               *page_size = IOMMU_PAGE_SIZE_2M;
+                       else
+                               return NULL;    /* Wrongly set PSE bit in PTE */
+
+                       break;
+               }
+
+               level -= 1;
+       }
+
+       return pte;
+}
+
+static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
+                             phys_addr_t paddr, size_t pgsize, size_t pgcount,
+                             int prot, gfp_t gfp, size_t *mapped)
+{
+       struct protection_domain *pdom = io_pgtable_ops_to_domain(ops);
+       struct io_pgtable_cfg *cfg = &pdom->iop.iop.cfg;
+       u64 *pte;
+       unsigned long map_size;
+       unsigned long mapped_size = 0;
+       unsigned long o_iova = iova;
+       size_t size = pgcount << __ffs(pgsize);
+       int count = 0;
+       int ret = 0;
+       bool updated = false;
+
+       if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount)
+               return -EINVAL;
+
+       if (!(prot & IOMMU_PROT_MASK))
+               return -EINVAL;
+
+       while (mapped_size < size) {
+               map_size = get_alloc_page_size(pgsize);
+               pte = v2_alloc_pte(pdom->iop.pgd, iova, map_size, &updated);
+               if (!pte) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               *pte = set_pte_attr(paddr, map_size, prot);
+
+               count++;
+               iova += map_size;
+               paddr += map_size;
+               mapped_size += map_size;
+       }
+
+out:
+       if (updated) {
+               if (count > 1)
+                       amd_iommu_flush_tlb(&pdom->domain, 0);
+               else
+                       amd_iommu_flush_page(&pdom->domain, 0, o_iova);
+       }
+
+       if (mapped)
+               *mapped += mapped_size;
+
+       return ret;
+}
+
+static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops,
+                                         unsigned long iova,
+                                         size_t pgsize, size_t pgcount,
+                                         struct iommu_iotlb_gather *gather)
+{
+       struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+       struct io_pgtable_cfg *cfg = &pgtable->iop.cfg;
+       unsigned long unmap_size;
+       unsigned long unmapped = 0;
+       size_t size = pgcount << __ffs(pgsize);
+       u64 *pte;
+
+       if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
+               return 0;
+
+       while (unmapped < size) {
+               pte = fetch_pte(pgtable, iova, &unmap_size);
+               if (!pte)
+                       return unmapped;
+
+               *pte = 0ULL;
+
+               iova = (iova & ~(unmap_size - 1)) + unmap_size;
+               unmapped += unmap_size;
+       }
+
+       return unmapped;
+}
+
+static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova)
+{
+       struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+       unsigned long offset_mask, pte_pgsize;
+       u64 *pte, __pte;
+
+       pte = fetch_pte(pgtable, iova, &pte_pgsize);
+       if (!pte || !IOMMU_PTE_PRESENT(*pte))
+               return 0;
+
+       offset_mask = pte_pgsize - 1;
+       __pte = __sme_clr(*pte & PM_ADDR_MASK);
+
+       return (__pte & ~offset_mask) | (iova & offset_mask);
+}
+
+/*
+ * ----------------------------------------------------
+ */
+static void v2_tlb_flush_all(void *cookie)
+{
+}
+
+static void v2_tlb_flush_walk(unsigned long iova, size_t size,
+                             size_t granule, void *cookie)
+{
+}
+
+static void v2_tlb_add_page(struct iommu_iotlb_gather *gather,
+                           unsigned long iova, size_t granule,
+                           void *cookie)
+{
+}
+
+static const struct iommu_flush_ops v2_flush_ops = {
+       .tlb_flush_all  = v2_tlb_flush_all,
+       .tlb_flush_walk = v2_tlb_flush_walk,
+       .tlb_add_page   = v2_tlb_add_page,
+};
+
+static void v2_free_pgtable(struct io_pgtable *iop)
+{
+       struct protection_domain *pdom;
+       struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop);
+
+       pdom = container_of(pgtable, struct protection_domain, iop);
+       if (!(pdom->flags & PD_IOMMUV2_MASK))
+               return;
+
+       /*
+        * Make changes visible to IOMMUs. No need to clear gcr3 entry
+        * as gcr3 table is already freed.
+        */
+       amd_iommu_domain_update(pdom);
+
+       /* Free page table */
+       free_pgtable(pgtable->pgd, get_pgtable_level());
+}
+
+static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+       struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
+       struct protection_domain *pdom = (struct protection_domain *)cookie;
+       int ret;
+
+       pgtable->pgd = alloc_pgtable_page();
+       if (!pgtable->pgd)
+               return NULL;
+
+       ret = amd_iommu_domain_set_gcr3(&pdom->domain, 0, iommu_virt_to_phys(pgtable->pgd));
+       if (ret)
+               goto err_free_pgd;
+
+       pgtable->iop.ops.map_pages    = iommu_v2_map_pages;
+       pgtable->iop.ops.unmap_pages  = iommu_v2_unmap_pages;
+       pgtable->iop.ops.iova_to_phys = iommu_v2_iova_to_phys;
+
+       cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2,
+       cfg->ias           = IOMMU_IN_ADDR_BIT_SIZE,
+       cfg->oas           = IOMMU_OUT_ADDR_BIT_SIZE,
+       cfg->tlb           = &v2_flush_ops;
+
+       return &pgtable->iop;
+
+err_free_pgd:
+       free_pgtable_page(pgtable->pgd);
+
+       return NULL;
+}
+
+struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = {
+       .alloc  = v2_alloc_pgtable,
+       .free   = v2_free_pgtable,
+};
index f4bfcef98297f4158f32ace205bbd50430d63421..ac02a45ed79860670683815bc97395865cbfd908 100644 (file)
@@ -27,6 +27,7 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #endif
 #ifdef CONFIG_AMD_IOMMU
        [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns,
+       [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns,
 #endif
 };
 
index ca98aeadcc80485d6e005d66ae925c773e792fa1..ffe616db9409fe183e0977d8ad717a49843a8c66 100644 (file)
@@ -16,6 +16,7 @@ enum io_pgtable_fmt {
        ARM_V7S,
        ARM_MALI_LPAE,
        AMD_IOMMU_V1,
+       AMD_IOMMU_V2,
        APPLE_DART,
        IO_PGTABLE_NUM_FMTS,
 };
@@ -260,6 +261,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
 
 #endif /* __IO_PGTABLE_H */