KVM: arm64: Move VTCR_EL2 into struct s2_mmu
authorMarc Zyngier <maz@kernel.org>
Thu, 12 Oct 2023 20:51:08 +0000 (21:51 +0100)
committerOliver Upton <oliver.upton@linux.dev>
Mon, 23 Oct 2023 18:48:46 +0000 (18:48 +0000)
We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:

- the PARange values and the number of IPA bits don't necessarily
  match: you can have 33 bits of IPA space, and yet you can only
  describe 32 or 36 bits of PARange

- When userspace set the IPA space, it creates a contract with the
  kernel saying "this is the IPA space I'm prepared to handle".
  At no point does it constraint the guest's own IPA space as
  long as the guest doesn't try to use a [I]PA outside of the
  IPA space set by userspace

- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.

And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.

This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.

For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.

Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/stage2_pgtable.h
arch/arm64/kvm/hyp/nvhe/mem_protect.c
arch/arm64/kvm/hyp/nvhe/pkvm.c
arch/arm64/kvm/hyp/pgtable.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/pkvm.c
arch/arm64/kvm/vgic/vgic-kvm-device.c

index 759adee420182e3288140c5a2fa69339ed9c3869..b6b10eb7543f1d8f4d7596dffad02689db8ed115 100644 (file)
@@ -158,6 +158,16 @@ struct kvm_s2_mmu {
        phys_addr_t     pgd_phys;
        struct kvm_pgtable *pgt;
 
+       /*
+        * VTCR value used on the host. For a non-NV guest (or a NV
+        * guest that runs in a context where its own S2 doesn't
+        * apply), its T0SZ value reflects that of the IPA size.
+        *
+        * For a shadow S2 MMU, T0SZ reflects the PARange exposed to
+        * the guest.
+        */
+       u64     vtcr;
+
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
 
@@ -205,9 +215,6 @@ struct kvm_protected_vm {
 struct kvm_arch {
        struct kvm_s2_mmu mmu;
 
-       /* VTCR_EL2 value for this VM */
-       u64    vtcr;
-
        /* Interrupt controller */
        struct vgic_dist        vgic;
 
index 96a80e8f62263eb30d04d8e0107807b75199f18a..caa29c16ac50b5003e955663f4c1b4b056e9b49f 100644 (file)
@@ -150,9 +150,9 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
  */
 #define KVM_PHYS_SHIFT (40)
 
-#define kvm_phys_shift(kvm)            VTCR_EL2_IPA(kvm->arch.vtcr)
-#define kvm_phys_size(kvm)             (_AC(1, ULL) << kvm_phys_shift(kvm))
-#define kvm_phys_mask(kvm)             (kvm_phys_size(kvm) - _AC(1, ULL))
+#define kvm_phys_shift(mmu)            VTCR_EL2_IPA((mmu)->vtcr)
+#define kvm_phys_size(mmu)             (_AC(1, ULL) << kvm_phys_shift(mmu))
+#define kvm_phys_mask(mmu)             (kvm_phys_size(mmu) - _AC(1, ULL))
 
 #include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
@@ -299,7 +299,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
 static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
                                          struct kvm_arch *arch)
 {
-       write_sysreg(arch->vtcr, vtcr_el2);
+       write_sysreg(mmu->vtcr, vtcr_el2);
        write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
 
        /*
index c8dca8ae359cd25c3bac6017b1b8bb4e85eb992a..23d27623e478b8d4443387676fd3cf04c6c38614 100644 (file)
  * (IPA_SHIFT - 4).
  */
 #define stage2_pgtable_levels(ipa)     ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
-#define kvm_stage2_levels(kvm)         VTCR_EL2_LVLS(kvm->arch.vtcr)
+#define kvm_stage2_levels(mmu)         VTCR_EL2_LVLS((mmu)->vtcr)
 
 /*
  * kvm_mmmu_cache_min_pages() is the number of pages required to install
  * a stage-2 translation. We pre-allocate the entry level page table at
  * the VM creation.
  */
-#define kvm_mmu_cache_min_pages(kvm)   (kvm_stage2_levels(kvm) - 1)
+#define kvm_mmu_cache_min_pages(mmu)   (kvm_stage2_levels(mmu) - 1)
 
 #endif /* __ARM64_S2_PGTABLE_H_ */
index 9d703441278bd72e33cec10adba5b9b0afb145bf..8d0a5834e8830059d43d464c4187ff63990e9770 100644 (file)
@@ -129,8 +129,8 @@ static void prepare_host_vtcr(void)
        parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
        phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
 
-       host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
-                                         id_aa64mmfr1_el1_sys_val, phys_shift);
+       host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+                                             id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
@@ -235,7 +235,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
        unsigned long nr_pages;
        int ret;
 
-       nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+       nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT;
        ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
        if (ret)
                return ret;
@@ -295,7 +295,7 @@ int __pkvm_prot_finalize(void)
                return -EPERM;
 
        params->vttbr = kvm_get_vttbr(mmu);
-       params->vtcr = host_mmu.arch.vtcr;
+       params->vtcr = mmu->vtcr;
        params->hcr_el2 |= HCR_VM;
 
        /*
index 8033ef353a5da406dba355ab73854dfa39e93c27..9d23a51d7f7525d50558adefdc402c906ea2b82a 100644 (file)
@@ -303,7 +303,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 {
        hyp_vm->host_kvm = host_kvm;
        hyp_vm->kvm.created_vcpus = nr_vcpus;
-       hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+       hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
 }
 
 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
@@ -483,7 +483,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
        }
 
        vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
-       pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+       pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
 
        ret = -ENOMEM;
 
index f155b8c9e98c7fbf1298f4ecf64c6826c76fdb23..0c84872fd89d3a25b4c2c724a5023d4412d56622 100644 (file)
@@ -1511,7 +1511,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                              kvm_pgtable_force_pte_cb_t force_pte_cb)
 {
        size_t pgd_sz;
-       u64 vtcr = mmu->arch->vtcr;
+       u64 vtcr = mmu->vtcr;
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
index 482280fe22d7c59d2539b9f0e758cfc95afad0ff..551f21936eaabfa6483b711dae0ce61becaad61f 100644 (file)
@@ -892,7 +892,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
-       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+       mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
        if (mmu->pgt != NULL) {
                kvm_err("kvm_arch already initialized?\n");
@@ -1067,7 +1067,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
        phys_addr_t addr;
        int ret = 0;
        struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
-       struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+       struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+       struct kvm_pgtable *pgt = mmu->pgt;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
                                     KVM_PGTABLE_PROT_R |
                                     (writable ? KVM_PGTABLE_PROT_W : 0);
@@ -1080,7 +1081,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
        for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
                ret = kvm_mmu_topup_memory_cache(&cache,
-                                                kvm_mmu_cache_min_pages(kvm));
+                                                kvm_mmu_cache_min_pages(mmu));
                if (ret)
                        break;
 
@@ -1431,7 +1432,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (fault_status != ESR_ELx_FSC_PERM ||
            (logging_active && write_fault)) {
                ret = kvm_mmu_topup_memory_cache(memcache,
-                                                kvm_mmu_cache_min_pages(kvm));
+                                                kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
                if (ret)
                        return ret;
        }
@@ -1747,7 +1748,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
        }
 
        /* Userspace should not be able to register out-of-bounds IPAs */
-       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
 
        if (fault_status == ESR_ELx_FSC_ACCESS) {
                handle_access_fault(vcpu, fault_ipa);
@@ -2021,7 +2022,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         * Prevent userspace from creating a memory region outside of the IPA
         * space addressable by the KVM guest IPA space.
         */
-       if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+       if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
                return -EFAULT;
 
        hva = new->userspace_addr;
index 6ff3ec18c92584376e941a87b9e88e62fa99c2ff..8350fb8fee0b998ccf27dca4b7bf2e858846ccd3 100644 (file)
@@ -123,7 +123,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
        if (host_kvm->created_vcpus < 1)
                return -EINVAL;
 
-       pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+       pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr);
 
        /*
         * The PGD pages will be reclaimed using a hyp_memcache which implies
index 212b73a715c1c245540e12636085bb0d4ac1d741..64f8e2e1c443bd217c2c5133bf54c23506c7cb43 100644 (file)
@@ -27,7 +27,8 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
        if (addr + size < addr)
                return -EINVAL;
 
-       if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm))
+       if (addr & ~kvm_phys_mask(&kvm->arch.mmu) ||
+           (addr + size) > kvm_phys_size(&kvm->arch.mmu))
                return -E2BIG;
 
        return 0;