KVM: x86: Morph pending exceptions to pending VM-Exits at queue time

author Sean Christopherson <seanjc@google.com>

Tue, 30 Aug 2022 23:16:08 +0000 (23:16 +0000)

committer Paolo Bonzini <pbonzini@redhat.com>

Mon, 26 Sep 2022 16:03:10 +0000 (12:03 -0400)
author Sean Christopherson <seanjc@google.com>
Tue, 30 Aug 2022 23:16:08 +0000 (23:16 +0000)
committer Paolo Bonzini <pbonzini@redhat.com>
Mon, 26 Sep 2022 16:03:10 +0000 (12:03 -0400)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 76fda10baa3d49ac5890676675e2852f220d295e..b3ce723efb4391e2b1ad0334c0593d092e1794b3 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -649,7 +649,6 @@ struct kvm_queued_exception {
         u32 error_code;
         unsigned long payload;
         bool has_payload;
-       u8 nested_apf;
  };
  
  struct kvm_vcpu_arch {
@@ -750,8 +749,12 @@ struct kvm_vcpu_arch {
  
         u8 event_exit_inst_len;
  
+       bool exception_from_userspace;
+
         /* Exceptions to be injected to the guest. */
         struct kvm_queued_exception exception;
+       /* Exception VM-Exits to be synthesized to L1. */
+       struct kvm_queued_exception exception_vmexit;
  
         struct kvm_queued_interrupt {
                 bool injected;
@@ -862,7 +865,6 @@ struct kvm_vcpu_arch {
                 u32 id;
                 bool send_user_only;
                 u32 host_apf_flags;
-               unsigned long nested_apf_token;
                 bool delivery_as_pf_vmexit;
                 bool pageready_pending;
         } apf;
@@ -1638,9 +1640,9 @@ struct kvm_x86_ops {
  
  struct kvm_x86_nested_ops {
         void (*leave_nested)(struct kvm_vcpu *vcpu);
+       bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+                                   u32 error_code);
         int (*check_events)(struct kvm_vcpu *vcpu);
-       bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
-                                            struct x86_exception *fault);
         bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
         void (*triple_fault)(struct kvm_vcpu *vcpu);
         int (*get_state)(struct kvm_vcpu *vcpu,
@@ -1867,7 +1869,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay
  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
  void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
-bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                     struct x86_exception *fault);
  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index 2ecc64c3f6ee4adb7fd0d6cf517353d0f8437f79..cf22900f7c5468ccf4234fdbefdc74c1b3040c8e 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -55,28 +55,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
         nested_svm_vmexit(svm);
  }
  
-static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
-                                                   struct x86_exception *fault)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *vmcb = svm->vmcb;
-
-       WARN_ON(!is_guest_mode(vcpu));
-
-       if (vmcb12_is_intercept(&svm->nested.ctl,
-                               INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
-           !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
-               vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
-               vmcb->control.exit_code_hi = 0;
-               vmcb->control.exit_info_1 = fault->error_code;
-               vmcb->control.exit_info_2 = fault->address;
-               nested_svm_vmexit(svm);
-               return true;
-       }
-
-       return false;
-}
-
  static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1308,16 +1286,17 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
         return 0;
  }
  
-static bool nested_exit_on_exception(struct vcpu_svm *svm)
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+                                          u32 error_code)
  {
-       unsigned int vector = svm->vcpu.arch.exception.vector;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
         return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
  }
  
  static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
  {
-       struct kvm_queued_exception *ex = &vcpu->arch.exception;
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb *vmcb = svm->vmcb;
  
@@ -1332,9 +1311,7 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
          * than #PF.
          */
         if (ex->vector == PF_VECTOR) {
-               if (ex->nested_apf)
-                       vmcb->control.exit_info_2 = vcpu->arch.apf.nested_apf_token;
-               else if (ex->has_payload)
+               if (ex->has_payload)
                         vmcb->control.exit_info_2 = ex->payload;
                 else
                         vmcb->control.exit_info_2 = vcpu->arch.cr2;
@@ -1387,15 +1364,19 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if (vcpu->arch.exception.pending) {
+       if (vcpu->arch.exception_vmexit.pending) {
                 if (block_nested_exceptions)
                          return -EBUSY;
-               if (!nested_exit_on_exception(svm))
-                       return 0;
                 nested_svm_inject_exception_vmexit(vcpu);
                 return 0;
         }
  
+       if (vcpu->arch.exception.pending) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               return 0;
+       }
+
         if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
@@ -1733,8 +1714,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
  
  struct kvm_x86_nested_ops svm_nested_ops = {
         .leave_nested = svm_leave_nested,
+       .is_exception_vmexit = nested_svm_is_exception_vmexit,
         .check_events = svm_check_nested_events,
-       .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
         .triple_fault = nested_svm_triple_fault,
         .get_nested_state_pages = svm_get_nested_state_pages,
         .get_state = svm_get_nested_state,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index e0a93d974829ff10d2cf641050d0c2811a6257b8..4da0558943ce506110ae87201edf904e59bf65d7 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -439,59 +439,22 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
         return inequality ^ bit;
  }
  
-
-/*
- * KVM wants to inject page-faults which it got to the guest. This function
- * checks whether in a nested guest, we need to inject them to L1 or L2.
- */
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
-{
-       struct kvm_queued_exception *ex = &vcpu->arch.exception;
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-       if (ex->vector == PF_VECTOR) {
-               if (ex->nested_apf) {
-                       *exit_qual = vcpu->arch.apf.nested_apf_token;
-                       return 1;
-               }
-               if (nested_vmx_is_page_fault_vmexit(vmcs12, ex->error_code)) {
-                       *exit_qual = ex->has_payload ? ex->payload : vcpu->arch.cr2;
-                       return 1;
-               }
-       } else if (vmcs12->exception_bitmap & (1u << ex->vector)) {
-               if (ex->vector == DB_VECTOR) {
-                       if (ex->has_payload) {
-                               *exit_qual = ex->payload;
-                       } else {
-                               *exit_qual = vcpu->arch.dr6;
-                               *exit_qual &= ~DR6_BT;
-                               *exit_qual ^= DR6_ACTIVE_LOW;
-                       }
-               } else
-                       *exit_qual = 0;
-               return 1;
-       }
-
-       return 0;
-}
-
-static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
-                                                   struct x86_exception *fault)
+static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+                                          u32 error_code)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
-       WARN_ON(!is_guest_mode(vcpu));
+       /*
+        * Drop bits 31:16 of the error code when performing the #PF mask+match
+        * check.  All VMCS fields involved are 32 bits, but Intel CPUs never
+        * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+        * error code.  Including the to-be-dropped bits in the check might
+        * result in an "impossible" or missed exit from L1's perspective.
+        */
+       if (vector == PF_VECTOR)
+               return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
  
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
-           !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
-               vmcs12->vm_exit_intr_error_code = fault->error_code;
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                                 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-                                 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-                                 fault->address);
-               return true;
-       }
-       return false;
+       return (vmcs12->exception_bitmap & (1u << vector));
  }
  
  static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -3863,12 +3826,24 @@ mmio_needed:
         return -ENXIO;
  }
  
-static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
-                                              unsigned long exit_qual)
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
  {
-       struct kvm_queued_exception *ex = &vcpu->arch.exception;
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       unsigned long exit_qual;
+
+       if (ex->has_payload) {
+               exit_qual = ex->payload;
+       } else if (ex->vector == PF_VECTOR) {
+               exit_qual = vcpu->arch.cr2;
+       } else if (ex->vector == DB_VECTOR) {
+               exit_qual = vcpu->arch.dr6;
+               exit_qual &= ~DR6_BT;
+               exit_qual ^= DR6_ACTIVE_LOW;
+       } else {
+               exit_qual = 0;
+       }
  
         if (ex->has_error_code) {
                 /*
@@ -4041,7 +4016,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
  {
         struct kvm_lapic *apic = vcpu->arch.apic;
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long exit_qual;
         /*
          * Only a pending nested run blocks a pending exception.  If there is a
          * previously injected event, the pending exception occurred while said
@@ -4095,14 +4069,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
          * across SMI/RSM as it should; that needs to be addressed in order to
          * prioritize SMI over MTF and trap-like #DBs.
          */
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+
+               nested_vmx_inject_exception_vmexit(vcpu);
+               return 0;
+       }
+
         if (vcpu->arch.exception.pending &&
             !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
                 if (block_nested_exceptions)
                         return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-               return 0;
+               goto no_vmexit;
         }
  
         if (vmx->nested.mtf_pending) {
@@ -4113,15 +4093,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if (vcpu->arch.exception.pending) {
+       if (vcpu->arch.exception_vmexit.pending) {
                 if (block_nested_exceptions)
                         return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+
+               nested_vmx_inject_exception_vmexit(vcpu);
                 return 0;
         }
  
+       if (vcpu->arch.exception.pending) {
+               if (block_nested_exceptions)
+                       return -EBUSY;
+               goto no_vmexit;
+       }
+
         if (nested_vmx_preemption_timer_pending(vcpu)) {
                 if (block_nested_events)
                         return -EBUSY;
@@ -6984,8 +6969,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
  
  struct kvm_x86_nested_ops vmx_nested_ops = {
         .leave_nested = vmx_leave_nested,
+       .is_exception_vmexit = nested_vmx_is_exception_vmexit,
         .check_events = vmx_check_nested_events,
-       .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
         .hv_timer_pending = nested_vmx_preemption_timer_pending,
         .triple_fault = nested_vmx_triple_fault,
         .get_state = vmx_get_nested_state,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index f555be2be993198f77d36f5834ff0d8734134116..8d793260db6a5c92c1a5d7f4887ce2bbb65aa768 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1659,7 +1659,9 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
          */
         if (nested_cpu_has_mtf(vmcs12) &&
             (!vcpu->arch.exception.pending ||
-            vcpu->arch.exception.vector == DB_VECTOR))
+            vcpu->arch.exception.vector == DB_VECTOR) &&
+           (!vcpu->arch.exception_vmexit.pending ||
+            vcpu->arch.exception_vmexit.vector == DB_VECTOR))
                 vmx->nested.mtf_pending = true;
         else
                 vmx->nested.mtf_pending = false;
@@ -5692,7 +5694,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         return vmx->emulation_required && !vmx->rmode.vm86_active &&
-              (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
+              (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
  }
  
  static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 458c7e35731a04f8a09e348bf0b3384c004bbd99..734b206ab8b79903ff2f3bca4b606d83f08c5743 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -613,6 +613,21 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
  }
  EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
  
+static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+                                      bool has_error_code, u32 error_code,
+                                      bool has_payload, unsigned long payload)
+{
+       struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+       ex->vector = vector;
+       ex->injected = false;
+       ex->pending = true;
+       ex->has_error_code = has_error_code;
+       ex->error_code = error_code;
+       ex->has_payload = has_payload;
+       ex->payload = payload;
+}
+
  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                 unsigned nr, bool has_error, u32 error_code,
                 bool has_payload, unsigned long payload, bool reinject)
@@ -622,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
  
         kvm_make_request(KVM_REQ_EVENT, vcpu);
  
+       /*
+        * If the exception is destined for L2 and isn't being reinjected,
+        * morph it to a VM-Exit if L1 wants to intercept the exception.  A
+        * previously injected exception is not checked because it was checked
+        * when it was original queued, and re-checking is incorrect if _L1_
+        * injected the exception, in which case it's exempt from interception.
+        */
+       if (!reinject && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+               kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+                                          has_payload, payload);
+               return;
+       }
+
         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
         queue:
                 if (reinject) {
                         /*
-                        * On vmentry, vcpu->arch.exception.pending is only
-                        * true if an event injection was blocked by
-                        * nested_run_pending.  In that case, however,
-                        * vcpu_enter_guest requests an immediate exit,
-                        * and the guest shouldn't proceed far enough to
-                        * need reinjection.
+                        * On VM-Entry, an exception can be pending if and only
+                        * if event injection was blocked by nested_run_pending.
+                        * In that case, however, vcpu_enter_guest() requests an
+                        * immediate exit, and the guest shouldn't proceed far
+                        * enough to need reinjection.
                          */
-                       WARN_ON_ONCE(vcpu->arch.exception.pending);
+                       WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
                         vcpu->arch.exception.injected = true;
                         if (WARN_ON_ONCE(has_payload)) {
                                 /*
@@ -736,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  {
         ++vcpu->stat.pf_guest;
-       vcpu->arch.exception.nested_apf =
-               is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf) {
-               vcpu->arch.apf.nested_apf_token = fault->address;
-               kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
-       } else {
+
+       /*
+        * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+        * whether or not L1 wants to intercept "regular" #PF.
+        */
+       if (is_guest_mode(vcpu) && fault->async_page_fault)
+               kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+                                          true, fault->error_code,
+                                          true, fault->address);
+       else
                 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
                                         fault->address);
-       }
  }
  EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  
-/* Returns true if the page fault was immediately morphed into a VM-Exit. */
-bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                     struct x86_exception *fault)
  {
         struct kvm_mmu *fault_mmu;
@@ -767,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                        fault_mmu->root.hpa);
  
-       /*
-        * A workaround for KVM's bad exception handling.  If KVM injected an
-        * exception into L2, and L2 encountered a #PF while vectoring the
-        * injected exception, manually check to see if L1 wants to intercept
-        * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
-        * In all other cases, defer the check to nested_ops->check_events(),
-        * which will correctly handle priority (this does not).  Note, other
-        * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
-        * most problematic, e.g. when L0 and L1 are both intercepting #PF for
-        * shadow paging.
-        *
-        * TODO: Rewrite exception handling to track injected and pending
-        *       (VM-Exit) exceptions separately.
-        */
-       if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
-           kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
-               return true;
-
         fault_mmu->inject_page_fault(vcpu, fault);
-       return false;
  }
  EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
  
@@ -4835,7 +4846,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
         return (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_accept_dm_intr(vcpu) &&
                 !kvm_event_needs_reinjection(vcpu) &&
-               !vcpu->arch.exception.pending);
+               !kvm_is_exception_pending(vcpu));
  }
  
  static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -5010,13 +5021,27 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                                                struct kvm_vcpu_events *events)
  {
-       struct kvm_queued_exception *ex = &vcpu->arch.exception;
+       struct kvm_queued_exception *ex;
  
         process_nmi(vcpu);
  
         if (kvm_check_request(KVM_REQ_SMI, vcpu))
                 process_smi(vcpu);
  
+       /*
+        * KVM's ABI only allows for one exception to be migrated.  Luckily,
+        * the only time there can be two queued exceptions is if there's a
+        * non-exiting _injected_ exception, and a pending exiting exception.
+        * In that case, ignore the VM-Exiting exception as it's an extension
+        * of the injected exception.
+        */
+       if (vcpu->arch.exception_vmexit.pending &&
+           !vcpu->arch.exception.pending &&
+           !vcpu->arch.exception.injected)
+               ex = &vcpu->arch.exception_vmexit;
+       else
+               ex = &vcpu->arch.exception;
+
         /*
          * In guest mode, payload delivery should be deferred if the exception
          * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
@@ -5123,6 +5148,19 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                 return -EINVAL;
  
         process_nmi(vcpu);
+
+       /*
+        * Flag that userspace is stuffing an exception, the next KVM_RUN will
+        * morph the exception to a VM-Exit if appropriate.  Do this only for
+        * pending exceptions, already-injected exceptions are not subject to
+        * intercpetion.  Note, userspace that conflates pending and injected
+        * is hosed, and will incorrectly convert an injected exception into a
+        * pending exception, which in turn may cause a spurious VM-Exit.
+        */
+       vcpu->arch.exception_from_userspace = events->exception.pending;
+
+       vcpu->arch.exception_vmexit.pending = false;
+
         vcpu->arch.exception.injected = events->exception.injected;
         vcpu->arch.exception.pending = events->exception.pending;
         vcpu->arch.exception.vector = events->exception.nr;
@@ -8167,18 +8205,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
         }
  }
  
-static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
  {
         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
-       if (ctxt->exception.vector == PF_VECTOR)
-               return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
  
-       if (ctxt->exception.error_code_valid)
+       if (ctxt->exception.vector == PF_VECTOR)
+               kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+       else if (ctxt->exception.error_code_valid)
                 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
                                       ctxt->exception.error_code);
         else
                 kvm_queue_exception(vcpu, ctxt->exception.vector);
-       return false;
  }
  
  static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -8813,8 +8850,7 @@ restart:
  
         if (ctxt->have_exception) {
                 r = 1;
-               if (inject_emulated_exception(vcpu))
-                       return r;
+               inject_emulated_exception(vcpu);
         } else if (vcpu->arch.pio.count) {
                 if (!vcpu->arch.pio.in) {
                         /* FIXME: return into emulator if single-stepping.  */
@@ -9761,7 +9797,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
          */
         if (vcpu->arch.exception.injected)
                 kvm_inject_exception(vcpu);
-       else if (vcpu->arch.exception.pending)
+       else if (kvm_is_exception_pending(vcpu))
                 ; /* see above */
         else if (vcpu->arch.nmi_injected)
                 static_call(kvm_x86_inject_nmi)(vcpu);
@@ -9788,6 +9824,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
         if (r < 0)
                 goto out;
  
+       /*
+        * A pending exception VM-Exit should either result in nested VM-Exit
+        * or force an immediate re-entry and exit to/from L2, and exception
+        * VM-Exits cannot be injected (flag should _never_ be set).
+        */
+       WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+                    vcpu->arch.exception_vmexit.pending);
+
         /*
          * New events, other than exceptions, cannot be injected if KVM needs
          * to re-inject a previous event.  See above comments on re-injecting
@@ -9887,7 +9931,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
             kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
                 *req_immediate_exit = true;
  
-       WARN_ON(vcpu->arch.exception.pending);
+       WARN_ON(kvm_is_exception_pending(vcpu));
         return 0;
  
  out:
@@ -10905,6 +10949,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  
  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
+       struct kvm_queued_exception *ex = &vcpu->arch.exception;
         struct kvm_run *kvm_run = vcpu->run;
         int r;
  
@@ -10963,6 +11008,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                 }
         }
  
+       /*
+        * If userspace set a pending exception and L2 is active, convert it to
+        * a pending VM-Exit if L1 wants to intercept the exception.
+        */
+       if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+           kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+                                                       ex->error_code)) {
+               kvm_queue_exception_vmexit(vcpu, ex->vector,
+                                          ex->has_error_code, ex->error_code,
+                                          ex->has_payload, ex->payload);
+               ex->injected = false;
+               ex->pending = false;
+       }
+       vcpu->arch.exception_from_userspace = false;
+
         if (unlikely(vcpu->arch.complete_userspace_io)) {
                 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
                 vcpu->arch.complete_userspace_io = NULL;
@@ -11069,6 +11129,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
         kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
  
         vcpu->arch.exception.pending = false;
+       vcpu->arch.exception_vmexit.pending = false;
  
         kvm_make_request(KVM_REQ_EVENT, vcpu);
  }
@@ -11436,7 +11497,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
  
         if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                 r = -EBUSY;
-               if (vcpu->arch.exception.pending)
+               if (kvm_is_exception_pending(vcpu))
                         goto out;
                 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                         kvm_queue_exception(vcpu, DB_VECTOR);
@@ -12670,7 +12731,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
         if (vcpu->arch.pv.pv_unhalted)
                 return true;
  
-       if (vcpu->arch.exception.pending)
+       if (kvm_is_exception_pending(vcpu))
                 return true;
  
         if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@ -12925,7 +12986,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
  {
         if (unlikely(!lapic_in_kernel(vcpu) ||
                      kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
+                    kvm_is_exception_pending(vcpu)))
                 return false;
  
         if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 4147d27f9fbc0892c2477ea9684cb84d61710c66..256745d1a2c3119f5343a5381b1d78c02c133caa 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -82,10 +82,17 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
  void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
  int kvm_check_nested_events(struct kvm_vcpu *vcpu);
  
+static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.exception.pending ||
+              vcpu->arch.exception_vmexit.pending;
+}
+
  static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
  {
         vcpu->arch.exception.pending = false;
         vcpu->arch.exception.injected = false;
+       vcpu->arch.exception_vmexit.pending = false;
  }
  
  static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
author	Sean Christopherson <seanjc@google.com>
	Tue, 30 Aug 2022 23:16:08 +0000 (23:16 +0000)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Mon, 26 Sep 2022 16:03:10 +0000 (12:03 -0400)
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history