x86/kvm: Handle async page faults directly through do_page_fault()
authorAndy Lutomirski <luto@kernel.org>
Fri, 28 Feb 2020 18:42:48 +0000 (10:42 -0800)
committerThomas Gleixner <tglx@linutronix.de>
Tue, 19 May 2020 13:53:57 +0000 (15:53 +0200)
KVM overloads #PF to indicate two types of not-actually-page-fault
events.  Right now, the KVM guest code intercepts them by modifying
the IDT and hooking the #PF vector.  This makes the already fragile
fault code even harder to understand, and it also pollutes call
traces with async_page_fault and do_async_page_fault for normal page
faults.

Clean it up by moving the logic into do_page_fault() using a static
branch.  This gets rid of the platform trap_init override mechanism
completely.

[ tglx: Fixed up 32bit, removed error code from the async functions and
   massaged coding style ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200505134059.169270470@linutronix.de
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/x86_init.h
arch/x86/kernel/kvm.c
arch/x86/kernel/traps.c
arch/x86/kernel/x86_init.c
arch/x86/mm/fault.c

index b67bae7091d7ecf9d5319b976c10be5e6045caa2..8ba0985f501655da3bbad4b0d3d1b7bc2197f3c7 100644 (file)
@@ -1693,14 +1693,6 @@ SYM_CODE_START(general_protection)
        jmp     common_exception
 SYM_CODE_END(general_protection)
 
-#ifdef CONFIG_KVM_GUEST
-SYM_CODE_START(async_page_fault)
-       ASM_CLAC
-       pushl   $do_async_page_fault
-       jmp     common_exception_read_cr2
-SYM_CODE_END(async_page_fault)
-#endif
-
 SYM_CODE_START(rewind_stack_do_exit)
        /* Prevent any naive code from trying to unwind to our caller. */
        xorl    %ebp, %ebp
index 3063aa9090f9a7927143fb929fde7ec7a8150d81..9ab3ea6d02fc64c0726536ef9b90509c5e39c081 100644 (file)
@@ -1202,10 +1202,6 @@ idtentry xendebug                do_debug                has_error_code=0
 idtentry general_protection    do_general_protection   has_error_code=1
 idtentry page_fault            do_page_fault           has_error_code=1        read_cr2=1
 
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault      do_async_page_fault     has_error_code=1        read_cr2=1
-#endif
-
 #ifdef CONFIG_X86_MCE
 idtentry machine_check         do_mce                  has_error_code=0        paranoid=1
 #endif
index 9b4df6eaa11a6c9f9e6f74b45b8006f1ad71cce3..5261363adda31a13f74f4779ed8ae2b4c486911c 100644 (file)
@@ -91,8 +91,18 @@ unsigned int kvm_arch_para_hints(void);
 void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
-extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+void kvm_disable_steal_time(void);
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
+
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+       if (static_branch_unlikely(&kvm_async_pf_enabled))
+               return __kvm_handle_async_pf(regs, token);
+       else
+               return false;
+}
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
@@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void)
 {
        return;
 }
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+       return false;
+}
 #endif
 
 #endif /* _ASM_X86_KVM_PARA_H */
index 96d9cd2086104c9cccddedec7a7bf1bc108d32be..6807153c04105ddc2a53dd69562308776fc74000 100644 (file)
@@ -50,14 +50,12 @@ struct x86_init_resources {
  * @pre_vector_init:           init code to run before interrupt vectors
  *                             are set up.
  * @intr_init:                 interrupt init code
- * @trap_init:                 platform specific trap setup
  * @intr_mode_select:          interrupt delivery mode selection
  * @intr_mode_init:            interrupt delivery mode setup
  */
 struct x86_init_irqs {
        void (*pre_vector_init)(void);
        void (*intr_init)(void);
-       void (*trap_init)(void);
        void (*intr_mode_select)(void);
        void (*intr_mode_init)(void);
 };
index 6efe0410fb728995ea8944ecd984840b6f30778e..5ad3fcca2309442b2b7d83bf0769892a5bf4ca78 100644 (file)
@@ -35,6 +35,8 @@
 #include <asm/tlb.h>
 #include <asm/cpuidle_haltpoll.h>
 
+DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
 static int kvmapf = 1;
 
 static int __init parse_no_kvmapf(char *arg)
@@ -242,25 +244,27 @@ u32 kvm_read_and_reset_pf_reason(void)
 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
-dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
+       /*
+        * If we get a page fault right here, the pf_reason seems likely
+        * to be clobbered.  Bummer.
+        */
        switch (kvm_read_and_reset_pf_reason()) {
        default:
-               do_page_fault(regs, error_code, address);
-               break;
+               return false;
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                /* page is swapped out by the host. */
-               kvm_async_pf_task_wait((u32)address, !user_mode(regs));
-               break;
+               kvm_async_pf_task_wait(token, !user_mode(regs));
+               return true;
        case KVM_PV_REASON_PAGE_READY:
                rcu_irq_enter();
-               kvm_async_pf_task_wake((u32)address);
+               kvm_async_pf_task_wake(token);
                rcu_irq_exit();
-               break;
+               return true;
        }
 }
-NOKPROBE_SYMBOL(do_async_page_fault);
+NOKPROBE_SYMBOL(__kvm_handle_async_pf);
 
 static void __init paravirt_ops_setup(void)
 {
@@ -306,7 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 static void kvm_guest_cpu_init(void)
 {
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
-               u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+               u64 pa;
+
+               WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+               pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
 
 #ifdef CONFIG_PREEMPTION
                pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -592,12 +600,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 }
 #endif
 
-static void __init kvm_apf_trap_init(void)
-{
-       update_intr_gate(X86_TRAP_PF, async_page_fault);
-}
-
-
 static void kvm_flush_tlb_others(const struct cpumask *cpumask,
                        const struct flush_tlb_info *info)
 {
@@ -632,8 +634,6 @@ static void __init kvm_guest_init(void)
        register_reboot_notifier(&kvm_pv_reboot_nb);
        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
                raw_spin_lock_init(&async_pf_sleepers[i].lock);
-       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
-               x86_init.irqs.trap_init = kvm_apf_trap_init;
 
        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
@@ -649,6 +649,9 @@ static void __init kvm_guest_init(void)
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
+       if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+               static_branch_enable(&kvm_async_pf_enabled);
+
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
index d54cffdc7cac2b5c55b272c7df3a8756fbd024c1..821fac47eef6fc983871bb7bb21c2e575b3f4ee5 100644 (file)
@@ -983,7 +983,5 @@ void __init trap_init(void)
 
        idt_setup_ist_traps();
 
-       x86_init.irqs.trap_init();
-
        idt_setup_debugidt_traps();
 }
index 85f1a90c55cd817cf994ff5c0b1da6c227380a5d..123f1c1f1788e4749109648346a074f816abe687 100644 (file)
@@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = {
        .irqs = {
                .pre_vector_init        = init_ISA_irqs,
                .intr_init              = native_init_IRQ,
-               .trap_init              = x86_init_noop,
                .intr_mode_select       = apic_intr_mode_select,
                .intr_mode_init         = apic_intr_mode_init
        },
index a51df516b87bf1e174e2170b2c642406d2f28633..6486ccec1b0eb21d43392b88af2152377d560468 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/desc.h>                  /* store_idt(), ...             */
 #include <asm/cpu_entry_area.h>                /* exception stack              */
 #include <asm/pgtable_areas.h>         /* VMALLOC_START, ...           */
+#include <asm/kvm_para.h>              /* kvm_handle_async_pf          */
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -1523,6 +1524,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
                unsigned long address)
 {
        prefetchw(&current->mm->mmap_sem);
+       /*
+        * KVM has two types of events that are, logically, interrupts, but
+        * are unfortunately delivered using the #PF vector.  These events are
+        * "you just accessed valid memory, but the host doesn't have it right
+        * now, so I'll put you to sleep if you continue" and "that memory
+        * you tried to access earlier is available now."
+        *
+        * We are relying on the interrupted context being sane (valid RSP,
+        * relevant locks not held, etc.), which is fine as long as the
+        * interrupted context had IF=1.  We are also relying on the KVM
+        * async pf type field and CR2 being read consistently instead of
+        * getting values from real and async page faults mixed up.
+        *
+        * Fingers crossed.
+        */
+       if (kvm_handle_async_pf(regs, (u32)address))
+               return;
+
        trace_page_fault_entries(regs, hw_error_code, address);
 
        if (unlikely(kmmio_fault(regs, address)))