x86/mce: Implement recovery for errors in TDX/SEAM non-root mode

author Tony Luck <tony.luck@intel.com>

Mon, 8 Apr 2024 18:09:44 +0000 (11:09 -0700)

committer Ingo Molnar <mingo@kernel.org>

Tue, 9 Apr 2024 07:30:36 +0000 (09:30 +0200)
author Tony Luck <tony.luck@intel.com>
Mon, 8 Apr 2024 18:09:44 +0000 (11:09 -0700)
committer Ingo Molnar <mingo@kernel.org>
Tue, 9 Apr 2024 07:30:36 +0000 (09:30 +0200)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h

index de3118305838e9ee2e80f5d432a21d15c49edf61..dfd2e9699bd7318010ecf6dfec7066f43005057a 100644 (file)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@
  #define MCG_CTL_P              BIT_ULL(8)   /* MCG_CTL register available */
  #define MCG_EXT_P              BIT_ULL(9)   /* Extended registers available */
  #define MCG_CMCI_P             BIT_ULL(10)  /* CMCI supported */
+#define MCG_SEAM_NR            BIT_ULL(12)  /* MCG_STATUS_SEAM_NR supported */
  #define MCG_EXT_CNT_MASK       0xff0000     /* Number of Extended registers */
  #define MCG_EXT_CNT_SHIFT      16
  #define MCG_EXT_CNT(c)         (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
@@ -25,6 +26,7 @@
  #define MCG_STATUS_EIPV                BIT_ULL(1)   /* ip points to correct instruction */
  #define MCG_STATUS_MCIP                BIT_ULL(2)   /* machine check in progress */
  #define MCG_STATUS_LMCES       BIT_ULL(3)   /* LMCE signaled */
+#define MCG_STATUS_SEAM_NR     BIT_ULL(12)  /* Machine check inside SEAM non-root mode */
  
  /* MCG_EXT_CTL register defines */
  #define MCG_EXT_CTL_LMCE_EN    BIT_ULL(0) /* Enable LMCE */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c

index 84d41be6d06ba4e79f49ae069f5f1d5ae20b00de..771a9f1832607ece76dfdd72f9ca04d80ef7b572 100644 (file)
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1593,6 +1593,24 @@ noinstr void do_machine_check(struct pt_regs *regs)
                 else
                         queue_task_work(&m, msg, kill_me_maybe);
  
+       } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) {
+               /*
+                * Saved RIP on stack makes it look like the machine check
+                * was taken in the kernel on the instruction following
+                * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+                * that the machine check was taken inside SEAM non-root
+                * mode.  CPU core has already marked that guest as dead.
+                * It is OK for the kernel to resume execution at the
+                * apparent point of the machine check as the fault did
+                * not occur there. Mark the page as poisoned so it won't
+                * be added to free list when the guest is terminated.
+                */
+               if (mce_usable_address(&m)) {
+                       struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT);
+
+                       if (p)
+                               SetPageHWPoison(p);
+               }
         } else {
                 /*
                  * Handle an MCE which has happened in kernel space but from
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c

index c4477162c07d134e977437d35251c506e4d1ee8b..fc8988cfe1c3644dce15eb2bdd28715beae778c6 100644 (file)
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -39,8 +39,8 @@ static struct severity {
         u64 mask;
         u64 result;
         unsigned char sev;
-       unsigned char mcgmask;
-       unsigned char mcgres;
+       unsigned short mcgmask;
+       unsigned short mcgres;
         unsigned char ser;
         unsigned char context;
         unsigned char excp;
@@ -173,6 +173,18 @@ static struct severity {
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
                 USER
                 ),
+       MCESEV(
+               AR, "Data load error in SEAM non-root mode",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+               MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+               KERNEL
+               ),
+       MCESEV(
+               AR, "Instruction fetch error in SEAM non-root mode",
+               SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+               MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+               KERNEL
+               ),
         MCESEV(
                 PANIC, "Data load in unrecoverable area of kernel",
                 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
author	Tony Luck <tony.luck@intel.com>
	Mon, 8 Apr 2024 18:09:44 +0000 (11:09 -0700)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 9 Apr 2024 07:30:36 +0000 (09:30 +0200)
arch/x86/include/asm/mce.h		patch \| blob \| history
arch/x86/kernel/cpu/mce/core.c		patch \| blob \| history
arch/x86/kernel/cpu/mce/severity.c		patch \| blob \| history