x86/retpoline: Simplify retpolines
authorPeter Zijlstra <peterz@infradead.org>
Fri, 26 Mar 2021 15:12:02 +0000 (16:12 +0100)
committerIngo Molnar <mingo@kernel.org>
Fri, 2 Apr 2021 10:42:04 +0000 (12:42 +0200)
Due to:

  c9c324dc22aa ("objtool: Support stack layout changes in alternatives")

it is now possible to simplify the retpolines.

Currently our retpolines consist of 2 symbols:

 - __x86_indirect_thunk_\reg: the compiler target
 - __x86_retpoline_\reg:  the actual retpoline.

Both are consecutive in code and aligned such that for any one register
they both live in the same cacheline:

  0000000000000000 <__x86_indirect_thunk_rax>:
   0:   ff e0                   jmpq   *%rax
   2:   90                      nop
   3:   90                      nop
   4:   90                      nop

  0000000000000005 <__x86_retpoline_rax>:
   5:   e8 07 00 00 00          callq  11 <__x86_retpoline_rax+0xc>
   a:   f3 90                   pause
   c:   0f ae e8                lfence
   f:   eb f9                   jmp    a <__x86_retpoline_rax+0x5>
  11:   48 89 04 24             mov    %rax,(%rsp)
  15:   c3                      retq
  16:   66 2e 0f 1f 84 00 00 00 00 00   nopw   %cs:0x0(%rax,%rax,1)

The thunk is an alternative_2, where one option is a JMP to the
retpoline. This was done so that objtool didn't need to deal with
alternatives with stack ops. But that problem has been solved, so now
it is possible to fold the entire retpoline into the alternative to
simplify and consolidate unused bytes:

  0000000000000000 <__x86_indirect_thunk_rax>:
   0:   ff e0                   jmpq   *%rax
   2:   90                      nop
   3:   90                      nop
   4:   90                      nop
   5:   90                      nop
   6:   90                      nop
   7:   90                      nop
   8:   90                      nop
   9:   90                      nop
   a:   90                      nop
   b:   90                      nop
   c:   90                      nop
   d:   90                      nop
   e:   90                      nop
   f:   90                      nop
  10:   90                      nop
  11:   66 66 2e 0f 1f 84 00 00 00 00 00        data16 nopw %cs:0x0(%rax,%rax,1)
  1c:   0f 1f 40 00             nopl   0x0(%rax)

Notice that since the longest alternative sequence is now:

   0:   e8 07 00 00 00          callq  c <.altinstr_replacement+0xc>
   5:   f3 90                   pause
   7:   0f ae e8                lfence
   a:   eb f9                   jmp    5 <.altinstr_replacement+0x5>
   c:   48 89 04 24             mov    %rax,(%rsp)
  10:   c3                      retq

17 bytes, we have 15 bytes NOP at the end of our 32 byte slot. (IOW, if
we can shrink the retpoline by 1 byte we can pack it more densely).

 [ bp: Massage commit message. ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20210326151259.506071949@infradead.org
arch/x86/include/asm/asm-prototypes.h
arch/x86/include/asm/nospec-branch.h
arch/x86/lib/retpoline.S
tools/objtool/check.c

index 51e2bf27cc9b0d9e078c0940645f1b1e8ef037d0..0545b07895a52483130f5308f4424006352adb35 100644 (file)
@@ -22,15 +22,8 @@ extern void cmpxchg8b_emu(void);
 #define DECL_INDIRECT_THUNK(reg) \
        extern asmlinkage void __x86_indirect_thunk_ ## reg (void);
 
-#define DECL_RETPOLINE(reg) \
-       extern asmlinkage void __x86_retpoline_ ## reg (void);
-
 #undef GEN
 #define GEN(reg) DECL_INDIRECT_THUNK(reg)
 #include <asm/GEN-for-each-reg.h>
 
-#undef GEN
-#define GEN(reg) DECL_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
-
 #endif /* CONFIG_RETPOLINE */
index 529f8e9380d8e240c50701776a4e25bd32f9603c..664be73daa8c1eeaa2d3432ccaacad8056f2a492 100644 (file)
@@ -80,7 +80,7 @@
 .macro JMP_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
        ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
-                     __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+                     __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
                      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
 #else
        jmp     *%\reg
@@ -90,7 +90,7 @@
 .macro CALL_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
        ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
-                     __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+                     __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
                      __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD
 #else
        call    *%\reg
        ALTERNATIVE_2(                                          \
        ANNOTATE_RETPOLINE_SAFE                                 \
        "call *%[thunk_target]\n",                              \
-       "call __x86_retpoline_%V[thunk_target]\n",              \
+       "call __x86_indirect_thunk_%V[thunk_target]\n",         \
        X86_FEATURE_RETPOLINE,                                  \
        "lfence;\n"                                             \
        ANNOTATE_RETPOLINE_SAFE                                 \
index 6bb74b5c238c61db1c23b8ec33152561bf1deaba..d2c0d14c1e221e20ef009d42a25e0e78e42ecc1d 100644 (file)
 #include <asm/unwind_hints.h>
 #include <asm/frame.h>
 
-.macro THUNK reg
-       .section .text.__x86.indirect_thunk
-
-       .align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
-       JMP_NOSPEC \reg
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
+.macro RETPOLINE reg
        ANNOTATE_INTRA_FUNCTION_CALL
-       call    .Ldo_rop_\@
+       call    .Ldo_rop_\@
 .Lspec_trap_\@:
        UNWIND_HINT_EMPTY
        pause
        lfence
-       jmp     .Lspec_trap_\@
+       jmp .Lspec_trap_\@
 .Ldo_rop_\@:
-       mov     %\reg, (%_ASM_SP)
+       mov     %\reg, (%_ASM_SP)
        UNWIND_HINT_FUNC
        ret
-SYM_FUNC_END(__x86_retpoline_\reg)
+.endm
+
+.macro THUNK reg
+       .section .text.__x86.indirect_thunk
+
+       .align 32
+SYM_FUNC_START(__x86_indirect_thunk_\reg)
+
+       ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
+                     __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
+                     __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+
+SYM_FUNC_END(__x86_indirect_thunk_\reg)
 
 .endm
 
@@ -48,7 +52,6 @@ SYM_FUNC_END(__x86_retpoline_\reg)
 
 #define __EXPORT_THUNK(sym)    _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
 #define EXPORT_THUNK(reg)      __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#define EXPORT_RETPOLINE(reg)  __EXPORT_THUNK(__x86_retpoline_ ## reg)
 
 #undef GEN
 #define GEN(reg) THUNK reg
@@ -58,6 +61,3 @@ SYM_FUNC_END(__x86_retpoline_\reg)
 #define GEN(reg) EXPORT_THUNK(reg)
 #include <asm/GEN-for-each-reg.h>
 
-#undef GEN
-#define GEN(reg) EXPORT_RETPOLINE(reg)
-#include <asm/GEN-for-each-reg.h>
index 5e5388a38e2a73878c9665285a72b6c43982adab..d45f0181f1332f838c6c8464e5ce7a15a6800493 100644 (file)
@@ -872,8 +872,7 @@ static int add_jump_destinations(struct objtool_file *file)
                } else if (reloc->sym->type == STT_SECTION) {
                        dest_sec = reloc->sym->sec;
                        dest_off = arch_dest_reloc_offset(reloc->addend);
-               } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) ||
-                          !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) {
+               } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21)) {
                        /*
                         * Retpoline jumps are really dynamic jumps in
                         * disguise, so convert them accordingly.