git.maquefel.me Git - linux.git/blob

1 // SPDX-License-Identifier: GPL-2.0-only

2 /*

3 * Kernel-based Virtual Machine driver for Linux

4 *

5 * This module enables machines with Intel VT-x extensions to run virtual

6 * machines without emulation or binary translation.

7 *

10 *

11 * Authors:

12 * Avi Kivity <avi@qumranet.com>

13 * Yaniv Kamay <yaniv@qumranet.com>

14 */

15

16 #include <linux/highmem.h>

17 #include <linux/hrtimer.h>

18 #include <linux/kernel.h>

19 #include <linux/kvm_host.h>

20 #include <linux/module.h>

21 #include <linux/moduleparam.h>

22 #include <linux/mod_devicetable.h>

23 #include <linux/mm.h>

24 #include <linux/objtool.h>

25 #include <linux/sched.h>

26 #include <linux/sched/smt.h>

27 #include <linux/slab.h>

28 #include <linux/tboot.h>

29 #include <linux/trace_events.h>

30 #include <linux/entry-kvm.h>

31

32 #include <asm/apic.h>

33 #include <asm/asm.h>

34 #include <asm/cpu.h>

35 #include <asm/cpu_device_id.h>

36 #include <asm/debugreg.h>

37 #include <asm/desc.h>

38 #include <asm/fpu/internal.h>

39 #include <asm/idtentry.h>

40 #include <asm/io.h>

41 #include <asm/irq_remapping.h>

42 #include <asm/kexec.h>

43 #include <asm/perf_event.h>

44 #include <asm/mmu_context.h>

45 #include <asm/mshyperv.h>

46 #include <asm/mwait.h>

47 #include <asm/spec-ctrl.h>

48 #include <asm/virtext.h>

49 #include <asm/vmx.h>

50

51 #include "capabilities.h"

52 #include "cpuid.h"

53 #include "evmcs.h"

54 #include "hyperv.h"

55 #include "kvm_onhyperv.h"

56 #include "irq.h"

57 #include "kvm_cache_regs.h"

58 #include "lapic.h"

59 #include "mmu.h"

60 #include "nested.h"

61 #include "pmu.h"

62 #include "sgx.h"

63 #include "trace.h"

64 #include "vmcs.h"

65 #include "vmcs12.h"

66 #include "vmx.h"

67 #include "x86.h"

68

69 MODULE_AUTHOR("Qumranet");

70 MODULE_LICENSE("GPL");

71

72 #ifdef MODULE

73 static const struct x86_cpu_id vmx_cpu_id[] = {

74 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),

75 {}

76 };

77 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);

78 #endif

79

80 bool __read_mostly enable_vpid = 1;

81 module_param_named(vpid, enable_vpid, bool, 0444);

82

83 static bool __read_mostly enable_vnmi = 1;

84 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);

85

86 bool __read_mostly flexpriority_enabled = 1;

87 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);

88

89 bool __read_mostly enable_ept = 1;

90 module_param_named(ept, enable_ept, bool, S_IRUGO);

91

92 bool __read_mostly enable_unrestricted_guest = 1;

93 module_param_named(unrestricted_guest,

94 enable_unrestricted_guest, bool, S_IRUGO);

95

96 bool __read_mostly enable_ept_ad_bits = 1;

97 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);

98

99 static bool __read_mostly emulate_invalid_guest_state = true;

100 module_param(emulate_invalid_guest_state, bool, S_IRUGO);

101

102 static bool __read_mostly fasteoi = 1;

103 module_param(fasteoi, bool, S_IRUGO);

104

105 module_param(enable_apicv, bool, S_IRUGO);

106

107 /*

108 * If nested=1, nested virtualization is supported, i.e., guests may use

109 * VMX and be a hypervisor for its own guests. If nested=0, guests may not

110 * use VMX instructions.

111 */

112 static bool __read_mostly nested = 1;

113 module_param(nested, bool, S_IRUGO);

114

115 bool __read_mostly enable_pml = 1;

116 module_param_named(pml, enable_pml, bool, S_IRUGO);

117

118 static bool __read_mostly dump_invalid_vmcs = 0;

119 module_param(dump_invalid_vmcs, bool, 0644);

120

121 #define MSR_BITMAP_MODE_X2APIC 1

122 #define MSR_BITMAP_MODE_X2APIC_APICV 2

123

124 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL

125

126 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */

127 static int __read_mostly cpu_preemption_timer_multi;

128 static bool __read_mostly enable_preemption_timer = 1;

129 #ifdef CONFIG_X86_64

130 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);

131 #endif

132

133 extern bool __read_mostly allow_smaller_maxphyaddr;

134 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);

135

136 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)

137 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE

138 #define KVM_VM_CR0_ALWAYS_ON \

139 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \

140 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)

141

142 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE

143 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)

144 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

145

146 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

147

148 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \

149 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \

150 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \

151 RTIT_STATUS_BYTECNT))

152

153 /*

154 * List of MSRs that can be directly passed to the guest.

155 * In addition to these x2apic and PT MSRs are handled specially.

156 */

157 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {

158 MSR_IA32_SPEC_CTRL,

159 MSR_IA32_PRED_CMD,

160 MSR_IA32_TSC,

161 #ifdef CONFIG_X86_64

162 MSR_FS_BASE,

163 MSR_GS_BASE,

164 MSR_KERNEL_GS_BASE,

165 #endif

166 MSR_IA32_SYSENTER_CS,

167 MSR_IA32_SYSENTER_ESP,

168 MSR_IA32_SYSENTER_EIP,

169 MSR_CORE_C1_RES,

170 MSR_CORE_C3_RESIDENCY,

171 MSR_CORE_C6_RESIDENCY,

172 MSR_CORE_C7_RESIDENCY,

173 };

174

175 /*

176 * These 2 parameters are used to config the controls for Pause-Loop Exiting:

177 * ple_gap: upper bound on the amount of time between two successive

178 * executions of PAUSE in a loop. Also indicate if ple enabled.

179 * According to test, this time is usually smaller than 128 cycles.

180 * ple_window: upper bound on the amount of time a guest is allowed to execute

181 * in a PAUSE loop. Tests indicate that most spinlocks are held for

182 * less than 2^12 cycles

183 * Time is measured based on a counter that runs at the same rate as the TSC,

184 * refer SDM volume 3b section 21.6.13 & 22.1.3.

185 */

186 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;

187 module_param(ple_gap, uint, 0444);

188

189 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;

190 module_param(ple_window, uint, 0444);

191

192 /* Default doubles per-vcpu window every exit. */

193 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;

194 module_param(ple_window_grow, uint, 0444);

195

196 /* Default resets per-vcpu window every exit to ple_window. */

197 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;

198 module_param(ple_window_shrink, uint, 0444);

199

200 /* Default is to compute the maximum so we can never overflow. */

201 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;

202 module_param(ple_window_max, uint, 0444);

203

204 /* Default is SYSTEM mode, 1 for host-guest mode */

205 int __read_mostly pt_mode = PT_MODE_SYSTEM;

206 module_param(pt_mode, int, S_IRUGO);

207

208 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);

209 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);

210 static DEFINE_MUTEX(vmx_l1d_flush_mutex);

211

212 /* Storage for pre module init parameter parsing */

213 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;

214

215 static const struct {

216 const char *option;

217 bool for_parse;

218 } vmentry_l1d_param[] = {

219 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},

220 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},

221 [VMENTER_L1D_FLUSH_COND] = {"cond", true},

222 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},

223 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},

224 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},

225 };

226

227 #define L1D_CACHE_ORDER 4

228 static void *vmx_l1d_flush_pages;

229

230 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)

231 {

232 struct page *page;

233 unsigned int i;

234

235 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {

236 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;

237 return 0;

238 }

239

240 if (!enable_ept) {

241 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;

242 return 0;

243 }

244

245 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {

246 u64 msr;

247

248 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);

249 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {

250 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;

251 return 0;

252 }

253 }

254

255 /* If set to auto use the default l1tf mitigation method */

256 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {

257 switch (l1tf_mitigation) {

258 case L1TF_MITIGATION_OFF:

259 l1tf = VMENTER_L1D_FLUSH_NEVER;

260 break;

261 case L1TF_MITIGATION_FLUSH_NOWARN:

262 case L1TF_MITIGATION_FLUSH:

263 case L1TF_MITIGATION_FLUSH_NOSMT:

264 l1tf = VMENTER_L1D_FLUSH_COND;

265 break;

266 case L1TF_MITIGATION_FULL:

267 case L1TF_MITIGATION_FULL_FORCE:

268 l1tf = VMENTER_L1D_FLUSH_ALWAYS;

269 break;

270 }

271 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {

272 l1tf = VMENTER_L1D_FLUSH_ALWAYS;

273 }

274

275 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&

276 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {

277 /*

278 * This allocation for vmx_l1d_flush_pages is not tied to a VM

279 * lifetime and so should not be charged to a memcg.

280 */

281 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);

282 if (!page)

283 return -ENOMEM;

284 vmx_l1d_flush_pages = page_address(page);

285

286 /*

287 * Initialize each page with a different pattern in

288 * order to protect against KSM in the nested

289 * virtualization case.

290 */

291 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {

292 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,

293 PAGE_SIZE);

294 }

295 }

296

297 l1tf_vmx_mitigation = l1tf;

298

299 if (l1tf != VMENTER_L1D_FLUSH_NEVER)

300 static_branch_enable(&vmx_l1d_should_flush);

301 else

302 static_branch_disable(&vmx_l1d_should_flush);

303

304 if (l1tf == VMENTER_L1D_FLUSH_COND)

305 static_branch_enable(&vmx_l1d_flush_cond);

306 else

307 static_branch_disable(&vmx_l1d_flush_cond);

308 return 0;

309 }

310

311 static int vmentry_l1d_flush_parse(const char *s)

312 {

313 unsigned int i;

314

315 if (s) {

316 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {

317 if (vmentry_l1d_param[i].for_parse &&

318 sysfs_streq(s, vmentry_l1d_param[i].option))

319 return i;

320 }

321 }

322 return -EINVAL;

323 }

324

325 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)

326 {

327 int l1tf, ret;

328

329 l1tf = vmentry_l1d_flush_parse(s);

330 if (l1tf < 0)

331 return l1tf;

332

333 if (!boot_cpu_has(X86_BUG_L1TF))

334 return 0;

335

336 /*

337 * Has vmx_init() run already? If not then this is the pre init

338 * parameter parsing. In that case just store the value and let

339 * vmx_init() do the proper setup after enable_ept has been

340 * established.

341 */

342 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {

343 vmentry_l1d_flush_param = l1tf;

344 return 0;

345 }

346

347 mutex_lock(&vmx_l1d_flush_mutex);

348 ret = vmx_setup_l1d_flush(l1tf);

349 mutex_unlock(&vmx_l1d_flush_mutex);

350 return ret;

351 }

352

353 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)

354 {

355 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))

356 return sprintf(s, "???\n");

357

358 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);

359 }

360

361 static const struct kernel_param_ops vmentry_l1d_flush_ops = {

362 .set = vmentry_l1d_flush_set,

363 .get = vmentry_l1d_flush_get,

364 };

365 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);

366

367 static u32 vmx_segment_access_rights(struct kvm_segment *var);

368

369 void vmx_vmexit(void);

370

371 #define vmx_insn_failed(fmt...) \

372 do { \

373 WARN_ONCE(1, fmt); \

374 pr_warn_ratelimited(fmt); \

375 } while (0)

376

377 asmlinkage void vmread_error(unsigned long field, bool fault)

378 {

379 if (fault)

380 kvm_spurious_fault();

381 else

382 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);

383 }

384

385 noinline void vmwrite_error(unsigned long field, unsigned long value)

386 {

387 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",

388 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));

389 }

390

391 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)

392 {

393 vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);

394 }

395

396 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)

397 {

398 vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);

399 }

400

401 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)

402 {

403 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",

404 ext, vpid, gva);

405 }

406

407 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)

408 {

409 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",

410 ext, eptp, gpa);

411 }

412

413 static DEFINE_PER_CPU(struct vmcs *, vmxarea);

414 DEFINE_PER_CPU(struct vmcs *, current_vmcs);

415 /*

416 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed

417 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.

418 */

419 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);

420

421 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);

422 static DEFINE_SPINLOCK(vmx_vpid_lock);

423

424 struct vmcs_config vmcs_config;

425 struct vmx_capability vmx_capability;

426

427 #define VMX_SEGMENT_FIELD(seg) \

428 [VCPU_SREG_##seg] = { \

429 .selector = GUEST_##seg##_SELECTOR, \

430 .base = GUEST_##seg##_BASE, \

431 .limit = GUEST_##seg##_LIMIT, \

432 .ar_bytes = GUEST_##seg##_AR_BYTES, \

433 }

434

435 static const struct kvm_vmx_segment_field {

436 unsigned selector;

437 unsigned base;

438 unsigned limit;

439 unsigned ar_bytes;

440 } kvm_vmx_segment_fields[] = {

441 VMX_SEGMENT_FIELD(CS),

442 VMX_SEGMENT_FIELD(DS),

443 VMX_SEGMENT_FIELD(ES),

444 VMX_SEGMENT_FIELD(FS),

445 VMX_SEGMENT_FIELD(GS),

446 VMX_SEGMENT_FIELD(SS),

447 VMX_SEGMENT_FIELD(TR),

448 VMX_SEGMENT_FIELD(LDTR),

449 };

450

451 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)

452 {

453 vmx->segment_cache.bitmask = 0;

454 }

455

456 static unsigned long host_idt_base;

457

458 #if IS_ENABLED(CONFIG_HYPERV)

459 static bool __read_mostly enlightened_vmcs = true;

460 module_param(enlightened_vmcs, bool, 0444);

461

462 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)

463 {

464 struct hv_enlightened_vmcs *evmcs;

465 struct hv_partition_assist_pg **p_hv_pa_pg =

466 &to_kvm_hv(vcpu->kvm)->hv_pa_pg;

467 /*

468 * Synthetic VM-Exit is not enabled in current code and so All

469 * evmcs in singe VM shares same assist page.

470 */

471 if (!*p_hv_pa_pg)

472 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);

473

474 if (!*p_hv_pa_pg)

475 return -ENOMEM;

476

477 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;

478

479 evmcs->partition_assist_page =

480 __pa(*p_hv_pa_pg);

481 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;

482 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;

483

484 return 0;

485 }

486

487 #endif /* IS_ENABLED(CONFIG_HYPERV) */

488

489 /*

490 * Comment's format: document - errata name - stepping - processor name.

491 * Refer from

492 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp

493 */

494 static u32 vmx_preemption_cpu_tfms[] = {

495 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */

496 0x000206E6,

497 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */

498 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */

499 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */

500 0x00020652,

501 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */

502 0x00020655,

503 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */

504 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */

505 /*

506 * 320767.pdf - AAP86 - B1 -

507 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile

508 */

509 0x000106E5,

510 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */

511 0x000106A0,

512 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */

513 0x000106A1,

514 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */

515 0x000106A4,

516 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */

517 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */

518 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */

519 0x000106A5,

520 /* Xeon E3-1220 V2 */

521 0x000306A8,

522 };

523

524 static inline bool cpu_has_broken_vmx_preemption_timer(void)

525 {

526 u32 eax = cpuid_eax(0x00000001), i;

527

528 /* Clear the reserved bits */

529 eax &= ~(0x3U << 14 | 0xfU << 28);

530 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)

531 if (eax == vmx_preemption_cpu_tfms[i])

532 return true;

533

534 return false;

535 }

536

537 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)

538 {

539 return flexpriority_enabled && lapic_in_kernel(vcpu);

540 }

541

542 static inline bool report_flexpriority(void)

543 {

544 return flexpriority_enabled;

545 }

546

547 static int possible_passthrough_msr_slot(u32 msr)

548 {

549 u32 i;

550

551 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)

552 if (vmx_possible_passthrough_msrs[i] == msr)

553 return i;

554

555 return -ENOENT;

556 }

557

558 static bool is_valid_passthrough_msr(u32 msr)

559 {

560 bool r;

561

562 switch (msr) {

563 case 0x800 ... 0x8ff:

564 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */

565 return true;

566 case MSR_IA32_RTIT_STATUS:

567 case MSR_IA32_RTIT_OUTPUT_BASE:

568 case MSR_IA32_RTIT_OUTPUT_MASK:

569 case MSR_IA32_RTIT_CR3_MATCH:

570 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:

571 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */

572 case MSR_LBR_SELECT:

573 case MSR_LBR_TOS:

574 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:

575 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:

576 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:

577 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:

578 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:

579 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */

580 return true;

581 }

582

583 r = possible_passthrough_msr_slot(msr) != -ENOENT;

584

585 WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);

586

587 return r;

588 }

589

590 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)

591 {

592 int i;

593

594 i = kvm_find_user_return_msr(msr);

595 if (i >= 0)

596 return &vmx->guest_uret_msrs[i];

597 return NULL;

598 }

599

600 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,

601 struct vmx_uret_msr *msr, u64 data)

602 {

603 unsigned int slot = msr - vmx->guest_uret_msrs;

604 int ret = 0;

605

606 u64 old_msr_data = msr->data;

607 msr->data = data;

608 if (msr->load_into_hardware) {

609 preempt_disable();

610 ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);

611 preempt_enable();

612 if (ret)

613 msr->data = old_msr_data;

614 }

615 return ret;

616 }

617

618 #ifdef CONFIG_KEXEC_CORE

619 static void crash_vmclear_local_loaded_vmcss(void)

620 {

621 int cpu = raw_smp_processor_id();

622 struct loaded_vmcs *v;

623

624 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),

625 loaded_vmcss_on_cpu_link)

626 vmcs_clear(v->vmcs);

627 }

628 #endif /* CONFIG_KEXEC_CORE */

629

630 static void __loaded_vmcs_clear(void *arg)

631 {

632 struct loaded_vmcs *loaded_vmcs = arg;

633 int cpu = raw_smp_processor_id();

634

635 if (loaded_vmcs->cpu != cpu)

636 return; /* vcpu migration can race with cpu offline */

637 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)

638 per_cpu(current_vmcs, cpu) = NULL;

639

640 vmcs_clear(loaded_vmcs->vmcs);

641 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)

642 vmcs_clear(loaded_vmcs->shadow_vmcs);

643

644 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);

645

646 /*

647 * Ensure all writes to loaded_vmcs, including deleting it from its

648 * current percpu list, complete before setting loaded_vmcs->vcpu to

649 * -1, otherwise a different cpu can see vcpu == -1 first and add

650 * loaded_vmcs to its percpu list before it's deleted from this cpu's

651 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().

652 */

653 smp_wmb();

654

655 loaded_vmcs->cpu = -1;

656 loaded_vmcs->launched = 0;

657 }

658

659 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)

660 {

661 int cpu = loaded_vmcs->cpu;

662

663 if (cpu != -1)

664 smp_call_function_single(cpu,

665 __loaded_vmcs_clear, loaded_vmcs, 1);

666 }

667

668 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,

669 unsigned field)

670 {

671 bool ret;

672 u32 mask = 1 << (seg * SEG_FIELD_NR + field);

673

674 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {

675 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);

676 vmx->segment_cache.bitmask = 0;

677 }

678 ret = vmx->segment_cache.bitmask & mask;

679 vmx->segment_cache.bitmask |= mask;

680 return ret;

681 }

682

683 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)

684 {

685 u16 *p = &vmx->segment_cache.seg[seg].selector;

686

687 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))

688 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);

689 return *p;

690 }

691

692 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)

693 {

694 ulong *p = &vmx->segment_cache.seg[seg].base;

695

696 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))

697 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);

698 return *p;

699 }

700

701 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)

702 {

703 u32 *p = &vmx->segment_cache.seg[seg].limit;

704

705 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))

706 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);

707 return *p;

708 }

709

710 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)

711 {

712 u32 *p = &vmx->segment_cache.seg[seg].ar;

713

714 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))

715 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);

716 return *p;

717 }

718

719 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)

720 {

721 u32 eb;

722

723 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |

724 (1u << DB_VECTOR) | (1u << AC_VECTOR);

725 /*

726 * Guest access to VMware backdoor ports could legitimately

727 * trigger #GP because of TSS I/O permission bitmap.

728 * We intercept those #GP and allow access to them anyway

729 * as VMware does.

730 */

731 if (enable_vmware_backdoor)

732 eb |= (1u << GP_VECTOR);

733 if ((vcpu->guest_debug &

734 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==

735 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))

736 eb |= 1u << BP_VECTOR;

737 if (to_vmx(vcpu)->rmode.vm86_active)

738 eb = ~0;

739 if (!vmx_need_pf_intercept(vcpu))

740 eb &= ~(1u << PF_VECTOR);

741

742 /* When we are running a nested L2 guest and L1 specified for it a

743 * certain exception bitmap, we must trap the same exceptions and pass

744 * them to L1. When running L2, we will only handle the exceptions

745 * specified above if L1 did not want them.

746 */

747 if (is_guest_mode(vcpu))

748 eb |= get_vmcs12(vcpu)->exception_bitmap;

749 else {

750 int mask = 0, match = 0;

751

752 if (enable_ept && (eb & (1u << PF_VECTOR))) {

753 /*

754 * If EPT is enabled, #PF is currently only intercepted

755 * if MAXPHYADDR is smaller on the guest than on the

756 * host. In that case we only care about present,

757 * non-reserved faults. For vmcs02, however, PFEC_MASK

758 * and PFEC_MATCH are set in prepare_vmcs02_rare.

759 */

760 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;

761 match = PFERR_PRESENT_MASK;

762 }

763 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);

764 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);

765 }

766

767 vmcs_write32(EXCEPTION_BITMAP, eb);

768 }

769

770 /*

771 * Check if MSR is intercepted for currently loaded MSR bitmap.

772 */

773 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)

774 {

775 unsigned long *msr_bitmap;

776 int f = sizeof(unsigned long);

777

778 if (!cpu_has_vmx_msr_bitmap())

779 return true;

780

781 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;

782

783 if (msr <= 0x1fff) {

784 return !!test_bit(msr, msr_bitmap + 0x800 / f);

785 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {

786 msr &= 0x1fff;

787 return !!test_bit(msr, msr_bitmap + 0xc00 / f);

788 }

789

790 return true;

791 }

792

793 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,

794 unsigned long entry, unsigned long exit)

795 {

796 vm_entry_controls_clearbit(vmx, entry);

797 vm_exit_controls_clearbit(vmx, exit);

798 }

799

800 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)

801 {

802 unsigned int i;

803

804 for (i = 0; i < m->nr; ++i) {

805 if (m->val[i].index == msr)

806 return i;

807 }

808 return -ENOENT;

809 }

810

811 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)

812 {

813 int i;

814 struct msr_autoload *m = &vmx->msr_autoload;

815

816 switch (msr) {

817 case MSR_EFER:

818 if (cpu_has_load_ia32_efer()) {

819 clear_atomic_switch_msr_special(vmx,

820 VM_ENTRY_LOAD_IA32_EFER,

821 VM_EXIT_LOAD_IA32_EFER);

822 return;

823 }

824 break;

825 case MSR_CORE_PERF_GLOBAL_CTRL:

826 if (cpu_has_load_perf_global_ctrl()) {

827 clear_atomic_switch_msr_special(vmx,

828 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,

829 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);

830 return;

831 }

832 break;

833 }

834 i = vmx_find_loadstore_msr_slot(&m->guest, msr);

835 if (i < 0)

836 goto skip_guest;

837 --m->guest.nr;

838 m->guest.val[i] = m->guest.val[m->guest.nr];

839 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);

840

841 skip_guest:

842 i = vmx_find_loadstore_msr_slot(&m->host, msr);

843 if (i < 0)

844 return;

845

846 --m->host.nr;

847 m->host.val[i] = m->host.val[m->host.nr];

848 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);

849 }

850

851 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,

852 unsigned long entry, unsigned long exit,

853 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,

854 u64 guest_val, u64 host_val)

855 {

856 vmcs_write64(guest_val_vmcs, guest_val);

857 if (host_val_vmcs != HOST_IA32_EFER)

858 vmcs_write64(host_val_vmcs, host_val);

859 vm_entry_controls_setbit(vmx, entry);

860 vm_exit_controls_setbit(vmx, exit);

861 }

862

863 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,

864 u64 guest_val, u64 host_val, bool entry_only)

865 {

866 int i, j = 0;

867 struct msr_autoload *m = &vmx->msr_autoload;

868

869 switch (msr) {

870 case MSR_EFER:

871 if (cpu_has_load_ia32_efer()) {

872 add_atomic_switch_msr_special(vmx,

873 VM_ENTRY_LOAD_IA32_EFER,

874 VM_EXIT_LOAD_IA32_EFER,

875 GUEST_IA32_EFER,

876 HOST_IA32_EFER,

877 guest_val, host_val);

878 return;

879 }

880 break;

881 case MSR_CORE_PERF_GLOBAL_CTRL:

882 if (cpu_has_load_perf_global_ctrl()) {

883 add_atomic_switch_msr_special(vmx,

884 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,

885 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,

886 GUEST_IA32_PERF_GLOBAL_CTRL,

887 HOST_IA32_PERF_GLOBAL_CTRL,

888 guest_val, host_val);

889 return;

890 }

891 break;

892 case MSR_IA32_PEBS_ENABLE:

893 /* PEBS needs a quiescent period after being disabled (to write

894 * a record). Disabling PEBS through VMX MSR swapping doesn't

895 * provide that period, so a CPU could write host's record into

896 * guest's memory.

897 */

898 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);

899 }

900

901 i = vmx_find_loadstore_msr_slot(&m->guest, msr);

902 if (!entry_only)

903 j = vmx_find_loadstore_msr_slot(&m->host, msr);

904

905 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||

906 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {

907 printk_once(KERN_WARNING "Not enough msr switch entries. "

908 "Can't add msr %x\n", msr);

909 return;

910 }

911 if (i < 0) {

912 i = m->guest.nr++;

913 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);

914 }

915 m->guest.val[i].index = msr;

916 m->guest.val[i].value = guest_val;

917

918 if (entry_only)

919 return;

920

921 if (j < 0) {

922 j = m->host.nr++;

923 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);

924 }

925 m->host.val[j].index = msr;

926 m->host.val[j].value = host_val;

927 }

928

929 static bool update_transition_efer(struct vcpu_vmx *vmx)

930 {

931 u64 guest_efer = vmx->vcpu.arch.efer;

932 u64 ignore_bits = 0;

933 int i;

934

935 /* Shadow paging assumes NX to be available. */

936 if (!enable_ept)

937 guest_efer |= EFER_NX;

938

939 /*

940 * LMA and LME handled by hardware; SCE meaningless outside long mode.

941 */

942 ignore_bits |= EFER_SCE;

943 #ifdef CONFIG_X86_64

944 ignore_bits |= EFER_LMA | EFER_LME;

945 /* SCE is meaningful only in long mode on Intel */

946 if (guest_efer & EFER_LMA)

947 ignore_bits &= ~(u64)EFER_SCE;

948 #endif

949

950 /*

951 * On EPT, we can't emulate NX, so we must switch EFER atomically.

952 * On CPUs that support "load IA32_EFER", always switch EFER

953 * atomically, since it's faster than switching it manually.

954 */

955 if (cpu_has_load_ia32_efer() ||

956 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {

957 if (!(guest_efer & EFER_LMA))

958 guest_efer &= ~EFER_LME;

959 if (guest_efer != host_efer)

960 add_atomic_switch_msr(vmx, MSR_EFER,

961 guest_efer, host_efer, false);

962 else

963 clear_atomic_switch_msr(vmx, MSR_EFER);

964 return false;

965 }

966

967 i = kvm_find_user_return_msr(MSR_EFER);

968 if (i < 0)

969 return false;

970

971 clear_atomic_switch_msr(vmx, MSR_EFER);

972

973 guest_efer &= ~ignore_bits;

974 guest_efer |= host_efer & ignore_bits;

975

976 vmx->guest_uret_msrs[i].data = guest_efer;

977 vmx->guest_uret_msrs[i].mask = ~ignore_bits;

978

979 return true;

980 }

981

982 #ifdef CONFIG_X86_32

983 /*

984 * On 32-bit kernels, VM exits still load the FS and GS bases from the

985 * VMCS rather than the segment table. KVM uses this helper to figure

986 * out the current bases to poke them into the VMCS before entry.

987 */

988 static unsigned long segment_base(u16 selector)

989 {

990 struct desc_struct *table;

991 unsigned long v;

992

993 if (!(selector & ~SEGMENT_RPL_MASK))

994 return 0;

995

996 table = get_current_gdt_ro();

997

998 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {

999 u16 ldt_selector = kvm_read_ldt();

1000

1001 if (!(ldt_selector & ~SEGMENT_RPL_MASK))

1002 return 0;

1003

1004 table = (struct desc_struct *)segment_base(ldt_selector);

1005 }

1006 v = get_desc_base(&table[selector >> 3]);

1007 return v;

1008 }

1009 #endif

1010

1011 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)

1012 {

1013 return vmx_pt_mode_is_host_guest() &&

1014 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);

1015 }

1016

1017 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)

1018 {

1019 /* The base must be 128-byte aligned and a legal physical address. */

1020 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);

1021 }

1022

1023 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)

1024 {

1025 u32 i;

1026

1027 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);

1028 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);

1029 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);

1030 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);

1031 for (i = 0; i < addr_range; i++) {

1032 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);

1033 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);

1034 }

1035 }

1036

1037 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)

1038 {

1039 u32 i;

1040

1041 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);

1042 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);

1043 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);

1044 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);

1045 for (i = 0; i < addr_range; i++) {

1046 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);

1047 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);

1048 }

1049 }

1050

1051 static void pt_guest_enter(struct vcpu_vmx *vmx)

1052 {

1053 if (vmx_pt_mode_is_system())

1054 return;

1055

1056 /*

1057 * GUEST_IA32_RTIT_CTL is already set in the VMCS.

1058 * Save host state before VM entry.

1059 */

1060 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);

1061 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {

1062 wrmsrl(MSR_IA32_RTIT_CTL, 0);

1063 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);

1064 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);

1065 }

1066 }

1067

1068 static void pt_guest_exit(struct vcpu_vmx *vmx)

1069 {

1070 if (vmx_pt_mode_is_system())

1071 return;

1072

1073 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {

1074 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);

1075 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);

1076 }

1077

1078 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */

1079 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);

1080 }

1081

1082 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,

1083 unsigned long fs_base, unsigned long gs_base)

1084 {

1085 if (unlikely(fs_sel != host->fs_sel)) {

1086 if (!(fs_sel & 7))

1087 vmcs_write16(HOST_FS_SELECTOR, fs_sel);

1088 else

1089 vmcs_write16(HOST_FS_SELECTOR, 0);

1090 host->fs_sel = fs_sel;

1091 }

1092 if (unlikely(gs_sel != host->gs_sel)) {

1093 if (!(gs_sel & 7))

1094 vmcs_write16(HOST_GS_SELECTOR, gs_sel);

1095 else

1096 vmcs_write16(HOST_GS_SELECTOR, 0);

1097 host->gs_sel = gs_sel;

1098 }

1099 if (unlikely(fs_base != host->fs_base)) {

1100 vmcs_writel(HOST_FS_BASE, fs_base);

1101 host->fs_base = fs_base;

1102 }

1103 if (unlikely(gs_base != host->gs_base)) {

1104 vmcs_writel(HOST_GS_BASE, gs_base);

1105 host->gs_base = gs_base;

1106 }

1107 }

1108

1109 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)

1110 {

1111 struct vcpu_vmx *vmx = to_vmx(vcpu);

1112 struct vmcs_host_state *host_state;

1113 #ifdef CONFIG_X86_64

1114 int cpu = raw_smp_processor_id();

1115 #endif

1116 unsigned long fs_base, gs_base;

1117 u16 fs_sel, gs_sel;

1118 int i;

1119

1120 vmx->req_immediate_exit = false;

1121

1122 /*

1123 * Note that guest MSRs to be saved/restored can also be changed

1124 * when guest state is loaded. This happens when guest transitions

1125 * to/from long-mode by setting MSR_EFER.LMA.

1126 */

1127 if (!vmx->guest_uret_msrs_loaded) {

1128 vmx->guest_uret_msrs_loaded = true;

1129 for (i = 0; i < kvm_nr_uret_msrs; ++i) {

1130 if (!vmx->guest_uret_msrs[i].load_into_hardware)

1131 continue;

1132

1133 kvm_set_user_return_msr(i,

1134 vmx->guest_uret_msrs[i].data,

1135 vmx->guest_uret_msrs[i].mask);

1136 }

1137 }

1138

1139 if (vmx->nested.need_vmcs12_to_shadow_sync)

1140 nested_sync_vmcs12_to_shadow(vcpu);

1141

1142 if (vmx->guest_state_loaded)

1143 return;

1144

1145 host_state = &vmx->loaded_vmcs->host_state;

1146

1147 /*

1148 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not

1149 * allow segment selectors with cpl > 0 or ti == 1.

1150 */

1151 host_state->ldt_sel = kvm_read_ldt();

1152

1153 #ifdef CONFIG_X86_64

1154 savesegment(ds, host_state->ds_sel);

1155 savesegment(es, host_state->es_sel);

1156

1157 gs_base = cpu_kernelmode_gs_base(cpu);

1158 if (likely(is_64bit_mm(current->mm))) {

1159 current_save_fsgs();

1160 fs_sel = current->thread.fsindex;

1161 gs_sel = current->thread.gsindex;

1162 fs_base = current->thread.fsbase;

1163 vmx->msr_host_kernel_gs_base = current->thread.gsbase;

1164 } else {

1165 savesegment(fs, fs_sel);

1166 savesegment(gs, gs_sel);

1167 fs_base = read_msr(MSR_FS_BASE);

1168 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);

1169 }

1170

1171 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);

1172 #else

1173 savesegment(fs, fs_sel);

1174 savesegment(gs, gs_sel);

1175 fs_base = segment_base(fs_sel);

1176 gs_base = segment_base(gs_sel);

1177 #endif

1178

1179 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);

1180 vmx->guest_state_loaded = true;

1181 }

1182

1183 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)

1184 {

1185 struct vmcs_host_state *host_state;

1186

1187 if (!vmx->guest_state_loaded)

1188 return;

1189

1190 host_state = &vmx->loaded_vmcs->host_state;

1191

1192 ++vmx->vcpu.stat.host_state_reload;

1193

1194 #ifdef CONFIG_X86_64

1195 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);

1196 #endif

1197 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {

1198 kvm_load_ldt(host_state->ldt_sel);

1199 #ifdef CONFIG_X86_64

1200 load_gs_index(host_state->gs_sel);

1201 #else

1202 loadsegment(gs, host_state->gs_sel);

1203 #endif

1204 }

1205 if (host_state->fs_sel & 7)

1206 loadsegment(fs, host_state->fs_sel);

1207 #ifdef CONFIG_X86_64

1208 if (unlikely(host_state->ds_sel | host_state->es_sel)) {

1209 loadsegment(ds, host_state->ds_sel);

1210 loadsegment(es, host_state->es_sel);

1211 }

1212 #endif

1213 invalidate_tss_limit();

1214 #ifdef CONFIG_X86_64

1215 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);

1216 #endif

1217 load_fixmap_gdt(raw_smp_processor_id());

1218 vmx->guest_state_loaded = false;

1219 vmx->guest_uret_msrs_loaded = false;

1220 }

1221

1222 #ifdef CONFIG_X86_64

1223 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)

1224 {

1225 preempt_disable();

1226 if (vmx->guest_state_loaded)

1227 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);

1228 preempt_enable();

1229 return vmx->msr_guest_kernel_gs_base;

1230 }

1231

1232 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)

1233 {

1234 preempt_disable();

1235 if (vmx->guest_state_loaded)

1236 wrmsrl(MSR_KERNEL_GS_BASE, data);

1237 preempt_enable();

1238 vmx->msr_guest_kernel_gs_base = data;

1239 }

1240 #endif

1241

1242 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,

1243 struct loaded_vmcs *buddy)

1244 {

1245 struct vcpu_vmx *vmx = to_vmx(vcpu);

1246 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;

1247 struct vmcs *prev;

1248

1249 if (!already_loaded) {

1250 loaded_vmcs_clear(vmx->loaded_vmcs);

1251 local_irq_disable();

1252

1253 /*

1254 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to

1255 * this cpu's percpu list, otherwise it may not yet be deleted

1256 * from its previous cpu's percpu list. Pairs with the

1257 * smb_wmb() in __loaded_vmcs_clear().

1258 */

1259 smp_rmb();

1260

1261 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,

1262 &per_cpu(loaded_vmcss_on_cpu, cpu));

1263 local_irq_enable();

1264 }

1265

1266 prev = per_cpu(current_vmcs, cpu);

1267 if (prev != vmx->loaded_vmcs->vmcs) {

1268 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;

1269 vmcs_load(vmx->loaded_vmcs->vmcs);

1270

1271 /*

1272 * No indirect branch prediction barrier needed when switching

1273 * the active VMCS within a guest, e.g. on nested VM-Enter.

1274 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.

1275 */

1276 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))

1277 indirect_branch_prediction_barrier();

1278 }

1279

1280 if (!already_loaded) {

1281 void *gdt = get_current_gdt_ro();

1282 unsigned long sysenter_esp;

1283

1284 /*

1285 * Flush all EPTP/VPID contexts, the new pCPU may have stale

1286 * TLB entries from its previous association with the vCPU.

1287 */

1288 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);

1289

1290 /*

1291 * Linux uses per-cpu TSS and GDT, so set these when switching

1292 * processors. See 22.2.4.

1293 */

1294 vmcs_writel(HOST_TR_BASE,

1295 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);

1296 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */

1297

1298 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);

1299 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */

1300

1301 vmx->loaded_vmcs->cpu = cpu;

1302 }

1303 }

1304

1305 /*

1306 * Switches to specified vcpu, until a matching vcpu_put(), but assumes

1307 * vcpu mutex is already taken.

1308 */

1309 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

1310 {

1311 struct vcpu_vmx *vmx = to_vmx(vcpu);

1312

1313 vmx_vcpu_load_vmcs(vcpu, cpu, NULL);

1314

1315 vmx_vcpu_pi_load(vcpu, cpu);

1316

1317 vmx->host_debugctlmsr = get_debugctlmsr();

1318 }

1319

1320 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)

1321 {

1322 vmx_vcpu_pi_put(vcpu);

1323

1324 vmx_prepare_switch_to_host(to_vmx(vcpu));

1325 }

1326

1327 static bool emulation_required(struct kvm_vcpu *vcpu)

1328 {

1329 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);

1330 }

1331

1332 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)

1333 {

1334 struct vcpu_vmx *vmx = to_vmx(vcpu);

1335 unsigned long rflags, save_rflags;

1336

1337 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {

1338 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);

1339 rflags = vmcs_readl(GUEST_RFLAGS);

1340 if (vmx->rmode.vm86_active) {

1341 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;

1342 save_rflags = vmx->rmode.save_rflags;

1343 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;

1344 }

1345 vmx->rflags = rflags;

1346 }

1347 return vmx->rflags;

1348 }

1349

1350 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)

1351 {

1352 struct vcpu_vmx *vmx = to_vmx(vcpu);

1353 unsigned long old_rflags;

1354

1355 if (is_unrestricted_guest(vcpu)) {

1356 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);

1357 vmx->rflags = rflags;

1358 vmcs_writel(GUEST_RFLAGS, rflags);

1359 return;

1360 }

1361

1362 old_rflags = vmx_get_rflags(vcpu);

1363 vmx->rflags = rflags;

1364 if (vmx->rmode.vm86_active) {

1365 vmx->rmode.save_rflags = rflags;

1366 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;

1367 }

1368 vmcs_writel(GUEST_RFLAGS, rflags);

1369

1370 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)

1371 vmx->emulation_required = emulation_required(vcpu);

1372 }

1373

1374 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)

1375 {

1376 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);

1377 int ret = 0;

1378

1379 if (interruptibility & GUEST_INTR_STATE_STI)

1380 ret |= KVM_X86_SHADOW_INT_STI;

1381 if (interruptibility & GUEST_INTR_STATE_MOV_SS)

1382 ret |= KVM_X86_SHADOW_INT_MOV_SS;

1383

1384 return ret;

1385 }

1386

1387 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)

1388 {

1389 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);

1390 u32 interruptibility = interruptibility_old;

1391

1392 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);

1393

1394 if (mask & KVM_X86_SHADOW_INT_MOV_SS)

1395 interruptibility |= GUEST_INTR_STATE_MOV_SS;

1396 else if (mask & KVM_X86_SHADOW_INT_STI)

1397 interruptibility |= GUEST_INTR_STATE_STI;

1398

1399 if ((interruptibility != interruptibility_old))

1400 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);

1401 }

1402

1403 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)

1404 {

1405 struct vcpu_vmx *vmx = to_vmx(vcpu);

1406 unsigned long value;

1407

1408 /*

1409 * Any MSR write that attempts to change bits marked reserved will

1410 * case a #GP fault.

1411 */

1412 if (data & vmx->pt_desc.ctl_bitmask)

1413 return 1;

1414

1415 /*

1416 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will

1417 * result in a #GP unless the same write also clears TraceEn.

1418 */

1419 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&

1420 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))

1421 return 1;

1422

1423 /*

1424 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit

1425 * and FabricEn would cause #GP, if

1426 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0

1427 */

1428 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&

1429 !(data & RTIT_CTL_FABRIC_EN) &&

1430 !intel_pt_validate_cap(vmx->pt_desc.caps,

1431 PT_CAP_single_range_output))

1432 return 1;

1433

1434 /*

1435 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that

1436 * utilize encodings marked reserved will cause a #GP fault.

1437 */

1438 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);

1439 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&

1440 !test_bit((data & RTIT_CTL_MTC_RANGE) >>

1441 RTIT_CTL_MTC_RANGE_OFFSET, &value))

1442 return 1;

1443 value = intel_pt_validate_cap(vmx->pt_desc.caps,

1444 PT_CAP_cycle_thresholds);

1445 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&

1446 !test_bit((data & RTIT_CTL_CYC_THRESH) >>

1447 RTIT_CTL_CYC_THRESH_OFFSET, &value))

1448 return 1;

1449 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);

1450 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&

1451 !test_bit((data & RTIT_CTL_PSB_FREQ) >>

1452 RTIT_CTL_PSB_FREQ_OFFSET, &value))

1453 return 1;

1454

1455 /*

1456 * If ADDRx_CFG is reserved or the encodings is >2 will

1457 * cause a #GP fault.

1458 */

1459 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;

1460 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))

1461 return 1;

1462 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;

1463 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))

1464 return 1;

1465 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;

1466 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))

1467 return 1;

1468 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;

1469 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))

1470 return 1;

1471

1472 return 0;

1473 }

1474

1475 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)

1476 {

1477 /*

1478 * Emulation of instructions in SGX enclaves is impossible as RIP does

1479 * not point tthe failing instruction, and even if it did, the code

1480 * stream is inaccessible. Inject #UD instead of exiting to userspace

1481 * so that guest userspace can't DoS the guest simply by triggering

1482 * emulation (enclaves are CPL3 only).

1483 */

1484 if (to_vmx(vcpu)->exit_reason.enclave_mode) {

1485 kvm_queue_exception(vcpu, UD_VECTOR);

1486 return false;

1487 }

1488 return true;

1489 }

1490

1491 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)

1492 {

1493 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;

1494 unsigned long rip, orig_rip;

1495 u32 instr_len;

1496

1497 /*

1498 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on

1499 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be

1500 * set when EPT misconfig occurs. In practice, real hardware updates

1501 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors

1502 * (namely Hyper-V) don't set it due to it being undefined behavior,

1503 * i.e. we end up advancing IP with some random value.

1504 */

1505 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||

1506 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {

1507 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);

1508

1509 /*

1510 * Emulating an enclave's instructions isn't supported as KVM

1511 * cannot access the enclave's memory or its true RIP, e.g. the

1512 * vmcs.GUEST_RIP points at the exit point of the enclave, not

1513 * the RIP that actually triggered the VM-Exit. But, because

1514 * most instructions that cause VM-Exit will #UD in an enclave,

1515 * most instruction-based VM-Exits simply do not occur.

1516 *

1517 * There are a few exceptions, notably the debug instructions

1518 * INT1ICEBRK and INT3, as they are allowed in debug enclaves

1519 * and generate #DB/#BP as expected, which KVM might intercept.

1520 * But again, the CPU does the dirty work and saves an instr

1521 * length of zero so VMMs don't shoot themselves in the foot.

1522 * WARN if KVM tries to skip a non-zero length instruction on

1523 * a VM-Exit from an enclave.

1524 */

1525 if (!instr_len)

1526 goto rip_updated;

1527

1528 WARN(exit_reason.enclave_mode,

1529 "KVM: skipping instruction after SGX enclave VM-Exit");

1530

1531 orig_rip = kvm_rip_read(vcpu);

1532 rip = orig_rip + instr_len;

1533 #ifdef CONFIG_X86_64

1534 /*

1535 * We need to mask out the high 32 bits of RIP if not in 64-bit

1536 * mode, but just finding out that we are in 64-bit mode is

1537 * quite expensive. Only do it if there was a carry.

1538 */

1539 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))

1540 rip = (u32)rip;

1541 #endif

1542 kvm_rip_write(vcpu, rip);

1543 } else {

1544 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))

1545 return 0;

1546 }

1547

1548 rip_updated:

1549 /* skipping an emulated instruction also counts */

1550 vmx_set_interrupt_shadow(vcpu, 0);

1551

1552 return 1;

1553 }

1554

1555 /*

1556 * Recognizes a pending MTF VM-exit and records the nested state for later

1557 * delivery.

1558 */

1559 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)

1560 {

1561 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

1562 struct vcpu_vmx *vmx = to_vmx(vcpu);

1563

1564 if (!is_guest_mode(vcpu))

1565 return;

1566

1567 /*

1568 * Per the SDM, MTF takes priority over debug-trap exceptions besides

1569 * T-bit traps. As instruction emulation is completed (i.e. at the

1570 * instruction boundary), any #DB exception pending delivery must be a

1571 * debug-trap. Record the pending MTF state to be delivered in

1572 * vmx_check_nested_events().

1573 */

1574 if (nested_cpu_has_mtf(vmcs12) &&

1575 (!vcpu->arch.exception.pending ||

1576 vcpu->arch.exception.nr == DB_VECTOR))

1577 vmx->nested.mtf_pending = true;

1578 else

1579 vmx->nested.mtf_pending = false;

1580 }

1581

1582 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)

1583 {

1584 vmx_update_emulated_instruction(vcpu);

1585 return skip_emulated_instruction(vcpu);

1586 }

1587

1588 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)

1589 {

1590 /*

1591 * Ensure that we clear the HLT state in the VMCS. We don't need to

1592 * explicitly skip the instruction because if the HLT state is set,

1593 * then the instruction is already executing and RIP has already been

1594 * advanced.

1595 */

1596 if (kvm_hlt_in_guest(vcpu->kvm) &&

1597 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)

1598 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);

1599 }

1600

1601 static void vmx_queue_exception(struct kvm_vcpu *vcpu)

1602 {

1603 struct vcpu_vmx *vmx = to_vmx(vcpu);

1604 unsigned nr = vcpu->arch.exception.nr;

1605 bool has_error_code = vcpu->arch.exception.has_error_code;

1606 u32 error_code = vcpu->arch.exception.error_code;

1607 u32 intr_info = nr | INTR_INFO_VALID_MASK;

1608

1609 kvm_deliver_exception_payload(vcpu);

1610

1611 if (has_error_code) {

1612 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);

1613 intr_info |= INTR_INFO_DELIVER_CODE_MASK;

1614 }

1615

1616 if (vmx->rmode.vm86_active) {

1617 int inc_eip = 0;

1618 if (kvm_exception_is_soft(nr))

1619 inc_eip = vcpu->arch.event_exit_inst_len;

1620 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);

1621 return;

1622 }

1623

1624 WARN_ON_ONCE(vmx->emulation_required);

1625

1626 if (kvm_exception_is_soft(nr)) {

1627 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,

1628 vmx->vcpu.arch.event_exit_inst_len);

1629 intr_info |= INTR_TYPE_SOFT_EXCEPTION;

1630 } else

1631 intr_info |= INTR_TYPE_HARD_EXCEPTION;

1632

1633 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);

1634

1635 vmx_clear_hlt(vcpu);

1636 }

1637

1638 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,

1639 bool load_into_hardware)

1640 {

1641 struct vmx_uret_msr *uret_msr;

1642

1643 uret_msr = vmx_find_uret_msr(vmx, msr);

1644 if (!uret_msr)

1645 return;

1646

1647 uret_msr->load_into_hardware = load_into_hardware;

1648 }

1649

1650 /*

1651 * Set up the vmcs to automatically save and restore system

1652 * msrs. Don't touch the 64-bit msrs if the guest is in legacy

1653 * mode, as fiddling with msrs is very expensive.

1654 */

1655 static void setup_msrs(struct vcpu_vmx *vmx)

1656 {

1657 #ifdef CONFIG_X86_64

1658 bool load_syscall_msrs;

1659

1660 /*

1661 * The SYSCALL MSRs are only needed on long mode guests, and only

1662 * when EFER.SCE is set.

1663 */

1664 load_syscall_msrs = is_long_mode(&vmx->vcpu) &&

1665 (vmx->vcpu.arch.efer & EFER_SCE);

1666

1667 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);

1668 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);

1669 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);

1670 #endif

1671 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));

1672

1673 vmx_setup_uret_msr(vmx, MSR_TSC_AUX,

1674 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||

1675 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));

1676

1677 /*

1678 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new

1679 * kernel and old userspace. If those guests run on a tsx=off host, do

1680 * allow guests to use TSX_CTRL, but don't change the value in hardware

1681 * so that TSX remains always disabled.

1682 */

1683 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));

1684

1685 if (cpu_has_vmx_msr_bitmap())

1686 vmx_update_msr_bitmap(&vmx->vcpu);

1687

1688 /*

1689 * The set of MSRs to load may have changed, reload MSRs before the

1690 * next VM-Enter.

1691 */

1692 vmx->guest_uret_msrs_loaded = false;

1693 }

1694

1695 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)

1696 {

1697 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

1698

1699 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))

1700 return vmcs12->tsc_offset;

1701

1702 return 0;

1703 }

1704

1705 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)

1706 {

1707 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

1708

1709 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&

1710 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))

1711 return vmcs12->tsc_multiplier;

1712

1713 return kvm_default_tsc_scaling_ratio;

1714 }

1715

1716 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)

1717 {

1718 vmcs_write64(TSC_OFFSET, offset);

1719 }

1720

1721 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)

1722 {

1723 vmcs_write64(TSC_MULTIPLIER, multiplier);

1724 }

1725

1726 /*

1727 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX

1728 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for

1729 * all guests if the "nested" module option is off, and can also be disabled

1730 * for a single guest by disabling its VMX cpuid bit.

1731 */

1732 bool nested_vmx_allowed(struct kvm_vcpu *vcpu)

1733 {

1734 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);

1735 }

1736

1737 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,

1738 uint64_t val)

1739 {

1740 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;

1741

1742 return !(val & ~valid_bits);

1743 }

1744

1745 static int vmx_get_msr_feature(struct kvm_msr_entry *msr)

1746 {

1747 switch (msr->index) {

1748 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:

1749 if (!nested)

1750 return 1;

1751 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);

1752 case MSR_IA32_PERF_CAPABILITIES:

1753 msr->data = vmx_get_perf_capabilities();

1754 return 0;

1755 default:

1756 return KVM_MSR_RET_INVALID;

1757 }

1758 }

1759

1760 /*

1761 * Reads an msr value (of 'msr_index') into 'pdata'.

1762 * Returns 0 on success, non-0 otherwise.

1763 * Assumes vcpu_load() was already called.

1764 */

1765 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

1766 {

1767 struct vcpu_vmx *vmx = to_vmx(vcpu);

1768 struct vmx_uret_msr *msr;

1769 u32 index;

1770

1771 switch (msr_info->index) {

1772 #ifdef CONFIG_X86_64

1773 case MSR_FS_BASE:

1774 msr_info->data = vmcs_readl(GUEST_FS_BASE);

1775 break;

1776 case MSR_GS_BASE:

1777 msr_info->data = vmcs_readl(GUEST_GS_BASE);

1778 break;

1779 case MSR_KERNEL_GS_BASE:

1780 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);

1781 break;

1782 #endif

1783 case MSR_EFER:

1784 return kvm_get_msr_common(vcpu, msr_info);

1785 case MSR_IA32_TSX_CTRL:

1786 if (!msr_info->host_initiated &&

1787 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))

1788 return 1;

1789 goto find_uret_msr;

1790 case MSR_IA32_UMWAIT_CONTROL:

1791 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))

1792 return 1;

1793

1794 msr_info->data = vmx->msr_ia32_umwait_control;

1795 break;

1796 case MSR_IA32_SPEC_CTRL:

1797 if (!msr_info->host_initiated &&

1798 !guest_has_spec_ctrl_msr(vcpu))

1799 return 1;

1800

1801 msr_info->data = to_vmx(vcpu)->spec_ctrl;

1802 break;

1803 case MSR_IA32_SYSENTER_CS:

1804 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);

1805 break;

1806 case MSR_IA32_SYSENTER_EIP:

1807 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);

1808 break;

1809 case MSR_IA32_SYSENTER_ESP:

1810 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);

1811 break;

1812 case MSR_IA32_BNDCFGS:

1813 if (!kvm_mpx_supported() ||

1814 (!msr_info->host_initiated &&

1815 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))

1816 return 1;

1817 msr_info->data = vmcs_read64(GUEST_BNDCFGS);

1818 break;

1819 case MSR_IA32_MCG_EXT_CTL:

1820 if (!msr_info->host_initiated &&

1821 !(vmx->msr_ia32_feature_control &

1822 FEAT_CTL_LMCE_ENABLED))

1823 return 1;

1824 msr_info->data = vcpu->arch.mcg_ext_ctl;

1825 break;

1826 case MSR_IA32_FEAT_CTL:

1827 msr_info->data = vmx->msr_ia32_feature_control;

1828 break;

1829 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:

1830 if (!msr_info->host_initiated &&

1831 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))

1832 return 1;

1833 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash

1834 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];

1835 break;

1836 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:

1837 if (!nested_vmx_allowed(vcpu))

1838 return 1;

1839 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,

1840 &msr_info->data))

1841 return 1;

1842 /*

1843 * Enlightened VMCS v1 doesn't have certain fields, but buggy

1844 * Hyper-V versions are still trying to use corresponding

1845 * features when they are exposed. Filter out the essential

1846 * minimum.

1847 */

1848 if (!msr_info->host_initiated &&

1849 vmx->nested.enlightened_vmcs_enabled)

1850 nested_evmcs_filter_control_msr(msr_info->index,

1851 &msr_info->data);

1852 break;

1853 case MSR_IA32_RTIT_CTL:

1854 if (!vmx_pt_mode_is_host_guest())

1855 return 1;

1856 msr_info->data = vmx->pt_desc.guest.ctl;

1857 break;

1858 case MSR_IA32_RTIT_STATUS:

1859 if (!vmx_pt_mode_is_host_guest())

1860 return 1;

1861 msr_info->data = vmx->pt_desc.guest.status;

1862 break;

1863 case MSR_IA32_RTIT_CR3_MATCH:

1864 if (!vmx_pt_mode_is_host_guest() ||

1865 !intel_pt_validate_cap(vmx->pt_desc.caps,

1866 PT_CAP_cr3_filtering))

1867 return 1;

1868 msr_info->data = vmx->pt_desc.guest.cr3_match;

1869 break;

1870 case MSR_IA32_RTIT_OUTPUT_BASE:

1871 if (!vmx_pt_mode_is_host_guest() ||

1872 (!intel_pt_validate_cap(vmx->pt_desc.caps,

1873 PT_CAP_topa_output) &&

1874 !intel_pt_validate_cap(vmx->pt_desc.caps,

1875 PT_CAP_single_range_output)))

1876 return 1;

1877 msr_info->data = vmx->pt_desc.guest.output_base;

1878 break;

1879 case MSR_IA32_RTIT_OUTPUT_MASK:

1880 if (!vmx_pt_mode_is_host_guest() ||

1881 (!intel_pt_validate_cap(vmx->pt_desc.caps,

1882 PT_CAP_topa_output) &&

1883 !intel_pt_validate_cap(vmx->pt_desc.caps,

1884 PT_CAP_single_range_output)))

1885 return 1;

1886 msr_info->data = vmx->pt_desc.guest.output_mask;

1887 break;

1888 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:

1889 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;

1890 if (!vmx_pt_mode_is_host_guest() ||

1891 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,

1892 PT_CAP_num_address_ranges)))

1893 return 1;

1894 if (index % 2)

1895 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];

1896 else

1897 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];

1898 break;

1899 case MSR_IA32_DEBUGCTLMSR:

1900 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);

1901 break;

1902 default:

1903 find_uret_msr:

1904 msr = vmx_find_uret_msr(vmx, msr_info->index);

1905 if (msr) {

1906 msr_info->data = msr->data;

1907 break;

1908 }

1909 return kvm_get_msr_common(vcpu, msr_info);

1910 }

1911

1912 return 0;

1913 }

1914

1915 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,

1916 u64 data)

1917 {

1918 #ifdef CONFIG_X86_64

1919 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))

1920 return (u32)data;

1921 #endif

1922 return (unsigned long)data;

1923 }

1924

1925 static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)

1926 {

1927 u64 debugctl = vmx_supported_debugctl();

1928

1929 if (!intel_pmu_lbr_is_enabled(vcpu))

1930 debugctl &= ~DEBUGCTLMSR_LBR_MASK;

1931

1932 if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))

1933 debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;

1934

1935 return debugctl;

1936 }

1937

1938 /*

1939 * Writes msr value into the appropriate "register".

1940 * Returns 0 on success, non-0 otherwise.

1941 * Assumes vcpu_load() was already called.

1942 */

1943 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

1944 {

1945 struct vcpu_vmx *vmx = to_vmx(vcpu);

1946 struct vmx_uret_msr *msr;

1947 int ret = 0;

1948 u32 msr_index = msr_info->index;

1949 u64 data = msr_info->data;

1950 u32 index;

1951

1952 switch (msr_index) {

1953 case MSR_EFER:

1954 ret = kvm_set_msr_common(vcpu, msr_info);

1955 break;

1956 #ifdef CONFIG_X86_64

1957 case MSR_FS_BASE:

1958 vmx_segment_cache_clear(vmx);

1959 vmcs_writel(GUEST_FS_BASE, data);

1960 break;

1961 case MSR_GS_BASE:

1962 vmx_segment_cache_clear(vmx);

1963 vmcs_writel(GUEST_GS_BASE, data);

1964 break;

1965 case MSR_KERNEL_GS_BASE:

1966 vmx_write_guest_kernel_gs_base(vmx, data);

1967 break;

1968 #endif

1969 case MSR_IA32_SYSENTER_CS:

1970 if (is_guest_mode(vcpu))

1971 get_vmcs12(vcpu)->guest_sysenter_cs = data;

1972 vmcs_write32(GUEST_SYSENTER_CS, data);

1973 break;

1974 case MSR_IA32_SYSENTER_EIP:

1975 if (is_guest_mode(vcpu)) {

1976 data = nested_vmx_truncate_sysenter_addr(vcpu, data);

1977 get_vmcs12(vcpu)->guest_sysenter_eip = data;

1978 }

1979 vmcs_writel(GUEST_SYSENTER_EIP, data);

1980 break;

1981 case MSR_IA32_SYSENTER_ESP:

1982 if (is_guest_mode(vcpu)) {

1983 data = nested_vmx_truncate_sysenter_addr(vcpu, data);

1984 get_vmcs12(vcpu)->guest_sysenter_esp = data;

1985 }

1986 vmcs_writel(GUEST_SYSENTER_ESP, data);

1987 break;

1988 case MSR_IA32_DEBUGCTLMSR: {

1989 u64 invalid = data & ~vcpu_supported_debugctl(vcpu);

1990 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {

1991 if (report_ignored_msrs)

1992 vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",

1993 __func__, data);

1994 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);

1995 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);

1996 }

1997

1998 if (invalid)

1999 return 1;

2000

2001 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &

2002 VM_EXIT_SAVE_DEBUG_CONTROLS)

2003 get_vmcs12(vcpu)->guest_ia32_debugctl = data;

2004

2005 vmcs_write64(GUEST_IA32_DEBUGCTL, data);

2006 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&

2007 (data & DEBUGCTLMSR_LBR))

2008 intel_pmu_create_guest_lbr_event(vcpu);

2009 return 0;

2010 }

2011 case MSR_IA32_BNDCFGS:

2012 if (!kvm_mpx_supported() ||

2013 (!msr_info->host_initiated &&

2014 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))

2015 return 1;

2016 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||

2017 (data & MSR_IA32_BNDCFGS_RSVD))

2018 return 1;

2019 vmcs_write64(GUEST_BNDCFGS, data);

2020 break;

2021 case MSR_IA32_UMWAIT_CONTROL:

2022 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))

2023 return 1;

2024

2025 /* The reserved bit 1 and non-32 bit [63:32] should be zero */

2026 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))

2027 return 1;

2028

2029 vmx->msr_ia32_umwait_control = data;

2030 break;

2031 case MSR_IA32_SPEC_CTRL:

2032 if (!msr_info->host_initiated &&

2033 !guest_has_spec_ctrl_msr(vcpu))

2034 return 1;

2035

2036 if (kvm_spec_ctrl_test_value(data))

2037 return 1;

2038

2039 vmx->spec_ctrl = data;

2040 if (!data)

2041 break;

2042

2043 /*

2044 * For non-nested:

2045 * When it's written (to non-zero) for the first time, pass

2046 * it through.

2047 *

2048 * For nested:

2049 * The handling of the MSR bitmap for L2 guests is done in

2050 * nested_vmx_prepare_msr_bitmap. We should not touch the

2051 * vmcs02.msr_bitmap here since it gets completely overwritten

2052 * in the merging. We update the vmcs01 here for L1 as well

2053 * since it will end up touching the MSR anyway now.

2054 */

2055 vmx_disable_intercept_for_msr(vcpu,

2056 MSR_IA32_SPEC_CTRL,

2057 MSR_TYPE_RW);

2058 break;

2059 case MSR_IA32_TSX_CTRL:

2060 if (!msr_info->host_initiated &&

2061 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))

2062 return 1;

2063 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))

2064 return 1;

2065 goto find_uret_msr;

2066 case MSR_IA32_PRED_CMD:

2067 if (!msr_info->host_initiated &&

2068 !guest_has_pred_cmd_msr(vcpu))

2069 return 1;

2070

2071 if (data & ~PRED_CMD_IBPB)

2072 return 1;

2073 if (!boot_cpu_has(X86_FEATURE_IBPB))

2074 return 1;

2075 if (!data)

2076 break;

2077

2078 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);

2079

2080 /*

2081 * For non-nested:

2082 * When it's written (to non-zero) for the first time, pass

2083 * it through.

2084 *

2085 * For nested:

2086 * The handling of the MSR bitmap for L2 guests is done in

2087 * nested_vmx_prepare_msr_bitmap. We should not touch the

2088 * vmcs02.msr_bitmap here since it gets completely overwritten

2089 * in the merging.

2090 */

2091 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);

2092 break;

2093 case MSR_IA32_CR_PAT:

2094 if (!kvm_pat_valid(data))

2095 return 1;

2096

2097 if (is_guest_mode(vcpu) &&

2098 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)

2099 get_vmcs12(vcpu)->guest_ia32_pat = data;

2100

2101 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {

2102 vmcs_write64(GUEST_IA32_PAT, data);

2103 vcpu->arch.pat = data;

2104 break;

2105 }

2106 ret = kvm_set_msr_common(vcpu, msr_info);

2107 break;

2108 case MSR_IA32_TSC_ADJUST:

2109 ret = kvm_set_msr_common(vcpu, msr_info);

2110 break;

2111 case MSR_IA32_MCG_EXT_CTL:

2112 if ((!msr_info->host_initiated &&

2113 !(to_vmx(vcpu)->msr_ia32_feature_control &

2114 FEAT_CTL_LMCE_ENABLED)) ||

2115 (data & ~MCG_EXT_CTL_LMCE_EN))

2116 return 1;

2117 vcpu->arch.mcg_ext_ctl = data;

2118 break;

2119 case MSR_IA32_FEAT_CTL:

2120 if (!vmx_feature_control_msr_valid(vcpu, data) ||

2121 (to_vmx(vcpu)->msr_ia32_feature_control &

2122 FEAT_CTL_LOCKED && !msr_info->host_initiated))

2123 return 1;

2124 vmx->msr_ia32_feature_control = data;

2125 if (msr_info->host_initiated && data == 0)

2126 vmx_leave_nested(vcpu);

2127

2128 /* SGX may be enabled/disabled by guest's firmware */

2129 vmx_write_encls_bitmap(vcpu, NULL);

2130 break;

2131 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:

2132 /*

2133 * On real hardware, the LE hash MSRs are writable before

2134 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),

2135 * at which point SGX related bits in IA32_FEATURE_CONTROL

2136 * become writable.

2137 *

2138 * KVM does not emulate SGX activation for simplicity, so

2139 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL

2140 * is unlocked. This is technically not architectural

2141 * behavior, but it's close enough.

2142 */

2143 if (!msr_info->host_initiated &&

2144 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||

2145 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&

2146 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))

2147 return 1;

2148 vmx->msr_ia32_sgxlepubkeyhash

2149 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;

2150 break;

2151 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:

2152 if (!msr_info->host_initiated)

2153 return 1; /* they are read-only */

2154 if (!nested_vmx_allowed(vcpu))

2155 return 1;

2156 return vmx_set_vmx_msr(vcpu, msr_index, data);

2157 case MSR_IA32_RTIT_CTL:

2158 if (!vmx_pt_mode_is_host_guest() ||

2159 vmx_rtit_ctl_check(vcpu, data) ||

2160 vmx->nested.vmxon)

2161 return 1;

2162 vmcs_write64(GUEST_IA32_RTIT_CTL, data);

2163 vmx->pt_desc.guest.ctl = data;

2164 pt_update_intercept_for_msr(vcpu);

2165 break;

2166 case MSR_IA32_RTIT_STATUS:

2167 if (!pt_can_write_msr(vmx))

2168 return 1;

2169 if (data & MSR_IA32_RTIT_STATUS_MASK)

2170 return 1;

2171 vmx->pt_desc.guest.status = data;

2172 break;

2173 case MSR_IA32_RTIT_CR3_MATCH:

2174 if (!pt_can_write_msr(vmx))

2175 return 1;

2176 if (!intel_pt_validate_cap(vmx->pt_desc.caps,

2177 PT_CAP_cr3_filtering))

2178 return 1;

2179 vmx->pt_desc.guest.cr3_match = data;

2180 break;

2181 case MSR_IA32_RTIT_OUTPUT_BASE:

2182 if (!pt_can_write_msr(vmx))

2183 return 1;

2184 if (!intel_pt_validate_cap(vmx->pt_desc.caps,

2185 PT_CAP_topa_output) &&

2186 !intel_pt_validate_cap(vmx->pt_desc.caps,

2187 PT_CAP_single_range_output))

2188 return 1;

2189 if (!pt_output_base_valid(vcpu, data))

2190 return 1;

2191 vmx->pt_desc.guest.output_base = data;

2192 break;

2193 case MSR_IA32_RTIT_OUTPUT_MASK:

2194 if (!pt_can_write_msr(vmx))

2195 return 1;

2196 if (!intel_pt_validate_cap(vmx->pt_desc.caps,

2197 PT_CAP_topa_output) &&

2198 !intel_pt_validate_cap(vmx->pt_desc.caps,

2199 PT_CAP_single_range_output))

2200 return 1;

2201 vmx->pt_desc.guest.output_mask = data;

2202 break;

2203 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:

2204 if (!pt_can_write_msr(vmx))

2205 return 1;

2206 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;

2207 if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,

2208 PT_CAP_num_address_ranges))

2209 return 1;

2210 if (is_noncanonical_address(data, vcpu))

2211 return 1;

2212 if (index % 2)

2213 vmx->pt_desc.guest.addr_b[index / 2] = data;

2214 else

2215 vmx->pt_desc.guest.addr_a[index / 2] = data;

2216 break;

2217 case MSR_IA32_PERF_CAPABILITIES:

2218 if (data && !vcpu_to_pmu(vcpu)->version)

2219 return 1;

2220 if (data & PMU_CAP_LBR_FMT) {

2221 if ((data & PMU_CAP_LBR_FMT) !=

2222 (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))

2223 return 1;

2224 if (!intel_pmu_lbr_is_compatible(vcpu))

2225 return 1;

2226 }

2227 ret = kvm_set_msr_common(vcpu, msr_info);

2228 break;

2229

2230 default:

2231 find_uret_msr:

2232 msr = vmx_find_uret_msr(vmx, msr_index);

2233 if (msr)

2234 ret = vmx_set_guest_uret_msr(vmx, msr, data);

2235 else

2236 ret = kvm_set_msr_common(vcpu, msr_info);

2237 }

2238

2239 return ret;

2240 }

2241

2242 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)

2243 {

2244 unsigned long guest_owned_bits;

2245

2246 kvm_register_mark_available(vcpu, reg);

2247

2248 switch (reg) {

2249 case VCPU_REGS_RSP:

2250 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);

2251 break;

2252 case VCPU_REGS_RIP:

2253 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);

2254 break;

2255 case VCPU_EXREG_PDPTR:

2256 if (enable_ept)

2257 ept_save_pdptrs(vcpu);

2258 break;

2259 case VCPU_EXREG_CR0:

2260 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;

2261

2262 vcpu->arch.cr0 &= ~guest_owned_bits;

2263 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;

2264 break;

2265 case VCPU_EXREG_CR3:

2266 if (is_unrestricted_guest(vcpu) ||

2267 (enable_ept && is_paging(vcpu)))

2268 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);

2269 break;

2270 case VCPU_EXREG_CR4:

2271 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;

2272

2273 vcpu->arch.cr4 &= ~guest_owned_bits;

2274 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;

2275 break;

2276 default:

2277 KVM_BUG_ON(1, vcpu->kvm);

2278 break;

2279 }

2280 }

2281

2282 static __init int cpu_has_kvm_support(void)

2283 {

2284 return cpu_has_vmx();

2285 }

2286

2287 static __init int vmx_disabled_by_bios(void)

2288 {

2289 return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||

2290 !boot_cpu_has(X86_FEATURE_VMX);

2291 }

2292

2293 static int kvm_cpu_vmxon(u64 vmxon_pointer)

2294 {

2295 u64 msr;

2296

2297 cr4_set_bits(X86_CR4_VMXE);

2298

2299 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"

2300 _ASM_EXTABLE(1b, %l[fault])

2301 : : [vmxon_pointer] "m"(vmxon_pointer)

2302 : : fault);

2303 return 0;

2304

2305 fault:

2306 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",

2307 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);

2308 cr4_clear_bits(X86_CR4_VMXE);

2309

2310 return -EFAULT;

2311 }

2312

2313 static int hardware_enable(void)

2314 {

2315 int cpu = raw_smp_processor_id();

2316 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));

2317 int r;

2318

2319 if (cr4_read_shadow() & X86_CR4_VMXE)

2320 return -EBUSY;

2321

2322 /*

2323 * This can happen if we hot-added a CPU but failed to allocate

2324 * VP assist page for it.

2325 */

2326 if (static_branch_unlikely(&enable_evmcs) &&

2327 !hv_get_vp_assist_page(cpu))

2328 return -EFAULT;

2329

2330 intel_pt_handle_vmx(1);

2331

2332 r = kvm_cpu_vmxon(phys_addr);

2333 if (r) {

2334 intel_pt_handle_vmx(0);

2335 return r;

2336 }

2337

2338 if (enable_ept)

2339 ept_sync_global();

2340

2341 return 0;

2342 }

2343

2344 static void vmclear_local_loaded_vmcss(void)

2345 {

2346 int cpu = raw_smp_processor_id();

2347 struct loaded_vmcs *v, *n;

2348

2349 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),

2350 loaded_vmcss_on_cpu_link)

2351 __loaded_vmcs_clear(v);

2352 }

2353

2354 static void hardware_disable(void)

2355 {

2356 vmclear_local_loaded_vmcss();

2357

2358 if (cpu_vmxoff())

2359 kvm_spurious_fault();

2360

2361 intel_pt_handle_vmx(0);

2362 }

2363

2364 /*

2365 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID

2366 * directly instead of going through cpu_has(), to ensure KVM is trapping

2367 * ENCLS whenever it's supported in hardware. It does not matter whether

2368 * the host OS supports or has enabled SGX.

2369 */

2370 static bool cpu_has_sgx(void)

2371 {

2372 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));

2373 }

2374

2375 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,

2376 u32 msr, u32 *result)

2377 {

2378 u32 vmx_msr_low, vmx_msr_high;

2379 u32 ctl = ctl_min | ctl_opt;

2380

2381 rdmsr(msr, vmx_msr_low, vmx_msr_high);

2382

2383 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */

2384 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */

2385

2386 /* Ensure minimum (required) set of control bits are supported. */

2387 if (ctl_min & ~ctl)

2388 return -EIO;

2389

2390 *result = ctl;

2391 return 0;

2392 }

2393

2394 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,

2395 struct vmx_capability *vmx_cap)

2396 {

2397 u32 vmx_msr_low, vmx_msr_high;

2398 u32 min, opt, min2, opt2;

2399 u32 _pin_based_exec_control = 0;

2400 u32 _cpu_based_exec_control = 0;

2401 u32 _cpu_based_2nd_exec_control = 0;

2402 u32 _vmexit_control = 0;

2403 u32 _vmentry_control = 0;

2404

2405 memset(vmcs_conf, 0, sizeof(*vmcs_conf));

2406 min = CPU_BASED_HLT_EXITING |

2407 #ifdef CONFIG_X86_64

2408 CPU_BASED_CR8_LOAD_EXITING |

2409 CPU_BASED_CR8_STORE_EXITING |

2410 #endif

2411 CPU_BASED_CR3_LOAD_EXITING |

2412 CPU_BASED_CR3_STORE_EXITING |

2413 CPU_BASED_UNCOND_IO_EXITING |

2414 CPU_BASED_MOV_DR_EXITING |

2415 CPU_BASED_USE_TSC_OFFSETTING |

2416 CPU_BASED_MWAIT_EXITING |

2417 CPU_BASED_MONITOR_EXITING |

2418 CPU_BASED_INVLPG_EXITING |

2419 CPU_BASED_RDPMC_EXITING;

2420

2421 opt = CPU_BASED_TPR_SHADOW |

2422 CPU_BASED_USE_MSR_BITMAPS |

2423 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;

2424 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,

2425 &_cpu_based_exec_control) < 0)

2426 return -EIO;

2427 #ifdef CONFIG_X86_64

2428 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))

2429 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &

2430 ~CPU_BASED_CR8_STORE_EXITING;

2431 #endif

2432 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {

2433 min2 = 0;

2434 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |

2435 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |

2436 SECONDARY_EXEC_WBINVD_EXITING |

2437 SECONDARY_EXEC_ENABLE_VPID |

2438 SECONDARY_EXEC_ENABLE_EPT |

2439 SECONDARY_EXEC_UNRESTRICTED_GUEST |

2440 SECONDARY_EXEC_PAUSE_LOOP_EXITING |

2441 SECONDARY_EXEC_DESC |

2442 SECONDARY_EXEC_ENABLE_RDTSCP |

2443 SECONDARY_EXEC_ENABLE_INVPCID |

2444 SECONDARY_EXEC_APIC_REGISTER_VIRT |

2445 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |

2446 SECONDARY_EXEC_SHADOW_VMCS |

2447 SECONDARY_EXEC_XSAVES |

2448 SECONDARY_EXEC_RDSEED_EXITING |

2449 SECONDARY_EXEC_RDRAND_EXITING |

2450 SECONDARY_EXEC_ENABLE_PML |

2451 SECONDARY_EXEC_TSC_SCALING |

2452 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |

2453 SECONDARY_EXEC_PT_USE_GPA |

2454 SECONDARY_EXEC_PT_CONCEAL_VMX |

2455 SECONDARY_EXEC_ENABLE_VMFUNC |

2456 SECONDARY_EXEC_BUS_LOCK_DETECTION;

2457 if (cpu_has_sgx())

2458 opt2 |= SECONDARY_EXEC_ENCLS_EXITING;

2459 if (adjust_vmx_controls(min2, opt2,

2460 MSR_IA32_VMX_PROCBASED_CTLS2,

2461 &_cpu_based_2nd_exec_control) < 0)

2462 return -EIO;

2463 }

2464 #ifndef CONFIG_X86_64

2465 if (!(_cpu_based_2nd_exec_control &

2466 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))

2467 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;

2468 #endif

2469

2470 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))

2471 _cpu_based_2nd_exec_control &= ~(

2472 SECONDARY_EXEC_APIC_REGISTER_VIRT |

2473 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |

2474 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

2475

2476 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,

2477 &vmx_cap->ept, &vmx_cap->vpid);

2478

2479 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {

2480 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT

2481 enabled */

2482 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |

2483 CPU_BASED_CR3_STORE_EXITING |

2484 CPU_BASED_INVLPG_EXITING);

2485 } else if (vmx_cap->ept) {

2486 vmx_cap->ept = 0;

2487 pr_warn_once("EPT CAP should not exist if not support "

2488 "1-setting enable EPT VM-execution control\n");

2489 }

2490 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&

2491 vmx_cap->vpid) {

2492 vmx_cap->vpid = 0;

2493 pr_warn_once("VPID CAP should not exist if not support "

2494 "1-setting enable VPID VM-execution control\n");

2495 }

2496

2497 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;

2498 #ifdef CONFIG_X86_64

2499 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;

2500 #endif

2501 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |

2502 VM_EXIT_LOAD_IA32_PAT |

2503 VM_EXIT_LOAD_IA32_EFER |

2504 VM_EXIT_CLEAR_BNDCFGS |

2505 VM_EXIT_PT_CONCEAL_PIP |

2506 VM_EXIT_CLEAR_IA32_RTIT_CTL;

2507 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,

2508 &_vmexit_control) < 0)

2509 return -EIO;

2510

2511 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;

2512 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |

2513 PIN_BASED_VMX_PREEMPTION_TIMER;

2514 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,

2515 &_pin_based_exec_control) < 0)

2516 return -EIO;

2517

2518 if (cpu_has_broken_vmx_preemption_timer())

2519 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;

2520 if (!(_cpu_based_2nd_exec_control &

2521 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))

2522 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;

2523

2524 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;

2525 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |

2526 VM_ENTRY_LOAD_IA32_PAT |

2527 VM_ENTRY_LOAD_IA32_EFER |

2528 VM_ENTRY_LOAD_BNDCFGS |

2529 VM_ENTRY_PT_CONCEAL_PIP |

2530 VM_ENTRY_LOAD_IA32_RTIT_CTL;

2531 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,

2532 &_vmentry_control) < 0)

2533 return -EIO;

2534

2535 /*

2536 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they

2537 * can't be used due to an errata where VM Exit may incorrectly clear

2538 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the

2539 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.

2540 */

2541 if (boot_cpu_data.x86 == 0x6) {

2542 switch (boot_cpu_data.x86_model) {

2543 case 26: /* AAK155 */

2544 case 30: /* AAP115 */

2545 case 37: /* AAT100 */

2546 case 44: /* BC86,AAY89,BD102 */

2547 case 46: /* BA97 */

2548 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;

2549 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;

2550 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "

2551 "does not work properly. Using workaround\n");

2552 break;

2553 default:

2554 break;

2555 }

2556 }

2557

2558

2559 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);

2560

2561 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */

2562 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)

2563 return -EIO;

2564

2565 #ifdef CONFIG_X86_64

2566 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */

2567 if (vmx_msr_high & (1u<<16))

2568 return -EIO;

2569 #endif

2570

2571 /* Require Write-Back (WB) memory type for VMCS accesses. */

2572 if (((vmx_msr_high >> 18) & 15) != 6)

2573 return -EIO;

2574

2575 vmcs_conf->size = vmx_msr_high & 0x1fff;

2576 vmcs_conf->order = get_order(vmcs_conf->size);

2577 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;

2578

2579 vmcs_conf->revision_id = vmx_msr_low;

2580

2581 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;

2582 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;

2583 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;

2584 vmcs_conf->vmexit_ctrl = _vmexit_control;

2585 vmcs_conf->vmentry_ctrl = _vmentry_control;

2586

2587 #if IS_ENABLED(CONFIG_HYPERV)

2588 if (enlightened_vmcs)

2589 evmcs_sanitize_exec_ctrls(vmcs_conf);

2590 #endif

2591

2592 return 0;

2593 }

2594

2595 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)

2596 {

2597 int node = cpu_to_node(cpu);

2598 struct page *pages;

2599 struct vmcs *vmcs;

2600

2601 pages = __alloc_pages_node(node, flags, vmcs_config.order);

2602 if (!pages)

2603 return NULL;

2604 vmcs = page_address(pages);

2605 memset(vmcs, 0, vmcs_config.size);

2606

2607 /* KVM supports Enlightened VMCS v1 only */

2608 if (static_branch_unlikely(&enable_evmcs))

2609 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;

2610 else

2611 vmcs->hdr.revision_id = vmcs_config.revision_id;

2612

2613 if (shadow)

2614 vmcs->hdr.shadow_vmcs = 1;

2615 return vmcs;

2616 }

2617

2618 void free_vmcs(struct vmcs *vmcs)

2619 {

2620 free_pages((unsigned long)vmcs, vmcs_config.order);

2621 }

2622

2623 /*

2624 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded

2625 */

2626 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)

2627 {

2628 if (!loaded_vmcs->vmcs)

2629 return;

2630 loaded_vmcs_clear(loaded_vmcs);

2631 free_vmcs(loaded_vmcs->vmcs);

2632 loaded_vmcs->vmcs = NULL;

2633 if (loaded_vmcs->msr_bitmap)

2634 free_page((unsigned long)loaded_vmcs->msr_bitmap);

2635 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);

2636 }

2637

2638 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)

2639 {

2640 loaded_vmcs->vmcs = alloc_vmcs(false);

2641 if (!loaded_vmcs->vmcs)

2642 return -ENOMEM;

2643

2644 vmcs_clear(loaded_vmcs->vmcs);

2645

2646 loaded_vmcs->shadow_vmcs = NULL;

2647 loaded_vmcs->hv_timer_soft_disabled = false;

2648 loaded_vmcs->cpu = -1;

2649 loaded_vmcs->launched = 0;

2650

2651 if (cpu_has_vmx_msr_bitmap()) {

2652 loaded_vmcs->msr_bitmap = (unsigned long *)

2653 __get_free_page(GFP_KERNEL_ACCOUNT);

2654 if (!loaded_vmcs->msr_bitmap)

2655 goto out_vmcs;

2656 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);

2657

2658 if (IS_ENABLED(CONFIG_HYPERV) &&

2659 static_branch_unlikely(&enable_evmcs) &&

2660 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {

2661 struct hv_enlightened_vmcs *evmcs =

2662 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;

2663

2664 evmcs->hv_enlightenments_control.msr_bitmap = 1;

2665 }

2666 }

2667

2668 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));

2669 memset(&loaded_vmcs->controls_shadow, 0,

2670 sizeof(struct vmcs_controls_shadow));

2671

2672 return 0;

2673

2674 out_vmcs:

2675 free_loaded_vmcs(loaded_vmcs);

2676 return -ENOMEM;

2677 }

2678

2679 static void free_kvm_area(void)

2680 {

2681 int cpu;

2682

2683 for_each_possible_cpu(cpu) {

2684 free_vmcs(per_cpu(vmxarea, cpu));

2685 per_cpu(vmxarea, cpu) = NULL;

2686 }

2687 }

2688

2689 static __init int alloc_kvm_area(void)

2690 {

2691 int cpu;

2692

2693 for_each_possible_cpu(cpu) {

2694 struct vmcs *vmcs;

2695

2696 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);

2697 if (!vmcs) {

2698 free_kvm_area();

2699 return -ENOMEM;

2700 }

2701

2702 /*

2703 * When eVMCS is enabled, alloc_vmcs_cpu() sets

2704 * vmcs->revision_id to KVM_EVMCS_VERSION instead of

2705 * revision_id reported by MSR_IA32_VMX_BASIC.

2706 *

2707 * However, even though not explicitly documented by

2708 * TLFS, VMXArea passed as VMXON argument should

2709 * still be marked with revision_id reported by

2710 * physical CPU.

2711 */

2712 if (static_branch_unlikely(&enable_evmcs))

2713 vmcs->hdr.revision_id = vmcs_config.revision_id;

2714

2715 per_cpu(vmxarea, cpu) = vmcs;

2716 }

2717 return 0;

2718 }

2719

2720 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,

2721 struct kvm_segment *save)

2722 {

2723 if (!emulate_invalid_guest_state) {

2724 /*

2725 * CS and SS RPL should be equal during guest entry according

2726 * to VMX spec, but in reality it is not always so. Since vcpu

2727 * is in the middle of the transition from real mode to

2728 * protected mode it is safe to assume that RPL 0 is a good

2729 * default value.

2730 */

2731 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)

2732 save->selector &= ~SEGMENT_RPL_MASK;

2733 save->dpl = save->selector & SEGMENT_RPL_MASK;

2734 save->s = 1;

2735 }

2736 vmx_set_segment(vcpu, save, seg);

2737 }

2738

2739 static void enter_pmode(struct kvm_vcpu *vcpu)

2740 {

2741 unsigned long flags;

2742 struct vcpu_vmx *vmx = to_vmx(vcpu);

2743

2744 /*

2745 * Update real mode segment cache. It may be not up-to-date if segment

2746 * register was written while vcpu was in a guest mode.

2747 */

2748 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);

2749 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);

2750 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);

2751 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);

2752 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);

2753 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

2754

2755 vmx->rmode.vm86_active = 0;

2756

2757 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);

2758

2759 flags = vmcs_readl(GUEST_RFLAGS);

2760 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;

2761 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;

2762 vmcs_writel(GUEST_RFLAGS, flags);

2763

2764 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |

2765 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));

2766

2767 vmx_update_exception_bitmap(vcpu);

2768

2769 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);

2770 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);

2771 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);

2772 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);

2773 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);

2774 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);

2775 }

2776

2777 static void fix_rmode_seg(int seg, struct kvm_segment *save)

2778 {

2779 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];

2780 struct kvm_segment var = *save;

2781

2782 var.dpl = 0x3;

2783 if (seg == VCPU_SREG_CS)

2784 var.type = 0x3;

2785

2786 if (!emulate_invalid_guest_state) {

2787 var.selector = var.base >> 4;

2788 var.base = var.base & 0xffff0;

2789 var.limit = 0xffff;

2790 var.g = 0;

2791 var.db = 0;

2792 var.present = 1;

2793 var.s = 1;

2794 var.l = 0;

2795 var.unusable = 0;

2796 var.type = 0x3;

2797 var.avl = 0;

2798 if (save->base & 0xf)

2799 printk_once(KERN_WARNING "kvm: segment base is not "

2800 "paragraph aligned when entering "

2801 "protected mode (seg=%d)", seg);

2802 }

2803

2804 vmcs_write16(sf->selector, var.selector);

2805 vmcs_writel(sf->base, var.base);

2806 vmcs_write32(sf->limit, var.limit);

2807 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));

2808 }

2809

2810 static void enter_rmode(struct kvm_vcpu *vcpu)

2811 {

2812 unsigned long flags;

2813 struct vcpu_vmx *vmx = to_vmx(vcpu);

2814 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);

2815

2816 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);

2817 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);

2818 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);

2819 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);

2820 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);

2821 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);

2822 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

2823

2824 vmx->rmode.vm86_active = 1;

2825

2826 /*

2827 * Very old userspace does not call KVM_SET_TSS_ADDR before entering

2828 * vcpu. Warn the user that an update is overdue.

2829 */

2830 if (!kvm_vmx->tss_addr)

2831 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "

2832 "called before entering vcpu\n");

2833

2834 vmx_segment_cache_clear(vmx);

2835

2836 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);

2837 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);

2838 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

2839

2840 flags = vmcs_readl(GUEST_RFLAGS);

2841 vmx->rmode.save_rflags = flags;

2842

2843 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;

2844

2845 vmcs_writel(GUEST_RFLAGS, flags);

2846 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);

2847 vmx_update_exception_bitmap(vcpu);

2848

2849 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);

2850 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);

2851 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);

2852 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);

2853 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);

2854 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);

2855 }

2856

2857 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)

2858 {

2859 struct vcpu_vmx *vmx = to_vmx(vcpu);

2860 struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);

2861

2862 /* Nothing to do if hardware doesn't support EFER. */

2863 if (!msr)

2864 return 0;

2865

2866 vcpu->arch.efer = efer;

2867 if (efer & EFER_LMA) {

2868 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);

2869 msr->data = efer;

2870 } else {

2871 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);

2872

2873 msr->data = efer & ~EFER_LME;

2874 }

2875 setup_msrs(vmx);

2876 return 0;

2877 }

2878

2879 #ifdef CONFIG_X86_64

2880

2881 static void enter_lmode(struct kvm_vcpu *vcpu)

2882 {

2883 u32 guest_tr_ar;

2884

2885 vmx_segment_cache_clear(to_vmx(vcpu));

2886

2887 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);

2888 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {

2889 pr_debug_ratelimited("%s: tss fixup for long mode. \n",

2890 __func__);

2891 vmcs_write32(GUEST_TR_AR_BYTES,

2892 (guest_tr_ar & ~VMX_AR_TYPE_MASK)

2893 | VMX_AR_TYPE_BUSY_64_TSS);

2894 }

2895 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);

2896 }

2897

2898 static void exit_lmode(struct kvm_vcpu *vcpu)

2899 {

2900 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);

2901 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);

2902 }

2903

2904 #endif

2905

2906 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)

2907 {

2908 struct vcpu_vmx *vmx = to_vmx(vcpu);

2909

2910 /*

2911 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as

2912 * the CPU is not required to invalidate guest-physical mappings on

2913 * VM-Entry, even if VPID is disabled. Guest-physical mappings are

2914 * associated with the root EPT structure and not any particular VPID

2915 * (INVVPID also isn't required to invalidate guest-physical mappings).

2916 */

2917 if (enable_ept) {

2918 ept_sync_global();

2919 } else if (enable_vpid) {

2920 if (cpu_has_vmx_invvpid_global()) {

2921 vpid_sync_vcpu_global();

2922 } else {

2923 vpid_sync_vcpu_single(vmx->vpid);

2924 vpid_sync_vcpu_single(vmx->nested.vpid02);

2925 }

2926 }

2927 }

2928

2929 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)

2930 {

2931 struct kvm_mmu *mmu = vcpu->arch.mmu;

2932 u64 root_hpa = mmu->root_hpa;

2933

2934 /* No flush required if the current context is invalid. */

2935 if (!VALID_PAGE(root_hpa))

2936 return;

2937

2938 if (enable_ept)

2939 ept_sync_context(construct_eptp(vcpu, root_hpa,

2940 mmu->shadow_root_level));

2941 else if (!is_guest_mode(vcpu))

2942 vpid_sync_context(to_vmx(vcpu)->vpid);

2943 else

2944 vpid_sync_context(nested_get_vpid02(vcpu));

2945 }

2946

2947 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)

2948 {

2949 /*

2950 * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in

2951 * vmx_flush_tlb_guest() for an explanation of why this is ok.

2952 */

2953 vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);

2954 }

2955

2956 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)

2957 {

2958 /*

2959 * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0

2960 * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit

2961 * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is

2962 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),

2963 * i.e. no explicit INVVPID is necessary.

2964 */

2965 vpid_sync_context(to_vmx(vcpu)->vpid);

2966 }

2967

2968 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)

2969 {

2970 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

2971

2972 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))

2973 return;

2974

2975 if (is_pae_paging(vcpu)) {

2976 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);

2977 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);

2978 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);

2979 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);

2980 }

2981 }

2982

2983 void ept_save_pdptrs(struct kvm_vcpu *vcpu)

2984 {

2985 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

2986

2987 if (WARN_ON_ONCE(!is_pae_paging(vcpu)))

2988 return;

2989

2990 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);

2991 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);

2992 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);

2993 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);

2994

2995 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);

2996 }

2997

2998 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,

2999 unsigned long cr0,

3000 struct kvm_vcpu *vcpu)

3001 {

3002 struct vcpu_vmx *vmx = to_vmx(vcpu);

3003

3004 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))

3005 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);

3006 if (!(cr0 & X86_CR0_PG)) {

3007 /* From paging/starting to nonpaging */

3008 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |

3009 CPU_BASED_CR3_STORE_EXITING);

3010 vcpu->arch.cr0 = cr0;

3011 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));

3012 } else if (!is_paging(vcpu)) {

3013 /* From nonpaging to paging */

3014 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |

3015 CPU_BASED_CR3_STORE_EXITING);

3016 vcpu->arch.cr0 = cr0;

3017 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));

3018 }

3019

3020 if (!(cr0 & X86_CR0_WP))

3021 *hw_cr0 &= ~X86_CR0_WP;

3022 }

3023

3024 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

3025 {

3026 struct vcpu_vmx *vmx = to_vmx(vcpu);

3027 unsigned long hw_cr0;

3028

3029 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);

3030 if (is_unrestricted_guest(vcpu))

3031 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;

3032 else {

3033 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;

3034

3035 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))

3036 enter_pmode(vcpu);

3037

3038 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))

3039 enter_rmode(vcpu);

3040 }

3041

3042 #ifdef CONFIG_X86_64

3043 if (vcpu->arch.efer & EFER_LME) {

3044 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))

3045 enter_lmode(vcpu);

3046 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))

3047 exit_lmode(vcpu);

3048 }

3049 #endif

3050

3051 if (enable_ept && !is_unrestricted_guest(vcpu))

3052 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);

3053

3054 vmcs_writel(CR0_READ_SHADOW, cr0);

3055 vmcs_writel(GUEST_CR0, hw_cr0);

3056 vcpu->arch.cr0 = cr0;

3057 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);

3058

3059 /* depends on vcpu->arch.cr0 to be set to a new value */

3060 vmx->emulation_required = emulation_required(vcpu);

3061 }

3062

3063 static int vmx_get_max_tdp_level(void)

3064 {

3065 if (cpu_has_vmx_ept_5levels())

3066 return 5;

3067 return 4;

3068 }

3069

3070 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)

3071 {

3072 u64 eptp = VMX_EPTP_MT_WB;

3073

3074 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;

3075

3076 if (enable_ept_ad_bits &&

3077 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))

3078 eptp |= VMX_EPTP_AD_ENABLE_BIT;

3079 eptp |= root_hpa;

3080

3081 return eptp;

3082 }

3083

3084 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,

3085 int root_level)

3086 {

3087 struct kvm *kvm = vcpu->kvm;

3088 bool update_guest_cr3 = true;

3089 unsigned long guest_cr3;

3090 u64 eptp;

3091

3092 if (enable_ept) {

3093 eptp = construct_eptp(vcpu, root_hpa, root_level);

3094 vmcs_write64(EPT_POINTER, eptp);

3095

3096 hv_track_root_tdp(vcpu, root_hpa);

3097

3098 if (!enable_unrestricted_guest && !is_paging(vcpu))

3099 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;

3100 else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))

3101 guest_cr3 = vcpu->arch.cr3;

3102 else /* vmcs01.GUEST_CR3 is already up-to-date. */

3103 update_guest_cr3 = false;

3104 vmx_ept_load_pdptrs(vcpu);

3105 } else {

3106 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);

3107 }

3108

3109 if (update_guest_cr3)

3110 vmcs_writel(GUEST_CR3, guest_cr3);

3111 }

3112

3113 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

3114 {

3115 /*

3116 * We operate under the default treatment of SMM, so VMX cannot be

3117 * enabled under SMM. Note, whether or not VMXE is allowed at all is

3118 * handled by kvm_is_valid_cr4().

3119 */

3120 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))

3121 return false;

3122

3123 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))

3124 return false;

3125

3126 return true;

3127 }

3128

3129 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)

3130 {

3131 unsigned long old_cr4 = vcpu->arch.cr4;

3132 struct vcpu_vmx *vmx = to_vmx(vcpu);

3133 /*

3134 * Pass through host's Machine Check Enable value to hw_cr4, which

3135 * is in force while we are in guest mode. Do not let guests control

3136 * this bit, even if host CR4.MCE == 0.

3137 */

3138 unsigned long hw_cr4;

3139

3140 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);

3141 if (is_unrestricted_guest(vcpu))

3142 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;

3143 else if (vmx->rmode.vm86_active)

3144 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;

3145 else

3146 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;

3147

3148 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {

3149 if (cr4 & X86_CR4_UMIP) {

3150 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);

3151 hw_cr4 &= ~X86_CR4_UMIP;

3152 } else if (!is_guest_mode(vcpu) ||

3153 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {

3154 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);

3155 }

3156 }

3157

3158 vcpu->arch.cr4 = cr4;

3159 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);

3160

3161 if (!is_unrestricted_guest(vcpu)) {

3162 if (enable_ept) {

3163 if (!is_paging(vcpu)) {

3164 hw_cr4 &= ~X86_CR4_PAE;

3165 hw_cr4 |= X86_CR4_PSE;

3166 } else if (!(cr4 & X86_CR4_PAE)) {

3167 hw_cr4 &= ~X86_CR4_PAE;

3168 }

3169 }

3170

3171 /*

3172 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in

3173 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs

3174 * to be manually disabled when guest switches to non-paging

3175 * mode.

3176 *

3177 * If !enable_unrestricted_guest, the CPU is always running

3178 * with CR0.PG=1 and CR4 needs to be modified.

3179 * If enable_unrestricted_guest, the CPU automatically

3180 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.

3181 */

3182 if (!is_paging(vcpu))

3183 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);

3184 }

3185

3186 vmcs_writel(CR4_READ_SHADOW, cr4);

3187 vmcs_writel(GUEST_CR4, hw_cr4);

3188

3189 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))

3190 kvm_update_cpuid_runtime(vcpu);

3191 }

3192

3193 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)

3194 {

3195 struct vcpu_vmx *vmx = to_vmx(vcpu);

3196 u32 ar;

3197

3198 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {

3199 *var = vmx->rmode.segs[seg];

3200 if (seg == VCPU_SREG_TR

3201 || var->selector == vmx_read_guest_seg_selector(vmx, seg))

3202 return;

3203 var->base = vmx_read_guest_seg_base(vmx, seg);

3204 var->selector = vmx_read_guest_seg_selector(vmx, seg);

3205 return;

3206 }

3207 var->base = vmx_read_guest_seg_base(vmx, seg);

3208 var->limit = vmx_read_guest_seg_limit(vmx, seg);

3209 var->selector = vmx_read_guest_seg_selector(vmx, seg);

3210 ar = vmx_read_guest_seg_ar(vmx, seg);

3211 var->unusable = (ar >> 16) & 1;

3212 var->type = ar & 15;

3213 var->s = (ar >> 4) & 1;

3214 var->dpl = (ar >> 5) & 3;

3215 /*

3216 * Some userspaces do not preserve unusable property. Since usable

3217 * segment has to be present according to VMX spec we can use present

3218 * property to amend userspace bug by making unusable segment always

3219 * nonpresent. vmx_segment_access_rights() already marks nonpresent

3220 * segment as unusable.

3221 */

3222 var->present = !var->unusable;

3223 var->avl = (ar >> 12) & 1;

3224 var->l = (ar >> 13) & 1;

3225 var->db = (ar >> 14) & 1;

3226 var->g = (ar >> 15) & 1;

3227 }

3228

3229 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)

3230 {

3231 struct kvm_segment s;

3232

3233 if (to_vmx(vcpu)->rmode.vm86_active) {

3234 vmx_get_segment(vcpu, &s, seg);

3235 return s.base;

3236 }

3237 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);

3238 }

3239

3240 int vmx_get_cpl(struct kvm_vcpu *vcpu)

3241 {

3242 struct vcpu_vmx *vmx = to_vmx(vcpu);

3243

3244 if (unlikely(vmx->rmode.vm86_active))

3245 return 0;

3246 else {

3247 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);

3248 return VMX_AR_DPL(ar);

3249 }

3250 }

3251

3252 static u32 vmx_segment_access_rights(struct kvm_segment *var)

3253 {

3254 u32 ar;

3255

3256 if (var->unusable || !var->present)

3257 ar = 1 << 16;

3258 else {

3259 ar = var->type & 15;

3260 ar |= (var->s & 1) << 4;

3261 ar |= (var->dpl & 3) << 5;

3262 ar |= (var->present & 1) << 7;

3263 ar |= (var->avl & 1) << 12;

3264 ar |= (var->l & 1) << 13;

3265 ar |= (var->db & 1) << 14;

3266 ar |= (var->g & 1) << 15;

3267 }

3268

3269 return ar;

3270 }

3271

3272 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)

3273 {

3274 struct vcpu_vmx *vmx = to_vmx(vcpu);

3275 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];

3276

3277 vmx_segment_cache_clear(vmx);

3278

3279 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {

3280 vmx->rmode.segs[seg] = *var;

3281 if (seg == VCPU_SREG_TR)

3282 vmcs_write16(sf->selector, var->selector);

3283 else if (var->s)

3284 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);

3285 goto out;

3286 }

3287

3288 vmcs_writel(sf->base, var->base);

3289 vmcs_write32(sf->limit, var->limit);

3290 vmcs_write16(sf->selector, var->selector);

3291

3292 /*

3293 * Fix the "Accessed" bit in AR field of segment registers for older

3294 * qemu binaries.

3295 * IA32 arch specifies that at the time of processor reset the

3296 * "Accessed" bit in the AR field of segment registers is 1. And qemu

3297 * is setting it to 0 in the userland code. This causes invalid guest

3298 * state vmexit when "unrestricted guest" mode is turned on.

3299 * Fix for this setup issue in cpu_reset is being pushed in the qemu

3300 * tree. Newer qemu binaries with that qemu fix would not need this

3301 * kvm hack.

3302 */

3303 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))

3304 var->type |= 0x1; /* Accessed */

3305

3306 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));

3307

3308 out:

3309 vmx->emulation_required = emulation_required(vcpu);

3310 }

3311

3312 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)

3313 {

3314 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);

3315

3316 *db = (ar >> 14) & 1;

3317 *l = (ar >> 13) & 1;

3318 }

3319

3320 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)

3321 {

3322 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);

3323 dt->address = vmcs_readl(GUEST_IDTR_BASE);

3324 }

3325

3326 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)

3327 {

3328 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);

3329 vmcs_writel(GUEST_IDTR_BASE, dt->address);

3330 }

3331

3332 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)

3333 {

3334 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);

3335 dt->address = vmcs_readl(GUEST_GDTR_BASE);

3336 }

3337

3338 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)

3339 {

3340 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);

3341 vmcs_writel(GUEST_GDTR_BASE, dt->address);

3342 }

3343

3344 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)

3345 {

3346 struct kvm_segment var;

3347 u32 ar;

3348

3349 vmx_get_segment(vcpu, &var, seg);

3350 var.dpl = 0x3;

3351 if (seg == VCPU_SREG_CS)

3352 var.type = 0x3;

3353 ar = vmx_segment_access_rights(&var);

3354

3355 if (var.base != (var.selector << 4))

3356 return false;

3357 if (var.limit != 0xffff)

3358 return false;

3359 if (ar != 0xf3)

3360 return false;

3361

3362 return true;

3363 }

3364

3365 static bool code_segment_valid(struct kvm_vcpu *vcpu)

3366 {

3367 struct kvm_segment cs;

3368 unsigned int cs_rpl;

3369

3370 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);

3371 cs_rpl = cs.selector & SEGMENT_RPL_MASK;

3372

3373 if (cs.unusable)

3374 return false;

3375 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))

3376 return false;

3377 if (!cs.s)

3378 return false;

3379 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {

3380 if (cs.dpl > cs_rpl)

3381 return false;

3382 } else {

3383 if (cs.dpl != cs_rpl)

3384 return false;

3385 }

3386 if (!cs.present)

3387 return false;

3388

3389 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */

3390 return true;

3391 }

3392

3393 static bool stack_segment_valid(struct kvm_vcpu *vcpu)

3394 {

3395 struct kvm_segment ss;

3396 unsigned int ss_rpl;

3397

3398 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);

3399 ss_rpl = ss.selector & SEGMENT_RPL_MASK;

3400

3401 if (ss.unusable)

3402 return true;

3403 if (ss.type != 3 && ss.type != 7)

3404 return false;

3405 if (!ss.s)

3406 return false;

3407 if (ss.dpl != ss_rpl) /* DPL != RPL */

3408 return false;

3409 if (!ss.present)

3410 return false;

3411

3412 return true;

3413 }

3414

3415 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)

3416 {

3417 struct kvm_segment var;

3418 unsigned int rpl;

3419

3420 vmx_get_segment(vcpu, &var, seg);

3421 rpl = var.selector & SEGMENT_RPL_MASK;

3422

3423 if (var.unusable)

3424 return true;

3425 if (!var.s)

3426 return false;

3427 if (!var.present)

3428 return false;

3429 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {

3430 if (var.dpl < rpl) /* DPL < RPL */

3431 return false;

3432 }

3433

3434 /* TODO: Add other members to kvm_segment_field to allow checking for other access

3435 * rights flags

3436 */

3437 return true;

3438 }

3439

3440 static bool tr_valid(struct kvm_vcpu *vcpu)

3441 {

3442 struct kvm_segment tr;

3443

3444 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);

3445

3446 if (tr.unusable)

3447 return false;

3448 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */

3449 return false;

3450 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */

3451 return false;

3452 if (!tr.present)

3453 return false;

3454

3455 return true;

3456 }

3457

3458 static bool ldtr_valid(struct kvm_vcpu *vcpu)

3459 {

3460 struct kvm_segment ldtr;

3461

3462 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);

3463

3464 if (ldtr.unusable)

3465 return true;

3466 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */

3467 return false;

3468 if (ldtr.type != 2)

3469 return false;

3470 if (!ldtr.present)

3471 return false;

3472

3473 return true;

3474 }

3475

3476 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)

3477 {

3478 struct kvm_segment cs, ss;

3479

3480 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);

3481 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);

3482

3483 return ((cs.selector & SEGMENT_RPL_MASK) ==

3484 (ss.selector & SEGMENT_RPL_MASK));

3485 }

3486

3487 /*

3488 * Check if guest state is valid. Returns true if valid, false if

3489 * not.

3490 * We assume that registers are always usable

3491 */

3492 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)

3493 {

3494 /* real mode guest state checks */

3495 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {

3496 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))

3497 return false;

3498 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))

3499 return false;

3500 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))

3501 return false;

3502 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))

3503 return false;

3504 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))

3505 return false;

3506 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))

3507 return false;

3508 } else {

3509 /* protected mode guest state checks */

3510 if (!cs_ss_rpl_check(vcpu))

3511 return false;

3512 if (!code_segment_valid(vcpu))

3513 return false;

3514 if (!stack_segment_valid(vcpu))

3515 return false;

3516 if (!data_segment_valid(vcpu, VCPU_SREG_DS))

3517 return false;

3518 if (!data_segment_valid(vcpu, VCPU_SREG_ES))

3519 return false;

3520 if (!data_segment_valid(vcpu, VCPU_SREG_FS))

3521 return false;

3522 if (!data_segment_valid(vcpu, VCPU_SREG_GS))

3523 return false;

3524 if (!tr_valid(vcpu))

3525 return false;

3526 if (!ldtr_valid(vcpu))

3527 return false;

3528 }

3529 /* TODO:

3530 * - Add checks on RIP

3531 * - Add checks on RFLAGS

3532 */

3533

3534 return true;

3535 }

3536

3537 static int init_rmode_tss(struct kvm *kvm, void __user *ua)

3538 {

3539 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));

3540 u16 data;

3541 int i;

3542

3543 for (i = 0; i < 3; i++) {

3544 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))

3545 return -EFAULT;

3546 }

3547

3548 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;

3549 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))

3550 return -EFAULT;

3551

3552 data = ~0;

3553 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))

3554 return -EFAULT;

3555

3556 return 0;

3557 }

3558

3559 static int init_rmode_identity_map(struct kvm *kvm)

3560 {

3561 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);

3562 int i, r = 0;

3563 void __user *uaddr;

3564 u32 tmp;

3565

3566 /* Protect kvm_vmx->ept_identity_pagetable_done. */

3567 mutex_lock(&kvm->slots_lock);

3568

3569 if (likely(kvm_vmx->ept_identity_pagetable_done))

3570 goto out;

3571

3572 if (!kvm_vmx->ept_identity_map_addr)

3573 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;

3574

3575 uaddr = __x86_set_memory_region(kvm,

3576 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,

3577 kvm_vmx->ept_identity_map_addr,

3578 PAGE_SIZE);

3579 if (IS_ERR(uaddr)) {

3580 r = PTR_ERR(uaddr);

3581 goto out;

3582 }

3583

3584 /* Set up identity-mapping pagetable for EPT in real mode */

3585 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {

3586 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |

3587 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);

3588 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {

3589 r = -EFAULT;

3590 goto out;

3591 }

3592 }

3593 kvm_vmx->ept_identity_pagetable_done = true;

3594

3595 out:

3596 mutex_unlock(&kvm->slots_lock);

3597 return r;

3598 }

3599

3600 static void seg_setup(int seg)

3601 {

3602 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];

3603 unsigned int ar;

3604

3605 vmcs_write16(sf->selector, 0);

3606 vmcs_writel(sf->base, 0);

3607 vmcs_write32(sf->limit, 0xffff);

3608 ar = 0x93;

3609 if (seg == VCPU_SREG_CS)

3610 ar |= 0x08; /* code segment */

3611

3612 vmcs_write32(sf->ar_bytes, ar);

3613 }

3614

3615 static int alloc_apic_access_page(struct kvm *kvm)

3616 {

3617 struct page *page;

3618 void __user *hva;

3619 int ret = 0;

3620

3621 mutex_lock(&kvm->slots_lock);

3622 if (kvm->arch.apic_access_memslot_enabled)

3623 goto out;

3624 hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,

3625 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);

3626 if (IS_ERR(hva)) {

3627 ret = PTR_ERR(hva);

3628 goto out;

3629 }

3630

3631 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);

3632 if (is_error_page(page)) {

3633 ret = -EFAULT;

3634 goto out;

3635 }

3636

3637 /*

3638 * Do not pin the page in memory, so that memory hot-unplug

3639 * is able to migrate it.

3640 */

3641 put_page(page);

3642 kvm->arch.apic_access_memslot_enabled = true;

3643 out:

3644 mutex_unlock(&kvm->slots_lock);

3645 return ret;

3646 }

3647

3648 int allocate_vpid(void)

3649 {

3650 int vpid;

3651

3652 if (!enable_vpid)

3653 return 0;

3654 spin_lock(&vmx_vpid_lock);

3655 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);

3656 if (vpid < VMX_NR_VPIDS)

3657 __set_bit(vpid, vmx_vpid_bitmap);

3658 else

3659 vpid = 0;

3660 spin_unlock(&vmx_vpid_lock);

3661 return vpid;

3662 }

3663

3664 void free_vpid(int vpid)

3665 {

3666 if (!enable_vpid || vpid == 0)

3667 return;

3668 spin_lock(&vmx_vpid_lock);

3669 __clear_bit(vpid, vmx_vpid_bitmap);

3670 spin_unlock(&vmx_vpid_lock);

3671 }

3672

3673 static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)

3674 {

3675 int f = sizeof(unsigned long);

3676

3677 if (msr <= 0x1fff)

3678 __clear_bit(msr, msr_bitmap + 0x000 / f);

3679 else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))

3680 __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);

3681 }

3682

3683 static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)

3684 {

3685 int f = sizeof(unsigned long);

3686

3687 if (msr <= 0x1fff)

3688 __clear_bit(msr, msr_bitmap + 0x800 / f);

3689 else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))

3690 __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);

3691 }

3692

3693 static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)

3694 {

3695 int f = sizeof(unsigned long);

3696

3697 if (msr <= 0x1fff)

3698 __set_bit(msr, msr_bitmap + 0x000 / f);

3699 else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))

3700 __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);

3701 }

3702

3703 static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)

3704 {

3705 int f = sizeof(unsigned long);

3706

3707 if (msr <= 0x1fff)

3708 __set_bit(msr, msr_bitmap + 0x800 / f);

3709 else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))

3710 __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);

3711 }

3712

3713 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)

3714 {

3715 struct vcpu_vmx *vmx = to_vmx(vcpu);

3716 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;

3717

3718 if (!cpu_has_vmx_msr_bitmap())

3719 return;

3720

3721 if (static_branch_unlikely(&enable_evmcs))

3722 evmcs_touch_msr_bitmap();

3723

3724 /*

3725 * Mark the desired intercept state in shadow bitmap, this is needed

3726 * for resync when the MSR filters change.

3727 */

3728 if (is_valid_passthrough_msr(msr)) {

3729 int idx = possible_passthrough_msr_slot(msr);

3730

3731 if (idx != -ENOENT) {

3732 if (type & MSR_TYPE_R)

3733 clear_bit(idx, vmx->shadow_msr_intercept.read);

3734 if (type & MSR_TYPE_W)

3735 clear_bit(idx, vmx->shadow_msr_intercept.write);

3736 }

3737 }

3738

3739 if ((type & MSR_TYPE_R) &&

3740 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {

3741 vmx_set_msr_bitmap_read(msr_bitmap, msr);

3742 type &= ~MSR_TYPE_R;

3743 }

3744

3745 if ((type & MSR_TYPE_W) &&

3746 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {

3747 vmx_set_msr_bitmap_write(msr_bitmap, msr);

3748 type &= ~MSR_TYPE_W;

3749 }

3750

3751 if (type & MSR_TYPE_R)

3752 vmx_clear_msr_bitmap_read(msr_bitmap, msr);

3753

3754 if (type & MSR_TYPE_W)

3755 vmx_clear_msr_bitmap_write(msr_bitmap, msr);

3756 }

3757

3758 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)

3759 {

3760 struct vcpu_vmx *vmx = to_vmx(vcpu);

3761 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;

3762

3763 if (!cpu_has_vmx_msr_bitmap())

3764 return;

3765

3766 if (static_branch_unlikely(&enable_evmcs))

3767 evmcs_touch_msr_bitmap();

3768

3769 /*

3770 * Mark the desired intercept state in shadow bitmap, this is needed

3771 * for resync when the MSR filter changes.

3772 */

3773 if (is_valid_passthrough_msr(msr)) {

3774 int idx = possible_passthrough_msr_slot(msr);

3775

3776 if (idx != -ENOENT) {

3777 if (type & MSR_TYPE_R)

3778 set_bit(idx, vmx->shadow_msr_intercept.read);

3779 if (type & MSR_TYPE_W)

3780 set_bit(idx, vmx->shadow_msr_intercept.write);

3781 }

3782 }

3783

3784 if (type & MSR_TYPE_R)

3785 vmx_set_msr_bitmap_read(msr_bitmap, msr);

3786

3787 if (type & MSR_TYPE_W)

3788 vmx_set_msr_bitmap_write(msr_bitmap, msr);

3789 }

3790

3791 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)

3792 {

3793 u8 mode = 0;

3794

3795 if (cpu_has_secondary_exec_ctrls() &&

3796 (secondary_exec_controls_get(to_vmx(vcpu)) &

3797 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {

3798 mode |= MSR_BITMAP_MODE_X2APIC;

3799 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))

3800 mode |= MSR_BITMAP_MODE_X2APIC_APICV;

3801 }

3802

3803 return mode;

3804 }

3805

3806 static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)

3807 {

3808 unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;

3809 unsigned long read_intercept;

3810 int msr;

3811

3812 read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;

3813

3814 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {

3815 unsigned int read_idx = msr / BITS_PER_LONG;

3816 unsigned int write_idx = read_idx + (0x800 / sizeof(long));

3817

3818 msr_bitmap[read_idx] = read_intercept;

3819 msr_bitmap[write_idx] = ~0ul;

3820 }

3821 }

3822

3823 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)

3824 {

3825 if (!cpu_has_vmx_msr_bitmap())

3826 return;

3827

3828 vmx_reset_x2apic_msrs(vcpu, mode);

3829

3830 /*

3831 * TPR reads and writes can be virtualized even if virtual interrupt

3832 * delivery is not in use.

3833 */

3834 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,

3835 !(mode & MSR_BITMAP_MODE_X2APIC));

3836

3837 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {

3838 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);

3839 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);

3840 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);

3841 }

3842 }

3843

3844 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)

3845 {

3846 struct vcpu_vmx *vmx = to_vmx(vcpu);

3847 u8 mode = vmx_msr_bitmap_mode(vcpu);

3848 u8 changed = mode ^ vmx->msr_bitmap_mode;

3849

3850 if (!changed)

3851 return;

3852

3853 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))

3854 vmx_update_msr_bitmap_x2apic(vcpu, mode);

3855

3856 vmx->msr_bitmap_mode = mode;

3857 }

3858

3859 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)

3860 {

3861 struct vcpu_vmx *vmx = to_vmx(vcpu);

3862 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);

3863 u32 i;

3864

3865 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);

3866 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);

3867 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);

3868 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);

3869 for (i = 0; i < vmx->pt_desc.addr_range; i++) {

3870 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);

3871 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);

3872 }

3873 }

3874

3875 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)

3876 {

3877 struct vcpu_vmx *vmx = to_vmx(vcpu);

3878 void *vapic_page;

3879 u32 vppr;

3880 int rvi;

3881

3882 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||

3883 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||

3884 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))

3885 return false;

3886

3887 rvi = vmx_get_rvi();

3888

3889 vapic_page = vmx->nested.virtual_apic_map.hva;

3890 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));

3891

3892 return ((rvi & 0xf0) > (vppr & 0xf0));

3893 }

3894

3895 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)

3896 {

3897 struct vcpu_vmx *vmx = to_vmx(vcpu);

3898 u32 i;

3899

3900 /*

3901 * Set intercept permissions for all potentially passed through MSRs

3902 * again. They will automatically get filtered through the MSR filter,

3903 * so we are back in sync after this.

3904 */

3905 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {

3906 u32 msr = vmx_possible_passthrough_msrs[i];

3907 bool read = test_bit(i, vmx->shadow_msr_intercept.read);

3908 bool write = test_bit(i, vmx->shadow_msr_intercept.write);

3909

3910 vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);

3911 vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);

3912 }

3913

3914 pt_update_intercept_for_msr(vcpu);

3915 vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));

3916 }

3917

3918 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,

3919 bool nested)

3920 {

3921 #ifdef CONFIG_SMP

3922 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;

3923

3924 if (vcpu->mode == IN_GUEST_MODE) {

3925 /*

3926 * The vector of interrupt to be delivered to vcpu had

3927 * been set in PIR before this function.

3928 *

3929 * Following cases will be reached in this block, and

3930 * we always send a notification event in all cases as

3931 * explained below.

3932 *

3933 * Case 1: vcpu keeps in non-root mode. Sending a

3934 * notification event posts the interrupt to vcpu.

3935 *

3936 * Case 2: vcpu exits to root mode and is still

3937 * runnable. PIR will be synced to vIRR before the

3938 * next vcpu entry. Sending a notification event in

3939 * this case has no effect, as vcpu is not in root

3940 * mode.

3941 *

3942 * Case 3: vcpu exits to root mode and is blocked.

3943 * vcpu_block() has already synced PIR to vIRR and

3944 * never blocks vcpu if vIRR is not cleared. Therefore,

3945 * a blocked vcpu here does not wait for any requested

3946 * interrupts in PIR, and sending a notification event

3947 * which has no effect is safe here.

3948 */

3949

3950 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);

3951 return true;

3952 }

3953 #endif

3954 return false;

3955 }

3956

3957 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,

3958 int vector)

3959 {

3960 struct vcpu_vmx *vmx = to_vmx(vcpu);

3961

3962 if (is_guest_mode(vcpu) &&

3963 vector == vmx->nested.posted_intr_nv) {

3964 /*

3965 * If a posted intr is not recognized by hardware,

3966 * we will accomplish it in the next vmentry.

3967 */

3968 vmx->nested.pi_pending = true;

3969 kvm_make_request(KVM_REQ_EVENT, vcpu);

3970 /* the PIR and ON have been set by L1. */

3971 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))

3972 kvm_vcpu_kick(vcpu);

3973 return 0;

3974 }

3975 return -1;

3976 }

3977 /*

3978 * Send interrupt to vcpu via posted interrupt way.

3979 * 1. If target vcpu is running(non-root mode), send posted interrupt

3980 * notification to vcpu and hardware will sync PIR to vIRR atomically.

3981 * 2. If target vcpu isn't running(root mode), kick it to pick up the

3982 * interrupt from PIR in next vmentry.

3983 */

3984 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)

3985 {

3986 struct vcpu_vmx *vmx = to_vmx(vcpu);

3987 int r;

3988

3989 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);

3990 if (!r)

3991 return 0;

3992

3993 if (!vcpu->arch.apicv_active)

3994 return -1;

3995

3996 if (pi_test_and_set_pir(vector, &vmx->pi_desc))

3997 return 0;

3998

3999 /* If a previous notification has sent the IPI, nothing to do. */

4000 if (pi_test_and_set_on(&vmx->pi_desc))

4001 return 0;

4002

4003 if (vcpu != kvm_get_running_vcpu() &&

4004 !kvm_vcpu_trigger_posted_interrupt(vcpu, false))

4005 kvm_vcpu_kick(vcpu);

4006

4007 return 0;

4008 }

4009

4010 /*

4011 * Set up the vmcs's constant host-state fields, i.e., host-state fields that

4012 * will not change in the lifetime of the guest.

4013 * Note that host-state that does change is set elsewhere. E.g., host-state

4014 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.

4015 */

4016 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)

4017 {

4018 u32 low32, high32;

4019 unsigned long tmpl;

4020 unsigned long cr0, cr3, cr4;

4021

4022 cr0 = read_cr0();

4023 WARN_ON(cr0 & X86_CR0_TS);

4024 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */

4025

4026 /*

4027 * Save the most likely value for this task's CR3 in the VMCS.

4028 * We can't use __get_current_cr3_fast() because we're not atomic.

4029 */

4030 cr3 = __read_cr3();

4031 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */

4032 vmx->loaded_vmcs->host_state.cr3 = cr3;

4033

4034 /* Save the most likely value for this task's CR4 in the VMCS. */

4035 cr4 = cr4_read_shadow();

4036 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */

4037 vmx->loaded_vmcs->host_state.cr4 = cr4;

4038

4039 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */

4040 #ifdef CONFIG_X86_64

4041 /*

4042 * Load null selectors, so we can avoid reloading them in

4043 * vmx_prepare_switch_to_host(), in case userspace uses

4044 * the null selectors too (the expected case).

4045 */

4046 vmcs_write16(HOST_DS_SELECTOR, 0);

4047 vmcs_write16(HOST_ES_SELECTOR, 0);

4048 #else

4049 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */

4050 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */

4051 #endif

4052 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */

4053 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */

4054

4055 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */

4056

4057 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */

4058

4059 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);

4060 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);

4061 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);

4062 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */

4063

4064 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {

4065 rdmsr(MSR_IA32_CR_PAT, low32, high32);

4066 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));

4067 }

4068

4069 if (cpu_has_load_ia32_efer())

4070 vmcs_write64(HOST_IA32_EFER, host_efer);

4071 }

4072

4073 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)

4074 {

4075 struct kvm_vcpu *vcpu = &vmx->vcpu;

4076

4077 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &

4078 ~vcpu->arch.cr4_guest_rsvd_bits;

4079 if (!enable_ept)

4080 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;

4081 if (is_guest_mode(&vmx->vcpu))

4082 vcpu->arch.cr4_guest_owned_bits &=

4083 ~get_vmcs12(vcpu)->cr4_guest_host_mask;

4084 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);

4085 }

4086

4087 u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)

4088 {

4089 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;

4090

4091 if (!kvm_vcpu_apicv_active(&vmx->vcpu))

4092 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;

4093

4094 if (!enable_vnmi)

4095 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;

4096

4097 if (!enable_preemption_timer)

4098 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;

4099

4100 return pin_based_exec_ctrl;

4101 }

4102

4103 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)

4104 {

4105 struct vcpu_vmx *vmx = to_vmx(vcpu);

4106

4107 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));

4108 if (cpu_has_secondary_exec_ctrls()) {

4109 if (kvm_vcpu_apicv_active(vcpu))

4110 secondary_exec_controls_setbit(vmx,

4111 SECONDARY_EXEC_APIC_REGISTER_VIRT |

4112 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

4113 else

4114 secondary_exec_controls_clearbit(vmx,

4115 SECONDARY_EXEC_APIC_REGISTER_VIRT |

4116 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

4117 }

4118

4119 if (cpu_has_vmx_msr_bitmap())

4120 vmx_update_msr_bitmap(vcpu);

4121 }

4122

4123 u32 vmx_exec_control(struct vcpu_vmx *vmx)

4124 {

4125 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;

4126

4127 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)

4128 exec_control &= ~CPU_BASED_MOV_DR_EXITING;

4129

4130 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {

4131 exec_control &= ~CPU_BASED_TPR_SHADOW;

4132 #ifdef CONFIG_X86_64

4133 exec_control |= CPU_BASED_CR8_STORE_EXITING |

4134 CPU_BASED_CR8_LOAD_EXITING;

4135 #endif

4136 }

4137 if (!enable_ept)

4138 exec_control |= CPU_BASED_CR3_STORE_EXITING |

4139 CPU_BASED_CR3_LOAD_EXITING |

4140 CPU_BASED_INVLPG_EXITING;

4141 if (kvm_mwait_in_guest(vmx->vcpu.kvm))

4142 exec_control &= ~(CPU_BASED_MWAIT_EXITING |

4143 CPU_BASED_MONITOR_EXITING);

4144 if (kvm_hlt_in_guest(vmx->vcpu.kvm))

4145 exec_control &= ~CPU_BASED_HLT_EXITING;

4146 return exec_control;

4147 }

4148

4149 /*

4150 * Adjust a single secondary execution control bit to intercept/allow an

4151 * instruction in the guest. This is usually done based on whether or not a

4152 * feature has been exposed to the guest in order to correctly emulate faults.

4153 */

4154 static inline void

4155 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,

4156 u32 control, bool enabled, bool exiting)

4157 {

4158 /*

4159 * If the control is for an opt-in feature, clear the control if the

4160 * feature is not exposed to the guest, i.e. not enabled. If the

4161 * control is opt-out, i.e. an exiting control, clear the control if

4162 * the feature _is_ exposed to the guest, i.e. exiting/interception is

4163 * disabled for the associated instruction. Note, the caller is

4164 * responsible presetting exec_control to set all supported bits.

4165 */

4166 if (enabled == exiting)

4167 *exec_control &= ~control;

4168

4169 /*

4170 * Update the nested MSR settings so that a nested VMM can/can't set

4171 * controls for features that are/aren't exposed to the guest.

4172 */

4173 if (nested) {

4174 if (enabled)

4175 vmx->nested.msrs.secondary_ctls_high |= control;

4176 else

4177 vmx->nested.msrs.secondary_ctls_high &= ~control;

4178 }

4179 }

4180

4181 /*

4182 * Wrapper macro for the common case of adjusting a secondary execution control

4183 * based on a single guest CPUID bit, with a dedicated feature bit. This also

4184 * verifies that the control is actually supported by KVM and hardware.

4185 */

4186 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \

4187 ({ \

4188 bool __enabled; \

4189 \

4190 if (cpu_has_vmx_##name()) { \

4191 __enabled = guest_cpuid_has(&(vmx)->vcpu, \

4192 X86_FEATURE_##feat_name); \

4193 vmx_adjust_secondary_exec_control(vmx, exec_control, \

4194 SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \

4195 } \

4196 })

4197

4198 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */

4199 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \

4200 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)

4201

4202 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \

4203 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)

4204

4205 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)

4206 {

4207 struct kvm_vcpu *vcpu = &vmx->vcpu;

4208

4209 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;

4210

4211 if (vmx_pt_mode_is_system())

4212 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);

4213 if (!cpu_need_virtualize_apic_accesses(vcpu))

4214 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;

4215 if (vmx->vpid == 0)

4216 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;

4217 if (!enable_ept) {

4218 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;

4219 enable_unrestricted_guest = 0;

4220 }

4221 if (!enable_unrestricted_guest)

4222 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;

4223 if (kvm_pause_in_guest(vmx->vcpu.kvm))

4224 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;

4225 if (!kvm_vcpu_apicv_active(vcpu))

4226 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |

4227 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

4228 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;

4229

4230 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,

4231 * in vmx_set_cr4. */

4232 exec_control &= ~SECONDARY_EXEC_DESC;

4233

4234 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD

4235 (handle_vmptrld).

4236 We can NOT enable shadow_vmcs here because we don't have yet

4237 a current VMCS12

4238 */

4239 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;

4240

4241 /*

4242 * PML is enabled/disabled when dirty logging of memsmlots changes, but

4243 * it needs to be set here when dirty logging is already active, e.g.

4244 * if this vCPU was created after dirty logging was enabled.

4245 */

4246 if (!vcpu->kvm->arch.cpu_dirty_logging_count)

4247 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;

4248

4249 if (cpu_has_vmx_xsaves()) {

4250 /* Exposing XSAVES only when XSAVE is exposed */

4251 bool xsaves_enabled =

4252 boot_cpu_has(X86_FEATURE_XSAVE) &&

4253 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&

4254 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);

4255

4256 vcpu->arch.xsaves_enabled = xsaves_enabled;

4257

4258 vmx_adjust_secondary_exec_control(vmx, &exec_control,

4259 SECONDARY_EXEC_XSAVES,

4260 xsaves_enabled, false);

4261 }

4262

4263 /*

4264 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either

4265 * feature is exposed to the guest. This creates a virtualization hole

4266 * if both are supported in hardware but only one is exposed to the

4267 * guest, but letting the guest execute RDTSCP or RDPID when either one

4268 * is advertised is preferable to emulating the advertised instruction

4269 * in KVM on #UD, and obviously better than incorrectly injecting #UD.

4270 */

4271 if (cpu_has_vmx_rdtscp()) {

4272 bool rdpid_or_rdtscp_enabled =

4273 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||

4274 guest_cpuid_has(vcpu, X86_FEATURE_RDPID);

4275

4276 vmx_adjust_secondary_exec_control(vmx, &exec_control,

4277 SECONDARY_EXEC_ENABLE_RDTSCP,

4278 rdpid_or_rdtscp_enabled, false);

4279 }

4280 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);

4281

4282 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);

4283 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);

4284

4285 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,

4286 ENABLE_USR_WAIT_PAUSE, false);

4287

4288 if (!vcpu->kvm->arch.bus_lock_detection_enabled)

4289 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;

4290

4291 vmx->secondary_exec_control = exec_control;

4292 }

4293

4294 #define VMX_XSS_EXIT_BITMAP 0

4295

4296 /*

4297 * Noting that the initialization of Guest-state Area of VMCS is in

4298 * vmx_vcpu_reset().

4299 */

4300 static void init_vmcs(struct vcpu_vmx *vmx)

4301 {

4302 if (nested)

4303 nested_vmx_set_vmcs_shadowing_bitmap();

4304

4305 if (cpu_has_vmx_msr_bitmap())

4306 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));

4307

4308 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */

4309

4310 /* Control */

4311 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));

4312

4313 exec_controls_set(vmx, vmx_exec_control(vmx));

4314

4315 if (cpu_has_secondary_exec_ctrls()) {

4316 vmx_compute_secondary_exec_control(vmx);

4317 secondary_exec_controls_set(vmx, vmx->secondary_exec_control);

4318 }

4319

4320 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {

4321 vmcs_write64(EOI_EXIT_BITMAP0, 0);

4322 vmcs_write64(EOI_EXIT_BITMAP1, 0);

4323 vmcs_write64(EOI_EXIT_BITMAP2, 0);

4324 vmcs_write64(EOI_EXIT_BITMAP3, 0);

4325

4326 vmcs_write16(GUEST_INTR_STATUS, 0);

4327

4328 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);

4329 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));

4330 }

4331

4332 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {

4333 vmcs_write32(PLE_GAP, ple_gap);

4334 vmx->ple_window = ple_window;

4335 vmx->ple_window_dirty = true;

4336 }

4337

4338 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);

4339 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);

4340 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */

4341

4342 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */

4343 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */

4344 vmx_set_constant_host_state(vmx);

4345 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */

4346 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */

4347

4348 if (cpu_has_vmx_vmfunc())

4349 vmcs_write64(VM_FUNCTION_CONTROL, 0);

4350

4351 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);

4352 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);

4353 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));

4354 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);

4355 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));

4356

4357 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)

4358 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);

4359

4360 vm_exit_controls_set(vmx, vmx_vmexit_ctrl());

4361

4362 /* 22.2.1, 20.8.1 */

4363 vm_entry_controls_set(vmx, vmx_vmentry_ctrl());

4364

4365 vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;

4366 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);

4367

4368 set_cr4_guest_host_mask(vmx);

4369

4370 if (vmx->vpid != 0)

4371 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);

4372

4373 if (cpu_has_vmx_xsaves())

4374 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);

4375

4376 if (enable_pml) {

4377 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));

4378 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);

4379 }

4380

4381 vmx_write_encls_bitmap(&vmx->vcpu, NULL);

4382

4383 if (vmx_pt_mode_is_host_guest()) {

4384 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));

4385 /* Bit[6~0] are forced to 1, writes are ignored. */

4386 vmx->pt_desc.guest.output_mask = 0x7F;

4387 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);

4388 }

4389 }

4390

4391 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)

4392 {

4393 struct vcpu_vmx *vmx = to_vmx(vcpu);

4394 struct msr_data apic_base_msr;

4395 u32 eax, dummy;

4396 u64 cr0;

4397

4398 vmx->rmode.vm86_active = 0;

4399 vmx->spec_ctrl = 0;

4400

4401 vmx->msr_ia32_umwait_control = 0;

4402

4403 eax = 1;

4404 if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))

4405 eax = get_rdx_init_val();

4406 kvm_rdx_write(vcpu, eax);

4407

4408 vmx->hv_deadline_tsc = -1;

4409 kvm_set_cr8(vcpu, 0);

4410

4411 if (!init_event) {

4412 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |

4413 MSR_IA32_APICBASE_ENABLE;

4414 if (kvm_vcpu_is_reset_bsp(vcpu))

4415 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;

4416 apic_base_msr.host_initiated = true;

4417 kvm_set_apic_base(vcpu, &apic_base_msr);

4418 }

4419

4420 vmx_segment_cache_clear(vmx);

4421

4422 seg_setup(VCPU_SREG_CS);

4423 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);

4424 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);

4425

4426 seg_setup(VCPU_SREG_DS);

4427 seg_setup(VCPU_SREG_ES);

4428 seg_setup(VCPU_SREG_FS);

4429 seg_setup(VCPU_SREG_GS);

4430 seg_setup(VCPU_SREG_SS);

4431

4432 vmcs_write16(GUEST_TR_SELECTOR, 0);

4433 vmcs_writel(GUEST_TR_BASE, 0);

4434 vmcs_write32(GUEST_TR_LIMIT, 0xffff);

4435 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

4436

4437 vmcs_write16(GUEST_LDTR_SELECTOR, 0);

4438 vmcs_writel(GUEST_LDTR_BASE, 0);

4439 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);

4440 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);

4441

4442 if (!init_event) {

4443 vmcs_write32(GUEST_SYSENTER_CS, 0);

4444 vmcs_writel(GUEST_SYSENTER_ESP, 0);

4445 vmcs_writel(GUEST_SYSENTER_EIP, 0);

4446 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

4447 }

4448

4449 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);

4450 kvm_rip_write(vcpu, 0xfff0);

4451

4452 vmcs_writel(GUEST_GDTR_BASE, 0);

4453 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);

4454

4455 vmcs_writel(GUEST_IDTR_BASE, 0);

4456 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);

4457

4458 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);

4459 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);

4460 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);

4461 if (kvm_mpx_supported())

4462 vmcs_write64(GUEST_BNDCFGS, 0);

4463

4464 setup_msrs(vmx);

4465

4466 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */

4467

4468 if (cpu_has_vmx_tpr_shadow() && !init_event) {

4469 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);

4470 if (cpu_need_tpr_shadow(vcpu))

4471 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,

4472 __pa(vcpu->arch.apic->regs));

4473 vmcs_write32(TPR_THRESHOLD, 0);

4474 }

4475

4476 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);

4477

4478 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;

4479 vmx->vcpu.arch.cr0 = cr0;

4480 vmx_set_cr0(vcpu, cr0); /* enter rmode */

4481 vmx_set_cr4(vcpu, 0);

4482 vmx_set_efer(vcpu, 0);

4483

4484 vmx_update_exception_bitmap(vcpu);

4485

4486 vpid_sync_context(vmx->vpid);

4487 if (init_event)

4488 vmx_clear_hlt(vcpu);

4489 }

4490

4491 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)

4492 {

4493 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);

4494 }

4495

4496 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)

4497 {

4498 if (!enable_vnmi ||

4499 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {

4500 vmx_enable_irq_window(vcpu);

4501 return;

4502 }

4503

4504 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);

4505 }

4506

4507 static void vmx_inject_irq(struct kvm_vcpu *vcpu)

4508 {

4509 struct vcpu_vmx *vmx = to_vmx(vcpu);

4510 uint32_t intr;

4511 int irq = vcpu->arch.interrupt.nr;

4512

4513 trace_kvm_inj_virq(irq);

4514

4515 ++vcpu->stat.irq_injections;

4516 if (vmx->rmode.vm86_active) {

4517 int inc_eip = 0;

4518 if (vcpu->arch.interrupt.soft)

4519 inc_eip = vcpu->arch.event_exit_inst_len;

4520 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);

4521 return;

4522 }

4523 intr = irq | INTR_INFO_VALID_MASK;

4524 if (vcpu->arch.interrupt.soft) {

4525 intr |= INTR_TYPE_SOFT_INTR;

4526 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,

4527 vmx->vcpu.arch.event_exit_inst_len);

4528 } else

4529 intr |= INTR_TYPE_EXT_INTR;

4530 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);

4531

4532 vmx_clear_hlt(vcpu);

4533 }

4534

4535 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)

4536 {

4537 struct vcpu_vmx *vmx = to_vmx(vcpu);

4538

4539 if (!enable_vnmi) {

4540 /*

4541 * Tracking the NMI-blocked state in software is built upon

4542 * finding the next open IRQ window. This, in turn, depends on

4543 * well-behaving guests: They have to keep IRQs disabled at

4544 * least as long as the NMI handler runs. Otherwise we may

4545 * cause NMI nesting, maybe breaking the guest. But as this is

4546 * highly unlikely, we can live with the residual risk.

4547 */

4548 vmx->loaded_vmcs->soft_vnmi_blocked = 1;

4549 vmx->loaded_vmcs->vnmi_blocked_time = 0;

4550 }

4551

4552 ++vcpu->stat.nmi_injections;

4553 vmx->loaded_vmcs->nmi_known_unmasked = false;

4554

4555 if (vmx->rmode.vm86_active) {

4556 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);

4557 return;

4558 }

4559

4560 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,

4561 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);

4562

4563 vmx_clear_hlt(vcpu);

4564 }

4565

4566 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)

4567 {

4568 struct vcpu_vmx *vmx = to_vmx(vcpu);

4569 bool masked;

4570

4571 if (!enable_vnmi)

4572 return vmx->loaded_vmcs->soft_vnmi_blocked;

4573 if (vmx->loaded_vmcs->nmi_known_unmasked)

4574 return false;

4575 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;

4576 vmx->loaded_vmcs->nmi_known_unmasked = !masked;

4577 return masked;

4578 }

4579

4580 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)

4581 {

4582 struct vcpu_vmx *vmx = to_vmx(vcpu);

4583

4584 if (!enable_vnmi) {

4585 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {

4586 vmx->loaded_vmcs->soft_vnmi_blocked = masked;

4587 vmx->loaded_vmcs->vnmi_blocked_time = 0;

4588 }

4589 } else {

4590 vmx->loaded_vmcs->nmi_known_unmasked = !masked;

4591 if (masked)

4592 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,

4593 GUEST_INTR_STATE_NMI);

4594 else

4595 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,

4596 GUEST_INTR_STATE_NMI);

4597 }

4598 }

4599

4600 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)

4601 {

4602 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))

4603 return false;

4604

4605 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)

4606 return true;

4607

4608 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &

4609 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |

4610 GUEST_INTR_STATE_NMI));

4611 }

4612

4613 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)

4614 {

4615 if (to_vmx(vcpu)->nested.nested_run_pending)

4616 return -EBUSY;

4617

4618 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */

4619 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))

4620 return -EBUSY;

4621

4622 return !vmx_nmi_blocked(vcpu);

4623 }

4624

4625 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)

4626 {

4627 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))

4628 return false;

4629

4630 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||

4631 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &

4632 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));

4633 }

4634

4635 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)

4636 {

4637 if (to_vmx(vcpu)->nested.nested_run_pending)

4638 return -EBUSY;

4639

4640 /*

4641 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,

4642 * e.g. if the IRQ arrived asynchronously after checking nested events.

4643 */

4644 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))

4645 return -EBUSY;

4646

4647 return !vmx_interrupt_blocked(vcpu);

4648 }

4649

4650 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)

4651 {

4652 void __user *ret;

4653

4654 if (enable_unrestricted_guest)

4655 return 0;

4656

4657 mutex_lock(&kvm->slots_lock);

4658 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,

4659 PAGE_SIZE * 3);

4660 mutex_unlock(&kvm->slots_lock);

4661

4662 if (IS_ERR(ret))

4663 return PTR_ERR(ret);

4664

4665 to_kvm_vmx(kvm)->tss_addr = addr;

4666

4667 return init_rmode_tss(kvm, ret);

4668 }

4669

4670 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)

4671 {

4672 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;

4673 return 0;

4674 }

4675

4676 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)

4677 {

4678 switch (vec) {

4679 case BP_VECTOR:

4680 /*

4681 * Update instruction length as we may reinject the exception

4682 * from user space while in guest debugging mode.

4683 */

4684 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =

4685 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);

4686 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)

4687 return false;

4688 fallthrough;

4689 case DB_VECTOR:

4690 return !(vcpu->guest_debug &

4691 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));

4692 case DE_VECTOR:

4693 case OF_VECTOR:

4694 case BR_VECTOR:

4695 case UD_VECTOR:

4696 case DF_VECTOR:

4697 case SS_VECTOR:

4698 case GP_VECTOR:

4699 case MF_VECTOR:

4700 return true;

4701 }

4702 return false;

4703 }

4704

4705 static int handle_rmode_exception(struct kvm_vcpu *vcpu,

4706 int vec, u32 err_code)

4707 {

4708 /*

4709 * Instruction with address size override prefix opcode 0x67

4710 * Cause the #SS fault with 0 error code in VM86 mode.

4711 */

4712 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {

4713 if (kvm_emulate_instruction(vcpu, 0)) {

4714 if (vcpu->arch.halt_request) {

4715 vcpu->arch.halt_request = 0;

4716 return kvm_vcpu_halt(vcpu);

4717 }

4718 return 1;

4719 }

4720 return 0;

4721 }

4722

4723 /*

4724 * Forward all other exceptions that are valid in real mode.

4725 * FIXME: Breaks guest debugging in real mode, needs to be fixed with

4726 * the required debugging infrastructure rework.

4727 */

4728 kvm_queue_exception(vcpu, vec);

4729 return 1;

4730 }

4731

4732 static int handle_machine_check(struct kvm_vcpu *vcpu)

4733 {

4734 /* handled by vmx_vcpu_run() */

4735 return 1;

4736 }

4737

4738 /*

4739 * If the host has split lock detection disabled, then #AC is

4740 * unconditionally injected into the guest, which is the pre split lock

4741 * detection behaviour.

4742 *

4743 * If the host has split lock detection enabled then #AC is

4744 * only injected into the guest when:

4745 * - Guest CPL == 3 (user mode)

4746 * - Guest has #AC detection enabled in CR0

4747 * - Guest EFLAGS has AC bit set

4748 */

4749 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)

4750 {

4751 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))

4752 return true;

4753

4754 return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&

4755 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);

4756 }

4757

4758 static int handle_exception_nmi(struct kvm_vcpu *vcpu)

4759 {

4760 struct vcpu_vmx *vmx = to_vmx(vcpu);

4761 struct kvm_run *kvm_run = vcpu->run;

4762 u32 intr_info, ex_no, error_code;

4763 unsigned long cr2, dr6;

4764 u32 vect_info;

4765

4766 vect_info = vmx->idt_vectoring_info;

4767 intr_info = vmx_get_intr_info(vcpu);

4768

4769 if (is_machine_check(intr_info) || is_nmi(intr_info))

4770 return 1; /* handled by handle_exception_nmi_irqoff() */

4771

4772 if (is_invalid_opcode(intr_info))

4773 return handle_ud(vcpu);

4774

4775 error_code = 0;

4776 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)

4777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);

4778

4779 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {

4780 WARN_ON_ONCE(!enable_vmware_backdoor);

4781

4782 /*

4783 * VMware backdoor emulation on #GP interception only handles

4784 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero

4785 * error code on #GP.

4786 */

4787 if (error_code) {

4788 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);

4789 return 1;

4790 }

4791 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);

4792 }

4793

4794 /*

4795 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing

4796 * MMIO, it is better to report an internal error.

4797 * See the comments in vmx_handle_exit.

4798 */

4799 if ((vect_info & VECTORING_INFO_VALID_MASK) &&

4800 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {

4801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

4802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;

4803 vcpu->run->internal.ndata = 4;

4804 vcpu->run->internal.data[0] = vect_info;

4805 vcpu->run->internal.data[1] = intr_info;

4806 vcpu->run->internal.data[2] = error_code;

4807 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;

4808 return 0;

4809 }

4810

4811 if (is_page_fault(intr_info)) {

4812 cr2 = vmx_get_exit_qual(vcpu);

4813 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {

4814 /*

4815 * EPT will cause page fault only if we need to

4816 * detect illegal GPAs.

4817 */

4818 WARN_ON_ONCE(!allow_smaller_maxphyaddr);

4819 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);

4820 return 1;

4821 } else

4822 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);

4823 }

4824

4825 ex_no = intr_info & INTR_INFO_VECTOR_MASK;

4826

4827 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))

4828 return handle_rmode_exception(vcpu, ex_no, error_code);

4829

4830 switch (ex_no) {

4831 case DB_VECTOR:

4832 dr6 = vmx_get_exit_qual(vcpu);

4833 if (!(vcpu->guest_debug &

4834 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {

4835 if (is_icebp(intr_info))

4836 WARN_ON(!skip_emulated_instruction(vcpu));

4837

4838 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);

4839 return 1;

4840 }

4841 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;

4842 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);

4843 fallthrough;

4844 case BP_VECTOR:

4845 /*

4846 * Update instruction length as we may reinject #BP from

4847 * user space while in guest debugging mode. Reading it for

4848 * #DB as well causes no harm, it is not used in that case.

4849 */

4850 vmx->vcpu.arch.event_exit_inst_len =

4851 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);

4852 kvm_run->exit_reason = KVM_EXIT_DEBUG;

4853 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);

4854 kvm_run->debug.arch.exception = ex_no;

4855 break;

4856 case AC_VECTOR:

4857 if (vmx_guest_inject_ac(vcpu)) {

4858 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);

4859 return 1;

4860 }

4861

4862 /*

4863 * Handle split lock. Depending on detection mode this will

4864 * either warn and disable split lock detection for this

4865 * task or force SIGBUS on it.

4866 */

4867 if (handle_guest_split_lock(kvm_rip_read(vcpu)))

4868 return 1;

4869 fallthrough;

4870 default:

4871 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;

4872 kvm_run->ex.exception = ex_no;

4873 kvm_run->ex.error_code = error_code;

4874 break;

4875 }

4876 return 0;

4877 }

4878

4879 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)

4880 {

4881 ++vcpu->stat.irq_exits;

4882 return 1;

4883 }

4884

4885 static int handle_triple_fault(struct kvm_vcpu *vcpu)

4886 {

4887 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

4888 vcpu->mmio_needed = 0;

4889 return 0;

4890 }

4891

4892 static int handle_io(struct kvm_vcpu *vcpu)

4893 {

4894 unsigned long exit_qualification;

4895 int size, in, string;

4896 unsigned port;

4897

4898 exit_qualification = vmx_get_exit_qual(vcpu);

4899 string = (exit_qualification & 16) != 0;

4900

4901 ++vcpu->stat.io_exits;

4902

4903 if (string)

4904 return kvm_emulate_instruction(vcpu, 0);

4905

4906 port = exit_qualification >> 16;

4907 size = (exit_qualification & 7) + 1;

4908 in = (exit_qualification & 8) != 0;

4909

4910 return kvm_fast_pio(vcpu, size, port, in);

4911 }

4912

4913 static void

4914 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)

4915 {

4916 /*

4917 * Patch in the VMCALL instruction:

4918 */

4919 hypercall[0] = 0x0f;

4920 hypercall[1] = 0x01;

4921 hypercall[2] = 0xc1;

4922 }

4923

4924 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */

4925 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)

4926 {

4927 if (is_guest_mode(vcpu)) {

4928 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

4929 unsigned long orig_val = val;

4930

4931 /*

4932 * We get here when L2 changed cr0 in a way that did not change

4933 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),

4934 * but did change L0 shadowed bits. So we first calculate the

4935 * effective cr0 value that L1 would like to write into the

4936 * hardware. It consists of the L2-owned bits from the new

4937 * value combined with the L1-owned bits from L1's guest_cr0.

4938 */

4939 val = (val & ~vmcs12->cr0_guest_host_mask) |

4940 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);

4941

4942 if (!nested_guest_cr0_valid(vcpu, val))

4943 return 1;

4944

4945 if (kvm_set_cr0(vcpu, val))

4946 return 1;

4947 vmcs_writel(CR0_READ_SHADOW, orig_val);

4948 return 0;

4949 } else {

4950 if (to_vmx(vcpu)->nested.vmxon &&

4951 !nested_host_cr0_valid(vcpu, val))

4952 return 1;

4953

4954 return kvm_set_cr0(vcpu, val);

4955 }

4956 }

4957

4958 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)

4959 {

4960 if (is_guest_mode(vcpu)) {

4961 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

4962 unsigned long orig_val = val;

4963

4964 /* analogously to handle_set_cr0 */

4965 val = (val & ~vmcs12->cr4_guest_host_mask) |

4966 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);

4967 if (kvm_set_cr4(vcpu, val))

4968 return 1;

4969 vmcs_writel(CR4_READ_SHADOW, orig_val);

4970 return 0;

4971 } else

4972 return kvm_set_cr4(vcpu, val);

4973 }

4974

4975 static int handle_desc(struct kvm_vcpu *vcpu)

4976 {

4977 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));

4978 return kvm_emulate_instruction(vcpu, 0);

4979 }

4980

4981 static int handle_cr(struct kvm_vcpu *vcpu)

4982 {

4983 unsigned long exit_qualification, val;

4984 int cr;

4985 int reg;

4986 int err;

4987 int ret;

4988

4989 exit_qualification = vmx_get_exit_qual(vcpu);

4990 cr = exit_qualification & 15;

4991 reg = (exit_qualification >> 8) & 15;

4992 switch ((exit_qualification >> 4) & 3) {

4993 case 0: /* mov to cr */

4994 val = kvm_register_read(vcpu, reg);

4995 trace_kvm_cr_write(cr, val);

4996 switch (cr) {

4997 case 0:

4998 err = handle_set_cr0(vcpu, val);

4999 return kvm_complete_insn_gp(vcpu, err);

5000 case 3:

5001 WARN_ON_ONCE(enable_unrestricted_guest);

5002

5003 err = kvm_set_cr3(vcpu, val);

5004 return kvm_complete_insn_gp(vcpu, err);

5005 case 4:

5006 err = handle_set_cr4(vcpu, val);

5007 return kvm_complete_insn_gp(vcpu, err);

5008 case 8: {

5009 u8 cr8_prev = kvm_get_cr8(vcpu);

5010 u8 cr8 = (u8)val;

5011 err = kvm_set_cr8(vcpu, cr8);

5012 ret = kvm_complete_insn_gp(vcpu, err);

5013 if (lapic_in_kernel(vcpu))

5014 return ret;

5015 if (cr8_prev <= cr8)

5016 return ret;

5017 /*

5018 * TODO: we might be squashing a

5019 * KVM_GUESTDBG_SINGLESTEP-triggered

5020 * KVM_EXIT_DEBUG here.

5021 */

5022 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;

5023 return 0;

5024 }

5025 }

5026 break;

5027 case 2: /* clts */

5028 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");

5029 return -EIO;

5030 case 1: /*mov from cr*/

5031 switch (cr) {

5032 case 3:

5033 WARN_ON_ONCE(enable_unrestricted_guest);

5034

5035 val = kvm_read_cr3(vcpu);

5036 kvm_register_write(vcpu, reg, val);

5037 trace_kvm_cr_read(cr, val);

5038 return kvm_skip_emulated_instruction(vcpu);

5039 case 8:

5040 val = kvm_get_cr8(vcpu);

5041 kvm_register_write(vcpu, reg, val);

5042 trace_kvm_cr_read(cr, val);

5043 return kvm_skip_emulated_instruction(vcpu);

5044 }

5045 break;

5046 case 3: /* lmsw */

5047 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;

5048 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);

5049 kvm_lmsw(vcpu, val);

5050

5051 return kvm_skip_emulated_instruction(vcpu);

5052 default:

5053 break;

5054 }

5055 vcpu->run->exit_reason = 0;

5056 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",

5057 (int)(exit_qualification >> 4) & 3, cr);

5058 return 0;

5059 }

5060

5061 static int handle_dr(struct kvm_vcpu *vcpu)

5062 {

5063 unsigned long exit_qualification;

5064 int dr, dr7, reg;

5065 int err = 1;

5066

5067 exit_qualification = vmx_get_exit_qual(vcpu);

5068 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;

5069

5070 /* First, if DR does not exist, trigger UD */

5071 if (!kvm_require_dr(vcpu, dr))

5072 return 1;

5073

5074 if (kvm_x86_ops.get_cpl(vcpu) > 0)

5075 goto out;

5076

5077 dr7 = vmcs_readl(GUEST_DR7);

5078 if (dr7 & DR7_GD) {

5079 /*

5080 * As the vm-exit takes precedence over the debug trap, we

5081 * need to emulate the latter, either for the host or the

5082 * guest debugging itself.

5083 */

5084 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {

5085 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;

5086 vcpu->run->debug.arch.dr7 = dr7;

5087 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);

5088 vcpu->run->debug.arch.exception = DB_VECTOR;

5089 vcpu->run->exit_reason = KVM_EXIT_DEBUG;

5090 return 0;

5091 } else {

5092 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);

5093 return 1;

5094 }

5095 }

5096

5097 if (vcpu->guest_debug == 0) {

5098 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);

5099

5100 /*

5101 * No more DR vmexits; force a reload of the debug registers

5102 * and reenter on this instruction. The next vmexit will

5103 * retrieve the full state of the debug registers.

5104 */

5105 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;

5106 return 1;

5107 }

5108

5109 reg = DEBUG_REG_ACCESS_REG(exit_qualification);

5110 if (exit_qualification & TYPE_MOV_FROM_DR) {

5111 unsigned long val;

5112

5113 kvm_get_dr(vcpu, dr, &val);

5114 kvm_register_write(vcpu, reg, val);

5115 err = 0;

5116 } else {

5117 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));

5118 }

5119

5120 out:

5121 return kvm_complete_insn_gp(vcpu, err);

5122 }

5123

5124 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)

5125 {

5126 get_debugreg(vcpu->arch.db[0], 0);

5127 get_debugreg(vcpu->arch.db[1], 1);

5128 get_debugreg(vcpu->arch.db[2], 2);

5129 get_debugreg(vcpu->arch.db[3], 3);

5130 get_debugreg(vcpu->arch.dr6, 6);

5131 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);

5132

5133 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;

5134 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);

5135 }

5136

5137 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)

5138 {

5139 vmcs_writel(GUEST_DR7, val);

5140 }

5141

5142 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)

5143 {

5144 kvm_apic_update_ppr(vcpu);

5145 return 1;

5146 }

5147

5148 static int handle_interrupt_window(struct kvm_vcpu *vcpu)

5149 {

5150 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);

5151

5152 kvm_make_request(KVM_REQ_EVENT, vcpu);

5153

5154 ++vcpu->stat.irq_window_exits;

5155 return 1;

5156 }

5157

5158 static int handle_invlpg(struct kvm_vcpu *vcpu)

5159 {

5160 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

5161

5162 kvm_mmu_invlpg(vcpu, exit_qualification);

5163 return kvm_skip_emulated_instruction(vcpu);

5164 }

5165

5166 static int handle_apic_access(struct kvm_vcpu *vcpu)

5167 {

5168 if (likely(fasteoi)) {

5169 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

5170 int access_type, offset;

5171

5172 access_type = exit_qualification & APIC_ACCESS_TYPE;

5173 offset = exit_qualification & APIC_ACCESS_OFFSET;

5174 /*

5175 * Sane guest uses MOV to write EOI, with written value

5176 * not cared. So make a short-circuit here by avoiding

5177 * heavy instruction emulation.

5178 */

5179 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&

5180 (offset == APIC_EOI)) {

5181 kvm_lapic_set_eoi(vcpu);

5182 return kvm_skip_emulated_instruction(vcpu);

5183 }

5184 }

5185 return kvm_emulate_instruction(vcpu, 0);

5186 }

5187

5188 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)

5189 {

5190 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

5191 int vector = exit_qualification & 0xff;

5192

5193 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */

5194 kvm_apic_set_eoi_accelerated(vcpu, vector);

5195 return 1;

5196 }

5197

5198 static int handle_apic_write(struct kvm_vcpu *vcpu)

5199 {

5200 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

5201 u32 offset = exit_qualification & 0xfff;

5202

5203 /* APIC-write VM exit is trap-like and thus no need to adjust IP */

5204 kvm_apic_write_nodecode(vcpu, offset);

5205 return 1;

5206 }

5207

5208 static int handle_task_switch(struct kvm_vcpu *vcpu)

5209 {

5210 struct vcpu_vmx *vmx = to_vmx(vcpu);

5211 unsigned long exit_qualification;

5212 bool has_error_code = false;

5213 u32 error_code = 0;

5214 u16 tss_selector;

5215 int reason, type, idt_v, idt_index;

5216

5217 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);

5218 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);

5219 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);

5220

5221 exit_qualification = vmx_get_exit_qual(vcpu);

5222

5223 reason = (u32)exit_qualification >> 30;

5224 if (reason == TASK_SWITCH_GATE && idt_v) {

5225 switch (type) {

5226 case INTR_TYPE_NMI_INTR:

5227 vcpu->arch.nmi_injected = false;

5228 vmx_set_nmi_mask(vcpu, true);

5229 break;

5230 case INTR_TYPE_EXT_INTR:

5231 case INTR_TYPE_SOFT_INTR:

5232 kvm_clear_interrupt_queue(vcpu);

5233 break;

5234 case INTR_TYPE_HARD_EXCEPTION:

5235 if (vmx->idt_vectoring_info &

5236 VECTORING_INFO_DELIVER_CODE_MASK) {

5237 has_error_code = true;

5238 error_code =

5239 vmcs_read32(IDT_VECTORING_ERROR_CODE);

5240 }

5241 fallthrough;

5242 case INTR_TYPE_SOFT_EXCEPTION:

5243 kvm_clear_exception_queue(vcpu);

5244 break;

5245 default:

5246 break;

5247 }

5248 }

5249 tss_selector = exit_qualification;

5250

5251 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&

5252 type != INTR_TYPE_EXT_INTR &&

5253 type != INTR_TYPE_NMI_INTR))

5254 WARN_ON(!skip_emulated_instruction(vcpu));

5255

5256 /*

5257 * TODO: What about debug traps on tss switch?

5258 * Are we supposed to inject them and update dr6?

5259 */

5260 return kvm_task_switch(vcpu, tss_selector,

5261 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,

5262 reason, has_error_code, error_code);

5263 }

5264

5265 static int handle_ept_violation(struct kvm_vcpu *vcpu)

5266 {

5267 unsigned long exit_qualification;

5268 gpa_t gpa;

5269 u64 error_code;

5270

5271 exit_qualification = vmx_get_exit_qual(vcpu);

5272

5273 /*

5274 * EPT violation happened while executing iret from NMI,

5275 * "blocked by NMI" bit has to be set before next VM entry.

5276 * There are errata that may cause this bit to not be set:

5277 * AAK134, BY25.

5278 */

5279 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&

5280 enable_vnmi &&

5281 (exit_qualification & INTR_INFO_UNBLOCK_NMI))

5282 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);

5283

5284 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);

5285 trace_kvm_page_fault(gpa, exit_qualification);

5286

5287 /* Is it a read fault? */

5288 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)

5289 ? PFERR_USER_MASK : 0;

5290 /* Is it a write fault? */

5291 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)

5292 ? PFERR_WRITE_MASK : 0;

5293 /* Is it a fetch fault? */

5294 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)

5295 ? PFERR_FETCH_MASK : 0;

5296 /* ept page table entry is present? */

5297 error_code |= (exit_qualification &

5298 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |

5299 EPT_VIOLATION_EXECUTABLE))

5300 ? PFERR_PRESENT_MASK : 0;

5301

5302 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?

5303 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;

5304

5305 vcpu->arch.exit_qualification = exit_qualification;

5306

5307 /*

5308 * Check that the GPA doesn't exceed physical memory limits, as that is

5309 * a guest page fault. We have to emulate the instruction here, because

5310 * if the illegal address is that of a paging structure, then

5311 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we

5312 * would also use advanced VM-exit information for EPT violations to

5313 * reconstruct the page fault error code.

5314 */

5315 if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))

5316 return kvm_emulate_instruction(vcpu, 0);

5317

5318 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);

5319 }

5320

5321 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)

5322 {

5323 gpa_t gpa;

5324

5325 if (!vmx_can_emulate_instruction(vcpu, NULL, 0))

5326 return 1;

5327

5328 /*

5329 * A nested guest cannot optimize MMIO vmexits, because we have an

5330 * nGPA here instead of the required GPA.

5331 */

5332 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);

5333 if (!is_guest_mode(vcpu) &&

5334 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {

5335 trace_kvm_fast_mmio(gpa);

5336 return kvm_skip_emulated_instruction(vcpu);

5337 }

5338

5339 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);

5340 }

5341

5342 static int handle_nmi_window(struct kvm_vcpu *vcpu)

5343 {

5344 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))

5345 return -EIO;

5346

5347 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);

5348 ++vcpu->stat.nmi_window_exits;

5349 kvm_make_request(KVM_REQ_EVENT, vcpu);

5350

5351 return 1;

5352 }

5353

5354 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)

5355 {

5356 struct vcpu_vmx *vmx = to_vmx(vcpu);

5357 bool intr_window_requested;

5358 unsigned count = 130;

5359

5360 intr_window_requested = exec_controls_get(vmx) &

5361 CPU_BASED_INTR_WINDOW_EXITING;

5362

5363 while (vmx->emulation_required && count-- != 0) {

5364 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))

5365 return handle_interrupt_window(&vmx->vcpu);

5366

5367 if (kvm_test_request(KVM_REQ_EVENT, vcpu))

5368 return 1;

5369

5370 if (!kvm_emulate_instruction(vcpu, 0))

5371 return 0;

5372

5373 if (vmx->emulation_required && !vmx->rmode.vm86_active &&

5374 vcpu->arch.exception.pending) {

5375 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

5376 vcpu->run->internal.suberror =

5377 KVM_INTERNAL_ERROR_EMULATION;

5378 vcpu->run->internal.ndata = 0;

5379 return 0;

5380 }

5381

5382 if (vcpu->arch.halt_request) {

5383 vcpu->arch.halt_request = 0;

5384 return kvm_vcpu_halt(vcpu);

5385 }

5386

5387 /*

5388 * Note, return 1 and not 0, vcpu_run() will invoke

5389 * xfer_to_guest_mode() which will create a proper return

5390 * code.

5391 */

5392 if (__xfer_to_guest_mode_work_pending())

5393 return 1;

5394 }

5395

5396 return 1;

5397 }

5398

5399 static void grow_ple_window(struct kvm_vcpu *vcpu)

5400 {

5401 struct vcpu_vmx *vmx = to_vmx(vcpu);

5402 unsigned int old = vmx->ple_window;

5403

5404 vmx->ple_window = __grow_ple_window(old, ple_window,

5405 ple_window_grow,

5406 ple_window_max);

5407

5408 if (vmx->ple_window != old) {

5409 vmx->ple_window_dirty = true;

5410 trace_kvm_ple_window_update(vcpu->vcpu_id,

5411 vmx->ple_window, old);

5412 }

5413 }

5414

5415 static void shrink_ple_window(struct kvm_vcpu *vcpu)

5416 {

5417 struct vcpu_vmx *vmx = to_vmx(vcpu);

5418 unsigned int old = vmx->ple_window;

5419

5420 vmx->ple_window = __shrink_ple_window(old, ple_window,

5421 ple_window_shrink,

5422 ple_window);

5423

5424 if (vmx->ple_window != old) {

5425 vmx->ple_window_dirty = true;

5426 trace_kvm_ple_window_update(vcpu->vcpu_id,

5427 vmx->ple_window, old);

5428 }

5429 }

5430

5431 /*

5432 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE

5433 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.

5434 */

5435 static int handle_pause(struct kvm_vcpu *vcpu)

5436 {

5437 if (!kvm_pause_in_guest(vcpu->kvm))

5438 grow_ple_window(vcpu);

5439

5440 /*

5441 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"

5442 * VM-execution control is ignored if CPL > 0. OTOH, KVM

5443 * never set PAUSE_EXITING and just set PLE if supported,

5444 * so the vcpu must be CPL=0 if it gets a PAUSE exit.

5445 */

5446 kvm_vcpu_on_spin(vcpu, true);

5447 return kvm_skip_emulated_instruction(vcpu);

5448 }

5449

5450 static int handle_monitor_trap(struct kvm_vcpu *vcpu)

5451 {

5452 return 1;

5453 }

5454

5455 static int handle_invpcid(struct kvm_vcpu *vcpu)

5456 {

5457 u32 vmx_instruction_info;

5458 unsigned long type;

5459 gva_t gva;

5460 struct {

5461 u64 pcid;

5462 u64 gla;

5463 } operand;

5464

5465 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {

5466 kvm_queue_exception(vcpu, UD_VECTOR);

5467 return 1;

5468 }

5469

5470 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);

5471 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);

5472

5473 if (type > 3) {

5474 kvm_inject_gp(vcpu, 0);

5475 return 1;

5476 }

5477

5478 /* According to the Intel instruction reference, the memory operand

5479 * is read even if it isn't needed (e.g., for type==all)

5480 */

5481 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),

5482 vmx_instruction_info, false,

5483 sizeof(operand), &gva))

5484 return 1;

5485

5486 return kvm_handle_invpcid(vcpu, type, gva);

5487 }

5488

5489 static int handle_pml_full(struct kvm_vcpu *vcpu)

5490 {

5491 unsigned long exit_qualification;

5492

5493 trace_kvm_pml_full(vcpu->vcpu_id);

5494

5495 exit_qualification = vmx_get_exit_qual(vcpu);

5496

5497 /*

5498 * PML buffer FULL happened while executing iret from NMI,

5499 * "blocked by NMI" bit has to be set before next VM entry.

5500 */

5501 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&

5502 enable_vnmi &&

5503 (exit_qualification & INTR_INFO_UNBLOCK_NMI))

5504 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,

5505 GUEST_INTR_STATE_NMI);

5506

5507 /*

5508 * PML buffer already flushed at beginning of VMEXIT. Nothing to do

5509 * here.., and there's no userspace involvement needed for PML.

5510 */

5511 return 1;

5512 }

5513

5514 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)

5515 {

5516 struct vcpu_vmx *vmx = to_vmx(vcpu);

5517

5518 if (!vmx->req_immediate_exit &&

5519 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {

5520 kvm_lapic_expired_hv_timer(vcpu);

5521 return EXIT_FASTPATH_REENTER_GUEST;

5522 }

5523

5524 return EXIT_FASTPATH_NONE;

5525 }

5526

5527 static int handle_preemption_timer(struct kvm_vcpu *vcpu)

5528 {

5529 handle_fastpath_preemption_timer(vcpu);

5530 return 1;

5531 }

5532

5533 /*

5534 * When nested=0, all VMX instruction VM Exits filter here. The handlers

5535 * are overwritten by nested_vmx_setup() when nested=1.

5536 */

5537 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)

5538 {

5539 kvm_queue_exception(vcpu, UD_VECTOR);

5540 return 1;

5541 }

5542

5543 #ifndef CONFIG_X86_SGX_KVM

5544 static int handle_encls(struct kvm_vcpu *vcpu)

5545 {

5546 /*

5547 * SGX virtualization is disabled. There is no software enable bit for

5548 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent

5549 * the guest from executing ENCLS (when SGX is supported by hardware).

5550 */

5551 kvm_queue_exception(vcpu, UD_VECTOR);

5552 return 1;

5553 }

5554 #endif /* CONFIG_X86_SGX_KVM */

5555

5556 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)

5557 {

5558 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;

5559 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;

5560 return 0;

5561 }

5562

5563 /*

5564 * The exit handlers return 1 if the exit was handled fully and guest execution

5565 * may resume. Otherwise they set the kvm_run parameter to indicate what needs

5566 * to be done to userspace and return 0.

5567 */

5568 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {

5569 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,

5570 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,

5571 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,

5572 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,

5573 [EXIT_REASON_IO_INSTRUCTION] = handle_io,

5574 [EXIT_REASON_CR_ACCESS] = handle_cr,

5575 [EXIT_REASON_DR_ACCESS] = handle_dr,

5576 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,

5577 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,

5578 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,

5579 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,

5580 [EXIT_REASON_HLT] = kvm_emulate_halt,

5581 [EXIT_REASON_INVD] = kvm_emulate_invd,

5582 [EXIT_REASON_INVLPG] = handle_invlpg,

5583 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,

5584 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,

5585 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,

5586 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,

5587 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,

5588 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,

5589 [EXIT_REASON_VMREAD] = handle_vmx_instruction,

5590 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,

5591 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,

5592 [EXIT_REASON_VMOFF] = handle_vmx_instruction,

5593 [EXIT_REASON_VMON] = handle_vmx_instruction,

5594 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,

5595 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,

5596 [EXIT_REASON_APIC_WRITE] = handle_apic_write,

5597 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,

5598 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,

5599 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,

5600 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,

5601 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,

5602 [EXIT_REASON_GDTR_IDTR] = handle_desc,

5603 [EXIT_REASON_LDTR_TR] = handle_desc,

5604 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,

5605 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,

5606 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,

5607 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,

5608 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,

5609 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,

5610 [EXIT_REASON_INVEPT] = handle_vmx_instruction,

5611 [EXIT_REASON_INVVPID] = handle_vmx_instruction,

5612 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,

5613 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,

5614 [EXIT_REASON_PML_FULL] = handle_pml_full,

5615 [EXIT_REASON_INVPCID] = handle_invpcid,

5616 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,

5617 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,

5618 [EXIT_REASON_ENCLS] = handle_encls,

5619 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,

5620 };

5621

5622 static const int kvm_vmx_max_exit_handlers =

5623 ARRAY_SIZE(kvm_vmx_exit_handlers);

5624

5625 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,

5626 u32 *intr_info, u32 *error_code)

5627 {

5628 struct vcpu_vmx *vmx = to_vmx(vcpu);

5629

5630 *info1 = vmx_get_exit_qual(vcpu);

5631 if (!(vmx->exit_reason.failed_vmentry)) {

5632 *info2 = vmx->idt_vectoring_info;

5633 *intr_info = vmx_get_intr_info(vcpu);

5634 if (is_exception_with_error_code(*intr_info))

5635 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);

5636 else

5637 *error_code = 0;

5638 } else {

5639 *info2 = 0;

5640 *intr_info = 0;

5641 *error_code = 0;

5642 }

5643 }

5644

5645 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)

5646 {

5647 if (vmx->pml_pg) {

5648 __free_page(vmx->pml_pg);

5649 vmx->pml_pg = NULL;

5650 }

5651 }

5652

5653 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)

5654 {

5655 struct vcpu_vmx *vmx = to_vmx(vcpu);

5656 u64 *pml_buf;

5657 u16 pml_idx;

5658

5659 pml_idx = vmcs_read16(GUEST_PML_INDEX);

5660

5661 /* Do nothing if PML buffer is empty */

5662 if (pml_idx == (PML_ENTITY_NUM - 1))

5663 return;

5664

5665 /* PML index always points to next available PML buffer entity */

5666 if (pml_idx >= PML_ENTITY_NUM)

5667 pml_idx = 0;

5668 else

5669 pml_idx++;

5670

5671 pml_buf = page_address(vmx->pml_pg);

5672 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {

5673 u64 gpa;

5674

5675 gpa = pml_buf[pml_idx];

5676 WARN_ON(gpa & (PAGE_SIZE - 1));

5677 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);

5678 }

5679

5680 /* reset PML index */

5681 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);

5682 }

5683

5684 static void vmx_dump_sel(char *name, uint32_t sel)

5685 {

5686 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",

5687 name, vmcs_read16(sel),

5688 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),

5689 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),

5690 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));

5691 }

5692

5693 static void vmx_dump_dtsel(char *name, uint32_t limit)

5694 {

5695 pr_err("%s limit=0x%08x, base=0x%016lx\n",

5696 name, vmcs_read32(limit),

5697 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));

5698 }

5699

5700 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)

5701 {

5702 unsigned int i;

5703 struct vmx_msr_entry *e;

5704

5705 pr_err("MSR %s:\n", name);

5706 for (i = 0, e = m->val; i < m->nr; ++i, ++e)

5707 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);

5708 }

5709

5710 void dump_vmcs(struct kvm_vcpu *vcpu)

5711 {

5712 struct vcpu_vmx *vmx = to_vmx(vcpu);

5713 u32 vmentry_ctl, vmexit_ctl;

5714 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;

5715 unsigned long cr4;

5716 int efer_slot;

5717

5718 if (!dump_invalid_vmcs) {

5719 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");

5720 return;

5721 }

5722

5723 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);

5724 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);

5725 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);

5726 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);

5727 cr4 = vmcs_readl(GUEST_CR4);

5728 secondary_exec_control = 0;

5729 if (cpu_has_secondary_exec_ctrls())

5730 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);

5731

5732 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",

5733 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);

5734 pr_err("*** Guest State ***\n");

5735 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",

5736 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),

5737 vmcs_readl(CR0_GUEST_HOST_MASK));

5738 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",

5739 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));

5740 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));

5741 if (cpu_has_vmx_ept()) {

5742 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",

5743 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));

5744 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",

5745 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));

5746 }

5747 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",

5748 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));

5749 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",

5750 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));

5751 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",

5752 vmcs_readl(GUEST_SYSENTER_ESP),

5753 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));

5754 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);

5755 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);

5756 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);

5757 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);

5758 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);

5759 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);

5760 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);

5761 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);

5762 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);

5763 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);

5764 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);

5765 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)

5766 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));

5767 else if (efer_slot >= 0)

5768 pr_err("EFER= 0x%016llx (autoload)\n",

5769 vmx->msr_autoload.guest.val[efer_slot].value);

5770 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)

5771 pr_err("EFER= 0x%016llx (effective)\n",

5772 vcpu->arch.efer | (EFER_LMA | EFER_LME));

5773 else

5774 pr_err("EFER= 0x%016llx (effective)\n",

5775 vcpu->arch.efer & ~(EFER_LMA | EFER_LME));

5776 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)

5777 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));

5778 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",

5779 vmcs_read64(GUEST_IA32_DEBUGCTL),

5780 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));

5781 if (cpu_has_load_perf_global_ctrl() &&

5782 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)

5783 pr_err("PerfGlobCtl = 0x%016llx\n",

5784 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));

5785 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)

5786 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));

5787 pr_err("Interruptibility = %08x ActivityState = %08x\n",

5788 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),

5789 vmcs_read32(GUEST_ACTIVITY_STATE));

5790 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)

5791 pr_err("InterruptStatus = %04x\n",

5792 vmcs_read16(GUEST_INTR_STATUS));

5793 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)

5794 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);

5795 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)

5796 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);

5797

5798 pr_err("*** Host State ***\n");

5799 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",

5800 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));

5801 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",

5802 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),

5803 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),

5804 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),

5805 vmcs_read16(HOST_TR_SELECTOR));

5806 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",

5807 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),

5808 vmcs_readl(HOST_TR_BASE));

5809 pr_err("GDTBase=%016lx IDTBase=%016lx\n",

5810 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));

5811 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",

5812 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),

5813 vmcs_readl(HOST_CR4));

5814 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",

5815 vmcs_readl(HOST_IA32_SYSENTER_ESP),

5816 vmcs_read32(HOST_IA32_SYSENTER_CS),

5817 vmcs_readl(HOST_IA32_SYSENTER_EIP));

5818 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)

5819 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));

5820 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)

5821 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));

5822 if (cpu_has_load_perf_global_ctrl() &&

5823 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)

5824 pr_err("PerfGlobCtl = 0x%016llx\n",

5825 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));

5826 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)

5827 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);

5828

5829 pr_err("*** Control State ***\n");

5830 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",

5831 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);

5832 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);

5833 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",

5834 vmcs_read32(EXCEPTION_BITMAP),

5835 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),

5836 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));

5837 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",

5838 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),

5839 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),

5840 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));

5841 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",

5842 vmcs_read32(VM_EXIT_INTR_INFO),

5843 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),

5844 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));

5845 pr_err(" reason=%08x qualification=%016lx\n",

5846 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));

5847 pr_err("IDTVectoring: info=%08x errcode=%08x\n",

5848 vmcs_read32(IDT_VECTORING_INFO_FIELD),

5849 vmcs_read32(IDT_VECTORING_ERROR_CODE));

5850 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));

5851 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)

5852 pr_err("TSC Multiplier = 0x%016llx\n",

5853 vmcs_read64(TSC_MULTIPLIER));

5854 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {

5855 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {

5856 u16 status = vmcs_read16(GUEST_INTR_STATUS);

5857 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);

5858 }

5859 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));

5860 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)

5861 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));

5862 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));

5863 }

5864 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)

5865 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));

5866 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))

5867 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));

5868 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)

5869 pr_err("PLE Gap=%08x Window=%08x\n",

5870 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));

5871 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)

5872 pr_err("Virtual processor ID = 0x%04x\n",

5873 vmcs_read16(VIRTUAL_PROCESSOR_ID));

5874 }

5875

5876 /*

5877 * The guest has exited. See if we can fix it or if we need userspace

5878 * assistance.

5879 */

5880 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)

5881 {

5882 struct vcpu_vmx *vmx = to_vmx(vcpu);

5883 union vmx_exit_reason exit_reason = vmx->exit_reason;

5884 u32 vectoring_info = vmx->idt_vectoring_info;

5885 u16 exit_handler_index;

5886

5887 /*

5888 * Flush logged GPAs PML buffer, this will make dirty_bitmap more

5889 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before

5890 * querying dirty_bitmap, we only need to kick all vcpus out of guest

5891 * mode as if vcpus is in root mode, the PML buffer must has been

5892 * flushed already. Note, PML is never enabled in hardware while

5893 * running L2.

5894 */

5895 if (enable_pml && !is_guest_mode(vcpu))

5896 vmx_flush_pml_buffer(vcpu);

5897

5898 /*

5899 * We should never reach this point with a pending nested VM-Enter, and

5900 * more specifically emulation of L2 due to invalid guest state (see

5901 * below) should never happen as that means we incorrectly allowed a

5902 * nested VM-Enter with an invalid vmcs12.

5903 */

5904 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))

5905 return -EIO;

5906

5907 /* If guest state is invalid, start emulating */

5908 if (vmx->emulation_required)

5909 return handle_invalid_guest_state(vcpu);

5910

5911 if (is_guest_mode(vcpu)) {

5912 /*

5913 * PML is never enabled when running L2, bail immediately if a

5914 * PML full exit occurs as something is horribly wrong.

5915 */

5916 if (exit_reason.basic == EXIT_REASON_PML_FULL)

5917 goto unexpected_vmexit;

5918

5919 /*

5920 * The host physical addresses of some pages of guest memory

5921 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC

5922 * Page). The CPU may write to these pages via their host

5923 * physical address while L2 is running, bypassing any

5924 * address-translation-based dirty tracking (e.g. EPT write

5925 * protection).

5926 *

5927 * Mark them dirty on every exit from L2 to prevent them from

5928 * getting out of sync with dirty tracking.

5929 */

5930 nested_mark_vmcs12_pages_dirty(vcpu);

5931

5932 if (nested_vmx_reflect_vmexit(vcpu))

5933 return 1;

5934 }

5935

5936 if (exit_reason.failed_vmentry) {

5937 dump_vmcs(vcpu);

5938 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;

5939 vcpu->run->fail_entry.hardware_entry_failure_reason

5940 = exit_reason.full;

5941 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;

5942 return 0;

5943 }

5944

5945 if (unlikely(vmx->fail)) {

5946 dump_vmcs(vcpu);

5947 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;

5948 vcpu->run->fail_entry.hardware_entry_failure_reason

5949 = vmcs_read32(VM_INSTRUCTION_ERROR);

5950 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;

5951 return 0;

5952 }

5953

5954 /*

5955 * Note:

5956 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by

5957 * delivery event since it indicates guest is accessing MMIO.

5958 * The vm-exit can be triggered again after return to guest that

5959 * will cause infinite loop.

5960 */

5961 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&

5962 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&

5963 exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&

5964 exit_reason.basic != EXIT_REASON_PML_FULL &&

5965 exit_reason.basic != EXIT_REASON_APIC_ACCESS &&

5966 exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {

5967 int ndata = 3;

5968

5969 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

5970 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;

5971 vcpu->run->internal.data[0] = vectoring_info;

5972 vcpu->run->internal.data[1] = exit_reason.full;

5973 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;

5974 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {

5975 vcpu->run->internal.data[ndata++] =

5976 vmcs_read64(GUEST_PHYSICAL_ADDRESS);

5977 }

5978 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;

5979 vcpu->run->internal.ndata = ndata;

5980 return 0;

5981 }

5982

5983 if (unlikely(!enable_vnmi &&

5984 vmx->loaded_vmcs->soft_vnmi_blocked)) {

5985 if (!vmx_interrupt_blocked(vcpu)) {

5986 vmx->loaded_vmcs->soft_vnmi_blocked = 0;

5987 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&

5988 vcpu->arch.nmi_pending) {

5989 /*

5990 * This CPU don't support us in finding the end of an

5991 * NMI-blocked window if the guest runs with IRQs

5992 * disabled. So we pull the trigger after 1 s of

5993 * futile waiting, but inform the user about this.

5994 */

5995 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "

5996 "state on VCPU %d after 1 s timeout\n",

5997 __func__, vcpu->vcpu_id);

5998 vmx->loaded_vmcs->soft_vnmi_blocked = 0;

5999 }

6000 }

6001

6002 if (exit_fastpath != EXIT_FASTPATH_NONE)

6003 return 1;

6004

6005 if (exit_reason.basic >= kvm_vmx_max_exit_handlers)

6006 goto unexpected_vmexit;

6007 #ifdef CONFIG_RETPOLINE

6008 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)

6009 return kvm_emulate_wrmsr(vcpu);

6010 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)

6011 return handle_preemption_timer(vcpu);

6012 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)

6013 return handle_interrupt_window(vcpu);

6014 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)

6015 return handle_external_interrupt(vcpu);

6016 else if (exit_reason.basic == EXIT_REASON_HLT)

6017 return kvm_emulate_halt(vcpu);

6018 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)

6019 return handle_ept_misconfig(vcpu);

6020 #endif

6021

6022 exit_handler_index = array_index_nospec((u16)exit_reason.basic,

6023 kvm_vmx_max_exit_handlers);

6024 if (!kvm_vmx_exit_handlers[exit_handler_index])

6025 goto unexpected_vmexit;

6026

6027 return kvm_vmx_exit_handlers[exit_handler_index](vcpu);

6028

6029 unexpected_vmexit:

6030 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",

6031 exit_reason.full);

6032 dump_vmcs(vcpu);

6033 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;

6034 vcpu->run->internal.suberror =

6035 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;

6036 vcpu->run->internal.ndata = 2;

6037 vcpu->run->internal.data[0] = exit_reason.full;

6038 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;

6039 return 0;

6040 }

6041

6042 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)

6043 {

6044 int ret = __vmx_handle_exit(vcpu, exit_fastpath);

6045

6046 /*

6047 * Even when current exit reason is handled by KVM internally, we

6048 * still need to exit to user space when bus lock detected to inform

6049 * that there is a bus lock in guest.

6050 */

6051 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {

6052 if (ret > 0)

6053 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;

6054

6055 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;

6056 return 0;

6057 }

6058 return ret;

6059 }

6060

6061 /*

6062 * Software based L1D cache flush which is used when microcode providing

6063 * the cache control MSR is not loaded.

6064 *

6065 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to

6066 * flush it is required to read in 64 KiB because the replacement algorithm

6067 * is not exactly LRU. This could be sized at runtime via topology

6068 * information but as all relevant affected CPUs have 32KiB L1D cache size

6069 * there is no point in doing so.

6070 */

6071 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)

6072 {

6073 int size = PAGE_SIZE << L1D_CACHE_ORDER;

6074

6075 /*

6076 * This code is only executed when the the flush mode is 'cond' or

6077 * 'always'

6078 */

6079 if (static_branch_likely(&vmx_l1d_flush_cond)) {

6080 bool flush_l1d;

6081

6082 /*

6083 * Clear the per-vcpu flush bit, it gets set again

6084 * either from vcpu_run() or from one of the unsafe

6085 * VMEXIT handlers.

6086 */

6087 flush_l1d = vcpu->arch.l1tf_flush_l1d;

6088 vcpu->arch.l1tf_flush_l1d = false;

6089

6090 /*

6091 * Clear the per-cpu flush bit, it gets set again from

6092 * the interrupt handlers.

6093 */

6094 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();

6095 kvm_clear_cpu_l1tf_flush_l1d();

6096

6097 if (!flush_l1d)

6098 return;

6099 }

6100

6101 vcpu->stat.l1d_flush++;

6102

6103 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {

6104 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);

6105 return;

6106 }

6107

6108 asm volatile(

6109 /* First ensure the pages are in the TLB */

6110 "xorl %%eax, %%eax\n"

6111 ".Lpopulate_tlb:\n\t"

6112 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"

6113 "addl $4096, %%eax\n\t"

6114 "cmpl %%eax, %[size]\n\t"

6115 "jne .Lpopulate_tlb\n\t"

6116 "xorl %%eax, %%eax\n\t"

6117 "cpuid\n\t"

6118 /* Now fill the cache */

6119 "xorl %%eax, %%eax\n"

6120 ".Lfill_cache:\n"

6121 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"

6122 "addl $64, %%eax\n\t"

6123 "cmpl %%eax, %[size]\n\t"

6124 "jne .Lfill_cache\n\t"

6125 "lfence\n"

6126 :: [flush_pages] "r" (vmx_l1d_flush_pages),

6127 [size] "r" (size)

6128 : "eax", "ebx", "ecx", "edx");

6129 }

6130

6131 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)

6132 {

6133 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

6134 int tpr_threshold;

6135

6136 if (is_guest_mode(vcpu) &&

6137 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))

6138 return;

6139

6140 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;

6141 if (is_guest_mode(vcpu))

6142 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;

6143 else

6144 vmcs_write32(TPR_THRESHOLD, tpr_threshold);

6145 }

6146

6147 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)

6148 {

6149 struct vcpu_vmx *vmx = to_vmx(vcpu);

6150 u32 sec_exec_control;

6151

6152 if (!lapic_in_kernel(vcpu))

6153 return;

6154

6155 if (!flexpriority_enabled &&

6156 !cpu_has_vmx_virtualize_x2apic_mode())

6157 return;

6158

6159 /* Postpone execution until vmcs01 is the current VMCS. */

6160 if (is_guest_mode(vcpu)) {

6161 vmx->nested.change_vmcs01_virtual_apic_mode = true;

6162 return;

6163 }

6164

6165 sec_exec_control = secondary_exec_controls_get(vmx);

6166 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |

6167 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);

6168

6169 switch (kvm_get_apic_mode(vcpu)) {

6170 case LAPIC_MODE_INVALID:

6171 WARN_ONCE(true, "Invalid local APIC state");

6172 break;

6173 case LAPIC_MODE_DISABLED:

6174 break;

6175 case LAPIC_MODE_XAPIC:

6176 if (flexpriority_enabled) {

6177 sec_exec_control |=

6178 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;

6179 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);

6180

6181 /*

6182 * Flush the TLB, reloading the APIC access page will

6183 * only do so if its physical address has changed, but

6184 * the guest may have inserted a non-APIC mapping into

6185 * the TLB while the APIC access page was disabled.

6186 */

6187 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);

6188 }

6189 break;

6190 case LAPIC_MODE_X2APIC:

6191 if (cpu_has_vmx_virtualize_x2apic_mode())

6192 sec_exec_control |=

6193 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;

6194 break;

6195 }

6196 secondary_exec_controls_set(vmx, sec_exec_control);

6197

6198 vmx_update_msr_bitmap(vcpu);

6199 }

6200

6201 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)

6202 {

6203 struct page *page;

6204

6205 /* Defer reload until vmcs01 is the current VMCS. */

6206 if (is_guest_mode(vcpu)) {

6207 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;

6208 return;

6209 }

6210

6211 if (!(secondary_exec_controls_get(to_vmx(vcpu)) &

6212 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))

6213 return;

6214

6215 page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);

6216 if (is_error_page(page))

6217 return;

6218

6219 vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));

6220 vmx_flush_tlb_current(vcpu);

6221

6222 /*

6223 * Do not pin apic access page in memory, the MMU notifier

6224 * will call us again if it is migrated or swapped out.

6225 */

6226 put_page(page);

6227 }

6228

6229 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)

6230 {

6231 u16 status;

6232 u8 old;

6233

6234 if (max_isr == -1)

6235 max_isr = 0;

6236

6237 status = vmcs_read16(GUEST_INTR_STATUS);

6238 old = status >> 8;

6239 if (max_isr != old) {

6240 status &= 0xff;

6241 status |= max_isr << 8;

6242 vmcs_write16(GUEST_INTR_STATUS, status);

6243 }

6244 }

6245

6246 static void vmx_set_rvi(int vector)

6247 {

6248 u16 status;

6249 u8 old;

6250

6251 if (vector == -1)

6252 vector = 0;

6253

6254 status = vmcs_read16(GUEST_INTR_STATUS);

6255 old = (u8)status & 0xff;

6256 if ((u8)vector != old) {

6257 status &= ~0xff;

6258 status |= (u8)vector;

6259 vmcs_write16(GUEST_INTR_STATUS, status);

6260 }

6261 }

6262

6263 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)

6264 {

6265 /*

6266 * When running L2, updating RVI is only relevant when

6267 * vmcs12 virtual-interrupt-delivery enabled.

6268 * However, it can be enabled only when L1 also

6269 * intercepts external-interrupts and in that case

6270 * we should not update vmcs02 RVI but instead intercept

6271 * interrupt. Therefore, do nothing when running L2.

6272 */

6273 if (!is_guest_mode(vcpu))

6274 vmx_set_rvi(max_irr);

6275 }

6276

6277 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)

6278 {

6279 struct vcpu_vmx *vmx = to_vmx(vcpu);

6280 int max_irr;

6281 bool max_irr_updated;

6282

6283 if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))

6284 return -EIO;

6285

6286 if (pi_test_on(&vmx->pi_desc)) {

6287 pi_clear_on(&vmx->pi_desc);

6288 /*

6289 * IOMMU can write to PID.ON, so the barrier matters even on UP.

6290 * But on x86 this is just a compiler barrier anyway.

6291 */

6292 smp_mb__after_atomic();

6293 max_irr_updated =

6294 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);

6295

6296 /*

6297 * If we are running L2 and L1 has a new pending interrupt

6298 * which can be injected, we should re-evaluate

6299 * what should be done with this new L1 interrupt.

6300 * If L1 intercepts external-interrupts, we should

6301 * exit from L2 to L1. Otherwise, interrupt should be

6302 * delivered directly to L2.

6303 */

6304 if (is_guest_mode(vcpu) && max_irr_updated) {

6305 if (nested_exit_on_intr(vcpu))

6306 kvm_vcpu_exiting_guest_mode(vcpu);

6307 else

6308 kvm_make_request(KVM_REQ_EVENT, vcpu);

6309 }

6310 } else {

6311 max_irr = kvm_lapic_find_highest_irr(vcpu);

6312 }

6313 vmx_hwapic_irr_update(vcpu, max_irr);

6314 return max_irr;

6315 }

6316

6317 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)

6318 {

6319 if (!kvm_vcpu_apicv_active(vcpu))

6320 return;

6321

6322 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);

6323 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);

6324 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);

6325 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);

6326 }

6327

6328 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)

6329 {

6330 struct vcpu_vmx *vmx = to_vmx(vcpu);

6331

6332 pi_clear_on(&vmx->pi_desc);

6333 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));

6334 }

6335

6336 void vmx_do_interrupt_nmi_irqoff(unsigned long entry);

6337

6338 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,

6339 unsigned long entry)

6340 {

6341 kvm_before_interrupt(vcpu);

6342 vmx_do_interrupt_nmi_irqoff(entry);

6343 kvm_after_interrupt(vcpu);

6344 }

6345

6346 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)

6347 {

6348 const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;

6349 u32 intr_info = vmx_get_intr_info(&vmx->vcpu);

6350

6351 /* if exit due to PF check for async PF */

6352 if (is_page_fault(intr_info))

6353 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();

6354 /* Handle machine checks before interrupts are enabled */

6355 else if (is_machine_check(intr_info))

6356 kvm_machine_check();

6357 /* We need to handle NMIs before interrupts are enabled */

6358 else if (is_nmi(intr_info))

6359 handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);

6360 }

6361

6362 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)

6363 {

6364 u32 intr_info = vmx_get_intr_info(vcpu);

6365 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;

6366 gate_desc *desc = (gate_desc *)host_idt_base + vector;

6367

6368 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,

6369 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))

6370 return;

6371

6372 handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));

6373 }

6374

6375 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)

6376 {

6377 struct vcpu_vmx *vmx = to_vmx(vcpu);

6378

6379 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)

6380 handle_external_interrupt_irqoff(vcpu);

6381 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)

6382 handle_exception_nmi_irqoff(vmx);

6383 }

6384

6385 /*

6386 * The kvm parameter can be NULL (module initialization, or invocation before

6387 * VM creation). Be sure to check the kvm parameter before using it.

6388 */

6389 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)

6390 {

6391 switch (index) {

6392 case MSR_IA32_SMBASE:

6393 /*

6394 * We cannot do SMM unless we can run the guest in big

6395 * real mode.

6396 */

6397 return enable_unrestricted_guest || emulate_invalid_guest_state;

6398 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:

6399 return nested;

6400 case MSR_AMD64_VIRT_SPEC_CTRL:

6401 /* This is AMD only. */

6402 return false;

6403 default:

6404 return true;

6405 }

6406 }

6407

6408 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)

6409 {

6410 u32 exit_intr_info;

6411 bool unblock_nmi;

6412 u8 vector;

6413 bool idtv_info_valid;

6414

6415 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;

6416

6417 if (enable_vnmi) {

6418 if (vmx->loaded_vmcs->nmi_known_unmasked)

6419 return;

6420

6421 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);

6422 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;

6423 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;

6424 /*

6425 * SDM 3: 27.7.1.2 (September 2008)

6426 * Re-set bit "block by NMI" before VM entry if vmexit caused by

6427 * a guest IRET fault.

6428 * SDM 3: 23.2.2 (September 2008)

6429 * Bit 12 is undefined in any of the following cases:

6430 * If the VM exit sets the valid bit in the IDT-vectoring

6431 * information field.

6432 * If the VM exit is due to a double fault.

6433 */

6434 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&

6435 vector != DF_VECTOR && !idtv_info_valid)

6436 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,

6437 GUEST_INTR_STATE_NMI);

6438 else

6439 vmx->loaded_vmcs->nmi_known_unmasked =

6440 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)

6441 & GUEST_INTR_STATE_NMI);

6442 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))

6443 vmx->loaded_vmcs->vnmi_blocked_time +=

6444 ktime_to_ns(ktime_sub(ktime_get(),

6445 vmx->loaded_vmcs->entry_time));

6446 }

6447

6448 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,

6449 u32 idt_vectoring_info,

6450 int instr_len_field,

6451 int error_code_field)

6452 {

6453 u8 vector;

6454 int type;

6455 bool idtv_info_valid;

6456

6457 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;

6458

6459 vcpu->arch.nmi_injected = false;

6460 kvm_clear_exception_queue(vcpu);

6461 kvm_clear_interrupt_queue(vcpu);

6462

6463 if (!idtv_info_valid)

6464 return;

6465

6466 kvm_make_request(KVM_REQ_EVENT, vcpu);

6467

6468 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;

6469 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;

6470

6471 switch (type) {

6472 case INTR_TYPE_NMI_INTR:

6473 vcpu->arch.nmi_injected = true;

6474 /*

6475 * SDM 3: 27.7.1.2 (September 2008)

6476 * Clear bit "block by NMI" before VM entry if a NMI

6477 * delivery faulted.

6478 */

6479 vmx_set_nmi_mask(vcpu, false);

6480 break;

6481 case INTR_TYPE_SOFT_EXCEPTION:

6482 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);

6483 fallthrough;

6484 case INTR_TYPE_HARD_EXCEPTION:

6485 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {

6486 u32 err = vmcs_read32(error_code_field);

6487 kvm_requeue_exception_e(vcpu, vector, err);

6488 } else

6489 kvm_requeue_exception(vcpu, vector);

6490 break;

6491 case INTR_TYPE_SOFT_INTR:

6492 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);

6493 fallthrough;

6494 case INTR_TYPE_EXT_INTR:

6495 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);

6496 break;

6497 default:

6498 break;

6499 }

6500 }

6501

6502 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)

6503 {

6504 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,

6505 VM_EXIT_INSTRUCTION_LEN,

6506 IDT_VECTORING_ERROR_CODE);

6507 }

6508

6509 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)

6510 {

6511 __vmx_complete_interrupts(vcpu,

6512 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),

6513 VM_ENTRY_INSTRUCTION_LEN,

6514 VM_ENTRY_EXCEPTION_ERROR_CODE);

6515

6516 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);

6517 }

6518

6519 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)

6520 {

6521 int i, nr_msrs;

6522 struct perf_guest_switch_msr *msrs;

6523

6524 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */

6525 msrs = perf_guest_get_msrs(&nr_msrs);

6526 if (!msrs)

6527 return;

6528

6529 for (i = 0; i < nr_msrs; i++)

6530 if (msrs[i].host == msrs[i].guest)

6531 clear_atomic_switch_msr(vmx, msrs[i].msr);

6532 else

6533 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,

6534 msrs[i].host, false);

6535 }

6536

6537 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)

6538 {

6539 struct vcpu_vmx *vmx = to_vmx(vcpu);

6540 u64 tscl;

6541 u32 delta_tsc;

6542

6543 if (vmx->req_immediate_exit) {

6544 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);

6545 vmx->loaded_vmcs->hv_timer_soft_disabled = false;

6546 } else if (vmx->hv_deadline_tsc != -1) {

6547 tscl = rdtsc();

6548 if (vmx->hv_deadline_tsc > tscl)

6549 /* set_hv_timer ensures the delta fits in 32-bits */

6550 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>

6551 cpu_preemption_timer_multi);

6552 else

6553 delta_tsc = 0;

6554

6555 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);

6556 vmx->loaded_vmcs->hv_timer_soft_disabled = false;

6557 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {

6558 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);

6559 vmx->loaded_vmcs->hv_timer_soft_disabled = true;

6560 }

6561 }

6562

6563 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)

6564 {

6565 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {

6566 vmx->loaded_vmcs->host_state.rsp = host_rsp;

6567 vmcs_writel(HOST_RSP, host_rsp);

6568 }

6569 }

6570

6571 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)

6572 {

6573 switch (to_vmx(vcpu)->exit_reason.basic) {

6574 case EXIT_REASON_MSR_WRITE:

6575 return handle_fastpath_set_msr_irqoff(vcpu);

6576 case EXIT_REASON_PREEMPTION_TIMER:

6577 return handle_fastpath_preemption_timer(vcpu);

6578 default:

6579 return EXIT_FASTPATH_NONE;

6580 }

6581 }

6582

6583 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,

6584 struct vcpu_vmx *vmx)

6585 {

6586 kvm_guest_enter_irqoff();

6587

6588 /* L1D Flush includes CPU buffer clear to mitigate MDS */

6589 if (static_branch_unlikely(&vmx_l1d_should_flush))

6590 vmx_l1d_flush(vcpu);

6591 else if (static_branch_unlikely(&mds_user_clear))

6592 mds_clear_cpu_buffers();

6593

6594 if (vcpu->arch.cr2 != native_read_cr2())

6595 native_write_cr2(vcpu->arch.cr2);

6596

6597 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,

6598 vmx->loaded_vmcs->launched);

6599

6600 vcpu->arch.cr2 = native_read_cr2();

6601

6602 kvm_guest_exit_irqoff();

6603 }

6604

6605 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

6606 {

6607 struct vcpu_vmx *vmx = to_vmx(vcpu);

6608 unsigned long cr3, cr4;

6609

6610 /* Record the guest's net vcpu time for enforced NMI injections. */

6611 if (unlikely(!enable_vnmi &&

6612 vmx->loaded_vmcs->soft_vnmi_blocked))

6613 vmx->loaded_vmcs->entry_time = ktime_get();

6614

6615 /* Don't enter VMX if guest state is invalid, let the exit handler

6616 start emulation until we arrive back to a valid state */

6617 if (vmx->emulation_required)

6618 return EXIT_FASTPATH_NONE;

6619

6620 trace_kvm_entry(vcpu);

6621

6622 if (vmx->ple_window_dirty) {

6623 vmx->ple_window_dirty = false;

6624 vmcs_write32(PLE_WINDOW, vmx->ple_window);

6625 }

6626

6627 /*

6628 * We did this in prepare_switch_to_guest, because it needs to

6629 * be within srcu_read_lock.

6630 */

6631 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);

6632

6633 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))

6634 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);

6635 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))

6636 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

6637

6638 cr3 = __get_current_cr3_fast();

6639 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {

6640 vmcs_writel(HOST_CR3, cr3);

6641 vmx->loaded_vmcs->host_state.cr3 = cr3;

6642 }

6643

6644 cr4 = cr4_read_shadow();

6645 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {

6646 vmcs_writel(HOST_CR4, cr4);

6647 vmx->loaded_vmcs->host_state.cr4 = cr4;

6648 }

6649

6650 /* When single-stepping over STI and MOV SS, we must clear the

6651 * corresponding interruptibility bits in the guest state. Otherwise

6652 * vmentry fails as it then expects bit 14 (BS) in pending debug

6653 * exceptions being set, but that's not correct for the guest debugging

6654 * case. */

6655 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)

6656 vmx_set_interrupt_shadow(vcpu, 0);

6657

6658 kvm_load_guest_xsave_state(vcpu);

6659

6660 pt_guest_enter(vmx);

6661

6662 atomic_switch_perf_msrs(vmx);

6663 if (intel_pmu_lbr_is_enabled(vcpu))

6664 vmx_passthrough_lbr_msrs(vcpu);

6665

6666 if (enable_preemption_timer)

6667 vmx_update_hv_timer(vcpu);

6668

6669 kvm_wait_lapic_expire(vcpu);

6670

6671 /*

6672 * If this vCPU has touched SPEC_CTRL, restore the guest's value if

6673 * it's non-zero. Since vmentry is serialising on affected CPUs, there

6674 * is no need to worry about the conditional branch over the wrmsr

6675 * being speculatively taken.

6676 */

6677 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);

6678

6679 /* The actual VMENTER/EXIT is in the .noinstr.text section. */

6680 vmx_vcpu_enter_exit(vcpu, vmx);

6681

6682 /*

6683 * We do not use IBRS in the kernel. If this vCPU has used the

6684 * SPEC_CTRL MSR it may have left it on; save the value and

6685 * turn it off. This is much more efficient than blindly adding

6686 * it to the atomic save/restore list. Especially as the former

6687 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.

6688 *

6689 * For non-nested case:

6690 * If the L01 MSR bitmap does not intercept the MSR, then we need to

6691 * save it.

6692 *

6693 * For nested case:

6694 * If the L02 MSR bitmap does not intercept the MSR, then we need to

6695 * save it.

6696 */

6697 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))

6698 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);

6699

6700 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);

6701

6702 /* All fields are clean at this point */

6703 if (static_branch_unlikely(&enable_evmcs)) {

6704 current_evmcs->hv_clean_fields |=

6705 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;

6706

6707 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);

6708 }

6709

6710 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */

6711 if (vmx->host_debugctlmsr)

6712 update_debugctlmsr(vmx->host_debugctlmsr);

6713

6714 #ifndef CONFIG_X86_64

6715 /*

6716 * The sysexit path does not restore ds/es, so we must set them to

6717 * a reasonable value ourselves.

6718 *

6719 * We can't defer this to vmx_prepare_switch_to_host() since that

6720 * function may be executed in interrupt context, which saves and

6721 * restore segments around it, nullifying its effect.

6722 */

6723 loadsegment(ds, __USER_DS);

6724 loadsegment(es, __USER_DS);

6725 #endif

6726

6727 vmx_register_cache_reset(vcpu);

6728

6729 pt_guest_exit(vmx);

6730

6731 kvm_load_host_xsave_state(vcpu);

6732

6733 if (is_guest_mode(vcpu)) {

6734 /*

6735 * Track VMLAUNCH/VMRESUME that have made past guest state

6736 * checking.

6737 */

6738 if (vmx->nested.nested_run_pending &&

6739 !vmx->exit_reason.failed_vmentry)

6740 ++vcpu->stat.nested_run;

6741

6742 vmx->nested.nested_run_pending = 0;

6743 }

6744

6745 vmx->idt_vectoring_info = 0;

6746

6747 if (unlikely(vmx->fail)) {

6748 vmx->exit_reason.full = 0xdead;

6749 return EXIT_FASTPATH_NONE;

6750 }

6751

6752 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);

6753 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))

6754 kvm_machine_check();

6755

6756 if (likely(!vmx->exit_reason.failed_vmentry))

6757 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

6758

6759 trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);

6760

6761 if (unlikely(vmx->exit_reason.failed_vmentry))

6762 return EXIT_FASTPATH_NONE;

6763

6764 vmx->loaded_vmcs->launched = 1;

6765

6766 vmx_recover_nmi_blocking(vmx);

6767 vmx_complete_interrupts(vmx);

6768

6769 if (is_guest_mode(vcpu))

6770 return EXIT_FASTPATH_NONE;

6771

6772 return vmx_exit_handlers_fastpath(vcpu);

6773 }

6774

6775 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)

6776 {

6777 struct vcpu_vmx *vmx = to_vmx(vcpu);

6778

6779 if (enable_pml)

6780 vmx_destroy_pml_buffer(vmx);

6781 free_vpid(vmx->vpid);

6782 nested_vmx_free_vcpu(vcpu);

6783 free_loaded_vmcs(vmx->loaded_vmcs);

6784 }

6785

6786 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)

6787 {

6788 struct vmx_uret_msr *tsx_ctrl;

6789 struct vcpu_vmx *vmx;

6790 int i, cpu, err;

6791

6792 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);

6793 vmx = to_vmx(vcpu);

6794

6795 err = -ENOMEM;

6796

6797 vmx->vpid = allocate_vpid();

6798

6799 /*

6800 * If PML is turned on, failure on enabling PML just results in failure

6801 * of creating the vcpu, therefore we can simplify PML logic (by

6802 * avoiding dealing with cases, such as enabling PML partially on vcpus

6803 * for the guest), etc.

6804 */

6805 if (enable_pml) {

6806 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);

6807 if (!vmx->pml_pg)

6808 goto free_vpid;

6809 }

6810

6811 for (i = 0; i < kvm_nr_uret_msrs; ++i) {

6812 vmx->guest_uret_msrs[i].data = 0;

6813 vmx->guest_uret_msrs[i].mask = -1ull;

6814 }

6815 if (boot_cpu_has(X86_FEATURE_RTM)) {

6816 /*

6817 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.

6818 * Keep the host value unchanged to avoid changing CPUID bits

6819 * under the host kernel's feet.

6820 */

6821 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);

6822 if (tsx_ctrl)

6823 vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;

6824 }

6825

6826 err = alloc_loaded_vmcs(&vmx->vmcs01);

6827 if (err < 0)

6828 goto free_pml;

6829

6830 /* The MSR bitmap starts with all ones */

6831 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);

6832 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);

6833

6834 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);

6835 #ifdef CONFIG_X86_64

6836 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);

6837 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);

6838 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);

6839 #endif

6840 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);

6841 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);

6842 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);

6843 if (kvm_cstate_in_guest(vcpu->kvm)) {

6844 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);

6845 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);

6846 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);

6847 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);

6848 }

6849 vmx->msr_bitmap_mode = 0;

6850

6851 vmx->loaded_vmcs = &vmx->vmcs01;

6852 cpu = get_cpu();

6853 vmx_vcpu_load(vcpu, cpu);

6854 vcpu->cpu = cpu;

6855 init_vmcs(vmx);

6856 vmx_vcpu_put(vcpu);

6857 put_cpu();

6858 if (cpu_need_virtualize_apic_accesses(vcpu)) {

6859 err = alloc_apic_access_page(vcpu->kvm);

6860 if (err)

6861 goto free_vmcs;

6862 }

6863

6864 if (enable_ept && !enable_unrestricted_guest) {

6865 err = init_rmode_identity_map(vcpu->kvm);

6866 if (err)

6867 goto free_vmcs;

6868 }

6869

6870 if (nested)

6871 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));

6872 else

6873 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));

6874

6875 vcpu_setup_sgx_lepubkeyhash(vcpu);

6876

6877 vmx->nested.posted_intr_nv = -1;

6878 vmx->nested.current_vmptr = -1ull;

6879 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;

6880

6881 vcpu->arch.microcode_version = 0x100000000ULL;

6882 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;

6883

6884 /*

6885 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR

6886 * or POSTED_INTR_WAKEUP_VECTOR.

6887 */

6888 vmx->pi_desc.nv = POSTED_INTR_VECTOR;

6889 vmx->pi_desc.sn = 1;

6890

6891 return 0;

6892

6893 free_vmcs:

6894 free_loaded_vmcs(vmx->loaded_vmcs);

6895 free_pml:

6896 vmx_destroy_pml_buffer(vmx);

6897 free_vpid:

6898 free_vpid(vmx->vpid);

6899 return err;

6900 }

6901

6902 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"

6903 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"

6904

6905 static int vmx_vm_init(struct kvm *kvm)

6906 {

6907 if (!ple_gap)

6908 kvm->arch.pause_in_guest = true;

6909

6910 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {

6911 switch (l1tf_mitigation) {

6912 case L1TF_MITIGATION_OFF:

6913 case L1TF_MITIGATION_FLUSH_NOWARN:

6914 /* 'I explicitly don't care' is set */

6915 break;