From: Paolo Bonzini Date: Tue, 6 Dec 2022 17:27:39 +0000 (-0500) Subject: Merge tag 'kvmarm-6.2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm... X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=eb5618911af0ac069d2313b289d4c19ca3379401;p=linux.git Merge tag 'kvmarm-6.2' of https://git./linux/kernel/git/kvmarm/kvmarm into HEAD KVM/arm64 updates for 6.2 - Enable the per-vcpu dirty-ring tracking mechanism, together with an option to keep the good old dirty log around for pages that are dirtied by something other than a vcpu. - Switch to the relaxed parallel fault handling, using RCU to delay page table reclaim and giving better performance under load. - Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping option, which multi-process VMMs such as crosvm rely on. - Merge the pKVM shadow vcpu state tracking that allows the hypervisor to have its own view of a vcpu, keeping that state private. - Add support for the PMUv3p5 architecture revision, bringing support for 64bit counters on systems that support it, and fix the no-quite-compliant CHAIN-ed counter support for the machines that actually exist out there. - Fix a handful of minor issues around 52bit VA/PA support (64kB pages only) as a prefix of the oncoming support for 4kB and 16kB pages. - Add/Enable/Fix a bunch of selftests covering memslots, breakpoints, stage-2 faults and access tracking. You name it, we got it, we probably broke it. - Pick a small set of documentation and spelling fixes, because no good merge window would be complete without those. As a side effect, this tag also drags: - The 'kvmarm-fixes-6.1-3' tag as a dependency to the dirty-ring series - A shared branch with the arm64 tree that repaints all the system registers to match the ARM ARM's naming, and resulting in interesting conflicts --- eb5618911af0ac069d2313b289d4c19ca3379401 diff --cc arch/x86/include/asm/kvm_host.h index d1013c4f673ca,b4dbde7d9eb1d..ad9f8b02071de --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@@ -2154,8 -2084,12 +2154,6 @@@ static inline int kvm_cpu_get_apicid(in #endif } - int kvm_cpu_dirty_log_size(void); -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) -- int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ diff --cc include/uapi/linux/kvm.h index 88448397642cb,c87b5882d7aef..820efdf9fef80 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@@ -1181,7 -1178,7 +1181,8 @@@ struct kvm_ppc_resize_hpt #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 -#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224 +#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 ++#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #ifdef KVM_CAP_IRQ_ROUTING diff --cc tools/testing/selftests/kvm/Makefile index 2275ba861e0e5,1d85b8e218a02..947676983da1f --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@@ -47,7 -47,7 +47,8 @@@ LIBKVM += lib/memstress. LIBKVM += lib/rbtree.c LIBKVM += lib/sparsebit.c LIBKVM += lib/test_util.c +LIBKVM += lib/ucall_common.c + LIBKVM += lib/userfaultfd_util.c LIBKVM_STRING += lib/string_override.c diff --cc tools/testing/selftests/kvm/aarch64/debug-exceptions.c index d86c4e4d1c826,b30add3e77269..8a3fb212084a0 --- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@@ -289,9 -425,9 +422,8 @@@ static void test_guest_debug_exceptions struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - int stage; vm = vm_create_with_one_vcpu(&vcpu, guest_code); - ucall_init(vm, NULL); vm_init_descriptor_tables(vm); vcpu_init_descriptor_tables(vcpu); diff --cc tools/testing/selftests/kvm/aarch64/page_fault_test.c index 0000000000000,05bb6a6369c25..95d22cfb7b41a mode 000000,100644..100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@@ -1,0 -1,1112 +1,1117 @@@ + // SPDX-License-Identifier: GPL-2.0 + /* + * page_fault_test.c - Test stage 2 faults. + * + * This test tries different combinations of guest accesses (e.g., write, + * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on + * hugetlbfs with a hole). It checks that the expected handling method is + * called (e.g., uffd faults with the right address and write/read flag). + */ + + #define _GNU_SOURCE + #include + #include + #include + #include + #include + #include + #include + #include "guest_modes.h" + #include "userfaultfd_util.h" + + /* Guest virtual addresses that point to the test page and its PTE. */ + #define TEST_GVA 0xc0000000 + #define TEST_EXEC_GVA (TEST_GVA + 0x8) + #define TEST_PTE_GVA 0xb0000000 + #define TEST_DATA 0x0123456789ABCDEF + + static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; + + #define CMD_NONE (0) + #define CMD_SKIP_TEST (1ULL << 1) + #define CMD_HOLE_PT (1ULL << 2) + #define CMD_HOLE_DATA (1ULL << 3) + #define CMD_CHECK_WRITE_IN_DIRTY_LOG (1ULL << 4) + #define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG (1ULL << 5) + #define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG (1ULL << 6) + #define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG (1ULL << 7) + #define CMD_SET_PTE_AF (1ULL << 8) + + #define PREPARE_FN_NR 10 + #define CHECK_FN_NR 10 + + static struct event_cnt { + int mmio_exits; + int fail_vcpu_runs; + int uffd_faults; + /* uffd_faults is incremented from multiple threads. */ + pthread_mutex_t uffd_faults_mutex; + } events; + + struct test_desc { + const char *name; + uint64_t mem_mark_cmd; + /* Skip the test if any prepare function returns false */ + bool (*guest_prepare[PREPARE_FN_NR])(void); + void (*guest_test)(void); + void (*guest_test_check[CHECK_FN_NR])(void); + uffd_handler_t uffd_pt_handler; + uffd_handler_t uffd_data_handler; + void (*dabt_handler)(struct ex_regs *regs); + void (*iabt_handler)(struct ex_regs *regs); + void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run); + void (*fail_vcpu_run_handler)(int ret); + uint32_t pt_memslot_flags; + uint32_t data_memslot_flags; + bool skip; + struct event_cnt expected_events; + }; + + struct test_params { + enum vm_mem_backing_src_type src_type; + struct test_desc *test_desc; + }; + + static inline void flush_tlb_page(uint64_t vaddr) + { + uint64_t page = vaddr >> 12; + + dsb(ishst); + asm volatile("tlbi vaae1is, %0" :: "r" (page)); + dsb(ish); + isb(); + } + + static void guest_write64(void) + { + uint64_t val; + + WRITE_ONCE(*guest_test_memory, TEST_DATA); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); + } + + /* Check the system for atomic instructions. */ + static bool guest_check_lse(void) + { + uint64_t isar0 = read_sysreg(id_aa64isar0_el1); + uint64_t atomic; + + atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0); + return atomic >= 2; + } + + static bool guest_check_dc_zva(void) + { + uint64_t dczid = read_sysreg(dczid_el0); + uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid); + + return dzp == 0; + } + + /* Compare and swap instruction. */ + static void guest_cas(void) + { + uint64_t val; + + GUEST_ASSERT(guest_check_lse()); + asm volatile(".arch_extension lse\n" + "casal %0, %1, [%2]\n" + :: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory)); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); + } + + static void guest_read64(void) + { + uint64_t val; + + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); + } + + /* Address translation instruction */ + static void guest_at(void) + { + uint64_t par; + + asm volatile("at s1e1r, %0" :: "r" (guest_test_memory)); + par = read_sysreg(par_el1); + isb(); + + /* Bit 1 indicates whether the AT was successful */ + GUEST_ASSERT_EQ(par & 1, 0); + } + + /* + * The size of the block written by "dc zva" is guaranteed to be between (2 << + * 0) and (2 << 9), which is safe in our case as we need the write to happen + * for at least a word, and not more than a page. + */ + static void guest_dc_zva(void) + { + uint16_t val; + + asm volatile("dc zva, %0" :: "r" (guest_test_memory)); + dsb(ish); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); + } + + /* + * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0). + * And that's special because KVM must take special care with those: they + * should still count as accesses for dirty logging or user-faulting, but + * should be handled differently on mmio. + */ + static void guest_ld_preidx(void) + { + uint64_t val; + uint64_t addr = TEST_GVA - 8; + + /* + * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is + * in a gap between memslots not backing by anything. + */ + asm volatile("ldr %0, [%1, #8]!" + : "=r" (val), "+r" (addr)); + GUEST_ASSERT_EQ(val, 0); + GUEST_ASSERT_EQ(addr, TEST_GVA); + } + + static void guest_st_preidx(void) + { + uint64_t val = TEST_DATA; + uint64_t addr = TEST_GVA - 8; + + asm volatile("str %0, [%1, #8]!" + : "+r" (val), "+r" (addr)); + + GUEST_ASSERT_EQ(addr, TEST_GVA); + val = READ_ONCE(*guest_test_memory); + } + + static bool guest_set_ha(void) + { + uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1); + uint64_t hadbs, tcr; + + /* Skip if HA is not supported. */ + hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1); + if (hadbs == 0) + return false; + + tcr = read_sysreg(tcr_el1) | TCR_EL1_HA; + write_sysreg(tcr, tcr_el1); + isb(); + + return true; + } + + static bool guest_clear_pte_af(void) + { + *((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF; + flush_tlb_page(TEST_GVA); + + return true; + } + + static void guest_check_pte_af(void) + { + dsb(ish); + GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF); + } + + static void guest_check_write_in_dirty_log(void) + { + GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG); + } + + static void guest_check_no_write_in_dirty_log(void) + { + GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG); + } + + static void guest_check_s1ptw_wr_in_dirty_log(void) + { + GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG); + } + + static void guest_exec(void) + { + int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; + int ret; + + ret = code(); + GUEST_ASSERT_EQ(ret, 0x77); + } + + static bool guest_prepare(struct test_desc *test) + { + bool (*prepare_fn)(void); + int i; + + for (i = 0; i < PREPARE_FN_NR; i++) { + prepare_fn = test->guest_prepare[i]; + if (prepare_fn && !prepare_fn()) + return false; + } + + return true; + } + + static void guest_test_check(struct test_desc *test) + { + void (*check_fn)(void); + int i; + + for (i = 0; i < CHECK_FN_NR; i++) { + check_fn = test->guest_test_check[i]; + if (check_fn) + check_fn(); + } + } + + static void guest_code(struct test_desc *test) + { + if (!guest_prepare(test)) + GUEST_SYNC(CMD_SKIP_TEST); + + GUEST_SYNC(test->mem_mark_cmd); + + if (test->guest_test) + test->guest_test(); + + guest_test_check(test); + GUEST_DONE(); + } + + static void no_dabt_handler(struct ex_regs *regs) + { + GUEST_ASSERT_1(false, read_sysreg(far_el1)); + } + + static void no_iabt_handler(struct ex_regs *regs) + { + GUEST_ASSERT_1(false, regs->pc); + } + + static struct uffd_args { + char *copy; + void *hva; + uint64_t paging_size; + } pt_args, data_args; + + /* Returns true to continue the test, and false if it should be skipped. */ + static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, + struct uffd_args *args, bool expect_write) + { + uint64_t addr = msg->arg.pagefault.address; + uint64_t flags = msg->arg.pagefault.flags; + struct uffdio_copy copy; + int ret; + + TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING, + "The only expected UFFD mode is MISSING"); + ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write); + ASSERT_EQ(addr, (uint64_t)args->hva); + + pr_debug("uffd fault: addr=%p write=%d\n", + (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE)); + + copy.src = (uint64_t)args->copy; + copy.dst = addr; + copy.len = args->paging_size; + copy.mode = 0; + + ret = ioctl(uffd, UFFDIO_COPY, ©); + if (ret == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n", + addr, errno); + return ret; + } + + pthread_mutex_lock(&events.uffd_faults_mutex); + events.uffd_faults += 1; + pthread_mutex_unlock(&events.uffd_faults_mutex); + return 0; + } + + static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg) + { + return uffd_generic_handler(mode, uffd, msg, &pt_args, true); + } + + static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg) + { + return uffd_generic_handler(mode, uffd, msg, &data_args, true); + } + + static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg) + { + return uffd_generic_handler(mode, uffd, msg, &data_args, false); + } + + static void setup_uffd_args(struct userspace_mem_region *region, + struct uffd_args *args) + { + args->hva = (void *)region->region.userspace_addr; + args->paging_size = region->region.memory_size; + + args->copy = malloc(args->paging_size); + TEST_ASSERT(args->copy, "Failed to allocate data copy."); + memcpy(args->copy, args->hva, args->paging_size); + } + + static void setup_uffd(struct kvm_vm *vm, struct test_params *p, + struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd) + { + struct test_desc *test = p->test_desc; + int uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args); + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args); + + *pt_uffd = NULL; + if (test->uffd_pt_handler) + *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, + pt_args.hva, + pt_args.paging_size, + test->uffd_pt_handler); + + *data_uffd = NULL; + if (test->uffd_data_handler) + *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, + data_args.hva, + data_args.paging_size, + test->uffd_data_handler); + } + + static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, + struct uffd_desc *data_uffd) + { + if (test->uffd_pt_handler) + uffd_stop_demand_paging(pt_uffd); + if (test->uffd_data_handler) + uffd_stop_demand_paging(data_uffd); + + free(pt_args.copy); + free(data_args.copy); + } + + static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg) + { + TEST_FAIL("There was no UFFD fault expected."); + return -1; + } + + /* Returns false if the test should be skipped. */ + static bool punch_hole_in_backing_store(struct kvm_vm *vm, + struct userspace_mem_region *region) + { + void *hva = (void *)region->region.userspace_addr; + uint64_t paging_size = region->region.memory_size; + int ret, fd = region->fd; + + if (fd != -1) { + ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, paging_size); + TEST_ASSERT(ret == 0, "fallocate failed\n"); + } else { + ret = madvise(hva, paging_size, MADV_DONTNEED); + TEST_ASSERT(ret == 0, "madvise failed\n"); + } + + return true; + } + + static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run) + { + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr); + + memcpy(hva, run->mmio.data, run->mmio.len); + events.mmio_exits += 1; + } + + static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run) + { + uint64_t data; + + memcpy(&data, run->mmio.data, sizeof(data)); + pr_debug("addr=%lld len=%d w=%d data=%lx\n", + run->mmio.phys_addr, run->mmio.len, + run->mmio.is_write, data); + TEST_FAIL("There was no MMIO exit expected."); + } + + static bool check_write_in_dirty_log(struct kvm_vm *vm, + struct userspace_mem_region *region, + uint64_t host_pg_nr) + { + unsigned long *bmap; + bool first_page_dirty; + uint64_t size = region->region.memory_size; + + /* getpage_size() is not always equal to vm->page_size */ + bmap = bitmap_zalloc(size / getpagesize()); + kvm_vm_get_dirty_log(vm, region->region.slot, bmap); + first_page_dirty = test_bit(host_pg_nr, bmap); + free(bmap); + return first_page_dirty; + } + + /* Returns true to continue the test, and false if it should be skipped. */ + static bool handle_cmd(struct kvm_vm *vm, int cmd) + { + struct userspace_mem_region *data_region, *pt_region; + bool continue_test = true; + + data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + pt_region = vm_get_mem_region(vm, MEM_REGION_PT); + + if (cmd == CMD_SKIP_TEST) + continue_test = false; + + if (cmd & CMD_HOLE_PT) + continue_test = punch_hole_in_backing_store(vm, pt_region); + if (cmd & CMD_HOLE_DATA) + continue_test = punch_hole_in_backing_store(vm, data_region); + if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0), + "Missing write in dirty log"); + if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0), + "Missing s1ptw write in dirty log"); + if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0), + "Unexpected write in dirty log"); + if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0), + "Unexpected s1ptw write in dirty log"); + + return continue_test; + } + + void fail_vcpu_run_no_handler(int ret) + { + TEST_FAIL("Unexpected vcpu run failure\n"); + } + + void fail_vcpu_run_mmio_no_syndrome_handler(int ret) + { + TEST_ASSERT(errno == ENOSYS, + "The mmio handler should have returned not implemented."); + events.fail_vcpu_runs += 1; + } + + typedef uint32_t aarch64_insn_t; + extern aarch64_insn_t __exec_test[2]; + + noinline void __return_0x77(void) + { + asm volatile("__exec_test: mov x0, #0x77\n" + "ret\n"); + } + + /* + * Note that this function runs on the host before the test VM starts: there's + * no need to sync the D$ and I$ caches. + */ + static void load_exec_code_for_test(struct kvm_vm *vm) + { + uint64_t *code; + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + assert(TEST_EXEC_GVA > TEST_GVA); + code = hva + TEST_EXEC_GVA - TEST_GVA; + memcpy(code, __exec_test, sizeof(__exec_test)); + } + + static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) + { + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_DABT, no_dabt_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_IABT, no_iabt_handler); + } + + static void setup_gva_maps(struct kvm_vm *vm) + { + struct userspace_mem_region *region; + uint64_t pte_gpa; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + /* Map TEST_GVA first. This will install a new PTE. */ + virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr); + /* Then map TEST_PTE_GVA to the above PTE. */ + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + virt_pg_map(vm, TEST_PTE_GVA, pte_gpa); + } + + enum pf_test_memslots { + CODE_AND_DATA_MEMSLOT, + PAGE_TABLE_MEMSLOT, + TEST_DATA_MEMSLOT, + }; + + /* + * Create a memslot for code and data at pfn=0, and test-data and PT ones + * at max_gfn. + */ + static void setup_memslots(struct kvm_vm *vm, struct test_params *p) + { + uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type); + uint64_t guest_page_size = vm->page_size; + uint64_t max_gfn = vm_compute_max_gfn(vm); + /* Enough for 2M of code when using 4K guest pages. */ + uint64_t code_npages = 512; + uint64_t pt_size, data_size, data_gpa; + + /* + * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using + * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs. That's 13 + * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use + * twice that just in case. + */ + pt_size = 26 * guest_page_size; + + /* memslot sizes and gpa's must be aligned to the backing page size */ + pt_size = align_up(pt_size, backing_src_pagesz); + data_size = align_up(guest_page_size, backing_src_pagesz); + data_gpa = (max_gfn * guest_page_size) - data_size; + data_gpa = align_down(data_gpa, backing_src_pagesz); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, + CODE_AND_DATA_MEMSLOT, code_npages, 0); + vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT; + vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size, + PAGE_TABLE_MEMSLOT, pt_size / guest_page_size, + p->test_desc->pt_memslot_flags); + vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT, + data_size / guest_page_size, + p->test_desc->data_memslot_flags); + vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; + } + ++static void setup_ucall(struct kvm_vm *vm) ++{ ++ struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); ++ ++ ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size); ++} ++ + static void setup_default_handlers(struct test_desc *test) + { + if (!test->mmio_handler) + test->mmio_handler = mmio_no_handler; + + if (!test->fail_vcpu_run_handler) + test->fail_vcpu_run_handler = fail_vcpu_run_no_handler; + } + + static void check_event_counts(struct test_desc *test) + { + ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults); + ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits); + ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs); + } + + static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) + { + struct test_desc *test = p->test_desc; + + pr_debug("Test: %s\n", test->name); + pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_debug("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(p->src_type)->name); + } + + static void reset_event_counts(void) + { + memset(&events, 0, sizeof(events)); + } + + /* + * This function either succeeds, skips the test (after setting test->skip), or + * fails with a TEST_FAIL that aborts all tests. + */ + static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) + { + struct kvm_run *run; + struct ucall uc; + int ret; + + run = vcpu->run; + + for (;;) { + ret = _vcpu_run(vcpu); + if (ret) { + test->fail_vcpu_run_handler(ret); + goto done; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + if (!handle_cmd(vm, uc.args[1])) { + test->skip = true; + goto done; + } + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx"); + break; + case UCALL_DONE: + goto done; + case UCALL_NONE: + if (run->exit_reason == KVM_EXIT_MMIO) + test->mmio_handler(vm, run); + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + done: + pr_debug(test->skip ? "Skipped.\n" : "Done.\n"); + } + + static void run_test(enum vm_guest_mode mode, void *arg) + { + struct test_params *p = (struct test_params *)arg; + struct test_desc *test = p->test_desc; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct uffd_desc *pt_uffd, *data_uffd; + + print_test_banner(mode, p); + + vm = ____vm_create(mode); + setup_memslots(vm, p); + kvm_vm_elf_load(vm, program_invocation_name); ++ setup_ucall(vm); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + setup_gva_maps(vm); + - ucall_init(vm, NULL); - + reset_event_counts(); + + /* + * Set some code in the data memslot for the guest to execute (only + * applicable to the EXEC tests). This has to be done before + * setup_uffd() as that function copies the memslot data for the uffd + * handler. + */ + load_exec_code_for_test(vm); + setup_uffd(vm, p, &pt_uffd, &data_uffd); + setup_abort_handlers(vm, vcpu, test); + setup_default_handlers(test); + vcpu_args_set(vcpu, 1, test); + + vcpu_run_loop(vm, vcpu, test); + - ucall_uninit(vm); + kvm_vm_free(vm); + free_uffd(test, pt_uffd, data_uffd); + + /* + * Make sure we check the events after the uffd threads have exited, + * which means they updated their respective event counters. + */ + if (!test->skip) + check_event_counts(test); + } + + static void help(char *name) + { + puts(""); + printf("usage: %s [-h] [-s mem-type]\n", name); + puts(""); + guest_modes_help(); + backing_src_help("-s"); + puts(""); + } + + #define SNAME(s) #s + #define SCAT2(a, b) SNAME(a ## _ ## b) + #define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c)) + #define SCAT4(a, b, c, d) SCAT2(a, SCAT3(b, c, d)) + + #define _CHECK(_test) _CHECK_##_test + #define _PREPARE(_test) _PREPARE_##_test + #define _PREPARE_guest_read64 NULL + #define _PREPARE_guest_ld_preidx NULL + #define _PREPARE_guest_write64 NULL + #define _PREPARE_guest_st_preidx NULL + #define _PREPARE_guest_exec NULL + #define _PREPARE_guest_at NULL + #define _PREPARE_guest_dc_zva guest_check_dc_zva + #define _PREPARE_guest_cas guest_check_lse + + /* With or without access flag checks */ + #define _PREPARE_with_af guest_set_ha, guest_clear_pte_af + #define _PREPARE_no_af NULL + #define _CHECK_with_af guest_check_pte_af + #define _CHECK_no_af NULL + + /* Performs an access and checks that no faults were triggered. */ + #define TEST_ACCESS(_access, _with_af, _mark_cmd) \ + { \ + .name = SCAT3(_access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af) }, \ + .expected_events = { 0 }, \ + } + + #define TEST_UFFD(_access, _with_af, _mark_cmd, \ + _uffd_data_handler, _uffd_pt_handler, _uffd_faults) \ + { \ + .name = SCAT4(uffd, _access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test_check = { _CHECK(_with_af) }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = _uffd_pt_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ + } + + #define TEST_DIRTY_LOG(_access, _with_af, _test_check) \ + { \ + .name = SCAT3(dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af), _test_check, \ + guest_check_s1ptw_wr_in_dirty_log}, \ + .expected_events = { 0 }, \ + } + + #define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \ + _uffd_faults, _test_check) \ + { \ + .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_test_check = { _CHECK(_with_af), _test_check }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ + } + + #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ + { \ + .name = SCAT3(ro_memslot, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits }, \ + } + + #define TEST_RO_MEMSLOT_NO_SYNDROME(_access) \ + { \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .guest_test = _access, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ + } + + #define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \ + _test_check) \ + { \ + .name = SCAT3(ro_memslot, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits}, \ + } + + #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check) \ + { \ + .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ + } + + #define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits, \ + _uffd_data_handler, _uffd_faults) \ + { \ + .name = SCAT2(ro_memslot_uffd, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits, \ + .uffd_faults = _uffd_faults }, \ + } + + #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler, \ + _uffd_faults) \ + { \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_write_handler, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1, \ + .uffd_faults = _uffd_faults }, \ + } + + static struct test_desc tests[] = { + + /* Check that HW is setting the Access Flag (AF) (sanity checks). */ + TEST_ACCESS(guest_read64, with_af, CMD_NONE), + TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_cas, with_af, CMD_NONE), + TEST_ACCESS(guest_write64, with_af, CMD_NONE), + TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE), + TEST_ACCESS(guest_exec, with_af, CMD_NONE), + + /* + * Punch a hole in the data backing store, and then try multiple + * accesses: reads should rturn zeroes, and writes should + * re-populate the page. Moreover, the test also check that no + * exception was generated in the guest. Note that this + * reading/writing behavior is the same as reading/writing a + * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from + * userspace. + */ + TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA), + + /* + * Punch holes in the data and PT backing stores and mark them for + * userfaultfd handling. This should result in 2 faults: the access + * on the data backing store, and its respective S1 page table walk + * (S1PTW). + */ + TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* no_af should also lead to a PT write. */ + TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* Note how that cas invokes the read handler. */ + TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + /* + * Can't test guest_at with_af as it's IMPDEF whether the AF is set. + * The S1PTW fault should still be marked as a write. + */ + TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 1), + TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_write_handler, uffd_pt_write_handler, 2), + TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_read_handler, uffd_pt_write_handler, 2), + + /* + * Try accesses when the data and PT memory regions are both + * tracked for dirty logging. + */ + TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log), + /* no_af should also lead to a PT write. */ + TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log), + TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log), + TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log), + + /* + * Access when the data and PT memory regions are both marked for + * dirty logging and UFFD at the same time. The expected result is + * that writes should mark the dirty log and trigger a userfaultfd + * write fault. Reads/execs should result in a read userfaultfd + * fault, and nothing in the dirty log. Any S1PTW should result in + * a write in the dirty log and a userfaultfd write. + */ + TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + /* no_af should also lead to a PT write. */ + TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler, + 2, guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2, + guest_check_no_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler, + 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2, + guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler, + 2, guest_check_write_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af, + uffd_data_write_handler, 2, + guest_check_write_in_dirty_log), + + /* + * Try accesses when the data memory region is marked read-only + * (with KVM_MEM_READONLY). Writes with a syndrome result in an + * MMIO exit, writes with no syndrome (e.g., CAS) result in a + * failed vcpu run, and reads/execs with and without syndroms do + * not fault. + */ + TEST_RO_MEMSLOT(guest_read64, 0, 0), + TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0), + TEST_RO_MEMSLOT(guest_at, 0, 0), + TEST_RO_MEMSLOT(guest_exec, 0, 0), + TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), + + /* + * Access when both the data region is both read-only and marked + * for dirty logging at the same time. The expected result is that + * for writes there should be no write in the dirty log. The + * readonly handling is the same as if the memslot was not marked + * for dirty logging: writes with a syndrome result in an MMIO + * exit, and writes with no syndrome result in a failed vcpu run. + */ + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler, + 1, guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx, + guest_check_no_write_in_dirty_log), + + /* + * Access when the data region is both read-only and punched with + * holes tracked with userfaultfd. The expected result is the + * union of both userfaultfd and read-only behaviors. For example, + * write accesses result in a userfaultfd write fault and an MMIO + * exit. Writes with no syndrome result in a failed vcpu run and + * no userfaultfd write fault. Reads result in userfaultfd getting + * triggered. + */ + TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, + uffd_no_handler, 1), + TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1, + uffd_data_write_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, + uffd_data_read_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, + uffd_no_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, + uffd_no_handler, 1), + + { 0 } + }; + + static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type) + { + struct test_desc *t; + + for (t = &tests[0]; t->name; t++) { + if (t->skip) + continue; + + struct test_params p = { + .src_type = src_type, + .test_desc = t, + }; + + for_each_guest_mode(run_test, &p); + } + } + + int main(int argc, char *argv[]) + { + enum vm_mem_backing_src_type src_type; + int opt; + + setbuf(stdout, NULL); + + src_type = DEFAULT_VM_MEM_SRC; + + while ((opt = getopt(argc, argv, "hm:s:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + exit(0); + } + } + + for_each_test_and_guest_mode(src_type); + return 0; + } diff --cc tools/testing/selftests/kvm/access_tracking_perf_test.c index 02d3587cab0a3,942370d573925..57a16371e9c29 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@@ -211,7 -208,7 +208,7 @@@ static bool spin_wait_for_next_iteratio int last_iteration = *current_iteration; do { - if (READ_ONCE(done)) - if (READ_ONCE(perf_test_args.stop_vcpus)) ++ if (READ_ONCE(memstress_args.stop_vcpus)) return false; *current_iteration = READ_ONCE(iteration); @@@ -321,11 -318,8 +318,8 @@@ static void run_test(enum vm_guest_mod mark_memory_idle(vm, nr_vcpus); access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); - /* Set done to signal the vCPU threads to exit */ - done = true; - - perf_test_join_vcpu_threads(nr_vcpus); - perf_test_destroy_vm(vm); + memstress_join_vcpu_threads(nr_vcpus); + memstress_destroy_vm(vm); } static void help(char *name) diff --cc tools/testing/selftests/kvm/demand_paging_test.c index 3a977ddf07b20,8e1fe4ffcccdf..b0e1fc4de9e29 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@@ -20,8 -20,9 +20,9 @@@ #include "kvm_util.h" #include "test_util.h" -#include "perf_test_util.h" +#include "memstress.h" #include "guest_modes.h" + #include "userfaultfd_util.h" #ifdef __NR_userfaultfd @@@ -270,22 -129,13 +129,13 @@@ static void prefault_mem(void *alias, u static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; - pthread_t *uffd_handler_threads = NULL; - struct uffd_handler_args *uffd_args = NULL; + struct uffd_desc **uffd_descs = NULL; struct timespec start; struct timespec ts_diff; - int *pipefds = NULL; struct kvm_vm *vm; - int r, i; + int i; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); demand_paging_size = get_backing_src_pagesz(p->src_type); @@@ -296,18 -146,11 +146,11 @@@ memset(guest_data_prototype, 0xAB, demand_paging_size); if (p->uffd_mode) { - uffd_handler_threads = - malloc(nr_vcpus * sizeof(*uffd_handler_threads)); - TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); - - uffd_args = malloc(nr_vcpus * sizeof(*uffd_args)); - TEST_ASSERT(uffd_args, "Memory allocation failed"); - - pipefds = malloc(sizeof(int) * nr_vcpus * 2); - TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); + uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); + TEST_ASSERT(uffd_descs, "Memory allocation failed"); for (i = 0; i < nr_vcpus; i++) { - struct perf_test_vcpu_args *vcpu_args; + struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; void *vcpu_alias; @@@ -317,19 -160,17 +160,17 @@@ vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa); + prefault_mem(vcpu_alias, - vcpu_args->pages * perf_test_args.guest_page_size); ++ vcpu_args->pages * memstress_args.guest_page_size); + /* * Set up user fault fd to handle demand paging * requests. */ - r = pipe2(&pipefds[i * 2], - O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!r, "Failed to set up pipefd"); - - setup_demand_paging(vm, &uffd_handler_threads[i], - pipefds[i * 2], p->uffd_mode, - p->uffd_delay, &uffd_args[i], - vcpu_hva, vcpu_alias, - vcpu_args->pages * memstress_args.guest_page_size); + uffd_descs[i] = uffd_setup_demand_paging( + p->uffd_mode, p->uffd_delay, vcpu_hva, - vcpu_args->pages * perf_test_args.guest_page_size, ++ vcpu_args->pages * memstress_args.guest_page_size, + &handle_uffd_page_request); } } @@@ -358,17 -193,14 +193,14 @@@ pr_info("Total guest execution time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); pr_info("Overall demand paging rate: %f pgs/sec\n", - perf_test_args.vcpu_args[0].pages * nr_vcpus / + memstress_args.vcpu_args[0].pages * nr_vcpus / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - perf_test_destroy_vm(vm); + memstress_destroy_vm(vm); free(guest_data_prototype); - if (p->uffd_mode) { - free(uffd_handler_threads); - free(uffd_args); - free(pipefds); - } + if (p->uffd_mode) + free(uffd_descs); } static void help(char *name) diff --cc tools/testing/selftests/kvm/include/kvm_util_base.h index c7685c7038ff0,b0da75af1ff33..37500c92dd0a6 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@@ -385,9 -406,12 +408,13 @@@ void vm_mem_region_set_flags(struct kvm void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); + vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); + vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, diff --cc tools/testing/selftests/kvm/include/memstress.h index bbd2a302df100,0000000000000..72e3e358ef7bd mode 100644,000000..100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@@ -1,72 -1,0 +1,75 @@@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tools/testing/selftests/kvm/include/memstress.h + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef SELFTEST_KVM_MEMSTRESS_H +#define SELFTEST_KVM_MEMSTRESS_H + +#include + +#include "kvm_util.h" + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ + +#define MEMSTRESS_MEM_SLOT_INDEX 1 + +struct memstress_vcpu_args { + uint64_t gpa; + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + struct kvm_vcpu *vcpu; + int vcpu_idx; +}; + +struct memstress_args { + struct kvm_vm *vm; + /* The starting address and size of the guest test region. */ + uint64_t gpa; + uint64_t size; + uint64_t guest_page_size; + uint32_t random_seed; + uint32_t write_percent; + + /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ + bool nested; + /* Randomize which pages are accessed by the guest. */ + bool random_access; + /* True if all vCPUs are pinned to pCPUs */ + bool pin_vcpus; + /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */ + uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS]; + ++ /* Test is done, stop running vCPUs. */ ++ bool stop_vcpus; ++ + struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS]; +}; + +extern struct memstress_args memstress_args; + +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, + uint64_t vcpu_memory_bytes, int slots, + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access); +void memstress_destroy_vm(struct kvm_vm *vm); + +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed); +void memstress_set_random_access(struct kvm_vm *vm, bool random_access); + +void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); +void memstress_join_vcpu_threads(int vcpus); +void memstress_guest_code(uint32_t vcpu_id); + +uint64_t memstress_nested_pages(int nr_vcpus); +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); + +#endif /* SELFTEST_KVM_MEMSTRESS_H */ diff --cc tools/testing/selftests/kvm/lib/kvm_util.c index 1d26a21601785,e3daa97ab0f4a..e9607eb089bee --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@@ -335,10 -328,17 +329,18 @@@ struct kvm_vm *__vm_create(enum vm_gues { uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, nr_extra_pages); + struct userspace_mem_region *slot0; struct kvm_vm *vm; + int i; + + pr_debug("%s: mode='%s' pages='%ld'\n", __func__, + vm_guest_mode_string(mode), nr_pages); - vm = ____vm_create(mode, nr_pages); + vm = ____vm_create(mode); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); + for (i = 0; i < NR_MEM_REGIONS; i++) + vm->memslots[i] = 0; kvm_vm_elf_load(vm, program_invocation_name); diff --cc tools/testing/selftests/kvm/lib/memstress.c index 2de8a5d527b3a,0000000000000..5f1d3173c238c mode 100644,000000..100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@@ -1,319 -1,0 +1,322 @@@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + */ +#define _GNU_SOURCE + +#include + +#include "kvm_util.h" +#include "memstress.h" +#include "processor.h" + +struct memstress_args memstress_args; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +struct vcpu_thread { + /* The index of the vCPU. */ + int vcpu_idx; + + /* The pthread backing the vCPU. */ + pthread_t thread; + + /* Set to true once the vCPU thread is up and running. */ + bool running; +}; + +/* The vCPU threads involved in this test. */ +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; + +/* The function run by each vCPU thread, as provided by the test. */ +static void (*vcpu_thread_fn)(struct memstress_vcpu_args *); + +/* Set to true once all vCPU threads are up and running. */ +static bool all_vcpu_threads_running; + +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +void memstress_guest_code(uint32_t vcpu_idx) +{ + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx]; + struct guest_random_state rand_state; + uint64_t gva; + uint64_t pages; + uint64_t addr; + uint64_t page; + int i; + + rand_state = new_guest_random_state(args->random_seed + vcpu_idx); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx); + + while (true) { + for (i = 0; i < pages; i++) { + if (args->random_access) + page = guest_random_u32(&rand_state) % pages; + else + page = i; + + addr = gva + (page * args->guest_page_size); + + if (guest_random_u32(&rand_state) % 100 < args->write_percent) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, + struct kvm_vcpu *vcpus[], + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access) +{ + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args; + int i; + + for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &args->vcpu_args[i]; + + vcpu_args->vcpu = vcpus[i]; + vcpu_args->vcpu_idx = i; + + if (partition_vcpu_memory_access) { + vcpu_args->gva = guest_test_virt_mem + + (i * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + args->guest_page_size; + vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes); + } else { + vcpu_args->gva = guest_test_virt_mem; + vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) / + args->guest_page_size; + vcpu_args->gpa = args->gpa; + } + + vcpu_args_set(vcpus[i], 1, i); + + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + i, vcpu_args->gpa, vcpu_args->gpa + + (vcpu_args->pages * args->guest_page_size)); + } +} + +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, + uint64_t vcpu_memory_bytes, int slots, + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access) +{ + struct memstress_args *args = &memstress_args; + struct kvm_vm *vm; + uint64_t guest_num_pages, slot0_pages = 0; + uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); + uint64_t region_end_gfn; + int i; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + /* By default vCPUs will write to memory. */ + args->write_percent = 100; + + /* + * Snapshot the non-huge page size. This is used by the guest code to + * access/dirty pages at the logging granularity. + */ + args->guest_page_size = vm_guest_mode_params[mode].page_size; + + guest_num_pages = vm_adjust_num_guest_pages(mode, + (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size); + + TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, + "Guest memory size is not host page size aligned."); + TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); + + /* + * If using nested, allocate extra pages for the nested page tables and + * in-memory data structures. + */ + if (args->nested) + slot0_pages += memstress_nested_pages(nr_vcpus); + + /* + * Pass guest_num_pages to populate the page tables for test memory. + * The memory is also added to memslot 0, but that's a benign side + * effect as KVM allows aliasing HVAs in meslots. + */ + vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, + memstress_guest_code, vcpus); + + args->vm = vm; + + /* Put the test region at the top guest physical memory. */ + region_end_gfn = vm->max_gfn + 1; + +#ifdef __x86_64__ + /* + * When running vCPUs in L2, restrict the test region to 48 bits to + * avoid needing 5-level page tables to identity map L2. + */ + if (args->nested) + region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size); +#endif + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < region_end_gfn, + "Requested more guest memory than address space allows.\n" + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " nr_vcpus: %d wss: %" PRIx64 "]\n", + guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); + + args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size; + args->gpa = align_down(args->gpa, backing_src_pagesz); +#ifdef __s390x__ + /* Align to 1M (segment size) */ + args->gpa = align_down(args->gpa, 1 << 20); +#endif + args->size = guest_num_pages * args->guest_page_size; + pr_info("guest physical test memory: [0x%lx, 0x%lx)\n", + args->gpa, args->gpa + args->size); + + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + MEMSTRESS_MEM_SLOT_INDEX + i, + region_pages, 0); + } + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages); + + memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, + partition_vcpu_memory_access); + + if (args->nested) { + pr_info("Configuring vCPUs to run in L2 (nested).\n"); + memstress_setup_nested(vm, nr_vcpus, vcpus); + } + + /* Export the shared variables to the guest. */ + sync_global_to_guest(vm, memstress_args); + + return vm; +} + +void memstress_destroy_vm(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} + +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) +{ + memstress_args.write_percent = write_percent; + sync_global_to_guest(vm, memstress_args.write_percent); +} + +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed) +{ + memstress_args.random_seed = random_seed; + sync_global_to_guest(vm, memstress_args.random_seed); +} + +void memstress_set_random_access(struct kvm_vm *vm, bool random_access) +{ + memstress_args.random_access = random_access; + sync_global_to_guest(vm, memstress_args.random_access); +} + +uint64_t __weak memstress_nested_pages(int nr_vcpus) +{ + return 0; +} + +void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) +{ + pr_info("%s() not support on this architecture, skipping.\n", __func__); + exit(KSFT_SKIP); +} + +static void *vcpu_thread_main(void *data) +{ + struct vcpu_thread *vcpu = data; + int vcpu_idx = vcpu->vcpu_idx; + + if (memstress_args.pin_vcpus) + kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]); + + WRITE_ONCE(vcpu->running, true); + + /* + * Wait for all vCPU threads to be up and running before calling the test- + * provided vCPU thread function. This prevents thread creation (which + * requires taking the mmap_sem in write mode) from interfering with the + * guest faulting in its memory. + */ + while (!READ_ONCE(all_vcpu_threads_running)) + ; + + vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]); + + return NULL; +} + +void memstress_start_vcpu_threads(int nr_vcpus, + void (*vcpu_fn)(struct memstress_vcpu_args *)) +{ + int i; + + vcpu_thread_fn = vcpu_fn; + WRITE_ONCE(all_vcpu_threads_running, false); ++ WRITE_ONCE(memstress_args.stop_vcpus, false); + + for (i = 0; i < nr_vcpus; i++) { + struct vcpu_thread *vcpu = &vcpu_threads[i]; + + vcpu->vcpu_idx = i; + WRITE_ONCE(vcpu->running, false); + + pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); + } + + for (i = 0; i < nr_vcpus; i++) { + while (!READ_ONCE(vcpu_threads[i].running)) + ; + } + + WRITE_ONCE(all_vcpu_threads_running, true); +} + +void memstress_join_vcpu_threads(int nr_vcpus) +{ + int i; + ++ WRITE_ONCE(memstress_args.stop_vcpus, true); ++ + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i].thread, NULL); +} diff --cc tools/testing/selftests/kvm/lib/userfaultfd_util.c index 0000000000000,3b44846fc277e..92cef20902f1f mode 000000,100644..100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@@ -1,0 -1,186 +1,186 @@@ + // SPDX-License-Identifier: GPL-2.0 + /* + * KVM userfaultfd util + * Adapted from demand_paging_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019-2022 Google LLC + */ + + #define _GNU_SOURCE /* for pipe2 */ + + #include + #include + #include + #include + #include + #include + #include + #include + + #include "kvm_util.h" + #include "test_util.h" -#include "perf_test_util.h" ++#include "memstress.h" + #include "userfaultfd_util.h" + + #ifdef __NR_userfaultfd + + static void *uffd_handler_thread_fn(void *arg) + { + struct uffd_desc *uffd_desc = (struct uffd_desc *)arg; + int uffd = uffd_desc->uffd; + int pipefd = uffd_desc->pipefds[0]; + useconds_t delay = uffd_desc->delay; + int64_t pages = 0; + struct timespec start; + struct timespec ts_diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { + struct uffd_msg msg; + struct pollfd pollfd[2]; + char tmp_chr; + int r; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd; + pollfd[1].events = POLLIN; + + r = poll(pollfd, 2, -1); + switch (r) { + case -1: + pr_info("poll err"); + continue; + case 0: + continue; + case 1: + break; + default: + pr_info("Polling uffd returned %d", r); + return NULL; + } + + if (pollfd[0].revents & POLLERR) { + pr_info("uffd revents has POLLERR"); + return NULL; + } + + if (pollfd[1].revents & POLLIN) { + r = read(pollfd[1].fd, &tmp_chr, 1); + TEST_ASSERT(r == 1, + "Error reading pipefd in UFFD thread\n"); + return NULL; + } + + if (!(pollfd[0].revents & POLLIN)) + continue; + + r = read(uffd, &msg, sizeof(msg)); + if (r == -1) { + if (errno == EAGAIN) + continue; + pr_info("Read of uffd got errno %d\n", errno); + return NULL; + } + + if (r != sizeof(msg)) { + pr_info("Read on uffd returned unexpected size: %d bytes", r); + return NULL; + } + + if (!(msg.event & UFFD_EVENT_PAGEFAULT)) + continue; + + if (delay) + usleep(delay); + r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg); + if (r < 0) + return NULL; + pages++; + } + + ts_diff = timespec_elapsed(start); + PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", + pages, ts_diff.tv_sec, ts_diff.tv_nsec, + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + + return NULL; + } + + struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + void *hva, uint64_t len, + uffd_handler_t handler) + { + struct uffd_desc *uffd_desc; + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); + int uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; + int ret; + + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + uffd_desc = malloc(sizeof(struct uffd_desc)); + TEST_ASSERT(uffd_desc, "malloc failed"); + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); + + uffdio_register.range.start = (uint64_t)hva; + uffdio_register.range.len = len; + uffdio_register.mode = uffd_mode; + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + + ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd"); + + uffd_desc->uffd_mode = uffd_mode; + uffd_desc->uffd = uffd; + uffd_desc->delay = delay; + uffd_desc->handler = handler; + pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn, + uffd_desc); + + PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", + hva, hva + len); + + return uffd_desc; + } + + void uffd_stop_demand_paging(struct uffd_desc *uffd) + { + char c = 0; + int ret; + + ret = write(uffd->pipefds[1], &c, 1); + TEST_ASSERT(ret == 1, "Unable to write to pipefd"); + + ret = pthread_join(uffd->thread, NULL); + TEST_ASSERT(ret == 0, "Pthread_join failed."); + + close(uffd->uffd); + + close(uffd->pipefds[1]); + close(uffd->pipefds[0]); + + free(uffd); + } + + #endif /* __NR_userfaultfd */ diff --cc tools/testing/selftests/kvm/memslot_modification_stress_test.c index d07e921bfcc53,3a5e4518307c2..9855c41ca811f --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@@ -34,9 -34,7 +34,7 @@@ static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; - static bool run_vcpus = true; - -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) { struct kvm_vcpu *vcpu = vcpu_args->vcpu; struct kvm_run *run; @@@ -45,7 -43,7 +43,7 @@@ run = vcpu->run; /* Let the guest access its memory until a stop signal is received */ - while (READ_ONCE(run_vcpus)) { - while (!READ_ONCE(perf_test_args.stop_vcpus)) { ++ while (!READ_ONCE(memstress_args.stop_vcpus)) { ret = _vcpu_run(vcpu); TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); @@@ -107,14 -105,13 +105,12 @@@ static void run_test(enum vm_guest_mod pr_info("Started all vCPUs\n"); - add_remove_memslot(vm, p->memslot_modification_delay, - p->nr_memslot_modifications); + add_remove_memslot(vm, p->delay, p->nr_iterations); - run_vcpus = false; - - perf_test_join_vcpu_threads(nr_vcpus); + memstress_join_vcpu_threads(nr_vcpus); pr_info("All vCPU threads joined\n"); - perf_test_destroy_vm(vm); + memstress_destroy_vm(vm); } static void help(char *name) diff --cc tools/testing/selftests/kvm/memslot_perf_test.c index 36b20abfb948e,2ad40f7c9c08e..e698306bf49d1 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@@ -252,37 -284,33 +284,34 @@@ static bool prepare_vm(struct vm_data * struct timespec tstart; struct sync_area *sync; - max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); - TEST_ASSERT(max_mem_slots > 1, - "KVM_CAP_NR_MEMSLOTS should be greater than 1"); - TEST_ASSERT(nslots > 1 || nslots == -1, - "Slot count cap should be greater than 1"); - if (nslots != -1) - max_mem_slots = min(max_mem_slots, (uint32_t)nslots); - pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); + host_page_size = getpagesize(); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + mempages = mem_size / guest_page_size; - TEST_ASSERT(mempages > 1, - "Can't test without any memory"); + data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); - ucall_init(data->vm, NULL); + TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size"); data->npages = mempages; - data->nslots = max_mem_slots - 1; - data->pages_per_slot = mempages / data->nslots; - if (!data->pages_per_slot) { - *maxslots = mempages + 1; + TEST_ASSERT(data->npages > 1, "Can't test without any memory"); + data->nslots = nslots; + data->pages_per_slot = data->npages / data->nslots; + rempages = data->npages % data->nslots; + if (!check_slot_pages(host_page_size, guest_page_size, + data->pages_per_slot, rempages)) { + *maxslots = get_max_slots(data, host_page_size); return false; } data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); TEST_ASSERT(data->hva_slots, "malloc() fail"); + data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", - max_mem_slots - 1, data->pages_per_slot, rempages); + data->nslots, data->pages_per_slot, rempages); clock_gettime(CLOCK_MONOTONIC, &tstart); - for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { + for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) { uint64_t npages; npages = data->pages_per_slot; @@@ -884,9 -966,9 +967,9 @@@ static bool parse_args(int argc, char * map_unmap_verify = true; break; case 's': - targs->nslots = atoi(optarg); + targs->nslots = atoi_paranoid(optarg); - if (targs->nslots <= 0 && targs->nslots != -1) { - pr_info("Slot count cap has to be positive or -1 for no cap\n"); + if (targs->nslots <= 1 && targs->nslots != -1) { + pr_info("Slot count cap must be larger than 1 or -1 for no cap\n"); return false; } break; @@@ -994,6 -1103,12 +1092,9 @@@ int main(int argc, char *argv[] struct test_result rbestslottime; int tctr; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - + if (!check_memory_sizes()) + return -1; + if (!parse_args(argc, argv, &targs)) return -1;