Merge tag 'kvmarm-6.2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm...
authorPaolo Bonzini <pbonzini@redhat.com>
Tue, 6 Dec 2022 17:27:39 +0000 (12:27 -0500)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 9 Dec 2022 08:12:12 +0000 (09:12 +0100)
KVM/arm64 updates for 6.2

- Enable the per-vcpu dirty-ring tracking mechanism, together with an
  option to keep the good old dirty log around for pages that are
  dirtied by something other than a vcpu.

- Switch to the relaxed parallel fault handling, using RCU to delay
  page table reclaim and giving better performance under load.

- Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping
  option, which multi-process VMMs such as crosvm rely on.

- Merge the pKVM shadow vcpu state tracking that allows the hypervisor
  to have its own view of a vcpu, keeping that state private.

- Add support for the PMUv3p5 architecture revision, bringing support
  for 64bit counters on systems that support it, and fix the
  no-quite-compliant CHAIN-ed counter support for the machines that
  actually exist out there.

- Fix a handful of minor issues around 52bit VA/PA support (64kB pages
  only) as a prefix of the oncoming support for 4kB and 16kB pages.

- Add/Enable/Fix a bunch of selftests covering memslots, breakpoints,
  stage-2 faults and access tracking. You name it, we got it, we
  probably broke it.

- Pick a small set of documentation and spelling fixes, because no
  good merge window would be complete without those.

As a side effect, this tag also drags:

- The 'kvmarm-fixes-6.1-3' tag as a dependency to the dirty-ring
  series

- A shared branch with the arm64 tree that repaints all the system
  registers to match the ARM ARM's naming, and resulting in
  interesting conflicts

26 files changed:
1  2 
Documentation/virt/kvm/api.rst
arch/arm64/kvm/arm.c
arch/arm64/kvm/mmu.c
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/x86.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
tools/testing/selftests/kvm/aarch64/debug-exceptions.c
tools/testing/selftests/kvm/aarch64/page_fault_test.c
tools/testing/selftests/kvm/access_tracking_perf_test.c
tools/testing/selftests/kvm/demand_paging_test.c
tools/testing/selftests/kvm/dirty_log_test.c
tools/testing/selftests/kvm/include/kvm_util_base.h
tools/testing/selftests/kvm/include/memstress.h
tools/testing/selftests/kvm/lib/aarch64/processor.c
tools/testing/selftests/kvm/lib/elf.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/memstress.c
tools/testing/selftests/kvm/lib/userfaultfd_util.c
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/memslot_modification_stress_test.c
tools/testing/selftests/kvm/memslot_perf_test.c
virt/kvm/kvm_main.c

Simple merge
Simple merge
Simple merge
index d1013c4f673cac9c65bc05a7a315e7d0f84e6367,b4dbde7d9eb1ddabe43347b873bc29d425206834..ad9f8b02071deb722fcd253386948569cd4e324a
@@@ -2154,8 -2084,12 +2154,6 @@@ static inline int kvm_cpu_get_apicid(in
  #endif
  }
  
- int kvm_cpu_dirty_log_size(void);
 -#define put_smstate(type, buf, offset, val)                      \
 -      *(type *)((buf) + (offset) - 0x7e00) = val
 -
 -#define GET_SMSTATE(type, buf, offset)                \
 -      (*(type *)((buf) + (offset) - 0x7e00))
--
  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
  
  #define KVM_CLOCK_VALID_FLAGS                                         \
Simple merge
Simple merge
index 88448397642cbdc7df8d1506a49110055dcc02d9,c87b5882d7aef7b4d353d1389b3e2cc7ffccf136..820efdf9fef809fa59d57f97e1a12c59efd319e5
@@@ -1181,7 -1178,7 +1181,8 @@@ struct kvm_ppc_resize_hpt 
  #define KVM_CAP_S390_ZPCI_OP 221
  #define KVM_CAP_S390_CPU_TOPOLOGY 222
  #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 -#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224
 +#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
++#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
index 2275ba861e0e5a0ec1d9622ea851b63a9b0d82c8,1d85b8e218a0219921761b5c9fb0ecb5978a54c4..947676983da1f2bff7102d66e18450861a6a97e9
@@@ -47,7 -47,7 +47,8 @@@ LIBKVM += lib/memstress.
  LIBKVM += lib/rbtree.c
  LIBKVM += lib/sparsebit.c
  LIBKVM += lib/test_util.c
 +LIBKVM += lib/ucall_common.c
+ LIBKVM += lib/userfaultfd_util.c
  
  LIBKVM_STRING += lib/string_override.c
  
index d86c4e4d1c8268e80488845fa72e035bf446e2b7,b30add3e77269a331a82f1c0bacf4f0d1da8188c..8a3fb212084a05fbc85d0d82fe01f0eacaf9187e
@@@ -289,9 -425,9 +422,8 @@@ static void test_guest_debug_exceptions
        struct kvm_vcpu *vcpu;
        struct kvm_vm *vm;
        struct ucall uc;
-       int stage;
  
        vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 -      ucall_init(vm, NULL);
  
        vm_init_descriptor_tables(vm);
        vcpu_init_descriptor_tables(vcpu);
index 0000000000000000000000000000000000000000,05bb6a6369c25bb0f32b22445f4435f8b295986d..95d22cfb7b41a2cf72ffd66b1113856250946eb9
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,1112 +1,1117 @@@
 -      ucall_init(vm, NULL);
 -
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  * page_fault_test.c - Test stage 2 faults.
+  *
+  * This test tries different combinations of guest accesses (e.g., write,
+  * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
+  * hugetlbfs with a hole). It checks that the expected handling method is
+  * called (e.g., uffd faults with the right address and write/read flag).
+  */
+ #define _GNU_SOURCE
+ #include <linux/bitmap.h>
+ #include <fcntl.h>
+ #include <test_util.h>
+ #include <kvm_util.h>
+ #include <processor.h>
+ #include <asm/sysreg.h>
+ #include <linux/bitfield.h>
+ #include "guest_modes.h"
+ #include "userfaultfd_util.h"
+ /* Guest virtual addresses that point to the test page and its PTE. */
+ #define TEST_GVA                              0xc0000000
+ #define TEST_EXEC_GVA                         (TEST_GVA + 0x8)
+ #define TEST_PTE_GVA                          0xb0000000
+ #define TEST_DATA                             0x0123456789ABCDEF
+ static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
+ #define CMD_NONE                              (0)
+ #define CMD_SKIP_TEST                         (1ULL << 1)
+ #define CMD_HOLE_PT                           (1ULL << 2)
+ #define CMD_HOLE_DATA                         (1ULL << 3)
+ #define CMD_CHECK_WRITE_IN_DIRTY_LOG          (1ULL << 4)
+ #define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG               (1ULL << 5)
+ #define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG               (1ULL << 6)
+ #define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG    (1ULL << 7)
+ #define CMD_SET_PTE_AF                                (1ULL << 8)
+ #define PREPARE_FN_NR                         10
+ #define CHECK_FN_NR                           10
+ static struct event_cnt {
+       int mmio_exits;
+       int fail_vcpu_runs;
+       int uffd_faults;
+       /* uffd_faults is incremented from multiple threads. */
+       pthread_mutex_t uffd_faults_mutex;
+ } events;
+ struct test_desc {
+       const char *name;
+       uint64_t mem_mark_cmd;
+       /* Skip the test if any prepare function returns false */
+       bool (*guest_prepare[PREPARE_FN_NR])(void);
+       void (*guest_test)(void);
+       void (*guest_test_check[CHECK_FN_NR])(void);
+       uffd_handler_t uffd_pt_handler;
+       uffd_handler_t uffd_data_handler;
+       void (*dabt_handler)(struct ex_regs *regs);
+       void (*iabt_handler)(struct ex_regs *regs);
+       void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
+       void (*fail_vcpu_run_handler)(int ret);
+       uint32_t pt_memslot_flags;
+       uint32_t data_memslot_flags;
+       bool skip;
+       struct event_cnt expected_events;
+ };
+ struct test_params {
+       enum vm_mem_backing_src_type src_type;
+       struct test_desc *test_desc;
+ };
+ static inline void flush_tlb_page(uint64_t vaddr)
+ {
+       uint64_t page = vaddr >> 12;
+       dsb(ishst);
+       asm volatile("tlbi vaae1is, %0" :: "r" (page));
+       dsb(ish);
+       isb();
+ }
+ static void guest_write64(void)
+ {
+       uint64_t val;
+       WRITE_ONCE(*guest_test_memory, TEST_DATA);
+       val = READ_ONCE(*guest_test_memory);
+       GUEST_ASSERT_EQ(val, TEST_DATA);
+ }
+ /* Check the system for atomic instructions. */
+ static bool guest_check_lse(void)
+ {
+       uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
+       uint64_t atomic;
+       atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0);
+       return atomic >= 2;
+ }
+ static bool guest_check_dc_zva(void)
+ {
+       uint64_t dczid = read_sysreg(dczid_el0);
+       uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid);
+       return dzp == 0;
+ }
+ /* Compare and swap instruction. */
+ static void guest_cas(void)
+ {
+       uint64_t val;
+       GUEST_ASSERT(guest_check_lse());
+       asm volatile(".arch_extension lse\n"
+                    "casal %0, %1, [%2]\n"
+                    :: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory));
+       val = READ_ONCE(*guest_test_memory);
+       GUEST_ASSERT_EQ(val, TEST_DATA);
+ }
+ static void guest_read64(void)
+ {
+       uint64_t val;
+       val = READ_ONCE(*guest_test_memory);
+       GUEST_ASSERT_EQ(val, 0);
+ }
+ /* Address translation instruction */
+ static void guest_at(void)
+ {
+       uint64_t par;
+       asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
+       par = read_sysreg(par_el1);
+       isb();
+       /* Bit 1 indicates whether the AT was successful */
+       GUEST_ASSERT_EQ(par & 1, 0);
+ }
+ /*
+  * The size of the block written by "dc zva" is guaranteed to be between (2 <<
+  * 0) and (2 << 9), which is safe in our case as we need the write to happen
+  * for at least a word, and not more than a page.
+  */
+ static void guest_dc_zva(void)
+ {
+       uint16_t val;
+       asm volatile("dc zva, %0" :: "r" (guest_test_memory));
+       dsb(ish);
+       val = READ_ONCE(*guest_test_memory);
+       GUEST_ASSERT_EQ(val, 0);
+ }
+ /*
+  * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
+  * And that's special because KVM must take special care with those: they
+  * should still count as accesses for dirty logging or user-faulting, but
+  * should be handled differently on mmio.
+  */
+ static void guest_ld_preidx(void)
+ {
+       uint64_t val;
+       uint64_t addr = TEST_GVA - 8;
+       /*
+        * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
+        * in a gap between memslots not backing by anything.
+        */
+       asm volatile("ldr %0, [%1, #8]!"
+                    : "=r" (val), "+r" (addr));
+       GUEST_ASSERT_EQ(val, 0);
+       GUEST_ASSERT_EQ(addr, TEST_GVA);
+ }
+ static void guest_st_preidx(void)
+ {
+       uint64_t val = TEST_DATA;
+       uint64_t addr = TEST_GVA - 8;
+       asm volatile("str %0, [%1, #8]!"
+                    : "+r" (val), "+r" (addr));
+       GUEST_ASSERT_EQ(addr, TEST_GVA);
+       val = READ_ONCE(*guest_test_memory);
+ }
+ static bool guest_set_ha(void)
+ {
+       uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+       uint64_t hadbs, tcr;
+       /* Skip if HA is not supported. */
+       hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1);
+       if (hadbs == 0)
+               return false;
+       tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
+       write_sysreg(tcr, tcr_el1);
+       isb();
+       return true;
+ }
+ static bool guest_clear_pte_af(void)
+ {
+       *((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
+       flush_tlb_page(TEST_GVA);
+       return true;
+ }
+ static void guest_check_pte_af(void)
+ {
+       dsb(ish);
+       GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
+ }
+ static void guest_check_write_in_dirty_log(void)
+ {
+       GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
+ }
+ static void guest_check_no_write_in_dirty_log(void)
+ {
+       GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
+ }
+ static void guest_check_s1ptw_wr_in_dirty_log(void)
+ {
+       GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
+ }
+ static void guest_exec(void)
+ {
+       int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
+       int ret;
+       ret = code();
+       GUEST_ASSERT_EQ(ret, 0x77);
+ }
+ static bool guest_prepare(struct test_desc *test)
+ {
+       bool (*prepare_fn)(void);
+       int i;
+       for (i = 0; i < PREPARE_FN_NR; i++) {
+               prepare_fn = test->guest_prepare[i];
+               if (prepare_fn && !prepare_fn())
+                       return false;
+       }
+       return true;
+ }
+ static void guest_test_check(struct test_desc *test)
+ {
+       void (*check_fn)(void);
+       int i;
+       for (i = 0; i < CHECK_FN_NR; i++) {
+               check_fn = test->guest_test_check[i];
+               if (check_fn)
+                       check_fn();
+       }
+ }
+ static void guest_code(struct test_desc *test)
+ {
+       if (!guest_prepare(test))
+               GUEST_SYNC(CMD_SKIP_TEST);
+       GUEST_SYNC(test->mem_mark_cmd);
+       if (test->guest_test)
+               test->guest_test();
+       guest_test_check(test);
+       GUEST_DONE();
+ }
+ static void no_dabt_handler(struct ex_regs *regs)
+ {
+       GUEST_ASSERT_1(false, read_sysreg(far_el1));
+ }
+ static void no_iabt_handler(struct ex_regs *regs)
+ {
+       GUEST_ASSERT_1(false, regs->pc);
+ }
+ static struct uffd_args {
+       char *copy;
+       void *hva;
+       uint64_t paging_size;
+ } pt_args, data_args;
+ /* Returns true to continue the test, and false if it should be skipped. */
+ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
+                               struct uffd_args *args, bool expect_write)
+ {
+       uint64_t addr = msg->arg.pagefault.address;
+       uint64_t flags = msg->arg.pagefault.flags;
+       struct uffdio_copy copy;
+       int ret;
+       TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
+                   "The only expected UFFD mode is MISSING");
+       ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write);
+       ASSERT_EQ(addr, (uint64_t)args->hva);
+       pr_debug("uffd fault: addr=%p write=%d\n",
+                (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
+       copy.src = (uint64_t)args->copy;
+       copy.dst = addr;
+       copy.len = args->paging_size;
+       copy.mode = 0;
+       ret = ioctl(uffd, UFFDIO_COPY, &copy);
+       if (ret == -1) {
+               pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
+                       addr, errno);
+               return ret;
+       }
+       pthread_mutex_lock(&events.uffd_faults_mutex);
+       events.uffd_faults += 1;
+       pthread_mutex_unlock(&events.uffd_faults_mutex);
+       return 0;
+ }
+ static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+       return uffd_generic_handler(mode, uffd, msg, &pt_args, true);
+ }
+ static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+       return uffd_generic_handler(mode, uffd, msg, &data_args, true);
+ }
+ static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+       return uffd_generic_handler(mode, uffd, msg, &data_args, false);
+ }
+ static void setup_uffd_args(struct userspace_mem_region *region,
+                           struct uffd_args *args)
+ {
+       args->hva = (void *)region->region.userspace_addr;
+       args->paging_size = region->region.memory_size;
+       args->copy = malloc(args->paging_size);
+       TEST_ASSERT(args->copy, "Failed to allocate data copy.");
+       memcpy(args->copy, args->hva, args->paging_size);
+ }
+ static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
+                      struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
+ {
+       struct test_desc *test = p->test_desc;
+       int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+       setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
+       setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
+       *pt_uffd = NULL;
+       if (test->uffd_pt_handler)
+               *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+                                                   pt_args.hva,
+                                                   pt_args.paging_size,
+                                                   test->uffd_pt_handler);
+       *data_uffd = NULL;
+       if (test->uffd_data_handler)
+               *data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+                                                     data_args.hva,
+                                                     data_args.paging_size,
+                                                     test->uffd_data_handler);
+ }
+ static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
+                     struct uffd_desc *data_uffd)
+ {
+       if (test->uffd_pt_handler)
+               uffd_stop_demand_paging(pt_uffd);
+       if (test->uffd_data_handler)
+               uffd_stop_demand_paging(data_uffd);
+       free(pt_args.copy);
+       free(data_args.copy);
+ }
+ static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+       TEST_FAIL("There was no UFFD fault expected.");
+       return -1;
+ }
+ /* Returns false if the test should be skipped. */
+ static bool punch_hole_in_backing_store(struct kvm_vm *vm,
+                                       struct userspace_mem_region *region)
+ {
+       void *hva = (void *)region->region.userspace_addr;
+       uint64_t paging_size = region->region.memory_size;
+       int ret, fd = region->fd;
+       if (fd != -1) {
+               ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                               0, paging_size);
+               TEST_ASSERT(ret == 0, "fallocate failed\n");
+       } else {
+               ret = madvise(hva, paging_size, MADV_DONTNEED);
+               TEST_ASSERT(ret == 0, "madvise failed\n");
+       }
+       return true;
+ }
+ static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
+ {
+       struct userspace_mem_region *region;
+       void *hva;
+       region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+       hva = (void *)region->region.userspace_addr;
+       ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
+       memcpy(hva, run->mmio.data, run->mmio.len);
+       events.mmio_exits += 1;
+ }
+ static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
+ {
+       uint64_t data;
+       memcpy(&data, run->mmio.data, sizeof(data));
+       pr_debug("addr=%lld len=%d w=%d data=%lx\n",
+                run->mmio.phys_addr, run->mmio.len,
+                run->mmio.is_write, data);
+       TEST_FAIL("There was no MMIO exit expected.");
+ }
+ static bool check_write_in_dirty_log(struct kvm_vm *vm,
+                                    struct userspace_mem_region *region,
+                                    uint64_t host_pg_nr)
+ {
+       unsigned long *bmap;
+       bool first_page_dirty;
+       uint64_t size = region->region.memory_size;
+       /* getpage_size() is not always equal to vm->page_size */
+       bmap = bitmap_zalloc(size / getpagesize());
+       kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
+       first_page_dirty = test_bit(host_pg_nr, bmap);
+       free(bmap);
+       return first_page_dirty;
+ }
+ /* Returns true to continue the test, and false if it should be skipped. */
+ static bool handle_cmd(struct kvm_vm *vm, int cmd)
+ {
+       struct userspace_mem_region *data_region, *pt_region;
+       bool continue_test = true;
+       data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+       pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
+       if (cmd == CMD_SKIP_TEST)
+               continue_test = false;
+       if (cmd & CMD_HOLE_PT)
+               continue_test = punch_hole_in_backing_store(vm, pt_region);
+       if (cmd & CMD_HOLE_DATA)
+               continue_test = punch_hole_in_backing_store(vm, data_region);
+       if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
+               TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
+                           "Missing write in dirty log");
+       if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
+               TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0),
+                           "Missing s1ptw write in dirty log");
+       if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
+               TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
+                           "Unexpected write in dirty log");
+       if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
+               TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0),
+                           "Unexpected s1ptw write in dirty log");
+       return continue_test;
+ }
+ void fail_vcpu_run_no_handler(int ret)
+ {
+       TEST_FAIL("Unexpected vcpu run failure\n");
+ }
+ void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
+ {
+       TEST_ASSERT(errno == ENOSYS,
+                   "The mmio handler should have returned not implemented.");
+       events.fail_vcpu_runs += 1;
+ }
+ typedef uint32_t aarch64_insn_t;
+ extern aarch64_insn_t __exec_test[2];
+ noinline void __return_0x77(void)
+ {
+       asm volatile("__exec_test: mov x0, #0x77\n"
+                    "ret\n");
+ }
+ /*
+  * Note that this function runs on the host before the test VM starts: there's
+  * no need to sync the D$ and I$ caches.
+  */
+ static void load_exec_code_for_test(struct kvm_vm *vm)
+ {
+       uint64_t *code;
+       struct userspace_mem_region *region;
+       void *hva;
+       region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+       hva = (void *)region->region.userspace_addr;
+       assert(TEST_EXEC_GVA > TEST_GVA);
+       code = hva + TEST_EXEC_GVA - TEST_GVA;
+       memcpy(code, __exec_test, sizeof(__exec_test));
+ }
+ static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+                                struct test_desc *test)
+ {
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vcpu);
+       vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+                               ESR_EC_DABT, no_dabt_handler);
+       vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+                               ESR_EC_IABT, no_iabt_handler);
+ }
+ static void setup_gva_maps(struct kvm_vm *vm)
+ {
+       struct userspace_mem_region *region;
+       uint64_t pte_gpa;
+       region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+       /* Map TEST_GVA first. This will install a new PTE. */
+       virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
+       /* Then map TEST_PTE_GVA to the above PTE. */
+       pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
+       virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
+ }
+ enum pf_test_memslots {
+       CODE_AND_DATA_MEMSLOT,
+       PAGE_TABLE_MEMSLOT,
+       TEST_DATA_MEMSLOT,
+ };
+ /*
+  * Create a memslot for code and data at pfn=0, and test-data and PT ones
+  * at max_gfn.
+  */
+ static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
+ {
+       uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
+       uint64_t guest_page_size = vm->page_size;
+       uint64_t max_gfn = vm_compute_max_gfn(vm);
+       /* Enough for 2M of code when using 4K guest pages. */
+       uint64_t code_npages = 512;
+       uint64_t pt_size, data_size, data_gpa;
+       /*
+        * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
+        * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs.  That's 13
+        * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
+        * twice that just in case.
+        */
+       pt_size = 26 * guest_page_size;
+       /* memslot sizes and gpa's must be aligned to the backing page size */
+       pt_size = align_up(pt_size, backing_src_pagesz);
+       data_size = align_up(guest_page_size, backing_src_pagesz);
+       data_gpa = (max_gfn * guest_page_size) - data_size;
+       data_gpa = align_down(data_gpa, backing_src_pagesz);
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
+                                   CODE_AND_DATA_MEMSLOT, code_npages, 0);
+       vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
+       vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
+       vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
+                                   PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
+                                   p->test_desc->pt_memslot_flags);
+       vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
+       vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
+                                   data_size / guest_page_size,
+                                   p->test_desc->data_memslot_flags);
+       vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+ }
++static void setup_ucall(struct kvm_vm *vm)
++{
++      struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
++
++      ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size);
++}
++
+ static void setup_default_handlers(struct test_desc *test)
+ {
+       if (!test->mmio_handler)
+               test->mmio_handler = mmio_no_handler;
+       if (!test->fail_vcpu_run_handler)
+               test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
+ }
+ static void check_event_counts(struct test_desc *test)
+ {
+       ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
+       ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
+       ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
+ }
+ static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
+ {
+       struct test_desc *test = p->test_desc;
+       pr_debug("Test: %s\n", test->name);
+       pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+       pr_debug("Testing memory backing src type: %s\n",
+                vm_mem_backing_src_alias(p->src_type)->name);
+ }
+ static void reset_event_counts(void)
+ {
+       memset(&events, 0, sizeof(events));
+ }
+ /*
+  * This function either succeeds, skips the test (after setting test->skip), or
+  * fails with a TEST_FAIL that aborts all tests.
+  */
+ static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+                         struct test_desc *test)
+ {
+       struct kvm_run *run;
+       struct ucall uc;
+       int ret;
+       run = vcpu->run;
+       for (;;) {
+               ret = _vcpu_run(vcpu);
+               if (ret) {
+                       test->fail_vcpu_run_handler(ret);
+                       goto done;
+               }
+               switch (get_ucall(vcpu, &uc)) {
+               case UCALL_SYNC:
+                       if (!handle_cmd(vm, uc.args[1])) {
+                               test->skip = true;
+                               goto done;
+                       }
+                       break;
+               case UCALL_ABORT:
+                       REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
+                       break;
+               case UCALL_DONE:
+                       goto done;
+               case UCALL_NONE:
+                       if (run->exit_reason == KVM_EXIT_MMIO)
+                               test->mmio_handler(vm, run);
+                       break;
+               default:
+                       TEST_FAIL("Unknown ucall %lu", uc.cmd);
+               }
+       }
+ done:
+       pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
+ }
+ static void run_test(enum vm_guest_mode mode, void *arg)
+ {
+       struct test_params *p = (struct test_params *)arg;
+       struct test_desc *test = p->test_desc;
+       struct kvm_vm *vm;
+       struct kvm_vcpu *vcpu;
+       struct uffd_desc *pt_uffd, *data_uffd;
+       print_test_banner(mode, p);
+       vm = ____vm_create(mode);
+       setup_memslots(vm, p);
+       kvm_vm_elf_load(vm, program_invocation_name);
++      setup_ucall(vm);
+       vcpu = vm_vcpu_add(vm, 0, guest_code);
+       setup_gva_maps(vm);
 -      ucall_uninit(vm);
+       reset_event_counts();
+       /*
+        * Set some code in the data memslot for the guest to execute (only
+        * applicable to the EXEC tests). This has to be done before
+        * setup_uffd() as that function copies the memslot data for the uffd
+        * handler.
+        */
+       load_exec_code_for_test(vm);
+       setup_uffd(vm, p, &pt_uffd, &data_uffd);
+       setup_abort_handlers(vm, vcpu, test);
+       setup_default_handlers(test);
+       vcpu_args_set(vcpu, 1, test);
+       vcpu_run_loop(vm, vcpu, test);
+       kvm_vm_free(vm);
+       free_uffd(test, pt_uffd, data_uffd);
+       /*
+        * Make sure we check the events after the uffd threads have exited,
+        * which means they updated their respective event counters.
+        */
+       if (!test->skip)
+               check_event_counts(test);
+ }
+ static void help(char *name)
+ {
+       puts("");
+       printf("usage: %s [-h] [-s mem-type]\n", name);
+       puts("");
+       guest_modes_help();
+       backing_src_help("-s");
+       puts("");
+ }
+ #define SNAME(s)                      #s
+ #define SCAT2(a, b)                   SNAME(a ## _ ## b)
+ #define SCAT3(a, b, c)                        SCAT2(a, SCAT2(b, c))
+ #define SCAT4(a, b, c, d)             SCAT2(a, SCAT3(b, c, d))
+ #define _CHECK(_test)                 _CHECK_##_test
+ #define _PREPARE(_test)                       _PREPARE_##_test
+ #define _PREPARE_guest_read64         NULL
+ #define _PREPARE_guest_ld_preidx      NULL
+ #define _PREPARE_guest_write64                NULL
+ #define _PREPARE_guest_st_preidx      NULL
+ #define _PREPARE_guest_exec           NULL
+ #define _PREPARE_guest_at             NULL
+ #define _PREPARE_guest_dc_zva         guest_check_dc_zva
+ #define _PREPARE_guest_cas            guest_check_lse
+ /* With or without access flag checks */
+ #define _PREPARE_with_af              guest_set_ha, guest_clear_pte_af
+ #define _PREPARE_no_af                        NULL
+ #define _CHECK_with_af                        guest_check_pte_af
+ #define _CHECK_no_af                  NULL
+ /* Performs an access and checks that no faults were triggered. */
+ #define TEST_ACCESS(_access, _with_af, _mark_cmd)                             \
+ {                                                                             \
+       .name                   = SCAT3(_access, _with_af, #_mark_cmd),         \
+       .guest_prepare          = { _PREPARE(_with_af),                         \
+                                   _PREPARE(_access) },                        \
+       .mem_mark_cmd           = _mark_cmd,                                    \
+       .guest_test             = _access,                                      \
+       .guest_test_check       = { _CHECK(_with_af) },                         \
+       .expected_events        = { 0 },                                        \
+ }
+ #define TEST_UFFD(_access, _with_af, _mark_cmd,                                       \
+                 _uffd_data_handler, _uffd_pt_handler, _uffd_faults)           \
+ {                                                                             \
+       .name                   = SCAT4(uffd, _access, _with_af, #_mark_cmd),   \
+       .guest_prepare          = { _PREPARE(_with_af),                         \
+                                   _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .mem_mark_cmd           = _mark_cmd,                                    \
+       .guest_test_check       = { _CHECK(_with_af) },                         \
+       .uffd_data_handler      = _uffd_data_handler,                           \
+       .uffd_pt_handler        = _uffd_pt_handler,                             \
+       .expected_events        = { .uffd_faults = _uffd_faults, },             \
+ }
+ #define TEST_DIRTY_LOG(_access, _with_af, _test_check)                                \
+ {                                                                             \
+       .name                   = SCAT3(dirty_log, _access, _with_af),          \
+       .data_memslot_flags     = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .pt_memslot_flags       = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .guest_prepare          = { _PREPARE(_with_af),                         \
+                                   _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .guest_test_check       = { _CHECK(_with_af), _test_check,              \
+                                   guest_check_s1ptw_wr_in_dirty_log},         \
+       .expected_events        = { 0 },                                        \
+ }
+ #define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler,                \
+                               _uffd_faults, _test_check)                      \
+ {                                                                             \
+       .name                   = SCAT3(uffd_and_dirty_log, _access, _with_af), \
+       .data_memslot_flags     = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .pt_memslot_flags       = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .guest_prepare          = { _PREPARE(_with_af),                         \
+                                   _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .mem_mark_cmd           = CMD_HOLE_DATA | CMD_HOLE_PT,                  \
+       .guest_test_check       = { _CHECK(_with_af), _test_check },            \
+       .uffd_data_handler      = _uffd_data_handler,                           \
+       .uffd_pt_handler        = uffd_pt_write_handler,                        \
+       .expected_events        = { .uffd_faults = _uffd_faults, },             \
+ }
+ #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits)                  \
+ {                                                                             \
+       .name                   = SCAT3(ro_memslot, _access, _with_af),         \
+       .data_memslot_flags     = KVM_MEM_READONLY,                             \
+       .guest_prepare          = { _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .mmio_handler           = _mmio_handler,                                \
+       .expected_events        = { .mmio_exits = _mmio_exits },                \
+ }
+ #define TEST_RO_MEMSLOT_NO_SYNDROME(_access)                                  \
+ {                                                                             \
+       .name                   = SCAT2(ro_memslot_no_syndrome, _access),       \
+       .data_memslot_flags     = KVM_MEM_READONLY,                             \
+       .guest_test             = _access,                                      \
+       .fail_vcpu_run_handler  = fail_vcpu_run_mmio_no_syndrome_handler,       \
+       .expected_events        = { .fail_vcpu_runs = 1 },                      \
+ }
+ #define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits,    \
+                                     _test_check)                              \
+ {                                                                             \
+       .name                   = SCAT3(ro_memslot, _access, _with_af),         \
+       .data_memslot_flags     = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,   \
+       .pt_memslot_flags       = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .guest_prepare          = { _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .guest_test_check       = { _test_check },                              \
+       .mmio_handler           = _mmio_handler,                                \
+       .expected_events        = { .mmio_exits = _mmio_exits},                 \
+ }
+ #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check)               \
+ {                                                                             \
+       .name                   = SCAT2(ro_memslot_no_syn_and_dlog, _access),   \
+       .data_memslot_flags     = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,   \
+       .pt_memslot_flags       = KVM_MEM_LOG_DIRTY_PAGES,                      \
+       .guest_test             = _access,                                      \
+       .guest_test_check       = { _test_check },                              \
+       .fail_vcpu_run_handler  = fail_vcpu_run_mmio_no_syndrome_handler,       \
+       .expected_events        = { .fail_vcpu_runs = 1 },                      \
+ }
+ #define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits,         \
+                                _uffd_data_handler, _uffd_faults)              \
+ {                                                                             \
+       .name                   = SCAT2(ro_memslot_uffd, _access),              \
+       .data_memslot_flags     = KVM_MEM_READONLY,                             \
+       .mem_mark_cmd           = CMD_HOLE_DATA | CMD_HOLE_PT,                  \
+       .guest_prepare          = { _PREPARE(_access) },                        \
+       .guest_test             = _access,                                      \
+       .uffd_data_handler      = _uffd_data_handler,                           \
+       .uffd_pt_handler        = uffd_pt_write_handler,                        \
+       .mmio_handler           = _mmio_handler,                                \
+       .expected_events        = { .mmio_exits = _mmio_exits,                  \
+                                   .uffd_faults = _uffd_faults },              \
+ }
+ #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler,     \
+                                            _uffd_faults)                      \
+ {                                                                             \
+       .name                   = SCAT2(ro_memslot_no_syndrome, _access),       \
+       .data_memslot_flags     = KVM_MEM_READONLY,                             \
+       .mem_mark_cmd           = CMD_HOLE_DATA | CMD_HOLE_PT,                  \
+       .guest_test             = _access,                                      \
+       .uffd_data_handler      = _uffd_data_handler,                           \
+       .uffd_pt_handler        = uffd_pt_write_handler,                        \
+       .fail_vcpu_run_handler  = fail_vcpu_run_mmio_no_syndrome_handler,       \
+       .expected_events        = { .fail_vcpu_runs = 1,                        \
+                                   .uffd_faults = _uffd_faults },              \
+ }
+ static struct test_desc tests[] = {
+       /* Check that HW is setting the Access Flag (AF) (sanity checks). */
+       TEST_ACCESS(guest_read64, with_af, CMD_NONE),
+       TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
+       TEST_ACCESS(guest_cas, with_af, CMD_NONE),
+       TEST_ACCESS(guest_write64, with_af, CMD_NONE),
+       TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
+       TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
+       TEST_ACCESS(guest_exec, with_af, CMD_NONE),
+       /*
+        * Punch a hole in the data backing store, and then try multiple
+        * accesses: reads should rturn zeroes, and writes should
+        * re-populate the page. Moreover, the test also check that no
+        * exception was generated in the guest.  Note that this
+        * reading/writing behavior is the same as reading/writing a
+        * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
+        * userspace.
+        */
+       TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
+       TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
+       /*
+        * Punch holes in the data and PT backing stores and mark them for
+        * userfaultfd handling. This should result in 2 faults: the access
+        * on the data backing store, and its respective S1 page table walk
+        * (S1PTW).
+        */
+       TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 2),
+       /* no_af should also lead to a PT write. */
+       TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 2),
+       /* Note how that cas invokes the read handler. */
+       TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 2),
+       /*
+        * Can't test guest_at with_af as it's IMPDEF whether the AF is set.
+        * The S1PTW fault should still be marked as a write.
+        */
+       TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 1),
+       TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 2),
+       TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_write_handler, uffd_pt_write_handler, 2),
+       TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_write_handler, uffd_pt_write_handler, 2),
+       TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_write_handler, uffd_pt_write_handler, 2),
+       TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+                 uffd_data_read_handler, uffd_pt_write_handler, 2),
+       /*
+        * Try accesses when the data and PT memory regions are both
+        * tracked for dirty logging.
+        */
+       TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log),
+       /* no_af should also lead to a PT write. */
+       TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log),
+       TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log),
+       /*
+        * Access when the data and PT memory regions are both marked for
+        * dirty logging and UFFD at the same time. The expected result is
+        * that writes should mark the dirty log and trigger a userfaultfd
+        * write fault.  Reads/execs should result in a read userfaultfd
+        * fault, and nothing in the dirty log.  Any S1PTW should result in
+        * a write in the dirty log and a userfaultfd write.
+        */
+       TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2,
+                               guest_check_no_write_in_dirty_log),
+       /* no_af should also lead to a PT write. */
+       TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2,
+                               guest_check_no_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler,
+                               2, guest_check_no_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1,
+                               guest_check_no_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2,
+                               guest_check_no_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler,
+                               2, guest_check_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2,
+                               guest_check_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler,
+                               2, guest_check_write_in_dirty_log),
+       TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
+                               uffd_data_write_handler, 2,
+                               guest_check_write_in_dirty_log),
+       /*
+        * Try accesses when the data memory region is marked read-only
+        * (with KVM_MEM_READONLY). Writes with a syndrome result in an
+        * MMIO exit, writes with no syndrome (e.g., CAS) result in a
+        * failed vcpu run, and reads/execs with and without syndroms do
+        * not fault.
+        */
+       TEST_RO_MEMSLOT(guest_read64, 0, 0),
+       TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
+       TEST_RO_MEMSLOT(guest_at, 0, 0),
+       TEST_RO_MEMSLOT(guest_exec, 0, 0),
+       TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
+       TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
+       TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
+       TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
+       /*
+        * Access when both the data region is both read-only and marked
+        * for dirty logging at the same time. The expected result is that
+        * for writes there should be no write in the dirty log. The
+        * readonly handling is the same as if the memslot was not marked
+        * for dirty logging: writes with a syndrome result in an MMIO
+        * exit, and writes with no syndrome result in a failed vcpu run.
+        */
+       TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
+                                     guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
+                                     guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
+                                     guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
+                                     guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
+                                     1, guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
+                                                 guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
+                                                 guest_check_no_write_in_dirty_log),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
+                                                 guest_check_no_write_in_dirty_log),
+       /*
+        * Access when the data region is both read-only and punched with
+        * holes tracked with userfaultfd.  The expected result is the
+        * union of both userfaultfd and read-only behaviors. For example,
+        * write accesses result in a userfaultfd write fault and an MMIO
+        * exit.  Writes with no syndrome result in a failed vcpu run and
+        * no userfaultfd write fault. Reads result in userfaultfd getting
+        * triggered.
+        */
+       TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0,
+                                uffd_data_read_handler, 2),
+       TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0,
+                                uffd_data_read_handler, 2),
+       TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0,
+                                uffd_no_handler, 1),
+       TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0,
+                                uffd_data_read_handler, 2),
+       TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
+                                uffd_data_write_handler, 2),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas,
+                                            uffd_data_read_handler, 2),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva,
+                                            uffd_no_handler, 1),
+       TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx,
+                                            uffd_no_handler, 1),
+       { 0 }
+ };
+ static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
+ {
+       struct test_desc *t;
+       for (t = &tests[0]; t->name; t++) {
+               if (t->skip)
+                       continue;
+               struct test_params p = {
+                       .src_type = src_type,
+                       .test_desc = t,
+               };
+               for_each_guest_mode(run_test, &p);
+       }
+ }
+ int main(int argc, char *argv[])
+ {
+       enum vm_mem_backing_src_type src_type;
+       int opt;
+       setbuf(stdout, NULL);
+       src_type = DEFAULT_VM_MEM_SRC;
+       while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
+               switch (opt) {
+               case 'm':
+                       guest_modes_cmdline(optarg);
+                       break;
+               case 's':
+                       src_type = parse_backing_src_type(optarg);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       exit(0);
+               }
+       }
+       for_each_test_and_guest_mode(src_type);
+       return 0;
+ }
index 02d3587cab0a3549c3de4918f2db528d9e724d9a,942370d5739253a42571cdfccbea5c477a7d4640..57a16371e9c293d987f7f5cf86a3e27c929aa8de
@@@ -211,7 -208,7 +208,7 @@@ static bool spin_wait_for_next_iteratio
        int last_iteration = *current_iteration;
  
        do {
-               if (READ_ONCE(done))
 -              if (READ_ONCE(perf_test_args.stop_vcpus))
++              if (READ_ONCE(memstress_args.stop_vcpus))
                        return false;
  
                *current_iteration = READ_ONCE(iteration);
@@@ -321,11 -318,8 +318,8 @@@ static void run_test(enum vm_guest_mod
        mark_memory_idle(vm, nr_vcpus);
        access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");
  
-       /* Set done to signal the vCPU threads to exit */
-       done = true;
 -      perf_test_join_vcpu_threads(nr_vcpus);
 -      perf_test_destroy_vm(vm);
 +      memstress_join_vcpu_threads(nr_vcpus);
 +      memstress_destroy_vm(vm);
  }
  
  static void help(char *name)
index 3a977ddf07b2024bf04d41d0c72e90b1350788a5,8e1fe4ffcccdfa60d7d833e6e70418151a2a886a..b0e1fc4de9e29ed551a27f120913def718fc0331
@@@ -20,8 -20,9 +20,9 @@@
  
  #include "kvm_util.h"
  #include "test_util.h"
 -#include "perf_test_util.h"
 +#include "memstress.h"
  #include "guest_modes.h"
+ #include "userfaultfd_util.h"
  
  #ifdef __NR_userfaultfd
  
@@@ -270,22 -129,13 +129,13 @@@ static void prefault_mem(void *alias, u
  static void run_test(enum vm_guest_mode mode, void *arg)
  {
        struct test_params *p = arg;
-       pthread_t *uffd_handler_threads = NULL;
-       struct uffd_handler_args *uffd_args = NULL;
+       struct uffd_desc **uffd_descs = NULL;
        struct timespec start;
        struct timespec ts_diff;
-       int *pipefds = NULL;
        struct kvm_vm *vm;
-       int r, i;
+       int i;
  
 -      vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
 +      vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
                                 p->src_type, p->partition_vcpu_memory_access);
  
        demand_paging_size = get_backing_src_pagesz(p->src_type);
        memset(guest_data_prototype, 0xAB, demand_paging_size);
  
        if (p->uffd_mode) {
-               uffd_handler_threads =
-                       malloc(nr_vcpus * sizeof(*uffd_handler_threads));
-               TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
-               uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
-               TEST_ASSERT(uffd_args, "Memory allocation failed");
-               pipefds = malloc(sizeof(int) * nr_vcpus * 2);
-               TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+               uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
+               TEST_ASSERT(uffd_descs, "Memory allocation failed");
  
                for (i = 0; i < nr_vcpus; i++) {
 -                      struct perf_test_vcpu_args *vcpu_args;
 +                      struct memstress_vcpu_args *vcpu_args;
                        void *vcpu_hva;
                        void *vcpu_alias;
  
                        vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
                        vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
  
 -                              vcpu_args->pages * perf_test_args.guest_page_size);
+                       prefault_mem(vcpu_alias,
++                              vcpu_args->pages * memstress_args.guest_page_size);
                        /*
                         * Set up user fault fd to handle demand paging
                         * requests.
                         */
-                       r = pipe2(&pipefds[i * 2],
-                                 O_CLOEXEC | O_NONBLOCK);
-                       TEST_ASSERT(!r, "Failed to set up pipefd");
-                       setup_demand_paging(vm, &uffd_handler_threads[i],
-                                           pipefds[i * 2], p->uffd_mode,
-                                           p->uffd_delay, &uffd_args[i],
-                                           vcpu_hva, vcpu_alias,
-                                           vcpu_args->pages * memstress_args.guest_page_size);
+                       uffd_descs[i] = uffd_setup_demand_paging(
+                               p->uffd_mode, p->uffd_delay, vcpu_hva,
 -                              vcpu_args->pages * perf_test_args.guest_page_size,
++                              vcpu_args->pages * memstress_args.guest_page_size,
+                               &handle_uffd_page_request);
                }
        }
  
        pr_info("Total guest execution time: %ld.%.9lds\n",
                ts_diff.tv_sec, ts_diff.tv_nsec);
        pr_info("Overall demand paging rate: %f pgs/sec\n",
 -              perf_test_args.vcpu_args[0].pages * nr_vcpus /
 +              memstress_args.vcpu_args[0].pages * nr_vcpus /
                ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
  
 -      perf_test_destroy_vm(vm);
 +      memstress_destroy_vm(vm);
  
        free(guest_data_prototype);
-       if (p->uffd_mode) {
-               free(uffd_handler_threads);
-               free(uffd_args);
-               free(pipefds);
-       }
+       if (p->uffd_mode)
+               free(uffd_descs);
  }
  
  static void help(char *name)
index c7685c7038ff0ce73739e5d1e2afebf0c3c68060,b0da75af1ff337cf1c7beefb80d163b4ea9a58cc..37500c92dd0a608f1451c30463bd5b2b27c332e3
@@@ -385,9 -406,12 +408,13 @@@ void vm_mem_region_set_flags(struct kvm
  void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
  void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
  struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
 +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
  vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
+ vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+                           enum kvm_mem_region_type type);
  vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
+ vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm,
+                                enum kvm_mem_region_type type);
  vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
  
  void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
index bbd2a302df1003707bac3faec2c119237d755659,0000000000000000000000000000000000000000..72e3e358ef7bd35d8c4ce910961090d593a30934
mode 100644,000000..100644
--- /dev/null
@@@ -1,72 -1,0 +1,75 @@@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * tools/testing/selftests/kvm/include/memstress.h
 + *
 + * Copyright (C) 2020, Google LLC.
 + */
 +
 +#ifndef SELFTEST_KVM_MEMSTRESS_H
 +#define SELFTEST_KVM_MEMSTRESS_H
 +
 +#include <pthread.h>
 +
 +#include "kvm_util.h"
 +
 +/* Default guest test virtual memory offset */
 +#define DEFAULT_GUEST_TEST_MEM                0xc0000000
 +
 +#define DEFAULT_PER_VCPU_MEM_SIZE     (1 << 30) /* 1G */
 +
 +#define MEMSTRESS_MEM_SLOT_INDEX      1
 +
 +struct memstress_vcpu_args {
 +      uint64_t gpa;
 +      uint64_t gva;
 +      uint64_t pages;
 +
 +      /* Only used by the host userspace part of the vCPU thread */
 +      struct kvm_vcpu *vcpu;
 +      int vcpu_idx;
 +};
 +
 +struct memstress_args {
 +      struct kvm_vm *vm;
 +      /* The starting address and size of the guest test region. */
 +      uint64_t gpa;
 +      uint64_t size;
 +      uint64_t guest_page_size;
 +      uint32_t random_seed;
 +      uint32_t write_percent;
 +
 +      /* Run vCPUs in L2 instead of L1, if the architecture supports it. */
 +      bool nested;
 +      /* Randomize which pages are accessed by the guest. */
 +      bool random_access;
 +      /* True if all vCPUs are pinned to pCPUs */
 +      bool pin_vcpus;
 +      /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */
 +      uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS];
 +
++      /* Test is done, stop running vCPUs. */
++      bool stop_vcpus;
++
 +      struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS];
 +};
 +
 +extern struct memstress_args memstress_args;
 +
 +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
 +                                 uint64_t vcpu_memory_bytes, int slots,
 +                                 enum vm_mem_backing_src_type backing_src,
 +                                 bool partition_vcpu_memory_access);
 +void memstress_destroy_vm(struct kvm_vm *vm);
 +
 +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent);
 +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed);
 +void memstress_set_random_access(struct kvm_vm *vm, bool random_access);
 +
 +void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *));
 +void memstress_join_vcpu_threads(int vcpus);
 +void memstress_guest_code(uint32_t vcpu_id);
 +
 +uint64_t memstress_nested_pages(int nr_vcpus);
 +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
 +
 +#endif /* SELFTEST_KVM_MEMSTRESS_H */
Simple merge
index 1d26a21601785357b5980504fee8953da2315641,e3daa97ab0f4a01edb88718fe0901e2d496f1794..e9607eb089bee551a9a214465cc69e33fe2c799a
@@@ -335,10 -328,17 +329,18 @@@ struct kvm_vm *__vm_create(enum vm_gues
  {
        uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
                                                 nr_extra_pages);
 +      struct userspace_mem_region *slot0;
        struct kvm_vm *vm;
+       int i;
+       pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
+                vm_guest_mode_string(mode), nr_pages);
  
-       vm = ____vm_create(mode, nr_pages);
+       vm = ____vm_create(mode);
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+       for (i = 0; i < NR_MEM_REGIONS; i++)
+               vm->memslots[i] = 0;
  
        kvm_vm_elf_load(vm, program_invocation_name);
  
index 2de8a5d527b3a7af886960b748ca5640b8cf85bb,0000000000000000000000000000000000000000..5f1d3173c238cb7d570b3c391945276a316b8336
mode 100644,000000..100644
--- /dev/null
@@@ -1,319 -1,0 +1,322 @@@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2020, Google LLC.
 + */
 +#define _GNU_SOURCE
 +
 +#include <inttypes.h>
 +
 +#include "kvm_util.h"
 +#include "memstress.h"
 +#include "processor.h"
 +
 +struct memstress_args memstress_args;
 +
 +/*
 + * Guest virtual memory offset of the testing memory slot.
 + * Must not conflict with identity mapped test code.
 + */
 +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
 +
 +struct vcpu_thread {
 +      /* The index of the vCPU. */
 +      int vcpu_idx;
 +
 +      /* The pthread backing the vCPU. */
 +      pthread_t thread;
 +
 +      /* Set to true once the vCPU thread is up and running. */
 +      bool running;
 +};
 +
 +/* The vCPU threads involved in this test. */
 +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
 +
 +/* The function run by each vCPU thread, as provided by the test. */
 +static void (*vcpu_thread_fn)(struct memstress_vcpu_args *);
 +
 +/* Set to true once all vCPU threads are up and running. */
 +static bool all_vcpu_threads_running;
 +
 +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 +
 +/*
 + * Continuously write to the first 8 bytes of each page in the
 + * specified region.
 + */
 +void memstress_guest_code(uint32_t vcpu_idx)
 +{
 +      struct memstress_args *args = &memstress_args;
 +      struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx];
 +      struct guest_random_state rand_state;
 +      uint64_t gva;
 +      uint64_t pages;
 +      uint64_t addr;
 +      uint64_t page;
 +      int i;
 +
 +      rand_state = new_guest_random_state(args->random_seed + vcpu_idx);
 +
 +      gva = vcpu_args->gva;
 +      pages = vcpu_args->pages;
 +
 +      /* Make sure vCPU args data structure is not corrupt. */
 +      GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx);
 +
 +      while (true) {
 +              for (i = 0; i < pages; i++) {
 +                      if (args->random_access)
 +                              page = guest_random_u32(&rand_state) % pages;
 +                      else
 +                              page = i;
 +
 +                      addr = gva + (page * args->guest_page_size);
 +
 +                      if (guest_random_u32(&rand_state) % 100 < args->write_percent)
 +                              *(uint64_t *)addr = 0x0123456789ABCDEF;
 +                      else
 +                              READ_ONCE(*(uint64_t *)addr);
 +              }
 +
 +              GUEST_SYNC(1);
 +      }
 +}
 +
 +void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
 +                         struct kvm_vcpu *vcpus[],
 +                         uint64_t vcpu_memory_bytes,
 +                         bool partition_vcpu_memory_access)
 +{
 +      struct memstress_args *args = &memstress_args;
 +      struct memstress_vcpu_args *vcpu_args;
 +      int i;
 +
 +      for (i = 0; i < nr_vcpus; i++) {
 +              vcpu_args = &args->vcpu_args[i];
 +
 +              vcpu_args->vcpu = vcpus[i];
 +              vcpu_args->vcpu_idx = i;
 +
 +              if (partition_vcpu_memory_access) {
 +                      vcpu_args->gva = guest_test_virt_mem +
 +                                       (i * vcpu_memory_bytes);
 +                      vcpu_args->pages = vcpu_memory_bytes /
 +                                         args->guest_page_size;
 +                      vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes);
 +              } else {
 +                      vcpu_args->gva = guest_test_virt_mem;
 +                      vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) /
 +                                         args->guest_page_size;
 +                      vcpu_args->gpa = args->gpa;
 +              }
 +
 +              vcpu_args_set(vcpus[i], 1, i);
 +
 +              pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
 +                       i, vcpu_args->gpa, vcpu_args->gpa +
 +                       (vcpu_args->pages * args->guest_page_size));
 +      }
 +}
 +
 +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
 +                                 uint64_t vcpu_memory_bytes, int slots,
 +                                 enum vm_mem_backing_src_type backing_src,
 +                                 bool partition_vcpu_memory_access)
 +{
 +      struct memstress_args *args = &memstress_args;
 +      struct kvm_vm *vm;
 +      uint64_t guest_num_pages, slot0_pages = 0;
 +      uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
 +      uint64_t region_end_gfn;
 +      int i;
 +
 +      pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 +
 +      /* By default vCPUs will write to memory. */
 +      args->write_percent = 100;
 +
 +      /*
 +       * Snapshot the non-huge page size.  This is used by the guest code to
 +       * access/dirty pages at the logging granularity.
 +       */
 +      args->guest_page_size = vm_guest_mode_params[mode].page_size;
 +
 +      guest_num_pages = vm_adjust_num_guest_pages(mode,
 +                              (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size);
 +
 +      TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0,
 +                  "Guest memory size is not host page size aligned.");
 +      TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0,
 +                  "Guest memory size is not guest page size aligned.");
 +      TEST_ASSERT(guest_num_pages % slots == 0,
 +                  "Guest memory cannot be evenly divided into %d slots.",
 +                  slots);
 +
 +      /*
 +       * If using nested, allocate extra pages for the nested page tables and
 +       * in-memory data structures.
 +       */
 +      if (args->nested)
 +              slot0_pages += memstress_nested_pages(nr_vcpus);
 +
 +      /*
 +       * Pass guest_num_pages to populate the page tables for test memory.
 +       * The memory is also added to memslot 0, but that's a benign side
 +       * effect as KVM allows aliasing HVAs in meslots.
 +       */
 +      vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages,
 +                                  memstress_guest_code, vcpus);
 +
 +      args->vm = vm;
 +
 +      /* Put the test region at the top guest physical memory. */
 +      region_end_gfn = vm->max_gfn + 1;
 +
 +#ifdef __x86_64__
 +      /*
 +       * When running vCPUs in L2, restrict the test region to 48 bits to
 +       * avoid needing 5-level page tables to identity map L2.
 +       */
 +      if (args->nested)
 +              region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size);
 +#endif
 +      /*
 +       * If there should be more memory in the guest test region than there
 +       * can be pages in the guest, it will definitely cause problems.
 +       */
 +      TEST_ASSERT(guest_num_pages < region_end_gfn,
 +                  "Requested more guest memory than address space allows.\n"
 +                  "    guest pages: %" PRIx64 " max gfn: %" PRIx64
 +                  " nr_vcpus: %d wss: %" PRIx64 "]\n",
 +                  guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes);
 +
 +      args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size;
 +      args->gpa = align_down(args->gpa, backing_src_pagesz);
 +#ifdef __s390x__
 +      /* Align to 1M (segment size) */
 +      args->gpa = align_down(args->gpa, 1 << 20);
 +#endif
 +      args->size = guest_num_pages * args->guest_page_size;
 +      pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
 +              args->gpa, args->gpa + args->size);
 +
 +      /* Add extra memory slots for testing */
 +      for (i = 0; i < slots; i++) {
 +              uint64_t region_pages = guest_num_pages / slots;
 +              vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i;
 +
 +              vm_userspace_mem_region_add(vm, backing_src, region_start,
 +                                          MEMSTRESS_MEM_SLOT_INDEX + i,
 +                                          region_pages, 0);
 +      }
 +
 +      /* Do mapping for the demand paging memory slot */
 +      virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages);
 +
 +      memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes,
 +                            partition_vcpu_memory_access);
 +
 +      if (args->nested) {
 +              pr_info("Configuring vCPUs to run in L2 (nested).\n");
 +              memstress_setup_nested(vm, nr_vcpus, vcpus);
 +      }
 +
 +      /* Export the shared variables to the guest. */
 +      sync_global_to_guest(vm, memstress_args);
 +
 +      return vm;
 +}
 +
 +void memstress_destroy_vm(struct kvm_vm *vm)
 +{
 +      kvm_vm_free(vm);
 +}
 +
 +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent)
 +{
 +      memstress_args.write_percent = write_percent;
 +      sync_global_to_guest(vm, memstress_args.write_percent);
 +}
 +
 +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed)
 +{
 +      memstress_args.random_seed = random_seed;
 +      sync_global_to_guest(vm, memstress_args.random_seed);
 +}
 +
 +void memstress_set_random_access(struct kvm_vm *vm, bool random_access)
 +{
 +      memstress_args.random_access = random_access;
 +      sync_global_to_guest(vm, memstress_args.random_access);
 +}
 +
 +uint64_t __weak memstress_nested_pages(int nr_vcpus)
 +{
 +      return 0;
 +}
 +
 +void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus)
 +{
 +      pr_info("%s() not support on this architecture, skipping.\n", __func__);
 +      exit(KSFT_SKIP);
 +}
 +
 +static void *vcpu_thread_main(void *data)
 +{
 +      struct vcpu_thread *vcpu = data;
 +      int vcpu_idx = vcpu->vcpu_idx;
 +
 +      if (memstress_args.pin_vcpus)
 +              kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]);
 +
 +      WRITE_ONCE(vcpu->running, true);
 +
 +      /*
 +       * Wait for all vCPU threads to be up and running before calling the test-
 +       * provided vCPU thread function. This prevents thread creation (which
 +       * requires taking the mmap_sem in write mode) from interfering with the
 +       * guest faulting in its memory.
 +       */
 +      while (!READ_ONCE(all_vcpu_threads_running))
 +              ;
 +
 +      vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]);
 +
 +      return NULL;
 +}
 +
 +void memstress_start_vcpu_threads(int nr_vcpus,
 +                                void (*vcpu_fn)(struct memstress_vcpu_args *))
 +{
 +      int i;
 +
 +      vcpu_thread_fn = vcpu_fn;
 +      WRITE_ONCE(all_vcpu_threads_running, false);
++      WRITE_ONCE(memstress_args.stop_vcpus, false);
 +
 +      for (i = 0; i < nr_vcpus; i++) {
 +              struct vcpu_thread *vcpu = &vcpu_threads[i];
 +
 +              vcpu->vcpu_idx = i;
 +              WRITE_ONCE(vcpu->running, false);
 +
 +              pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
 +      }
 +
 +      for (i = 0; i < nr_vcpus; i++) {
 +              while (!READ_ONCE(vcpu_threads[i].running))
 +                      ;
 +      }
 +
 +      WRITE_ONCE(all_vcpu_threads_running, true);
 +}
 +
 +void memstress_join_vcpu_threads(int nr_vcpus)
 +{
 +      int i;
 +
++      WRITE_ONCE(memstress_args.stop_vcpus, true);
++
 +      for (i = 0; i < nr_vcpus; i++)
 +              pthread_join(vcpu_threads[i].thread, NULL);
 +}
index 0000000000000000000000000000000000000000,3b44846fc277e96ffc5fe66acbbdea6224be61b9..92cef20902f1f901abba2947de27e315694a6bd7
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,186 +1,186 @@@
 -#include "perf_test_util.h"
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  * KVM userfaultfd util
+  * Adapted from demand_paging_test.c
+  *
+  * Copyright (C) 2018, Red Hat, Inc.
+  * Copyright (C) 2019-2022 Google LLC
+  */
+ #define _GNU_SOURCE /* for pipe2 */
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
+ #include <poll.h>
+ #include <pthread.h>
+ #include <linux/userfaultfd.h>
+ #include <sys/syscall.h>
+ #include "kvm_util.h"
+ #include "test_util.h"
++#include "memstress.h"
+ #include "userfaultfd_util.h"
+ #ifdef __NR_userfaultfd
+ static void *uffd_handler_thread_fn(void *arg)
+ {
+       struct uffd_desc *uffd_desc = (struct uffd_desc *)arg;
+       int uffd = uffd_desc->uffd;
+       int pipefd = uffd_desc->pipefds[0];
+       useconds_t delay = uffd_desc->delay;
+       int64_t pages = 0;
+       struct timespec start;
+       struct timespec ts_diff;
+       clock_gettime(CLOCK_MONOTONIC, &start);
+       while (1) {
+               struct uffd_msg msg;
+               struct pollfd pollfd[2];
+               char tmp_chr;
+               int r;
+               pollfd[0].fd = uffd;
+               pollfd[0].events = POLLIN;
+               pollfd[1].fd = pipefd;
+               pollfd[1].events = POLLIN;
+               r = poll(pollfd, 2, -1);
+               switch (r) {
+               case -1:
+                       pr_info("poll err");
+                       continue;
+               case 0:
+                       continue;
+               case 1:
+                       break;
+               default:
+                       pr_info("Polling uffd returned %d", r);
+                       return NULL;
+               }
+               if (pollfd[0].revents & POLLERR) {
+                       pr_info("uffd revents has POLLERR");
+                       return NULL;
+               }
+               if (pollfd[1].revents & POLLIN) {
+                       r = read(pollfd[1].fd, &tmp_chr, 1);
+                       TEST_ASSERT(r == 1,
+                                   "Error reading pipefd in UFFD thread\n");
+                       return NULL;
+               }
+               if (!(pollfd[0].revents & POLLIN))
+                       continue;
+               r = read(uffd, &msg, sizeof(msg));
+               if (r == -1) {
+                       if (errno == EAGAIN)
+                               continue;
+                       pr_info("Read of uffd got errno %d\n", errno);
+                       return NULL;
+               }
+               if (r != sizeof(msg)) {
+                       pr_info("Read on uffd returned unexpected size: %d bytes", r);
+                       return NULL;
+               }
+               if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+                       continue;
+               if (delay)
+                       usleep(delay);
+               r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg);
+               if (r < 0)
+                       return NULL;
+               pages++;
+       }
+       ts_diff = timespec_elapsed(start);
+       PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+                      pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+                      pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+       return NULL;
+ }
+ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
+                                          void *hva, uint64_t len,
+                                          uffd_handler_t handler)
+ {
+       struct uffd_desc *uffd_desc;
+       bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
+       int uffd;
+       struct uffdio_api uffdio_api;
+       struct uffdio_register uffdio_register;
+       uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
+       int ret;
+       PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
+                      is_minor ? "MINOR" : "MISSING",
+                      is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
+       uffd_desc = malloc(sizeof(struct uffd_desc));
+       TEST_ASSERT(uffd_desc, "malloc failed");
+       /* In order to get minor faults, prefault via the alias. */
+       if (is_minor)
+               expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = 0;
+       TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
+                   "ioctl UFFDIO_API failed: %" PRIu64,
+                   (uint64_t)uffdio_api.api);
+       uffdio_register.range.start = (uint64_t)hva;
+       uffdio_register.range.len = len;
+       uffdio_register.mode = uffd_mode;
+       TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
+                   "ioctl UFFDIO_REGISTER failed");
+       TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
+                   expected_ioctls, "missing userfaultfd ioctls");
+       ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK);
+       TEST_ASSERT(!ret, "Failed to set up pipefd");
+       uffd_desc->uffd_mode = uffd_mode;
+       uffd_desc->uffd = uffd;
+       uffd_desc->delay = delay;
+       uffd_desc->handler = handler;
+       pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn,
+                      uffd_desc);
+       PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+                      hva, hva + len);
+       return uffd_desc;
+ }
+ void uffd_stop_demand_paging(struct uffd_desc *uffd)
+ {
+       char c = 0;
+       int ret;
+       ret = write(uffd->pipefds[1], &c, 1);
+       TEST_ASSERT(ret == 1, "Unable to write to pipefd");
+       ret = pthread_join(uffd->thread, NULL);
+       TEST_ASSERT(ret == 0, "Pthread_join failed.");
+       close(uffd->uffd);
+       close(uffd->pipefds[1]);
+       close(uffd->pipefds[0]);
+       free(uffd);
+ }
+ #endif /* __NR_userfaultfd */
index d07e921bfcc5350d7d2e51eedc35a2342d68bcbb,3a5e4518307c20ab631f8263e2549f5780519ee8..9855c41ca811fa69a77f41f212ddc6086d47467c
@@@ -34,9 -34,7 +34,7 @@@
  static int nr_vcpus = 1;
  static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
  
- static bool run_vcpus = true;
 -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
 +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
  {
        struct kvm_vcpu *vcpu = vcpu_args->vcpu;
        struct kvm_run *run;
@@@ -45,7 -43,7 +43,7 @@@
        run = vcpu->run;
  
        /* Let the guest access its memory until a stop signal is received */
-       while (READ_ONCE(run_vcpus)) {
 -      while (!READ_ONCE(perf_test_args.stop_vcpus)) {
++      while (!READ_ONCE(memstress_args.stop_vcpus)) {
                ret = _vcpu_run(vcpu);
                TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
  
@@@ -107,14 -105,13 +105,12 @@@ static void run_test(enum vm_guest_mod
  
        pr_info("Started all vCPUs\n");
  
 -      add_remove_memslot(vm, p->memslot_modification_delay,
 -                         p->nr_memslot_modifications);
 +      add_remove_memslot(vm, p->delay, p->nr_iterations);
  
-       run_vcpus = false;
 -      perf_test_join_vcpu_threads(nr_vcpus);
 +      memstress_join_vcpu_threads(nr_vcpus);
        pr_info("All vCPU threads joined\n");
  
 -      perf_test_destroy_vm(vm);
 +      memstress_destroy_vm(vm);
  }
  
  static void help(char *name)
index 36b20abfb948e3185db20d2eac7b77b34a74d04a,2ad40f7c9c08e4ec2b05196e03174e749d48e788..e698306bf49d1d7745d0337f7f0350911c886add
@@@ -252,37 -284,33 +284,34 @@@ static bool prepare_vm(struct vm_data *
        struct timespec tstart;
        struct sync_area *sync;
  
-       max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
-       TEST_ASSERT(max_mem_slots > 1,
-                   "KVM_CAP_NR_MEMSLOTS should be greater than 1");
-       TEST_ASSERT(nslots > 1 || nslots == -1,
-                   "Slot count cap should be greater than 1");
-       if (nslots != -1)
-               max_mem_slots = min(max_mem_slots, (uint32_t)nslots);
-       pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots);
+       host_page_size = getpagesize();
+       guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+       mempages = mem_size / guest_page_size;
  
-       TEST_ASSERT(mempages > 1,
-                   "Can't test without any memory");
+       data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
 -      ucall_init(data->vm, NULL);
+       TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size");
  
        data->npages = mempages;
-       data->nslots = max_mem_slots - 1;
-       data->pages_per_slot = mempages / data->nslots;
-       if (!data->pages_per_slot) {
-               *maxslots = mempages + 1;
+       TEST_ASSERT(data->npages > 1, "Can't test without any memory");
+       data->nslots = nslots;
+       data->pages_per_slot = data->npages / data->nslots;
+       rempages = data->npages % data->nslots;
+       if (!check_slot_pages(host_page_size, guest_page_size,
+                             data->pages_per_slot, rempages)) {
+               *maxslots = get_max_slots(data, host_page_size);
                return false;
        }
  
        data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
        TEST_ASSERT(data->hva_slots, "malloc() fail");
  
 +      data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
 +
        pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
-               max_mem_slots - 1, data->pages_per_slot, rempages);
+               data->nslots, data->pages_per_slot, rempages);
  
        clock_gettime(CLOCK_MONOTONIC, &tstart);
-       for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) {
+       for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) {
                uint64_t npages;
  
                npages = data->pages_per_slot;
@@@ -884,9 -966,9 +967,9 @@@ static bool parse_args(int argc, char *
                        map_unmap_verify = true;
                        break;
                case 's':
 -                      targs->nslots = atoi(optarg);
 +                      targs->nslots = atoi_paranoid(optarg);
-                       if (targs->nslots <= 0 && targs->nslots != -1) {
-                               pr_info("Slot count cap has to be positive or -1 for no cap\n");
+                       if (targs->nslots <= 1 && targs->nslots != -1) {
+                               pr_info("Slot count cap must be larger than 1 or -1 for no cap\n");
                                return false;
                        }
                        break;
@@@ -994,6 -1103,12 +1092,9 @@@ int main(int argc, char *argv[]
        struct test_result rbestslottime;
        int tctr;
  
 -      /* Tell stdout not to buffer its content */
 -      setbuf(stdout, NULL);
 -
+       if (!check_memory_sizes())
+               return -1;
        if (!parse_args(argc, argv, &targs))
                return -1;
  
Simple merge