From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Dec 2022 17:27:39 +0000 (-0500)
Subject: Merge tag 'kvmarm-6.2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm... 
X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=eb5618911af0ac069d2313b289d4c19ca3379401;p=linux.git

Merge tag 'kvmarm-6.2' of https://git./linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 updates for 6.2

- Enable the per-vcpu dirty-ring tracking mechanism, together with an
  option to keep the good old dirty log around for pages that are
  dirtied by something other than a vcpu.

- Switch to the relaxed parallel fault handling, using RCU to delay
  page table reclaim and giving better performance under load.

- Relax the MTE ABI, allowing a VMM to use the MAP_SHARED mapping
  option, which multi-process VMMs such as crosvm rely on.

- Merge the pKVM shadow vcpu state tracking that allows the hypervisor
  to have its own view of a vcpu, keeping that state private.

- Add support for the PMUv3p5 architecture revision, bringing support
  for 64bit counters on systems that support it, and fix the
  no-quite-compliant CHAIN-ed counter support for the machines that
  actually exist out there.

- Fix a handful of minor issues around 52bit VA/PA support (64kB pages
  only) as a prefix of the oncoming support for 4kB and 16kB pages.

- Add/Enable/Fix a bunch of selftests covering memslots, breakpoints,
  stage-2 faults and access tracking. You name it, we got it, we
  probably broke it.

- Pick a small set of documentation and spelling fixes, because no
  good merge window would be complete without those.

As a side effect, this tag also drags:

- The 'kvmarm-fixes-6.1-3' tag as a dependency to the dirty-ring
  series

- A shared branch with the arm64 tree that repaints all the system
  registers to match the ARM ARM's naming, and resulting in
  interesting conflicts
---

eb5618911af0ac069d2313b289d4c19ca3379401
diff --cc arch/x86/include/asm/kvm_host.h
index d1013c4f673ca,b4dbde7d9eb1d..ad9f8b02071de
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@@ -2154,8 -2084,12 +2154,6 @@@ static inline int kvm_cpu_get_apicid(in
  #endif
  }
  
- int kvm_cpu_dirty_log_size(void);
 -#define put_smstate(type, buf, offset, val)                      \
 -	*(type *)((buf) + (offset) - 0x7e00) = val
 -
 -#define GET_SMSTATE(type, buf, offset)		\
 -	(*(type *)((buf) + (offset) - 0x7e00))
--
  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
  
  #define KVM_CLOCK_VALID_FLAGS						\
diff --cc include/uapi/linux/kvm.h
index 88448397642cb,c87b5882d7aef..820efdf9fef80
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@@ -1181,7 -1178,7 +1181,8 @@@ struct kvm_ppc_resize_hpt 
  #define KVM_CAP_S390_ZPCI_OP 221
  #define KVM_CAP_S390_CPU_TOPOLOGY 222
  #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 -#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224
 +#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
++#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --cc tools/testing/selftests/kvm/Makefile
index 2275ba861e0e5,1d85b8e218a02..947676983da1f
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@@ -47,7 -47,7 +47,8 @@@ LIBKVM += lib/memstress.
  LIBKVM += lib/rbtree.c
  LIBKVM += lib/sparsebit.c
  LIBKVM += lib/test_util.c
 +LIBKVM += lib/ucall_common.c
+ LIBKVM += lib/userfaultfd_util.c
  
  LIBKVM_STRING += lib/string_override.c
  
diff --cc tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index d86c4e4d1c826,b30add3e77269..8a3fb212084a0
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@@ -289,9 -425,9 +422,8 @@@ static void test_guest_debug_exceptions
  	struct kvm_vcpu *vcpu;
  	struct kvm_vm *vm;
  	struct ucall uc;
- 	int stage;
  
  	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 -	ucall_init(vm, NULL);
  
  	vm_init_descriptor_tables(vm);
  	vcpu_init_descriptor_tables(vcpu);
diff --cc tools/testing/selftests/kvm/aarch64/page_fault_test.c
index 0000000000000,05bb6a6369c25..95d22cfb7b41a
mode 000000,100644..100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@@ -1,0 -1,1112 +1,1117 @@@
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  * page_fault_test.c - Test stage 2 faults.
+  *
+  * This test tries different combinations of guest accesses (e.g., write,
+  * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on
+  * hugetlbfs with a hole). It checks that the expected handling method is
+  * called (e.g., uffd faults with the right address and write/read flag).
+  */
+ 
+ #define _GNU_SOURCE
+ #include <linux/bitmap.h>
+ #include <fcntl.h>
+ #include <test_util.h>
+ #include <kvm_util.h>
+ #include <processor.h>
+ #include <asm/sysreg.h>
+ #include <linux/bitfield.h>
+ #include "guest_modes.h"
+ #include "userfaultfd_util.h"
+ 
+ /* Guest virtual addresses that point to the test page and its PTE. */
+ #define TEST_GVA				0xc0000000
+ #define TEST_EXEC_GVA				(TEST_GVA + 0x8)
+ #define TEST_PTE_GVA				0xb0000000
+ #define TEST_DATA				0x0123456789ABCDEF
+ 
+ static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA;
+ 
+ #define CMD_NONE				(0)
+ #define CMD_SKIP_TEST				(1ULL << 1)
+ #define CMD_HOLE_PT				(1ULL << 2)
+ #define CMD_HOLE_DATA				(1ULL << 3)
+ #define CMD_CHECK_WRITE_IN_DIRTY_LOG		(1ULL << 4)
+ #define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG		(1ULL << 5)
+ #define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG		(1ULL << 6)
+ #define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG	(1ULL << 7)
+ #define CMD_SET_PTE_AF				(1ULL << 8)
+ 
+ #define PREPARE_FN_NR				10
+ #define CHECK_FN_NR				10
+ 
+ static struct event_cnt {
+ 	int mmio_exits;
+ 	int fail_vcpu_runs;
+ 	int uffd_faults;
+ 	/* uffd_faults is incremented from multiple threads. */
+ 	pthread_mutex_t uffd_faults_mutex;
+ } events;
+ 
+ struct test_desc {
+ 	const char *name;
+ 	uint64_t mem_mark_cmd;
+ 	/* Skip the test if any prepare function returns false */
+ 	bool (*guest_prepare[PREPARE_FN_NR])(void);
+ 	void (*guest_test)(void);
+ 	void (*guest_test_check[CHECK_FN_NR])(void);
+ 	uffd_handler_t uffd_pt_handler;
+ 	uffd_handler_t uffd_data_handler;
+ 	void (*dabt_handler)(struct ex_regs *regs);
+ 	void (*iabt_handler)(struct ex_regs *regs);
+ 	void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run);
+ 	void (*fail_vcpu_run_handler)(int ret);
+ 	uint32_t pt_memslot_flags;
+ 	uint32_t data_memslot_flags;
+ 	bool skip;
+ 	struct event_cnt expected_events;
+ };
+ 
+ struct test_params {
+ 	enum vm_mem_backing_src_type src_type;
+ 	struct test_desc *test_desc;
+ };
+ 
+ static inline void flush_tlb_page(uint64_t vaddr)
+ {
+ 	uint64_t page = vaddr >> 12;
+ 
+ 	dsb(ishst);
+ 	asm volatile("tlbi vaae1is, %0" :: "r" (page));
+ 	dsb(ish);
+ 	isb();
+ }
+ 
+ static void guest_write64(void)
+ {
+ 	uint64_t val;
+ 
+ 	WRITE_ONCE(*guest_test_memory, TEST_DATA);
+ 	val = READ_ONCE(*guest_test_memory);
+ 	GUEST_ASSERT_EQ(val, TEST_DATA);
+ }
+ 
+ /* Check the system for atomic instructions. */
+ static bool guest_check_lse(void)
+ {
+ 	uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
+ 	uint64_t atomic;
+ 
+ 	atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS), isar0);
+ 	return atomic >= 2;
+ }
+ 
+ static bool guest_check_dc_zva(void)
+ {
+ 	uint64_t dczid = read_sysreg(dczid_el0);
+ 	uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_DZP), dczid);
+ 
+ 	return dzp == 0;
+ }
+ 
+ /* Compare and swap instruction. */
+ static void guest_cas(void)
+ {
+ 	uint64_t val;
+ 
+ 	GUEST_ASSERT(guest_check_lse());
+ 	asm volatile(".arch_extension lse\n"
+ 		     "casal %0, %1, [%2]\n"
+ 		     :: "r" (0), "r" (TEST_DATA), "r" (guest_test_memory));
+ 	val = READ_ONCE(*guest_test_memory);
+ 	GUEST_ASSERT_EQ(val, TEST_DATA);
+ }
+ 
+ static void guest_read64(void)
+ {
+ 	uint64_t val;
+ 
+ 	val = READ_ONCE(*guest_test_memory);
+ 	GUEST_ASSERT_EQ(val, 0);
+ }
+ 
+ /* Address translation instruction */
+ static void guest_at(void)
+ {
+ 	uint64_t par;
+ 
+ 	asm volatile("at s1e1r, %0" :: "r" (guest_test_memory));
+ 	par = read_sysreg(par_el1);
+ 	isb();
+ 
+ 	/* Bit 1 indicates whether the AT was successful */
+ 	GUEST_ASSERT_EQ(par & 1, 0);
+ }
+ 
+ /*
+  * The size of the block written by "dc zva" is guaranteed to be between (2 <<
+  * 0) and (2 << 9), which is safe in our case as we need the write to happen
+  * for at least a word, and not more than a page.
+  */
+ static void guest_dc_zva(void)
+ {
+ 	uint16_t val;
+ 
+ 	asm volatile("dc zva, %0" :: "r" (guest_test_memory));
+ 	dsb(ish);
+ 	val = READ_ONCE(*guest_test_memory);
+ 	GUEST_ASSERT_EQ(val, 0);
+ }
+ 
+ /*
+  * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0).
+  * And that's special because KVM must take special care with those: they
+  * should still count as accesses for dirty logging or user-faulting, but
+  * should be handled differently on mmio.
+  */
+ static void guest_ld_preidx(void)
+ {
+ 	uint64_t val;
+ 	uint64_t addr = TEST_GVA - 8;
+ 
+ 	/*
+ 	 * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is
+ 	 * in a gap between memslots not backing by anything.
+ 	 */
+ 	asm volatile("ldr %0, [%1, #8]!"
+ 		     : "=r" (val), "+r" (addr));
+ 	GUEST_ASSERT_EQ(val, 0);
+ 	GUEST_ASSERT_EQ(addr, TEST_GVA);
+ }
+ 
+ static void guest_st_preidx(void)
+ {
+ 	uint64_t val = TEST_DATA;
+ 	uint64_t addr = TEST_GVA - 8;
+ 
+ 	asm volatile("str %0, [%1, #8]!"
+ 		     : "+r" (val), "+r" (addr));
+ 
+ 	GUEST_ASSERT_EQ(addr, TEST_GVA);
+ 	val = READ_ONCE(*guest_test_memory);
+ }
+ 
+ static bool guest_set_ha(void)
+ {
+ 	uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+ 	uint64_t hadbs, tcr;
+ 
+ 	/* Skip if HA is not supported. */
+ 	hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS), mmfr1);
+ 	if (hadbs == 0)
+ 		return false;
+ 
+ 	tcr = read_sysreg(tcr_el1) | TCR_EL1_HA;
+ 	write_sysreg(tcr, tcr_el1);
+ 	isb();
+ 
+ 	return true;
+ }
+ 
+ static bool guest_clear_pte_af(void)
+ {
+ 	*((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF;
+ 	flush_tlb_page(TEST_GVA);
+ 
+ 	return true;
+ }
+ 
+ static void guest_check_pte_af(void)
+ {
+ 	dsb(ish);
+ 	GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF);
+ }
+ 
+ static void guest_check_write_in_dirty_log(void)
+ {
+ 	GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG);
+ }
+ 
+ static void guest_check_no_write_in_dirty_log(void)
+ {
+ 	GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG);
+ }
+ 
+ static void guest_check_s1ptw_wr_in_dirty_log(void)
+ {
+ 	GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG);
+ }
+ 
+ static void guest_exec(void)
+ {
+ 	int (*code)(void) = (int (*)(void))TEST_EXEC_GVA;
+ 	int ret;
+ 
+ 	ret = code();
+ 	GUEST_ASSERT_EQ(ret, 0x77);
+ }
+ 
+ static bool guest_prepare(struct test_desc *test)
+ {
+ 	bool (*prepare_fn)(void);
+ 	int i;
+ 
+ 	for (i = 0; i < PREPARE_FN_NR; i++) {
+ 		prepare_fn = test->guest_prepare[i];
+ 		if (prepare_fn && !prepare_fn())
+ 			return false;
+ 	}
+ 
+ 	return true;
+ }
+ 
+ static void guest_test_check(struct test_desc *test)
+ {
+ 	void (*check_fn)(void);
+ 	int i;
+ 
+ 	for (i = 0; i < CHECK_FN_NR; i++) {
+ 		check_fn = test->guest_test_check[i];
+ 		if (check_fn)
+ 			check_fn();
+ 	}
+ }
+ 
+ static void guest_code(struct test_desc *test)
+ {
+ 	if (!guest_prepare(test))
+ 		GUEST_SYNC(CMD_SKIP_TEST);
+ 
+ 	GUEST_SYNC(test->mem_mark_cmd);
+ 
+ 	if (test->guest_test)
+ 		test->guest_test();
+ 
+ 	guest_test_check(test);
+ 	GUEST_DONE();
+ }
+ 
+ static void no_dabt_handler(struct ex_regs *regs)
+ {
+ 	GUEST_ASSERT_1(false, read_sysreg(far_el1));
+ }
+ 
+ static void no_iabt_handler(struct ex_regs *regs)
+ {
+ 	GUEST_ASSERT_1(false, regs->pc);
+ }
+ 
+ static struct uffd_args {
+ 	char *copy;
+ 	void *hva;
+ 	uint64_t paging_size;
+ } pt_args, data_args;
+ 
+ /* Returns true to continue the test, and false if it should be skipped. */
+ static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg,
+ 				struct uffd_args *args, bool expect_write)
+ {
+ 	uint64_t addr = msg->arg.pagefault.address;
+ 	uint64_t flags = msg->arg.pagefault.flags;
+ 	struct uffdio_copy copy;
+ 	int ret;
+ 
+ 	TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING,
+ 		    "The only expected UFFD mode is MISSING");
+ 	ASSERT_EQ(!!(flags & UFFD_PAGEFAULT_FLAG_WRITE), expect_write);
+ 	ASSERT_EQ(addr, (uint64_t)args->hva);
+ 
+ 	pr_debug("uffd fault: addr=%p write=%d\n",
+ 		 (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE));
+ 
+ 	copy.src = (uint64_t)args->copy;
+ 	copy.dst = addr;
+ 	copy.len = args->paging_size;
+ 	copy.mode = 0;
+ 
+ 	ret = ioctl(uffd, UFFDIO_COPY, &copy);
+ 	if (ret == -1) {
+ 		pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n",
+ 			addr, errno);
+ 		return ret;
+ 	}
+ 
+ 	pthread_mutex_lock(&events.uffd_faults_mutex);
+ 	events.uffd_faults += 1;
+ 	pthread_mutex_unlock(&events.uffd_faults_mutex);
+ 	return 0;
+ }
+ 
+ static int uffd_pt_write_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+ 	return uffd_generic_handler(mode, uffd, msg, &pt_args, true);
+ }
+ 
+ static int uffd_data_write_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+ 	return uffd_generic_handler(mode, uffd, msg, &data_args, true);
+ }
+ 
+ static int uffd_data_read_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+ 	return uffd_generic_handler(mode, uffd, msg, &data_args, false);
+ }
+ 
+ static void setup_uffd_args(struct userspace_mem_region *region,
+ 			    struct uffd_args *args)
+ {
+ 	args->hva = (void *)region->region.userspace_addr;
+ 	args->paging_size = region->region.memory_size;
+ 
+ 	args->copy = malloc(args->paging_size);
+ 	TEST_ASSERT(args->copy, "Failed to allocate data copy.");
+ 	memcpy(args->copy, args->hva, args->paging_size);
+ }
+ 
+ static void setup_uffd(struct kvm_vm *vm, struct test_params *p,
+ 		       struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd)
+ {
+ 	struct test_desc *test = p->test_desc;
+ 	int uffd_mode = UFFDIO_REGISTER_MODE_MISSING;
+ 
+ 	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args);
+ 	setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args);
+ 
+ 	*pt_uffd = NULL;
+ 	if (test->uffd_pt_handler)
+ 		*pt_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+ 						    pt_args.hva,
+ 						    pt_args.paging_size,
+ 						    test->uffd_pt_handler);
+ 
+ 	*data_uffd = NULL;
+ 	if (test->uffd_data_handler)
+ 		*data_uffd = uffd_setup_demand_paging(uffd_mode, 0,
+ 						      data_args.hva,
+ 						      data_args.paging_size,
+ 						      test->uffd_data_handler);
+ }
+ 
+ static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd,
+ 		      struct uffd_desc *data_uffd)
+ {
+ 	if (test->uffd_pt_handler)
+ 		uffd_stop_demand_paging(pt_uffd);
+ 	if (test->uffd_data_handler)
+ 		uffd_stop_demand_paging(data_uffd);
+ 
+ 	free(pt_args.copy);
+ 	free(data_args.copy);
+ }
+ 
+ static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg)
+ {
+ 	TEST_FAIL("There was no UFFD fault expected.");
+ 	return -1;
+ }
+ 
+ /* Returns false if the test should be skipped. */
+ static bool punch_hole_in_backing_store(struct kvm_vm *vm,
+ 					struct userspace_mem_region *region)
+ {
+ 	void *hva = (void *)region->region.userspace_addr;
+ 	uint64_t paging_size = region->region.memory_size;
+ 	int ret, fd = region->fd;
+ 
+ 	if (fd != -1) {
+ 		ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ 				0, paging_size);
+ 		TEST_ASSERT(ret == 0, "fallocate failed\n");
+ 	} else {
+ 		ret = madvise(hva, paging_size, MADV_DONTNEED);
+ 		TEST_ASSERT(ret == 0, "madvise failed\n");
+ 	}
+ 
+ 	return true;
+ }
+ 
+ static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run)
+ {
+ 	struct userspace_mem_region *region;
+ 	void *hva;
+ 
+ 	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ 	hva = (void *)region->region.userspace_addr;
+ 
+ 	ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr);
+ 
+ 	memcpy(hva, run->mmio.data, run->mmio.len);
+ 	events.mmio_exits += 1;
+ }
+ 
+ static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run)
+ {
+ 	uint64_t data;
+ 
+ 	memcpy(&data, run->mmio.data, sizeof(data));
+ 	pr_debug("addr=%lld len=%d w=%d data=%lx\n",
+ 		 run->mmio.phys_addr, run->mmio.len,
+ 		 run->mmio.is_write, data);
+ 	TEST_FAIL("There was no MMIO exit expected.");
+ }
+ 
+ static bool check_write_in_dirty_log(struct kvm_vm *vm,
+ 				     struct userspace_mem_region *region,
+ 				     uint64_t host_pg_nr)
+ {
+ 	unsigned long *bmap;
+ 	bool first_page_dirty;
+ 	uint64_t size = region->region.memory_size;
+ 
+ 	/* getpage_size() is not always equal to vm->page_size */
+ 	bmap = bitmap_zalloc(size / getpagesize());
+ 	kvm_vm_get_dirty_log(vm, region->region.slot, bmap);
+ 	first_page_dirty = test_bit(host_pg_nr, bmap);
+ 	free(bmap);
+ 	return first_page_dirty;
+ }
+ 
+ /* Returns true to continue the test, and false if it should be skipped. */
+ static bool handle_cmd(struct kvm_vm *vm, int cmd)
+ {
+ 	struct userspace_mem_region *data_region, *pt_region;
+ 	bool continue_test = true;
+ 
+ 	data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ 	pt_region = vm_get_mem_region(vm, MEM_REGION_PT);
+ 
+ 	if (cmd == CMD_SKIP_TEST)
+ 		continue_test = false;
+ 
+ 	if (cmd & CMD_HOLE_PT)
+ 		continue_test = punch_hole_in_backing_store(vm, pt_region);
+ 	if (cmd & CMD_HOLE_DATA)
+ 		continue_test = punch_hole_in_backing_store(vm, data_region);
+ 	if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG)
+ 		TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0),
+ 			    "Missing write in dirty log");
+ 	if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG)
+ 		TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, 0),
+ 			    "Missing s1ptw write in dirty log");
+ 	if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG)
+ 		TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0),
+ 			    "Unexpected write in dirty log");
+ 	if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG)
+ 		TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, 0),
+ 			    "Unexpected s1ptw write in dirty log");
+ 
+ 	return continue_test;
+ }
+ 
+ void fail_vcpu_run_no_handler(int ret)
+ {
+ 	TEST_FAIL("Unexpected vcpu run failure\n");
+ }
+ 
+ void fail_vcpu_run_mmio_no_syndrome_handler(int ret)
+ {
+ 	TEST_ASSERT(errno == ENOSYS,
+ 		    "The mmio handler should have returned not implemented.");
+ 	events.fail_vcpu_runs += 1;
+ }
+ 
+ typedef uint32_t aarch64_insn_t;
+ extern aarch64_insn_t __exec_test[2];
+ 
+ noinline void __return_0x77(void)
+ {
+ 	asm volatile("__exec_test: mov x0, #0x77\n"
+ 		     "ret\n");
+ }
+ 
+ /*
+  * Note that this function runs on the host before the test VM starts: there's
+  * no need to sync the D$ and I$ caches.
+  */
+ static void load_exec_code_for_test(struct kvm_vm *vm)
+ {
+ 	uint64_t *code;
+ 	struct userspace_mem_region *region;
+ 	void *hva;
+ 
+ 	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ 	hva = (void *)region->region.userspace_addr;
+ 
+ 	assert(TEST_EXEC_GVA > TEST_GVA);
+ 	code = hva + TEST_EXEC_GVA - TEST_GVA;
+ 	memcpy(code, __exec_test, sizeof(__exec_test));
+ }
+ 
+ static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+ 				 struct test_desc *test)
+ {
+ 	vm_init_descriptor_tables(vm);
+ 	vcpu_init_descriptor_tables(vcpu);
+ 
+ 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+ 				ESR_EC_DABT, no_dabt_handler);
+ 	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+ 				ESR_EC_IABT, no_iabt_handler);
+ }
+ 
+ static void setup_gva_maps(struct kvm_vm *vm)
+ {
+ 	struct userspace_mem_region *region;
+ 	uint64_t pte_gpa;
+ 
+ 	region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+ 	/* Map TEST_GVA first. This will install a new PTE. */
+ 	virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr);
+ 	/* Then map TEST_PTE_GVA to the above PTE. */
+ 	pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA));
+ 	virt_pg_map(vm, TEST_PTE_GVA, pte_gpa);
+ }
+ 
+ enum pf_test_memslots {
+ 	CODE_AND_DATA_MEMSLOT,
+ 	PAGE_TABLE_MEMSLOT,
+ 	TEST_DATA_MEMSLOT,
+ };
+ 
+ /*
+  * Create a memslot for code and data at pfn=0, and test-data and PT ones
+  * at max_gfn.
+  */
+ static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
+ {
+ 	uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type);
+ 	uint64_t guest_page_size = vm->page_size;
+ 	uint64_t max_gfn = vm_compute_max_gfn(vm);
+ 	/* Enough for 2M of code when using 4K guest pages. */
+ 	uint64_t code_npages = 512;
+ 	uint64_t pt_size, data_size, data_gpa;
+ 
+ 	/*
+ 	 * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using
+ 	 * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs.  That's 13
+ 	 * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use
+ 	 * twice that just in case.
+ 	 */
+ 	pt_size = 26 * guest_page_size;
+ 
+ 	/* memslot sizes and gpa's must be aligned to the backing page size */
+ 	pt_size = align_up(pt_size, backing_src_pagesz);
+ 	data_size = align_up(guest_page_size, backing_src_pagesz);
+ 	data_gpa = (max_gfn * guest_page_size) - data_size;
+ 	data_gpa = align_down(data_gpa, backing_src_pagesz);
+ 
+ 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0,
+ 				    CODE_AND_DATA_MEMSLOT, code_npages, 0);
+ 	vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT;
+ 	vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT;
+ 
+ 	vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size,
+ 				    PAGE_TABLE_MEMSLOT, pt_size / guest_page_size,
+ 				    p->test_desc->pt_memslot_flags);
+ 	vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT;
+ 
+ 	vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT,
+ 				    data_size / guest_page_size,
+ 				    p->test_desc->data_memslot_flags);
+ 	vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+ }
+ 
++static void setup_ucall(struct kvm_vm *vm)
++{
++	struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
++
++	ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size);
++}
++
+ static void setup_default_handlers(struct test_desc *test)
+ {
+ 	if (!test->mmio_handler)
+ 		test->mmio_handler = mmio_no_handler;
+ 
+ 	if (!test->fail_vcpu_run_handler)
+ 		test->fail_vcpu_run_handler = fail_vcpu_run_no_handler;
+ }
+ 
+ static void check_event_counts(struct test_desc *test)
+ {
+ 	ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults);
+ 	ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits);
+ 	ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs);
+ }
+ 
+ static void print_test_banner(enum vm_guest_mode mode, struct test_params *p)
+ {
+ 	struct test_desc *test = p->test_desc;
+ 
+ 	pr_debug("Test: %s\n", test->name);
+ 	pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+ 	pr_debug("Testing memory backing src type: %s\n",
+ 		 vm_mem_backing_src_alias(p->src_type)->name);
+ }
+ 
+ static void reset_event_counts(void)
+ {
+ 	memset(&events, 0, sizeof(events));
+ }
+ 
+ /*
+  * This function either succeeds, skips the test (after setting test->skip), or
+  * fails with a TEST_FAIL that aborts all tests.
+  */
+ static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
+ 			  struct test_desc *test)
+ {
+ 	struct kvm_run *run;
+ 	struct ucall uc;
+ 	int ret;
+ 
+ 	run = vcpu->run;
+ 
+ 	for (;;) {
+ 		ret = _vcpu_run(vcpu);
+ 		if (ret) {
+ 			test->fail_vcpu_run_handler(ret);
+ 			goto done;
+ 		}
+ 
+ 		switch (get_ucall(vcpu, &uc)) {
+ 		case UCALL_SYNC:
+ 			if (!handle_cmd(vm, uc.args[1])) {
+ 				test->skip = true;
+ 				goto done;
+ 			}
+ 			break;
+ 		case UCALL_ABORT:
+ 			REPORT_GUEST_ASSERT_2(uc, "values: %#lx, %#lx");
+ 			break;
+ 		case UCALL_DONE:
+ 			goto done;
+ 		case UCALL_NONE:
+ 			if (run->exit_reason == KVM_EXIT_MMIO)
+ 				test->mmio_handler(vm, run);
+ 			break;
+ 		default:
+ 			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ 		}
+ 	}
+ 
+ done:
+ 	pr_debug(test->skip ? "Skipped.\n" : "Done.\n");
+ }
+ 
+ static void run_test(enum vm_guest_mode mode, void *arg)
+ {
+ 	struct test_params *p = (struct test_params *)arg;
+ 	struct test_desc *test = p->test_desc;
+ 	struct kvm_vm *vm;
+ 	struct kvm_vcpu *vcpu;
+ 	struct uffd_desc *pt_uffd, *data_uffd;
+ 
+ 	print_test_banner(mode, p);
+ 
+ 	vm = ____vm_create(mode);
+ 	setup_memslots(vm, p);
+ 	kvm_vm_elf_load(vm, program_invocation_name);
++	setup_ucall(vm);
+ 	vcpu = vm_vcpu_add(vm, 0, guest_code);
+ 
+ 	setup_gva_maps(vm);
+ 
 -	ucall_init(vm, NULL);
 -
+ 	reset_event_counts();
+ 
+ 	/*
+ 	 * Set some code in the data memslot for the guest to execute (only
+ 	 * applicable to the EXEC tests). This has to be done before
+ 	 * setup_uffd() as that function copies the memslot data for the uffd
+ 	 * handler.
+ 	 */
+ 	load_exec_code_for_test(vm);
+ 	setup_uffd(vm, p, &pt_uffd, &data_uffd);
+ 	setup_abort_handlers(vm, vcpu, test);
+ 	setup_default_handlers(test);
+ 	vcpu_args_set(vcpu, 1, test);
+ 
+ 	vcpu_run_loop(vm, vcpu, test);
+ 
 -	ucall_uninit(vm);
+ 	kvm_vm_free(vm);
+ 	free_uffd(test, pt_uffd, data_uffd);
+ 
+ 	/*
+ 	 * Make sure we check the events after the uffd threads have exited,
+ 	 * which means they updated their respective event counters.
+ 	 */
+ 	if (!test->skip)
+ 		check_event_counts(test);
+ }
+ 
+ static void help(char *name)
+ {
+ 	puts("");
+ 	printf("usage: %s [-h] [-s mem-type]\n", name);
+ 	puts("");
+ 	guest_modes_help();
+ 	backing_src_help("-s");
+ 	puts("");
+ }
+ 
+ #define SNAME(s)			#s
+ #define SCAT2(a, b)			SNAME(a ## _ ## b)
+ #define SCAT3(a, b, c)			SCAT2(a, SCAT2(b, c))
+ #define SCAT4(a, b, c, d)		SCAT2(a, SCAT3(b, c, d))
+ 
+ #define _CHECK(_test)			_CHECK_##_test
+ #define _PREPARE(_test)			_PREPARE_##_test
+ #define _PREPARE_guest_read64		NULL
+ #define _PREPARE_guest_ld_preidx	NULL
+ #define _PREPARE_guest_write64		NULL
+ #define _PREPARE_guest_st_preidx	NULL
+ #define _PREPARE_guest_exec		NULL
+ #define _PREPARE_guest_at		NULL
+ #define _PREPARE_guest_dc_zva		guest_check_dc_zva
+ #define _PREPARE_guest_cas		guest_check_lse
+ 
+ /* With or without access flag checks */
+ #define _PREPARE_with_af		guest_set_ha, guest_clear_pte_af
+ #define _PREPARE_no_af			NULL
+ #define _CHECK_with_af			guest_check_pte_af
+ #define _CHECK_no_af			NULL
+ 
+ /* Performs an access and checks that no faults were triggered. */
+ #define TEST_ACCESS(_access, _with_af, _mark_cmd)				\
+ {										\
+ 	.name			= SCAT3(_access, _with_af, #_mark_cmd),		\
+ 	.guest_prepare		= { _PREPARE(_with_af),				\
+ 				    _PREPARE(_access) },			\
+ 	.mem_mark_cmd		= _mark_cmd,					\
+ 	.guest_test		= _access,					\
+ 	.guest_test_check	= { _CHECK(_with_af) },				\
+ 	.expected_events	= { 0 },					\
+ }
+ 
+ #define TEST_UFFD(_access, _with_af, _mark_cmd,					\
+ 		  _uffd_data_handler, _uffd_pt_handler, _uffd_faults)		\
+ {										\
+ 	.name			= SCAT4(uffd, _access, _with_af, #_mark_cmd),	\
+ 	.guest_prepare		= { _PREPARE(_with_af),				\
+ 				    _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.mem_mark_cmd		= _mark_cmd,					\
+ 	.guest_test_check	= { _CHECK(_with_af) },				\
+ 	.uffd_data_handler	= _uffd_data_handler,				\
+ 	.uffd_pt_handler	= _uffd_pt_handler,				\
+ 	.expected_events	= { .uffd_faults = _uffd_faults, },		\
+ }
+ 
+ #define TEST_DIRTY_LOG(_access, _with_af, _test_check)				\
+ {										\
+ 	.name			= SCAT3(dirty_log, _access, _with_af),		\
+ 	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.guest_prepare		= { _PREPARE(_with_af),				\
+ 				    _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.guest_test_check	= { _CHECK(_with_af), _test_check,		\
+ 				    guest_check_s1ptw_wr_in_dirty_log},		\
+ 	.expected_events	= { 0 },					\
+ }
+ 
+ #define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler,		\
+ 				_uffd_faults, _test_check)			\
+ {										\
+ 	.name			= SCAT3(uffd_and_dirty_log, _access, _with_af),	\
+ 	.data_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.guest_prepare		= { _PREPARE(_with_af),				\
+ 				    _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+ 	.guest_test_check	= { _CHECK(_with_af), _test_check },		\
+ 	.uffd_data_handler	= _uffd_data_handler,				\
+ 	.uffd_pt_handler	= uffd_pt_write_handler,			\
+ 	.expected_events	= { .uffd_faults = _uffd_faults, },		\
+ }
+ 
+ #define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits)			\
+ {										\
+ 	.name			= SCAT3(ro_memslot, _access, _with_af),		\
+ 	.data_memslot_flags	= KVM_MEM_READONLY,				\
+ 	.guest_prepare		= { _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.mmio_handler		= _mmio_handler,				\
+ 	.expected_events	= { .mmio_exits = _mmio_exits },		\
+ }
+ 
+ #define TEST_RO_MEMSLOT_NO_SYNDROME(_access)					\
+ {										\
+ 	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
+ 	.data_memslot_flags	= KVM_MEM_READONLY,				\
+ 	.guest_test		= _access,					\
+ 	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+ 	.expected_events	= { .fail_vcpu_runs = 1 },			\
+ }
+ 
+ #define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits,	\
+ 				      _test_check)				\
+ {										\
+ 	.name			= SCAT3(ro_memslot, _access, _with_af),		\
+ 	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+ 	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.guest_prepare		= { _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.guest_test_check	= { _test_check },				\
+ 	.mmio_handler		= _mmio_handler,				\
+ 	.expected_events	= { .mmio_exits = _mmio_exits},			\
+ }
+ 
+ #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check)		\
+ {										\
+ 	.name			= SCAT2(ro_memslot_no_syn_and_dlog, _access),	\
+ 	.data_memslot_flags	= KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES,	\
+ 	.pt_memslot_flags	= KVM_MEM_LOG_DIRTY_PAGES,			\
+ 	.guest_test		= _access,					\
+ 	.guest_test_check	= { _test_check },				\
+ 	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+ 	.expected_events	= { .fail_vcpu_runs = 1 },			\
+ }
+ 
+ #define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits,		\
+ 				 _uffd_data_handler, _uffd_faults)		\
+ {										\
+ 	.name			= SCAT2(ro_memslot_uffd, _access),		\
+ 	.data_memslot_flags	= KVM_MEM_READONLY,				\
+ 	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+ 	.guest_prepare		= { _PREPARE(_access) },			\
+ 	.guest_test		= _access,					\
+ 	.uffd_data_handler	= _uffd_data_handler,				\
+ 	.uffd_pt_handler	= uffd_pt_write_handler,			\
+ 	.mmio_handler		= _mmio_handler,				\
+ 	.expected_events	= { .mmio_exits = _mmio_exits,			\
+ 				    .uffd_faults = _uffd_faults },		\
+ }
+ 
+ #define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler,	\
+ 					     _uffd_faults)			\
+ {										\
+ 	.name			= SCAT2(ro_memslot_no_syndrome, _access),	\
+ 	.data_memslot_flags	= KVM_MEM_READONLY,				\
+ 	.mem_mark_cmd		= CMD_HOLE_DATA | CMD_HOLE_PT,			\
+ 	.guest_test		= _access,					\
+ 	.uffd_data_handler	= _uffd_data_handler,				\
+ 	.uffd_pt_handler	= uffd_pt_write_handler,			\
+ 	.fail_vcpu_run_handler	= fail_vcpu_run_mmio_no_syndrome_handler,	\
+ 	.expected_events	= { .fail_vcpu_runs = 1,			\
+ 				    .uffd_faults = _uffd_faults },		\
+ }
+ 
+ static struct test_desc tests[] = {
+ 
+ 	/* Check that HW is setting the Access Flag (AF) (sanity checks). */
+ 	TEST_ACCESS(guest_read64, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_cas, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_write64, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE),
+ 	TEST_ACCESS(guest_exec, with_af, CMD_NONE),
+ 
+ 	/*
+ 	 * Punch a hole in the data backing store, and then try multiple
+ 	 * accesses: reads should rturn zeroes, and writes should
+ 	 * re-populate the page. Moreover, the test also check that no
+ 	 * exception was generated in the guest.  Note that this
+ 	 * reading/writing behavior is the same as reading/writing a
+ 	 * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from
+ 	 * userspace.
+ 	 */
+ 	TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA),
+ 	TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA),
+ 
+ 	/*
+ 	 * Punch holes in the data and PT backing stores and mark them for
+ 	 * userfaultfd handling. This should result in 2 faults: the access
+ 	 * on the data backing store, and its respective S1 page table walk
+ 	 * (S1PTW).
+ 	 */
+ 	TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 2),
+ 	/* no_af should also lead to a PT write. */
+ 	TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 2),
+ 	/* Note how that cas invokes the read handler. */
+ 	TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 2),
+ 	/*
+ 	 * Can't test guest_at with_af as it's IMPDEF whether the AF is set.
+ 	 * The S1PTW fault should still be marked as a write.
+ 	 */
+ 	TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 1),
+ 	TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 2),
+ 	TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_write_handler, uffd_pt_write_handler, 2),
+ 	TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_write_handler, uffd_pt_write_handler, 2),
+ 	TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_write_handler, uffd_pt_write_handler, 2),
+ 	TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT,
+ 		  uffd_data_read_handler, uffd_pt_write_handler, 2),
+ 
+ 	/*
+ 	 * Try accesses when the data and PT memory regions are both
+ 	 * tracked for dirty logging.
+ 	 */
+ 	TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log),
+ 	/* no_af should also lead to a PT write. */
+ 	TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_ld_preidx, with_af, guest_check_no_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log),
+ 	TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log),
+ 
+ 	/*
+ 	 * Access when the data and PT memory regions are both marked for
+ 	 * dirty logging and UFFD at the same time. The expected result is
+ 	 * that writes should mark the dirty log and trigger a userfaultfd
+ 	 * write fault.  Reads/execs should result in a read userfaultfd
+ 	 * fault, and nothing in the dirty log.  Any S1PTW should result in
+ 	 * a write in the dirty log and a userfaultfd write.
+ 	 */
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, uffd_data_read_handler, 2,
+ 				guest_check_no_write_in_dirty_log),
+ 	/* no_af should also lead to a PT write. */
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, uffd_data_read_handler, 2,
+ 				guest_check_no_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, uffd_data_read_handler,
+ 				2, guest_check_no_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, 0, 1,
+ 				guest_check_no_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, uffd_data_read_handler, 2,
+ 				guest_check_no_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, uffd_data_write_handler,
+ 				2, guest_check_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, uffd_data_read_handler, 2,
+ 				guest_check_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, uffd_data_write_handler,
+ 				2, guest_check_write_in_dirty_log),
+ 	TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af,
+ 				uffd_data_write_handler, 2,
+ 				guest_check_write_in_dirty_log),
+ 
+ 	/*
+ 	 * Try accesses when the data memory region is marked read-only
+ 	 * (with KVM_MEM_READONLY). Writes with a syndrome result in an
+ 	 * MMIO exit, writes with no syndrome (e.g., CAS) result in a
+ 	 * failed vcpu run, and reads/execs with and without syndroms do
+ 	 * not fault.
+ 	 */
+ 	TEST_RO_MEMSLOT(guest_read64, 0, 0),
+ 	TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0),
+ 	TEST_RO_MEMSLOT(guest_at, 0, 0),
+ 	TEST_RO_MEMSLOT(guest_exec, 0, 0),
+ 	TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx),
+ 
+ 	/*
+ 	 * Access when both the data region is both read-only and marked
+ 	 * for dirty logging at the same time. The expected result is that
+ 	 * for writes there should be no write in the dirty log. The
+ 	 * readonly handling is the same as if the memslot was not marked
+ 	 * for dirty logging: writes with a syndrome result in an MMIO
+ 	 * exit, and writes with no syndrome result in a failed vcpu run.
+ 	 */
+ 	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0,
+ 				      guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0,
+ 				      guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0,
+ 				      guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0,
+ 				      guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler,
+ 				      1, guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva,
+ 						  guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas,
+ 						  guest_check_no_write_in_dirty_log),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx,
+ 						  guest_check_no_write_in_dirty_log),
+ 
+ 	/*
+ 	 * Access when the data region is both read-only and punched with
+ 	 * holes tracked with userfaultfd.  The expected result is the
+ 	 * union of both userfaultfd and read-only behaviors. For example,
+ 	 * write accesses result in a userfaultfd write fault and an MMIO
+ 	 * exit.  Writes with no syndrome result in a failed vcpu run and
+ 	 * no userfaultfd write fault. Reads result in userfaultfd getting
+ 	 * triggered.
+ 	 */
+ 	TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0,
+ 				 uffd_data_read_handler, 2),
+ 	TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0,
+ 				 uffd_data_read_handler, 2),
+ 	TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0,
+ 				 uffd_no_handler, 1),
+ 	TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0,
+ 				 uffd_data_read_handler, 2),
+ 	TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1,
+ 				 uffd_data_write_handler, 2),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas,
+ 					     uffd_data_read_handler, 2),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva,
+ 					     uffd_no_handler, 1),
+ 	TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx,
+ 					     uffd_no_handler, 1),
+ 
+ 	{ 0 }
+ };
+ 
+ static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type)
+ {
+ 	struct test_desc *t;
+ 
+ 	for (t = &tests[0]; t->name; t++) {
+ 		if (t->skip)
+ 			continue;
+ 
+ 		struct test_params p = {
+ 			.src_type = src_type,
+ 			.test_desc = t,
+ 		};
+ 
+ 		for_each_guest_mode(run_test, &p);
+ 	}
+ }
+ 
+ int main(int argc, char *argv[])
+ {
+ 	enum vm_mem_backing_src_type src_type;
+ 	int opt;
+ 
+ 	setbuf(stdout, NULL);
+ 
+ 	src_type = DEFAULT_VM_MEM_SRC;
+ 
+ 	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
+ 		switch (opt) {
+ 		case 'm':
+ 			guest_modes_cmdline(optarg);
+ 			break;
+ 		case 's':
+ 			src_type = parse_backing_src_type(optarg);
+ 			break;
+ 		case 'h':
+ 		default:
+ 			help(argv[0]);
+ 			exit(0);
+ 		}
+ 	}
+ 
+ 	for_each_test_and_guest_mode(src_type);
+ 	return 0;
+ }
diff --cc tools/testing/selftests/kvm/access_tracking_perf_test.c
index 02d3587cab0a3,942370d573925..57a16371e9c29
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@@ -211,7 -208,7 +208,7 @@@ static bool spin_wait_for_next_iteratio
  	int last_iteration = *current_iteration;
  
  	do {
- 		if (READ_ONCE(done))
 -		if (READ_ONCE(perf_test_args.stop_vcpus))
++		if (READ_ONCE(memstress_args.stop_vcpus))
  			return false;
  
  		*current_iteration = READ_ONCE(iteration);
@@@ -321,11 -318,8 +318,8 @@@ static void run_test(enum vm_guest_mod
  	mark_memory_idle(vm, nr_vcpus);
  	access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");
  
- 	/* Set done to signal the vCPU threads to exit */
- 	done = true;
- 
 -	perf_test_join_vcpu_threads(nr_vcpus);
 -	perf_test_destroy_vm(vm);
 +	memstress_join_vcpu_threads(nr_vcpus);
 +	memstress_destroy_vm(vm);
  }
  
  static void help(char *name)
diff --cc tools/testing/selftests/kvm/demand_paging_test.c
index 3a977ddf07b20,8e1fe4ffcccdf..b0e1fc4de9e29
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@@ -20,8 -20,9 +20,9 @@@
  
  #include "kvm_util.h"
  #include "test_util.h"
 -#include "perf_test_util.h"
 +#include "memstress.h"
  #include "guest_modes.h"
+ #include "userfaultfd_util.h"
  
  #ifdef __NR_userfaultfd
  
@@@ -270,22 -129,13 +129,13 @@@ static void prefault_mem(void *alias, u
  static void run_test(enum vm_guest_mode mode, void *arg)
  {
  	struct test_params *p = arg;
- 	pthread_t *uffd_handler_threads = NULL;
- 	struct uffd_handler_args *uffd_args = NULL;
+ 	struct uffd_desc **uffd_descs = NULL;
  	struct timespec start;
  	struct timespec ts_diff;
- 	int *pipefds = NULL;
  	struct kvm_vm *vm;
- 	int r, i;
+ 	int i;
  
 -	vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
 +	vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
  				 p->src_type, p->partition_vcpu_memory_access);
  
  	demand_paging_size = get_backing_src_pagesz(p->src_type);
@@@ -296,18 -146,11 +146,11 @@@
  	memset(guest_data_prototype, 0xAB, demand_paging_size);
  
  	if (p->uffd_mode) {
- 		uffd_handler_threads =
- 			malloc(nr_vcpus * sizeof(*uffd_handler_threads));
- 		TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
- 
- 		uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
- 		TEST_ASSERT(uffd_args, "Memory allocation failed");
- 
- 		pipefds = malloc(sizeof(int) * nr_vcpus * 2);
- 		TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+ 		uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
+ 		TEST_ASSERT(uffd_descs, "Memory allocation failed");
  
  		for (i = 0; i < nr_vcpus; i++) {
 -			struct perf_test_vcpu_args *vcpu_args;
 +			struct memstress_vcpu_args *vcpu_args;
  			void *vcpu_hva;
  			void *vcpu_alias;
  
@@@ -317,19 -160,17 +160,17 @@@
  			vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
  			vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
  
+ 			prefault_mem(vcpu_alias,
 -				vcpu_args->pages * perf_test_args.guest_page_size);
++				vcpu_args->pages * memstress_args.guest_page_size);
+ 
  			/*
  			 * Set up user fault fd to handle demand paging
  			 * requests.
  			 */
- 			r = pipe2(&pipefds[i * 2],
- 				  O_CLOEXEC | O_NONBLOCK);
- 			TEST_ASSERT(!r, "Failed to set up pipefd");
- 
- 			setup_demand_paging(vm, &uffd_handler_threads[i],
- 					    pipefds[i * 2], p->uffd_mode,
- 					    p->uffd_delay, &uffd_args[i],
- 					    vcpu_hva, vcpu_alias,
- 					    vcpu_args->pages * memstress_args.guest_page_size);
+ 			uffd_descs[i] = uffd_setup_demand_paging(
+ 				p->uffd_mode, p->uffd_delay, vcpu_hva,
 -				vcpu_args->pages * perf_test_args.guest_page_size,
++				vcpu_args->pages * memstress_args.guest_page_size,
+ 				&handle_uffd_page_request);
  		}
  	}
  
@@@ -358,17 -193,14 +193,14 @@@
  	pr_info("Total guest execution time: %ld.%.9lds\n",
  		ts_diff.tv_sec, ts_diff.tv_nsec);
  	pr_info("Overall demand paging rate: %f pgs/sec\n",
 -		perf_test_args.vcpu_args[0].pages * nr_vcpus /
 +		memstress_args.vcpu_args[0].pages * nr_vcpus /
  		((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
  
 -	perf_test_destroy_vm(vm);
 +	memstress_destroy_vm(vm);
  
  	free(guest_data_prototype);
- 	if (p->uffd_mode) {
- 		free(uffd_handler_threads);
- 		free(uffd_args);
- 		free(pipefds);
- 	}
+ 	if (p->uffd_mode)
+ 		free(uffd_descs);
  }
  
  static void help(char *name)
diff --cc tools/testing/selftests/kvm/include/kvm_util_base.h
index c7685c7038ff0,b0da75af1ff33..37500c92dd0a6
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@@ -385,9 -406,12 +408,13 @@@ void vm_mem_region_set_flags(struct kvm
  void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
  void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
  struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
 +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
  vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
+ vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ 			    enum kvm_mem_region_type type);
  vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages);
+ vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm,
+ 				 enum kvm_mem_region_type type);
  vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm);
  
  void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
diff --cc tools/testing/selftests/kvm/include/memstress.h
index bbd2a302df100,0000000000000..72e3e358ef7bd
mode 100644,000000..100644
--- a/tools/testing/selftests/kvm/include/memstress.h
+++ b/tools/testing/selftests/kvm/include/memstress.h
@@@ -1,72 -1,0 +1,75 @@@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * tools/testing/selftests/kvm/include/memstress.h
 + *
 + * Copyright (C) 2020, Google LLC.
 + */
 +
 +#ifndef SELFTEST_KVM_MEMSTRESS_H
 +#define SELFTEST_KVM_MEMSTRESS_H
 +
 +#include <pthread.h>
 +
 +#include "kvm_util.h"
 +
 +/* Default guest test virtual memory offset */
 +#define DEFAULT_GUEST_TEST_MEM		0xc0000000
 +
 +#define DEFAULT_PER_VCPU_MEM_SIZE	(1 << 30) /* 1G */
 +
 +#define MEMSTRESS_MEM_SLOT_INDEX	1
 +
 +struct memstress_vcpu_args {
 +	uint64_t gpa;
 +	uint64_t gva;
 +	uint64_t pages;
 +
 +	/* Only used by the host userspace part of the vCPU thread */
 +	struct kvm_vcpu *vcpu;
 +	int vcpu_idx;
 +};
 +
 +struct memstress_args {
 +	struct kvm_vm *vm;
 +	/* The starting address and size of the guest test region. */
 +	uint64_t gpa;
 +	uint64_t size;
 +	uint64_t guest_page_size;
 +	uint32_t random_seed;
 +	uint32_t write_percent;
 +
 +	/* Run vCPUs in L2 instead of L1, if the architecture supports it. */
 +	bool nested;
 +	/* Randomize which pages are accessed by the guest. */
 +	bool random_access;
 +	/* True if all vCPUs are pinned to pCPUs */
 +	bool pin_vcpus;
 +	/* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */
 +	uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS];
 +
++ 	/* Test is done, stop running vCPUs. */
++ 	bool stop_vcpus;
++
 +	struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS];
 +};
 +
 +extern struct memstress_args memstress_args;
 +
 +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
 +				   uint64_t vcpu_memory_bytes, int slots,
 +				   enum vm_mem_backing_src_type backing_src,
 +				   bool partition_vcpu_memory_access);
 +void memstress_destroy_vm(struct kvm_vm *vm);
 +
 +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent);
 +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed);
 +void memstress_set_random_access(struct kvm_vm *vm, bool random_access);
 +
 +void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *));
 +void memstress_join_vcpu_threads(int vcpus);
 +void memstress_guest_code(uint32_t vcpu_id);
 +
 +uint64_t memstress_nested_pages(int nr_vcpus);
 +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
 +
 +#endif /* SELFTEST_KVM_MEMSTRESS_H */
diff --cc tools/testing/selftests/kvm/lib/kvm_util.c
index 1d26a21601785,e3daa97ab0f4a..e9607eb089bee
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@@ -335,10 -328,17 +329,18 @@@ struct kvm_vm *__vm_create(enum vm_gues
  {
  	uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
  						 nr_extra_pages);
 +	struct userspace_mem_region *slot0;
  	struct kvm_vm *vm;
+ 	int i;
+ 
+ 	pr_debug("%s: mode='%s' pages='%ld'\n", __func__,
+ 		 vm_guest_mode_string(mode), nr_pages);
  
- 	vm = ____vm_create(mode, nr_pages);
+ 	vm = ____vm_create(mode);
+ 
+ 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
+ 	for (i = 0; i < NR_MEM_REGIONS; i++)
+ 		vm->memslots[i] = 0;
  
  	kvm_vm_elf_load(vm, program_invocation_name);
  
diff --cc tools/testing/selftests/kvm/lib/memstress.c
index 2de8a5d527b3a,0000000000000..5f1d3173c238c
mode 100644,000000..100644
--- a/tools/testing/selftests/kvm/lib/memstress.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@@ -1,319 -1,0 +1,322 @@@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2020, Google LLC.
 + */
 +#define _GNU_SOURCE
 +
 +#include <inttypes.h>
 +
 +#include "kvm_util.h"
 +#include "memstress.h"
 +#include "processor.h"
 +
 +struct memstress_args memstress_args;
 +
 +/*
 + * Guest virtual memory offset of the testing memory slot.
 + * Must not conflict with identity mapped test code.
 + */
 +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
 +
 +struct vcpu_thread {
 +	/* The index of the vCPU. */
 +	int vcpu_idx;
 +
 +	/* The pthread backing the vCPU. */
 +	pthread_t thread;
 +
 +	/* Set to true once the vCPU thread is up and running. */
 +	bool running;
 +};
 +
 +/* The vCPU threads involved in this test. */
 +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
 +
 +/* The function run by each vCPU thread, as provided by the test. */
 +static void (*vcpu_thread_fn)(struct memstress_vcpu_args *);
 +
 +/* Set to true once all vCPU threads are up and running. */
 +static bool all_vcpu_threads_running;
 +
 +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 +
 +/*
 + * Continuously write to the first 8 bytes of each page in the
 + * specified region.
 + */
 +void memstress_guest_code(uint32_t vcpu_idx)
 +{
 +	struct memstress_args *args = &memstress_args;
 +	struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx];
 +	struct guest_random_state rand_state;
 +	uint64_t gva;
 +	uint64_t pages;
 +	uint64_t addr;
 +	uint64_t page;
 +	int i;
 +
 +	rand_state = new_guest_random_state(args->random_seed + vcpu_idx);
 +
 +	gva = vcpu_args->gva;
 +	pages = vcpu_args->pages;
 +
 +	/* Make sure vCPU args data structure is not corrupt. */
 +	GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx);
 +
 +	while (true) {
 +		for (i = 0; i < pages; i++) {
 +			if (args->random_access)
 +				page = guest_random_u32(&rand_state) % pages;
 +			else
 +				page = i;
 +
 +			addr = gva + (page * args->guest_page_size);
 +
 +			if (guest_random_u32(&rand_state) % 100 < args->write_percent)
 +				*(uint64_t *)addr = 0x0123456789ABCDEF;
 +			else
 +				READ_ONCE(*(uint64_t *)addr);
 +		}
 +
 +		GUEST_SYNC(1);
 +	}
 +}
 +
 +void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
 +			   struct kvm_vcpu *vcpus[],
 +			   uint64_t vcpu_memory_bytes,
 +			   bool partition_vcpu_memory_access)
 +{
 +	struct memstress_args *args = &memstress_args;
 +	struct memstress_vcpu_args *vcpu_args;
 +	int i;
 +
 +	for (i = 0; i < nr_vcpus; i++) {
 +		vcpu_args = &args->vcpu_args[i];
 +
 +		vcpu_args->vcpu = vcpus[i];
 +		vcpu_args->vcpu_idx = i;
 +
 +		if (partition_vcpu_memory_access) {
 +			vcpu_args->gva = guest_test_virt_mem +
 +					 (i * vcpu_memory_bytes);
 +			vcpu_args->pages = vcpu_memory_bytes /
 +					   args->guest_page_size;
 +			vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes);
 +		} else {
 +			vcpu_args->gva = guest_test_virt_mem;
 +			vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) /
 +					   args->guest_page_size;
 +			vcpu_args->gpa = args->gpa;
 +		}
 +
 +		vcpu_args_set(vcpus[i], 1, i);
 +
 +		pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
 +			 i, vcpu_args->gpa, vcpu_args->gpa +
 +			 (vcpu_args->pages * args->guest_page_size));
 +	}
 +}
 +
 +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
 +				   uint64_t vcpu_memory_bytes, int slots,
 +				   enum vm_mem_backing_src_type backing_src,
 +				   bool partition_vcpu_memory_access)
 +{
 +	struct memstress_args *args = &memstress_args;
 +	struct kvm_vm *vm;
 +	uint64_t guest_num_pages, slot0_pages = 0;
 +	uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
 +	uint64_t region_end_gfn;
 +	int i;
 +
 +	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
 +
 +	/* By default vCPUs will write to memory. */
 +	args->write_percent = 100;
 +
 +	/*
 +	 * Snapshot the non-huge page size.  This is used by the guest code to
 +	 * access/dirty pages at the logging granularity.
 +	 */
 +	args->guest_page_size = vm_guest_mode_params[mode].page_size;
 +
 +	guest_num_pages = vm_adjust_num_guest_pages(mode,
 +				(nr_vcpus * vcpu_memory_bytes) / args->guest_page_size);
 +
 +	TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0,
 +		    "Guest memory size is not host page size aligned.");
 +	TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0,
 +		    "Guest memory size is not guest page size aligned.");
 +	TEST_ASSERT(guest_num_pages % slots == 0,
 +		    "Guest memory cannot be evenly divided into %d slots.",
 +		    slots);
 +
 +	/*
 +	 * If using nested, allocate extra pages for the nested page tables and
 +	 * in-memory data structures.
 +	 */
 +	if (args->nested)
 +		slot0_pages += memstress_nested_pages(nr_vcpus);
 +
 +	/*
 +	 * Pass guest_num_pages to populate the page tables for test memory.
 +	 * The memory is also added to memslot 0, but that's a benign side
 +	 * effect as KVM allows aliasing HVAs in meslots.
 +	 */
 +	vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages,
 +				    memstress_guest_code, vcpus);
 +
 +	args->vm = vm;
 +
 +	/* Put the test region at the top guest physical memory. */
 +	region_end_gfn = vm->max_gfn + 1;
 +
 +#ifdef __x86_64__
 +	/*
 +	 * When running vCPUs in L2, restrict the test region to 48 bits to
 +	 * avoid needing 5-level page tables to identity map L2.
 +	 */
 +	if (args->nested)
 +		region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size);
 +#endif
 +	/*
 +	 * If there should be more memory in the guest test region than there
 +	 * can be pages in the guest, it will definitely cause problems.
 +	 */
 +	TEST_ASSERT(guest_num_pages < region_end_gfn,
 +		    "Requested more guest memory than address space allows.\n"
 +		    "    guest pages: %" PRIx64 " max gfn: %" PRIx64
 +		    " nr_vcpus: %d wss: %" PRIx64 "]\n",
 +		    guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes);
 +
 +	args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size;
 +	args->gpa = align_down(args->gpa, backing_src_pagesz);
 +#ifdef __s390x__
 +	/* Align to 1M (segment size) */
 +	args->gpa = align_down(args->gpa, 1 << 20);
 +#endif
 +	args->size = guest_num_pages * args->guest_page_size;
 +	pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
 +		args->gpa, args->gpa + args->size);
 +
 +	/* Add extra memory slots for testing */
 +	for (i = 0; i < slots; i++) {
 +		uint64_t region_pages = guest_num_pages / slots;
 +		vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i;
 +
 +		vm_userspace_mem_region_add(vm, backing_src, region_start,
 +					    MEMSTRESS_MEM_SLOT_INDEX + i,
 +					    region_pages, 0);
 +	}
 +
 +	/* Do mapping for the demand paging memory slot */
 +	virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages);
 +
 +	memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes,
 +			      partition_vcpu_memory_access);
 +
 +	if (args->nested) {
 +		pr_info("Configuring vCPUs to run in L2 (nested).\n");
 +		memstress_setup_nested(vm, nr_vcpus, vcpus);
 +	}
 +
 +	/* Export the shared variables to the guest. */
 +	sync_global_to_guest(vm, memstress_args);
 +
 +	return vm;
 +}
 +
 +void memstress_destroy_vm(struct kvm_vm *vm)
 +{
 +	kvm_vm_free(vm);
 +}
 +
 +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent)
 +{
 +	memstress_args.write_percent = write_percent;
 +	sync_global_to_guest(vm, memstress_args.write_percent);
 +}
 +
 +void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed)
 +{
 +	memstress_args.random_seed = random_seed;
 +	sync_global_to_guest(vm, memstress_args.random_seed);
 +}
 +
 +void memstress_set_random_access(struct kvm_vm *vm, bool random_access)
 +{
 +	memstress_args.random_access = random_access;
 +	sync_global_to_guest(vm, memstress_args.random_access);
 +}
 +
 +uint64_t __weak memstress_nested_pages(int nr_vcpus)
 +{
 +	return 0;
 +}
 +
 +void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus)
 +{
 +	pr_info("%s() not support on this architecture, skipping.\n", __func__);
 +	exit(KSFT_SKIP);
 +}
 +
 +static void *vcpu_thread_main(void *data)
 +{
 +	struct vcpu_thread *vcpu = data;
 +	int vcpu_idx = vcpu->vcpu_idx;
 +
 +	if (memstress_args.pin_vcpus)
 +		kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]);
 +
 +	WRITE_ONCE(vcpu->running, true);
 +
 +	/*
 +	 * Wait for all vCPU threads to be up and running before calling the test-
 +	 * provided vCPU thread function. This prevents thread creation (which
 +	 * requires taking the mmap_sem in write mode) from interfering with the
 +	 * guest faulting in its memory.
 +	 */
 +	while (!READ_ONCE(all_vcpu_threads_running))
 +		;
 +
 +	vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]);
 +
 +	return NULL;
 +}
 +
 +void memstress_start_vcpu_threads(int nr_vcpus,
 +				  void (*vcpu_fn)(struct memstress_vcpu_args *))
 +{
 +	int i;
 +
 +	vcpu_thread_fn = vcpu_fn;
 +	WRITE_ONCE(all_vcpu_threads_running, false);
++	WRITE_ONCE(memstress_args.stop_vcpus, false);
 +
 +	for (i = 0; i < nr_vcpus; i++) {
 +		struct vcpu_thread *vcpu = &vcpu_threads[i];
 +
 +		vcpu->vcpu_idx = i;
 +		WRITE_ONCE(vcpu->running, false);
 +
 +		pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
 +	}
 +
 +	for (i = 0; i < nr_vcpus; i++) {
 +		while (!READ_ONCE(vcpu_threads[i].running))
 +			;
 +	}
 +
 +	WRITE_ONCE(all_vcpu_threads_running, true);
 +}
 +
 +void memstress_join_vcpu_threads(int nr_vcpus)
 +{
 +	int i;
 +
++	WRITE_ONCE(memstress_args.stop_vcpus, true);
++
 +	for (i = 0; i < nr_vcpus; i++)
 +		pthread_join(vcpu_threads[i].thread, NULL);
 +}
diff --cc tools/testing/selftests/kvm/lib/userfaultfd_util.c
index 0000000000000,3b44846fc277e..92cef20902f1f
mode 000000,100644..100644
--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@@ -1,0 -1,186 +1,186 @@@
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  * KVM userfaultfd util
+  * Adapted from demand_paging_test.c
+  *
+  * Copyright (C) 2018, Red Hat, Inc.
+  * Copyright (C) 2019-2022 Google LLC
+  */
+ 
+ #define _GNU_SOURCE /* for pipe2 */
+ 
+ #include <inttypes.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
+ #include <poll.h>
+ #include <pthread.h>
+ #include <linux/userfaultfd.h>
+ #include <sys/syscall.h>
+ 
+ #include "kvm_util.h"
+ #include "test_util.h"
 -#include "perf_test_util.h"
++#include "memstress.h"
+ #include "userfaultfd_util.h"
+ 
+ #ifdef __NR_userfaultfd
+ 
+ static void *uffd_handler_thread_fn(void *arg)
+ {
+ 	struct uffd_desc *uffd_desc = (struct uffd_desc *)arg;
+ 	int uffd = uffd_desc->uffd;
+ 	int pipefd = uffd_desc->pipefds[0];
+ 	useconds_t delay = uffd_desc->delay;
+ 	int64_t pages = 0;
+ 	struct timespec start;
+ 	struct timespec ts_diff;
+ 
+ 	clock_gettime(CLOCK_MONOTONIC, &start);
+ 	while (1) {
+ 		struct uffd_msg msg;
+ 		struct pollfd pollfd[2];
+ 		char tmp_chr;
+ 		int r;
+ 
+ 		pollfd[0].fd = uffd;
+ 		pollfd[0].events = POLLIN;
+ 		pollfd[1].fd = pipefd;
+ 		pollfd[1].events = POLLIN;
+ 
+ 		r = poll(pollfd, 2, -1);
+ 		switch (r) {
+ 		case -1:
+ 			pr_info("poll err");
+ 			continue;
+ 		case 0:
+ 			continue;
+ 		case 1:
+ 			break;
+ 		default:
+ 			pr_info("Polling uffd returned %d", r);
+ 			return NULL;
+ 		}
+ 
+ 		if (pollfd[0].revents & POLLERR) {
+ 			pr_info("uffd revents has POLLERR");
+ 			return NULL;
+ 		}
+ 
+ 		if (pollfd[1].revents & POLLIN) {
+ 			r = read(pollfd[1].fd, &tmp_chr, 1);
+ 			TEST_ASSERT(r == 1,
+ 				    "Error reading pipefd in UFFD thread\n");
+ 			return NULL;
+ 		}
+ 
+ 		if (!(pollfd[0].revents & POLLIN))
+ 			continue;
+ 
+ 		r = read(uffd, &msg, sizeof(msg));
+ 		if (r == -1) {
+ 			if (errno == EAGAIN)
+ 				continue;
+ 			pr_info("Read of uffd got errno %d\n", errno);
+ 			return NULL;
+ 		}
+ 
+ 		if (r != sizeof(msg)) {
+ 			pr_info("Read on uffd returned unexpected size: %d bytes", r);
+ 			return NULL;
+ 		}
+ 
+ 		if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+ 			continue;
+ 
+ 		if (delay)
+ 			usleep(delay);
+ 		r = uffd_desc->handler(uffd_desc->uffd_mode, uffd, &msg);
+ 		if (r < 0)
+ 			return NULL;
+ 		pages++;
+ 	}
+ 
+ 	ts_diff = timespec_elapsed(start);
+ 	PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+ 		       pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+ 		       pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+ 
+ 	return NULL;
+ }
+ 
+ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay,
+ 					   void *hva, uint64_t len,
+ 					   uffd_handler_t handler)
+ {
+ 	struct uffd_desc *uffd_desc;
+ 	bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
+ 	int uffd;
+ 	struct uffdio_api uffdio_api;
+ 	struct uffdio_register uffdio_register;
+ 	uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY;
+ 	int ret;
+ 
+ 	PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n",
+ 		       is_minor ? "MINOR" : "MISSING",
+ 		       is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY");
+ 
+ 	uffd_desc = malloc(sizeof(struct uffd_desc));
+ 	TEST_ASSERT(uffd_desc, "malloc failed");
+ 
+ 	/* In order to get minor faults, prefault via the alias. */
+ 	if (is_minor)
+ 		expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE;
+ 
+ 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ 	TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
+ 
+ 	uffdio_api.api = UFFD_API;
+ 	uffdio_api.features = 0;
+ 	TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
+ 		    "ioctl UFFDIO_API failed: %" PRIu64,
+ 		    (uint64_t)uffdio_api.api);
+ 
+ 	uffdio_register.range.start = (uint64_t)hva;
+ 	uffdio_register.range.len = len;
+ 	uffdio_register.mode = uffd_mode;
+ 	TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1,
+ 		    "ioctl UFFDIO_REGISTER failed");
+ 	TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
+ 		    expected_ioctls, "missing userfaultfd ioctls");
+ 
+ 	ret = pipe2(uffd_desc->pipefds, O_CLOEXEC | O_NONBLOCK);
+ 	TEST_ASSERT(!ret, "Failed to set up pipefd");
+ 
+ 	uffd_desc->uffd_mode = uffd_mode;
+ 	uffd_desc->uffd = uffd;
+ 	uffd_desc->delay = delay;
+ 	uffd_desc->handler = handler;
+ 	pthread_create(&uffd_desc->thread, NULL, uffd_handler_thread_fn,
+ 		       uffd_desc);
+ 
+ 	PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+ 		       hva, hva + len);
+ 
+ 	return uffd_desc;
+ }
+ 
+ void uffd_stop_demand_paging(struct uffd_desc *uffd)
+ {
+ 	char c = 0;
+ 	int ret;
+ 
+ 	ret = write(uffd->pipefds[1], &c, 1);
+ 	TEST_ASSERT(ret == 1, "Unable to write to pipefd");
+ 
+ 	ret = pthread_join(uffd->thread, NULL);
+ 	TEST_ASSERT(ret == 0, "Pthread_join failed.");
+ 
+ 	close(uffd->uffd);
+ 
+ 	close(uffd->pipefds[1]);
+ 	close(uffd->pipefds[0]);
+ 
+ 	free(uffd);
+ }
+ 
+ #endif /* __NR_userfaultfd */
diff --cc tools/testing/selftests/kvm/memslot_modification_stress_test.c
index d07e921bfcc53,3a5e4518307c2..9855c41ca811f
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@@ -34,9 -34,7 +34,7 @@@
  static int nr_vcpus = 1;
  static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
  
- static bool run_vcpus = true;
- 
 -static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
 +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
  {
  	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
  	struct kvm_run *run;
@@@ -45,7 -43,7 +43,7 @@@
  	run = vcpu->run;
  
  	/* Let the guest access its memory until a stop signal is received */
- 	while (READ_ONCE(run_vcpus)) {
 -	while (!READ_ONCE(perf_test_args.stop_vcpus)) {
++	while (!READ_ONCE(memstress_args.stop_vcpus)) {
  		ret = _vcpu_run(vcpu);
  		TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
  
@@@ -107,14 -105,13 +105,12 @@@ static void run_test(enum vm_guest_mod
  
  	pr_info("Started all vCPUs\n");
  
 -	add_remove_memslot(vm, p->memslot_modification_delay,
 -			   p->nr_memslot_modifications);
 +	add_remove_memslot(vm, p->delay, p->nr_iterations);
  
- 	run_vcpus = false;
- 
 -	perf_test_join_vcpu_threads(nr_vcpus);
 +	memstress_join_vcpu_threads(nr_vcpus);
  	pr_info("All vCPU threads joined\n");
  
 -	perf_test_destroy_vm(vm);
 +	memstress_destroy_vm(vm);
  }
  
  static void help(char *name)
diff --cc tools/testing/selftests/kvm/memslot_perf_test.c
index 36b20abfb948e,2ad40f7c9c08e..e698306bf49d1
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@@ -252,37 -284,33 +284,34 @@@ static bool prepare_vm(struct vm_data *
  	struct timespec tstart;
  	struct sync_area *sync;
  
- 	max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
- 	TEST_ASSERT(max_mem_slots > 1,
- 		    "KVM_CAP_NR_MEMSLOTS should be greater than 1");
- 	TEST_ASSERT(nslots > 1 || nslots == -1,
- 		    "Slot count cap should be greater than 1");
- 	if (nslots != -1)
- 		max_mem_slots = min(max_mem_slots, (uint32_t)nslots);
- 	pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots);
+ 	host_page_size = getpagesize();
+ 	guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+ 	mempages = mem_size / guest_page_size;
  
- 	TEST_ASSERT(mempages > 1,
- 		    "Can't test without any memory");
+ 	data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
 -	ucall_init(data->vm, NULL);
+ 	TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size");
  
  	data->npages = mempages;
- 	data->nslots = max_mem_slots - 1;
- 	data->pages_per_slot = mempages / data->nslots;
- 	if (!data->pages_per_slot) {
- 		*maxslots = mempages + 1;
+ 	TEST_ASSERT(data->npages > 1, "Can't test without any memory");
+ 	data->nslots = nslots;
+ 	data->pages_per_slot = data->npages / data->nslots;
+ 	rempages = data->npages % data->nslots;
+ 	if (!check_slot_pages(host_page_size, guest_page_size,
+ 			      data->pages_per_slot, rempages)) {
+ 		*maxslots = get_max_slots(data, host_page_size);
  		return false;
  	}
  
  	data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
  	TEST_ASSERT(data->hva_slots, "malloc() fail");
  
 +	data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
 +
  	pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
- 		max_mem_slots - 1, data->pages_per_slot, rempages);
+ 		data->nslots, data->pages_per_slot, rempages);
  
  	clock_gettime(CLOCK_MONOTONIC, &tstart);
- 	for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) {
+ 	for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) {
  		uint64_t npages;
  
  		npages = data->pages_per_slot;
@@@ -884,9 -966,9 +967,9 @@@ static bool parse_args(int argc, char *
  			map_unmap_verify = true;
  			break;
  		case 's':
 -			targs->nslots = atoi(optarg);
 +			targs->nslots = atoi_paranoid(optarg);
- 			if (targs->nslots <= 0 && targs->nslots != -1) {
- 				pr_info("Slot count cap has to be positive or -1 for no cap\n");
+ 			if (targs->nslots <= 1 && targs->nslots != -1) {
+ 				pr_info("Slot count cap must be larger than 1 or -1 for no cap\n");
  				return false;
  			}
  			break;
@@@ -994,6 -1103,12 +1092,9 @@@ int main(int argc, char *argv[]
  	struct test_result rbestslottime;
  	int tctr;
  
 -	/* Tell stdout not to buffer its content */
 -	setbuf(stdout, NULL);
 -
+ 	if (!check_memory_sizes())
+ 		return -1;
+ 
  	if (!parse_args(argc, argv, &targs))
  		return -1;