powerpc/64s/hash: add stress_hpt kernel boot option to increase hash faults

author Nicholas Piggin <npiggin@gmail.com>

Mon, 24 Oct 2022 03:01:50 +0000 (13:01 +1000)

committer Michael Ellerman <mpe@ellerman.id.au>

Fri, 2 Dec 2022 07:04:25 +0000 (18:04 +1100)
author Nicholas Piggin <npiggin@gmail.com>
Mon, 24 Oct 2022 03:01:50 +0000 (13:01 +1000)
committer Michael Ellerman <mpe@ellerman.id.au>
Fri, 2 Dec 2022 07:04:25 +0000 (18:04 +1100)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index a465d5242774af8f89779121b5acca4439b7f5f0..9f3d256529d01b27059db3abda71e353c8fa64a5 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1042,6 +1042,11 @@
                         them frequently to increase the rate of SLB faults
                         on kernel addresses.
  
+       stress_hpt      [PPC]
+                       Limits the number of kernel HPT entries in the hash
+                       page table to increase the rate of hash page table
+                       faults on kernel addresses.
+
         disable=        [IPV6]
                         See Documentation/networking/ipv6.rst.
  
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c

index 7de1a8a0c62a8320e4aca5c9512e45f304956716..02acbfd05b460c1e096b639506437ce9033e0c3f 100644 (file)
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -16,6 +16,8 @@
  #include <asm/machdep.h>
  #include <asm/mmu.h>
  
+#include "internal.h"
+
  int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
                    pte_t *ptep, unsigned long trap, unsigned long flags,
                    int ssize, int subpg_prot)
@@ -118,6 +120,9 @@ repeat:
                 }
                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
                 new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+               if (stress_hpt())
+                       hpt_do_stress(ea, hpte_group);
         }
         *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c

index 998c6817ed47b11240cf873a4e9c80403460a230..954af420f358623d60494d57821b341668777792 100644 (file)
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -16,6 +16,8 @@
  #include <asm/machdep.h>
  #include <asm/mmu.h>
  
+#include "internal.h"
+
  /*
   * Return true, if the entry has a slot value which
   * the software considers as invalid.
@@ -216,6 +218,9 @@ repeat:
         new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
         new_pte |= H_PAGE_HASHPTE;
  
+       if (stress_hpt())
+               hpt_do_stress(ea, hpte_group);
+
         *ptep = __pte(new_pte & ~H_PAGE_BUSY);
         return 0;
  }
@@ -327,7 +332,12 @@ repeat:
  
                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
                 new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+               if (stress_hpt())
+                       hpt_do_stress(ea, hpte_group);
         }
+
         *ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
         return 0;
  }
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c

index 6df4c6d38b66017c57f7ba0e25597cddf6261c2e..80a148c57de8137aba48022373d3475a567c219f 100644 (file)
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -471,7 +471,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
         return ret;
  }
  
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __ro_after_init;
  
  static int __init parse_disable_1tb_segments(char *p)
  {
@@ -480,6 +480,40 @@ static int __init parse_disable_1tb_segments(char *p)
  }
  early_param("disable_1tb_segments", parse_disable_1tb_segments);
  
+bool stress_hpt_enabled __initdata;
+
+static int __init parse_stress_hpt(char *p)
+{
+       stress_hpt_enabled = true;
+       return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+/*
+ * per-CPU array allocated if we enable stress_hpt.
+ */
+#define STRESS_MAX_GROUPS 16
+struct stress_hpt_struct {
+       unsigned long last_group[STRESS_MAX_GROUPS];
+};
+
+static inline int stress_nr_groups(void)
+{
+       /*
+        * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
+        * to allow practical forward progress. Bare metal returns 1, which
+        * seems to help uncover more bugs.
+        */
+       if (firmware_has_feature(FW_FEATURE_LPAR))
+               return STRESS_MAX_GROUPS;
+       else
+               return 1;
+}
+
+static struct stress_hpt_struct *stress_hpt_struct;
+
  static int __init htab_dt_scan_seg_sizes(unsigned long node,
                                          const char *uname, int depth,
                                          void *data)
@@ -976,6 +1010,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
         pr_info("Partition table %p\n", partition_tb);
  }
  
+void hpt_clear_stress(void);
+static struct timer_list stress_hpt_timer;
+void stress_hpt_timer_fn(struct timer_list *timer)
+{
+       int next_cpu;
+
+       hpt_clear_stress();
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               tlbiel_all();
+
+       next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+       if (next_cpu >= nr_cpu_ids)
+               next_cpu = cpumask_first(cpu_online_mask);
+       stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+       add_timer_on(&stress_hpt_timer, next_cpu);
+}
+
  static void __init htab_initialize(void)
  {
         unsigned long table;
@@ -995,6 +1046,20 @@ static void __init htab_initialize(void)
         if (stress_slb_enabled)
                 static_branch_enable(&stress_slb_key);
  
+       if (stress_hpt_enabled) {
+               unsigned long tmp;
+               static_branch_enable(&stress_hpt_key);
+               // Too early to use nr_cpu_ids, so use NR_CPUS
+               tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
+                                               0, 0, MEMBLOCK_ALLOC_ANYWHERE);
+               memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
+               stress_hpt_struct = __va(tmp);
+
+               timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
+               stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
+               add_timer(&stress_hpt_timer);
+       }
+
         /*
          * Calculate the required size of the htab.  We want the number of
          * PTEGs to equal one half the number of real pages.
@@ -1980,6 +2045,69 @@ repeat:
         return slot;
  }
  
+void hpt_clear_stress(void)
+{
+       int cpu = raw_smp_processor_id();
+       int g;
+
+       for (g = 0; g < stress_nr_groups(); g++) {
+               unsigned long last_group;
+               last_group = stress_hpt_struct[cpu].last_group[g];
+
+               if (last_group != -1UL) {
+                       int i;
+                       for (i = 0; i < HPTES_PER_GROUP; i++) {
+                               if (mmu_hash_ops.hpte_remove(last_group) == -1)
+                                       break;
+                       }
+                       stress_hpt_struct[cpu].last_group[g] = -1;
+               }
+       }
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
+{
+       unsigned long last_group;
+       int cpu = raw_smp_processor_id();
+
+       last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
+       if (hpte_group == last_group)
+               return;
+
+       if (last_group != -1UL) {
+               int i;
+               /*
+                * Concurrent CPUs might be inserting into this group, so
+                * give up after a number of iterations, to prevent a live
+                * lock.
+                */
+               for (i = 0; i < HPTES_PER_GROUP; i++) {
+                       if (mmu_hash_ops.hpte_remove(last_group) == -1)
+                               break;
+               }
+               stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
+       }
+
+       if (ea >= PAGE_OFFSET) {
+               /*
+                * We would really like to prefetch to get the TLB loaded, then
+                * remove the PTE before returning from fault interrupt, to
+                * increase the hash fault rate.
+                *
+                * Unfortunately QEMU TCG does not model the TLB in a way that
+                * makes this possible, and systemsim (mambo) emulator does not
+                * bring in TLBs with prefetches (although loads/stores do
+                * work for non-CI PTEs).
+                *
+                * So remember this PTE and clear it on the next hash fault.
+                */
+               memmove(&stress_hpt_struct[cpu].last_group[1],
+                       &stress_hpt_struct[cpu].last_group[0],
+                       (stress_nr_groups() - 1) * sizeof(unsigned long));
+               stress_hpt_struct[cpu].last_group[0] = hpte_group;
+       }
+}
+
  #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
  static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
  
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h

index 5045048ce244da88439a469fe766b4e7577ca02a..a57a25f06a215b38417d26ef6e23a3827e7656c2 100644 (file)
--- a/arch/powerpc/mm/book3s64/internal.h
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -13,6 +13,17 @@ static inline bool stress_slb(void)
         return static_branch_unlikely(&stress_slb_key);
  }
  
+extern bool stress_hpt_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+       return static_branch_unlikely(&stress_hpt_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
+
  void slb_setup_new_exec(void);
  
  void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);
author	Nicholas Piggin <npiggin@gmail.com>
	Mon, 24 Oct 2022 03:01:50 +0000 (13:01 +1000)
committer	Michael Ellerman <mpe@ellerman.id.au>
	Fri, 2 Dec 2022 07:04:25 +0000 (18:04 +1100)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
arch/powerpc/mm/book3s64/hash_4k.c		patch \| blob \| history
arch/powerpc/mm/book3s64/hash_64k.c		patch \| blob \| history
arch/powerpc/mm/book3s64/hash_utils.c		patch \| blob \| history
arch/powerpc/mm/book3s64/internal.h		patch \| blob \| history