resctrl has one mutex that is taken by the architecture-specific code, and the
filesystem parts. The two interact via cpuhp, where the architecture code
updates the domain list. Filesystem handlers that walk the domains list should
not run concurrently with the cpuhp callback modifying the list.
Exposing a lock from the filesystem code means the interface is not cleanly
defined, and creates the possibility of cross-architecture lock ordering
headaches. The interaction only exists so that certain filesystem paths are
serialised against CPU hotplug. The CPU hotplug code already has a mechanism to
do this using cpus_read_lock().
MPAM's monitors have an overflow interrupt, so it needs to be possible to walk
the domains list in irq context. RCU is ideal for this, but some paths need to
be able to sleep to allocate memory.
Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp
callback, cpus_read_lock() must always be taken first.
rdtgroup_schemata_write() already does this.
Most of the filesystem code's domain list walkers are currently protected by
the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are
rdt_bit_usage_show() and the mon_config helpers which take the lock directly.
Make the domain list protected by RCU. An architecture-specific lock prevents
concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU,
but to keep all the filesystem operations the same, this is changed to call
cpus_read_lock(). The mon_config helpers send multiple IPIs, take the
cpus_read_lock() in these cases.
The other filesystem list walkers need to be able to sleep. Add
cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't
be invoked when file system operations are occurring.
Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live()
call isn't obvious.
Resctrl's domain online/offline calls now need to take the rdtgroup_mutex
themselves.
[ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Babu Moger <babu.moger@amd.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Tested-by: Carl Worth <carl@os.amperecomputing.com> # arm64
Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
#define pr_fmt(fmt) "resctrl: " fmt
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/cacheinfo.h>
#include <asm/resctrl.h>
#include "internal.h"
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
/*
* The cached resctrl_pqr_state is strictly per CPU and can never be
{
struct rdt_domain *d;
+ /*
+ * Walking r->domains, ensure it can't race with cpuhp.
+ * Because this is called via IPI by rdt_ctrl_update(), assertions
+ * about locks this thread holds will lead to false positives. Check
+ * someone is holding the CPUs lock.
+ */
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && IS_ENABLED(CONFIG_LOCKDEP))
+ WARN_ON_ONCE(!lockdep_is_cpus_held());
+
list_for_each_entry(d, &r->domains, list) {
/* Find the domain that contains this CPU */
if (cpumask_test_cpu(cpu, &d->cpu_mask))
struct rdt_domain *d;
int err;
+ lockdep_assert_held(&domain_list_lock);
+
d = rdt_find_domain(r, id, &add_pos);
if (IS_ERR(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu);
return;
}
- list_add_tail(&d->list, add_pos);
+ list_add_tail_rcu(&d->list, add_pos);
err = resctrl_online_domain(r, d);
if (err) {
- list_del(&d->list);
+ list_del_rcu(&d->list);
+ synchronize_rcu();
domain_free(hw_dom);
}
}
struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
+ lockdep_assert_held(&domain_list_lock);
+
d = rdt_find_domain(r, id, NULL);
if (IS_ERR_OR_NULL(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu);
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
resctrl_offline_domain(r, d);
- list_del(&d->list);
+ list_del_rcu(&d->list);
+ synchronize_rcu();
/*
* rdt_domain "d" is going to be freed below, so clear
{
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
- clear_closid_rmid(cpu);
+ mutex_unlock(&domain_list_lock);
+ clear_closid_rmid(cpu);
resctrl_online_cpu(cpu);
- mutex_unlock(&rdtgroup_mutex);
return 0;
}
{
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
resctrl_offline_cpu(cpu);
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
return 0;
}
struct rdt_domain *d;
unsigned long dom_id;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
struct rdt_domain *d;
u32 idx;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
return -EINVAL;
buf[nbytes - 1] = '\0';
- cpus_read_lock();
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return -ENOENT;
}
rdt_last_cmd_clear();
out:
rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return ret ?: nbytes;
}
bool sep = false;
u32 ctrl_val;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
{
int cpu;
+ /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
/*
* Setup the parameters to pass to mon_event_count() to read the data.
*/
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <linux/slab.h>
lockdep_assert_held(&rdtgroup_mutex);
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
entry->busy = 0;
unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
struct rdt_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
d = container_of(work, struct rdt_domain, cqm_limbo.work);
}
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
/**
struct rdt_resource *r;
struct rdt_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/*
out_unlock:
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
/**
struct rdt_domain *d_i;
bool ret = false;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
return true;
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
bool sep = false;
u32 ctrl_val;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
list_for_each_entry(dom, &r->domains, list) {
}
seq_putc(seq, '\n');
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
struct rdt_domain *d;
u32 ctrl;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
struct rdt_domain *dom;
bool sep = false;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
list_for_each_entry(dom, &r->domains, list) {
seq_puts(s, "\n");
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
unsigned long dom_id, val;
struct rdt_domain *d;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
next:
if (!tok || tok[0] == '\0')
return 0;
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
struct rdt_domain *d;
int cpu;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
rdtgroup_kn_get(rdtgrp, kn);
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/* Was this group deleted while we waited? */
return;
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
rdtgroup_kn_put(rdtgrp, kn);
}
struct rdt_domain *d;
int i;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
struct rdt_domain *dom;
int ret;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
list_for_each_entry(dom, &r->domains, list) {
ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
if (ret)
void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
{
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
mba_sc_domain_destroy(r, d);
if (!r->mon_capable)
- return;
+ goto out_unlock;
/*
* If resctrl is mounted, remove all the
}
domain_destroy_mon_state(d);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
}
static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
{
- int err;
+ int err = 0;
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
/* RDT_RESOURCE_MBA is never mon_capable */
- return mba_sc_domain_allocate(r, d);
+ err = mba_sc_domain_allocate(r, d);
+ goto out_unlock;
+ }
if (!r->mon_capable)
- return 0;
+ goto out_unlock;
err = domain_setup_mon_state(r, d);
if (err)
- return err;
+ goto out_unlock;
if (is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
if (resctrl_mounted && resctrl_arch_mon_capable())
mkdir_mondata_subdir_allrdtgrp(r, d);
- return 0;
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
}
void resctrl_online_cpu(unsigned int cpu)
{
- lockdep_assert_held(&rdtgroup_mutex);
-
+ mutex_lock(&rdtgroup_mutex);
/* The CPU is set in default rdtgroup after online. */
cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&rdtgroup_mutex);
}
static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
struct rdtgroup *rdtgrp;
struct rdt_domain *d;
- lockdep_assert_held(&rdtgroup_mutex);
-
+ mutex_lock(&rdtgroup_mutex);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
clear_childcpus(rdtgrp, cpu);
}
if (!l3->mon_capable)
- return;
+ goto out_unlock;
d = get_domain_from_cpu(cpu, l3);
if (d) {
cqm_setup_limbo_handler(d, 0, cpu);
}
}
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
}
/*
* @cache_level: Which cache level defines scope of this resource
* @cache: Cache allocation related data
* @membw: If the component has bandwidth controls, their properties.
- * @domains: All domains for this resource
+ * @domains: RCU list of all domains for this resource
* @name: Name to use in "schemata" file.
* @data_width: Character width of data when displaying
* @default_ctrl: Specifies default cache cbm or memory B/W percent.