perf metrics: Compute unmerged uncore metrics individually
authorIan Rogers <irogers@google.com>
Wed, 21 Feb 2024 07:07:53 +0000 (23:07 -0800)
committerNamhyung Kim <namhyung@kernel.org>
Thu, 22 Feb 2024 16:57:09 +0000 (08:57 -0800)
When merging counts from multiple uncore PMUs the metric is only
computed for the metric leader. When merging/aggregation is disabled,
prior to this patch just the leader's metric would be computed. Fix
this by computing the metric for each PMU.

On a SkylakeX:
Before:
```
$ perf stat -A -M memory_bandwidth_total -a sleep 1

 Performance counter stats for 'system wide':

CPU0               82,217      UNC_M_CAS_COUNT.RD [uncore_imc_0] #      9.2 MB/s  memory_bandwidth_total
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_0] #      0.0 MB/s  memory_bandwidth_total
CPU0               61,395      UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU0                    0      UNC_M_CAS_COUNT.RD [uncore_imc_1]
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_1]
CPU0                    0      UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU0               81,570      UNC_M_CAS_COUNT.RD [uncore_imc_2]
CPU18             113,886      UNC_M_CAS_COUNT.RD [uncore_imc_2]
CPU0               62,330      UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU18              66,942      UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU0               75,489      UNC_M_CAS_COUNT.RD [uncore_imc_3]
CPU18              27,958      UNC_M_CAS_COUNT.RD [uncore_imc_3]
CPU0               55,864      UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU18              38,727      UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU0                    0      UNC_M_CAS_COUNT.RD [uncore_imc_4]
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_4]
CPU0                    0      UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU0               75,423      UNC_M_CAS_COUNT.RD [uncore_imc_5]
CPU18             104,527      UNC_M_CAS_COUNT.RD [uncore_imc_5]
CPU0               57,596      UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU18              56,777      UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU0        1,003,440,851 ns   duration_time

       1.003440851 seconds time elapsed
```

After:
```
$ perf stat -A -M memory_bandwidth_total -a sleep 1

 Performance counter stats for 'system wide':

CPU0               88,968      UNC_M_CAS_COUNT.RD [uncore_imc_0] #      9.5 MB/s  memory_bandwidth_total
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_0] #      0.0 MB/s  memory_bandwidth_total
CPU0               59,498      UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_0]
CPU0                    0      UNC_M_CAS_COUNT.RD [uncore_imc_1] #      0.0 MB/s  memory_bandwidth_total
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_1] #      0.0 MB/s  memory_bandwidth_total
CPU0                    0      UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_1]
CPU0               88,635      UNC_M_CAS_COUNT.RD [uncore_imc_2] #      9.5 MB/s  memory_bandwidth_total
CPU18             117,975      UNC_M_CAS_COUNT.RD [uncore_imc_2] #     11.5 MB/s  memory_bandwidth_total
CPU0               60,829      UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU18              62,105      UNC_M_CAS_COUNT.WR [uncore_imc_2]
CPU0               82,238      UNC_M_CAS_COUNT.RD [uncore_imc_3] #      8.7 MB/s  memory_bandwidth_total
CPU18              22,906      UNC_M_CAS_COUNT.RD [uncore_imc_3] #      3.6 MB/s  memory_bandwidth_total
CPU0               53,959      UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU18              32,990      UNC_M_CAS_COUNT.WR [uncore_imc_3]
CPU0                    0      UNC_M_CAS_COUNT.RD [uncore_imc_4] #      0.0 MB/s  memory_bandwidth_total
CPU18                   0      UNC_M_CAS_COUNT.RD [uncore_imc_4] #      0.0 MB/s  memory_bandwidth_total
CPU0                    0      UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU18                   0      UNC_M_CAS_COUNT.WR [uncore_imc_4]
CPU0               83,595      UNC_M_CAS_COUNT.RD [uncore_imc_5] #      8.9 MB/s  memory_bandwidth_total
CPU18             110,151      UNC_M_CAS_COUNT.RD [uncore_imc_5] #     10.5 MB/s  memory_bandwidth_total
CPU0               56,540      UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU18              53,816      UNC_M_CAS_COUNT.WR [uncore_imc_5]
CPU0        1,003,353,416 ns   duration_time
```

Signed-off-by: Ian Rogers <irogers@google.com> |
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Kaige Ye <ye@kaige.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: John Garry <john.g.garry@oracle.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240221070754.4163916-2-irogers@google.com
tools/perf/util/metricgroup.c
tools/perf/util/stat-shadow.c

index 966cca5a3e88cd94b78c27ce8429f820f1fa86b9..b24a1c177a80537eb7b6e7fdb820be43a5c295d5 100644 (file)
@@ -44,6 +44,8 @@ struct metric_event *metricgroup__lookup(struct rblist *metric_events,
        if (!metric_events)
                return NULL;
 
+       if (evsel->metric_leader)
+               me.evsel = evsel->metric_leader;
        nd = rblist__find(metric_events, &me);
        if (nd)
                return container_of(nd, struct metric_event, nd);
index 10b45279203701a1ea1a3adf92d0640b8bc48c84..3466aa9524421d734f5e4ef4e82979bb1954308d 100644 (file)
@@ -356,6 +356,7 @@ static void print_nsecs(struct perf_stat_config *config,
 }
 
 static int prepare_metric(const struct metric_expr *mexp,
+                         const struct evsel *evsel,
                          struct expr_parse_ctx *pctx,
                          int aggr_idx)
 {
@@ -399,8 +400,29 @@ static int prepare_metric(const struct metric_expr *mexp,
                        source_count = 1;
                } else {
                        struct perf_stat_evsel *ps = metric_events[i]->stats;
-                       struct perf_stat_aggr *aggr = &ps->aggr[aggr_idx];
+                       struct perf_stat_aggr *aggr;
 
+                       /*
+                        * If there are multiple uncore PMUs and we're not
+                        * reading the leader's stats, determine the stats for
+                        * the appropriate uncore PMU.
+                        */
+                       if (evsel && evsel->metric_leader &&
+                           evsel->pmu != evsel->metric_leader->pmu &&
+                           mexp->metric_events[i]->pmu == evsel->metric_leader->pmu) {
+                               struct evsel *pos;
+
+                               evlist__for_each_entry(evsel->evlist, pos) {
+                                       if (pos->pmu != evsel->pmu)
+                                               continue;
+                                       if (pos->metric_leader != mexp->metric_events[i])
+                                               continue;
+                                       ps = pos->stats;
+                                       source_count = 1;
+                                       break;
+                               }
+                       }
+                       aggr = &ps->aggr[aggr_idx];
                        if (!aggr)
                                break;
 
@@ -416,7 +438,8 @@ static int prepare_metric(const struct metric_expr *mexp,
                                source_count = 0;
                        } else {
                                val = aggr->counts.val;
-                               source_count = evsel__source_count(metric_events[i]);
+                               if (!source_count)
+                                       source_count = evsel__source_count(metric_events[i]);
                        }
                }
                n = strdup(evsel__metric_id(metric_events[i]));
@@ -463,7 +486,7 @@ static void generic_metric(struct perf_stat_config *config,
                pctx->sctx.user_requested_cpu_list = strdup(config->user_requested_cpu_list);
        pctx->sctx.runtime = runtime;
        pctx->sctx.system_wide = config->system_wide;
-       i = prepare_metric(mexp, pctx, aggr_idx);
+       i = prepare_metric(mexp, evsel, pctx, aggr_idx);
        if (i < 0) {
                expr__ctx_free(pctx);
                return;
@@ -524,7 +547,7 @@ double test_generic_metric(struct metric_expr *mexp, int aggr_idx)
        if (!pctx)
                return NAN;
 
-       if (prepare_metric(mexp, pctx, aggr_idx) < 0)
+       if (prepare_metric(mexp, /*evsel=*/NULL, pctx, aggr_idx) < 0)
                goto out;
 
        if (expr__parse(&ratio, pctx, mexp->metric_expr))