perf report: Support instruction latency
authorKan Liang <kan.liang@linux.intel.com>
Tue, 2 Feb 2021 20:09:10 +0000 (12:09 -0800)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Mon, 8 Feb 2021 19:25:00 +0000 (16:25 -0300)
The instruction latency information can be recorded on some platforms,
e.g., the Intel Sapphire Rapids server. With both memory latency
(weight) and the new instruction latency information, users can easily
locate the expensive load instructions, and also understand the time
spent in different stages. The users can optimize their applications in
different pipeline stages.

The 'weight' field is shared among different architectures. Reusing the
'weight' field may impacts other architectures. Add a new field to store
the instruction latency.

Like the 'weight' support, introduce a 'ins_lat' for the global
instruction latency, and a 'local_ins_lat' for the local instruction
latency version.

Add new sort functions, INSTR Latency and Local INSTR Latency,
accordingly.

Add local_ins_lat to the default_mem_sort_order[].

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/1612296553-21962-7-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/Documentation/perf-report.txt
tools/perf/util/event.h
tools/perf/util/evsel.c
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/intel-pt.c
tools/perf/util/session.c
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/synthetic-events.c

index b9686a131ed8d4b64be8621a03b67df2fef06400..f546b5e9db056559ad808ee605d7ecd1900f9936 100644 (file)
@@ -109,6 +109,9 @@ OPTIONS
        - time: Separate the samples by time stamp with the resolution specified by
        --time-quantum (default 100ms). Specify with overhead and before it.
        - code_page_size: the code page size of sampled code address (ip)
+       - ins_lat: Instruction latency in core cycles. This is the global instruction
+         latency
+       - local_ins_lat: Local instruction latency version
 
        By default, comm, dso and symbol keys are used.
        (i.e. --sort comm,dso,symbol)
@@ -155,7 +158,8 @@ OPTIONS
        - blocked: reason of blocked load access for the data at the time of the sample
 
        And the default sort keys are changed to local_weight, mem, sym, dso,
-       symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, see '--mem-mode'.
+       symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat,
+       see '--mem-mode'.
 
        If the data file has tracepoint event(s), following (dynamic) sort keys
        are also available:
index 2afea7247dd30aa007e806a6fb4b2b9e79b20b0b..60752e4c97274e3fc259825b5b55786f2c054c34 100644 (file)
@@ -142,6 +142,7 @@ struct perf_sample {
        u16 insn_len;
        u8  cpumode;
        u16 misc;
+       u16 ins_lat;
        bool no_hw_idx;         /* No hw_idx collected in branch_stack */
        char insn[MAX_INSN];
        void *raw_data;
index fa49d15edc35add8ccc0f20e6a2123d90f9a1744..844aebd9c306d71b446fc33e3e9987685d68232b 100644 (file)
@@ -2352,8 +2352,10 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
                weight.full = *array;
                if (type & PERF_SAMPLE_WEIGHT)
                        data->weight = weight.full;
-               else
+               else {
                        data->weight = weight.var1_dw;
+                       data->ins_lat = weight.var2_w;
+               }
                array++;
        }
 
index 4038b086cb80dc84e8014376afe880a9c7579089..c82f5fc26af85e3cfb40f0a39c9bcce7fdc04391 100644 (file)
@@ -209,6 +209,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
        hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12);
        hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12);
        hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
+       hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
+       hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
        if (symbol_conf.nanosecs)
                hists__new_col_len(hists, HISTC_TIME, 16);
        else
@@ -287,12 +289,13 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-                               u64 weight)
+                               u64 weight, u64 ins_lat)
 {
 
        he_stat->period         += period;
        he_stat->weight         += weight;
        he_stat->nr_events      += 1;
+       he_stat->ins_lat        += ins_lat;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -304,6 +307,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
        dest->period_guest_us   += src->period_guest_us;
        dest->nr_events         += src->nr_events;
        dest->weight            += src->weight;
+       dest->ins_lat           += src->ins_lat;
 }
 
 static void he_stat__decay(struct he_stat *he_stat)
@@ -592,6 +596,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
        int64_t cmp;
        u64 period = entry->stat.period;
        u64 weight = entry->stat.weight;
+       u64 ins_lat = entry->stat.ins_lat;
        bool leftmost = true;
 
        p = &hists->entries_in->rb_root.rb_node;
@@ -610,11 +615,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 
                if (!cmp) {
                        if (sample_self) {
-                               he_stat__add_period(&he->stat, period, weight);
+                               he_stat__add_period(&he->stat, period, weight, ins_lat);
                                hist_entry__add_callchain_period(he, period);
                        }
                        if (symbol_conf.cumulate_callchain)
-                               he_stat__add_period(he->stat_acc, period, weight);
+                               he_stat__add_period(he->stat_acc, period, weight, ins_lat);
 
                        /*
                         * This mem info was allocated from sample__resolve_mem
@@ -725,6 +730,7 @@ __hists__add_entry(struct hists *hists,
                        .nr_events = 1,
                        .period = sample->period,
                        .weight = sample->weight,
+                       .ins_lat = sample->ins_lat,
                },
                .parent = sym_parent,
                .filtered = symbol__parent_filter(sym_parent) | al->filtered,
index 3788391c50c795afde9be8231aab7bb2cd4b0398..3c537232294bdea41b9626bff6d25787f4e42a15 100644 (file)
@@ -73,6 +73,8 @@ enum hist_column {
        HISTC_DSO_SIZE,
        HISTC_SYMBOL_IPC,
        HISTC_MEM_BLOCKED,
+       HISTC_LOCAL_INS_LAT,
+       HISTC_GLOBAL_INS_LAT,
        HISTC_NR_COLS, /* Last entry */
 };
 
index a929f6dbdf433eecfc5b0e0bf20a5c717d6192ce..c9477d0216879bb478b377f18f0d37bcd3880d1a 100644 (file)
@@ -1871,9 +1871,10 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq)
                         * cycles. Use latency >> 32 to distinguish the
                         * different format of the mem access latency field.
                         */
-                       if (weight > 0)
+                       if (weight > 0) {
                                sample.weight = weight & 0xffff;
-                       else
+                               sample.ins_lat = items->mem_access_latency & 0xffff;
+                       } else
                                sample.weight = items->mem_access_latency;
                }
                if (!sample.weight && items->has_tsx_aux_info) {
index 053c08c8c850d54f1cd14f24fde6f85e49b4b038..f4aeb1af05d89acdbc761dfa9762074d0d6f7927 100644 (file)
@@ -1300,8 +1300,12 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
        if (sample_type & PERF_SAMPLE_STACK_USER)
                stack_user__printf(&sample->user_stack);
 
-       if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
-               printf("... weight: %" PRIu64 "\n", sample->weight);
+       if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+               printf("... weight: %" PRIu64 "", sample->weight);
+                       if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+                               printf(",0x%"PRIx16"", sample->ins_lat);
+               printf("\n");
+       }
 
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
index e29a24b41b67b4dea0fd210a3f8546798137ee76..0d5ad42812b9d784cb6b67a17f632df9e76c9281 100644 (file)
@@ -36,7 +36,7 @@ const char    default_parent_pattern[] = "^sys_|^do_page_fault";
 const char     *parent_pattern = default_parent_pattern;
 const char     *default_sort_order = "comm,dso,symbol";
 const char     default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char     default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked";
+const char     default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
 const char     default_top_sort_order[] = "dso,symbol";
 const char     default_diff_sort_order[] = "dso,symbol";
 const char     default_tracepoint_sort_order[] = "trace";
@@ -1365,6 +1365,49 @@ struct sort_entry sort_global_weight = {
        .se_width_idx   = HISTC_GLOBAL_WEIGHT,
 };
 
+static u64 he_ins_lat(struct hist_entry *he)
+{
+               return he->stat.nr_events ? he->stat.ins_lat / he->stat.nr_events : 0;
+}
+
+static int64_t
+sort__local_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+               return he_ins_lat(left) - he_ins_lat(right);
+}
+
+static int hist_entry__local_ins_lat_snprintf(struct hist_entry *he, char *bf,
+                                             size_t size, unsigned int width)
+{
+               return repsep_snprintf(bf, size, "%-*u", width, he_ins_lat(he));
+}
+
+struct sort_entry sort_local_ins_lat = {
+       .se_header      = "Local INSTR Latency",
+       .se_cmp         = sort__local_ins_lat_cmp,
+       .se_snprintf    = hist_entry__local_ins_lat_snprintf,
+       .se_width_idx   = HISTC_LOCAL_INS_LAT,
+};
+
+static int64_t
+sort__global_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+               return left->stat.ins_lat - right->stat.ins_lat;
+}
+
+static int hist_entry__global_ins_lat_snprintf(struct hist_entry *he, char *bf,
+                                              size_t size, unsigned int width)
+{
+               return repsep_snprintf(bf, size, "%-*u", width, he->stat.ins_lat);
+}
+
+struct sort_entry sort_global_ins_lat = {
+       .se_header      = "INSTR Latency",
+       .se_cmp         = sort__global_ins_lat_cmp,
+       .se_snprintf    = hist_entry__global_ins_lat_snprintf,
+       .se_width_idx   = HISTC_GLOBAL_INS_LAT,
+};
+
 struct sort_entry sort_mem_daddr_sym = {
        .se_header      = "Data Symbol",
        .se_cmp         = sort__daddr_cmp,
@@ -1796,6 +1839,8 @@ static struct sort_dimension common_sort_dimensions[] = {
        DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null),
        DIM(SORT_TIME, "time", sort_time),
        DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
+       DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
+       DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
 };
 
 #undef DIM
index 984e54533ae114cc569ae4140a83457155905a9e..63f67a3f36308239420ed4dec870592caccf7c1b 100644 (file)
@@ -50,6 +50,7 @@ struct he_stat {
        u64                     period_guest_sys;
        u64                     period_guest_us;
        u64                     weight;
+       u64                     ins_lat;
        u32                     nr_events;
 };
 
@@ -231,6 +232,8 @@ enum sort_type {
        SORT_SYM_IPC_NULL,
        SORT_TIME,
        SORT_CODE_PAGE_SIZE,
+       SORT_LOCAL_INS_LAT,
+       SORT_GLOBAL_INS_LAT,
 
        /* branch stack specific sort keys */
        __SORT_BRANCH_STACK,
index 4e9266f751754af0141a24d64ec90ecfdcab5e6e..c6f9db3faf83baff404718667e6eb64b1eea8673 100644 (file)
@@ -1644,8 +1644,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
 
        if (type & PERF_SAMPLE_WEIGHT_TYPE) {
                *array = sample->weight;
-               if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+               if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
                        *array &= 0xffffffff;
+                       *array |= ((u64)sample->ins_lat << 32);
+               }
                array++;
        }