struct hist_entry *he;
        struct addr_location al;
        struct mem_info *mi, *mi_dup;
+       struct callchain_cursor *cursor;
        int ret;
 
        addr_location__init(&al);
        if (c2c.stitch_lbr)
                thread__set_lbr_stitch_enable(al.thread, true);
 
-       ret = sample__resolve_callchain(sample, &callchain_cursor, NULL,
+       cursor = get_tls_callchain_cursor();
+       ret = sample__resolve_callchain(sample, cursor, NULL,
                                        evsel, &al, sysctl_perf_event_max_stack);
        if (ret)
                goto out;
 
        struct addr_location al;
        struct machine *machine = &kmem_session->machines.host;
        struct callchain_cursor_node *node;
+       struct callchain_cursor *cursor;
        u64 result = sample->ip;
 
        addr_location__init(&al);
        }
 
        al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
-       sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16);
 
-       callchain_cursor_commit(&callchain_cursor);
+       cursor = get_tls_callchain_cursor();
+       if (cursor == NULL)
+               goto out;
+
+       sample__resolve_callchain(sample, cursor, NULL, evsel, &al, 16);
+
+       callchain_cursor_commit(cursor);
        while (true) {
                struct alloc_func key, *caller;
                u64 addr;
 
-               node = callchain_cursor_current(&callchain_cursor);
+               node = callchain_cursor_current(cursor);
                if (node == NULL)
                        break;
 
                } else
                        pr_debug3("skipping alloc function: %s\n", caller->name);
 
-               callchain_cursor_advance(&callchain_cursor);
+               callchain_cursor_advance(cursor);
        }
 
        pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
 
        struct symbol *sym;
        struct thread *thread;
        struct callchain_cursor_node *node;
-       struct callchain_cursor *cursor = &callchain_cursor;
+       struct callchain_cursor *cursor;
 
        if (!kwork->show_callchain || sample->callchain == NULL)
                return;
                return;
        }
 
+       cursor = get_tls_callchain_cursor();
+
        if (thread__resolve_callchain(thread, cursor, evsel, sample,
                                      NULL, NULL, kwork->max_stack + 2) != 0) {
                pr_debug("Failed to resolve callchain, skipping\n");
         * callchain
         */
        if (kwork->show_callchain) {
+               struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+               if (cursor == NULL)
+                       return;
+
                printf(" ");
+
                sample__fprintf_sym(sample, al, 0,
                                    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
                                    EVSEL__PRINT_CALLCHAIN_ARROW |
                                    EVSEL__PRINT_SKIP_IGNORED,
-                                   &callchain_cursor, symbol_conf.bt_stop_list,
+                                   cursor, symbol_conf.bt_stop_list,
                                    stdout);
        }
 
 
                                  char *buf, int size)
 {
        struct thread *thread;
-       struct callchain_cursor *cursor = &callchain_cursor;
+       struct callchain_cursor *cursor;
        struct machine *machine = &session->machines.host;
        struct symbol *sym;
        int skip = 0;
        if (thread == NULL)
                return -1;
 
+       cursor = get_tls_callchain_cursor();
+
        /* use caller function name from the callchain */
        ret = thread__resolve_callchain(thread, cursor, evsel, sample,
                                        NULL, NULL, max_stack_depth);
 
 static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
 {
-       struct callchain_cursor *cursor = &callchain_cursor;
+       struct callchain_cursor *cursor;
        struct machine *machine = &session->machines.host;
        struct thread *thread;
        u64 hash = 0;
        if (thread == NULL)
                return -1;
 
+       cursor = get_tls_callchain_cursor();
        /* use caller function name from the callchain */
        ret = thread__resolve_callchain(thread, cursor, evsel, sample,
                                        NULL, NULL, max_stack_depth);
 
                            EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
                            EVSEL__PRINT_CALLCHAIN_ARROW |
                            EVSEL__PRINT_SKIP_IGNORED,
-                           &callchain_cursor, symbol_conf.bt_stop_list,  stdout);
+                           get_tls_callchain_cursor(), symbol_conf.bt_stop_list,  stdout);
 
 out:
        printf("\n");
                                struct evsel *evsel,
                                struct machine *machine)
 {
-       struct callchain_cursor *cursor = &callchain_cursor;
+       struct callchain_cursor *cursor;
        struct thread *thread;
 
        /* want main thread for process - has maps */
        if (!sched->show_callchain || sample->callchain == NULL)
                return;
 
+       cursor = get_tls_callchain_cursor();
+
        if (thread__resolve_callchain(thread, cursor, evsel, sample,
                                      NULL, NULL, sched->max_stack + 2) != 0) {
                if (verbose > 0)
                                struct idle_thread_runtime *itr,
                                struct perf_sample *sample)
 {
+       struct callchain_cursor *cursor;
+
        if (!sched->show_callchain || sample->callchain == NULL)
                return;
 
-       callchain_cursor__copy(&itr->cursor, &callchain_cursor);
+       cursor = get_tls_callchain_cursor();
+       if (cursor == NULL)
+               return;
+
+       callchain_cursor__copy(&itr->cursor, cursor);
 }
 
 static struct thread *timehist_get_thread(struct perf_sched *sched,
 
                unsigned int print_opts = output[type].print_ip_opts;
                struct callchain_cursor *cursor = NULL;
 
-               if (symbol_conf.use_callchain && sample->callchain &&
-                   thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-                                             sample, NULL, NULL, scripting_max_stack) == 0)
-                       cursor = &callchain_cursor;
-
+               if (symbol_conf.use_callchain && sample->callchain) {
+                       cursor = get_tls_callchain_cursor();
+                       if (thread__resolve_callchain(al->thread, cursor, evsel,
+                                                     sample, NULL, NULL,
+                                                     scripting_max_stack))
+                               cursor = NULL;
+               }
                if (cursor == NULL) {
                        printed += fprintf(fp, " ");
                        if (print_opts & EVSEL__PRINT_SRCLINE) {
                if (script->stitch_lbr)
                        thread__set_lbr_stitch_enable(al->thread, true);
 
-               if (symbol_conf.use_callchain && sample->callchain &&
-                   thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-                                             sample, NULL, NULL, scripting_max_stack) == 0)
-                       cursor = &callchain_cursor;
-
+               if (symbol_conf.use_callchain && sample->callchain) {
+                       cursor = get_tls_callchain_cursor();
+                       if (thread__resolve_callchain(al->thread, cursor, evsel,
+                                                     sample, NULL, NULL,
+                                                     scripting_max_stack))
+                               cursor = NULL;
+               }
                fputc(cursor ? '\n' : ' ', fp);
                sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor,
                                    symbol_conf.bt_stop_list, fp);
 
                                        EVSEL__PRINT_DSO |
                                        EVSEL__PRINT_UNKNOWN_AS_ADDR;
 
-       return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
+       return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
 }
 
 static const char *errno_to_name(struct evsel *evsel, int err)
                goto out;
 
        if (sample->callchain) {
-               callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+               struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+               callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
                if (callchain_ret == 0) {
-                       if (callchain_cursor.nr < trace->min_stack)
+                       if (cursor->nr < trace->min_stack)
                                goto out;
                        callchain_ret = 1;
                }
        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 
        if (sample->callchain) {
-               callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+               struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+               callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
                if (callchain_ret == 0) {
-                       if (callchain_cursor.nr < trace->min_stack)
+                       if (cursor->nr < trace->min_stack)
                                goto out;
                        callchain_ret = 1;
                }
        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 
        if (sample->callchain) {
-               callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+               struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+               callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
                if (callchain_ret == 0) {
-                       if (callchain_cursor.nr < trace->min_stack)
+                       if (cursor->nr < trace->min_stack)
                                goto out_put;
                        callchain_ret = 1;
                }
 
        CALLCHAIN_PARAM_DEFAULT
 };
 
-__thread struct callchain_cursor callchain_cursor;
+/* Used for thread-local struct callchain_cursor. */
+static pthread_key_t callchain_cursor;
 
 int parse_callchain_record_opt(const char *arg, struct callchain_param *param)
 {
                     struct callchain_cursor *cursor,
                     u64 period)
 {
+       if (cursor == NULL)
+               return -1;
+
        if (!cursor->nr)
                return 0;
 
        if ((!symbol_conf.use_callchain || sample->callchain == NULL) &&
                !symbol_conf.show_branchflag_count)
                return 0;
-       return callchain_append(he->callchain, &callchain_cursor, sample->period);
+       return callchain_append(he->callchain, get_tls_callchain_cursor(), sample->period);
 }
 
 int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node,
        return -ENOMEM;
 }
 
+static void callchain_cursor__delete(void *vcursor)
+{
+       struct callchain_cursor *cursor = vcursor;
+       struct callchain_cursor_node *node, *next;
+
+       callchain_cursor_reset(cursor);
+       for (node = cursor->first; node != NULL; node = next) {
+               next = node->next;
+               free(node);
+       }
+       free(cursor);
+}
+
+static void init_callchain_cursor_key(void)
+{
+       if (pthread_key_create(&callchain_cursor, callchain_cursor__delete)) {
+               pr_err("callchain cursor creation failed");
+               abort();
+       }
+}
+
+struct callchain_cursor *get_tls_callchain_cursor(void)
+{
+       static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+       struct callchain_cursor *cursor;
+
+       pthread_once(&once_control, init_callchain_cursor_key);
+       cursor = pthread_getspecific(callchain_cursor);
+       if (!cursor) {
+               cursor = zalloc(sizeof(*cursor));
+               if (!cursor)
+                       pr_debug3("%s: not enough memory\n", __func__);
+               pthread_setspecific(callchain_cursor, cursor);
+       }
+       return cursor;
+}
+
 int callchain_cursor__copy(struct callchain_cursor *dst,
                           struct callchain_cursor *src)
 {
 
        struct callchain_cursor_node    *curr;
 };
 
-extern __thread struct callchain_cursor callchain_cursor;
-
 static inline void callchain_init(struct callchain_root *root)
 {
        INIT_LIST_HEAD(&root->node.val);
 /* Close a cursor writing session. Initialize for the reader */
 static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
 {
+       if (cursor == NULL)
+               return;
        cursor->curr = cursor->first;
        cursor->pos = 0;
 }
 static inline struct callchain_cursor_node *
 callchain_cursor_current(struct callchain_cursor *cursor)
 {
-       if (cursor->pos == cursor->nr)
+       if (cursor == NULL || cursor->pos == cursor->nr)
                return NULL;
 
        return cursor->curr;
        cursor->pos++;
 }
 
+struct callchain_cursor *get_tls_callchain_cursor(void);
+
 int callchain_cursor__copy(struct callchain_cursor *dst,
                           struct callchain_cursor *src);
 
 
        u64 kernel_start = machine__kernel_start(machine);
        struct call_path *current = &dbe->cpr->call_path;
        enum chain_order saved_order = callchain_param.order;
+       struct callchain_cursor *cursor;
        int err;
 
        if (!symbol_conf.use_callchain || !sample->callchain)
         * the callchain starting with the root node and ending with the leaf.
         */
        callchain_param.order = ORDER_CALLER;
-       err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
+       cursor = get_tls_callchain_cursor();
+       err = thread__resolve_callchain(thread, cursor, evsel,
                                        sample, NULL, NULL, PERF_MAX_STACK_DEPTH);
        if (err) {
                callchain_param.order = saved_order;
                return NULL;
        }
-       callchain_cursor_commit(&callchain_cursor);
+       callchain_cursor_commit(cursor);
 
        while (1) {
                struct callchain_cursor_node *node;
                u64 dso_db_id = 0, sym_db_id = 0, offset = 0;
 
 
-               node = callchain_cursor_current(&callchain_cursor);
+               node = callchain_cursor_current(cursor);
                if (!node)
                        break;
 
                                             al.sym, node->ip,
                                             kernel_start);
 
-               callchain_cursor_advance(&callchain_cursor);
+               callchain_cursor_advance(cursor);
                addr_location__exit(&al);
        }
 
 
        char s = print_oneline ? ' ' : '\t';
        bool first = true;
 
+       if (cursor == NULL)
+               return fprintf(fp, "<not enough memory for the callchain cursor>%s", print_oneline ? "" : "\n");
+
        if (sample->callchain) {
                callchain_cursor_commit(cursor);
 
 
                              struct addr_location *al __maybe_unused)
 {
        struct hist_entry **he_cache;
+       struct callchain_cursor *cursor = get_tls_callchain_cursor();
 
-       callchain_cursor_commit(&callchain_cursor);
+       if (cursor == NULL)
+               return -ENOMEM;
+
+       callchain_cursor_commit(cursor);
 
        /*
         * This is for detecting cycles or recursions so that they're
         * cumulated only one time to prevent entries more than 100%
         * overhead.
         */
-       he_cache = malloc(sizeof(*he_cache) * (callchain_cursor.nr + 1));
+       he_cache = malloc(sizeof(*he_cache) * (cursor->nr + 1));
        if (he_cache == NULL)
                return -ENOMEM;
 
         * We need to re-initialize the cursor since callchain_append()
         * advanced the cursor to the end.
         */
-       callchain_cursor_commit(&callchain_cursor);
+       callchain_cursor_commit(get_tls_callchain_cursor());
 
        hists__inc_nr_samples(hists, he->filtered);
 
 {
        struct callchain_cursor_node *node;
 
-       node = callchain_cursor_current(&callchain_cursor);
+       node = callchain_cursor_current(get_tls_callchain_cursor());
        if (node == NULL)
                return 0;
 
                .raw_size = sample->raw_size,
        };
        int i;
-       struct callchain_cursor cursor;
+       struct callchain_cursor cursor, *tls_cursor = get_tls_callchain_cursor();
        bool fast = hists__has(he_tmp.hists, sym);
 
-       callchain_cursor_snapshot(&cursor, &callchain_cursor);
+       if (tls_cursor == NULL)
+               return -ENOMEM;
+
+       callchain_cursor_snapshot(&cursor, tls_cursor);
 
-       callchain_cursor_advance(&callchain_cursor);
+       callchain_cursor_advance(tls_cursor);
 
        /*
         * Check if there's duplicate entries in the callchain.
        if (al)
                alm = map__get(al->map);
 
-       err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent,
+       err = sample__resolve_callchain(iter->sample, get_tls_callchain_cursor(), &iter->parent,
                                        iter->evsel, al, max_stack_depth);
        if (err) {
                map__put(alm);
 
                if (hist_entry__has_callchains(new_he) &&
                    symbol_conf.use_callchain) {
-                       callchain_cursor_reset(&callchain_cursor);
-                       if (callchain_merge(&callchain_cursor,
+                       struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+                       if (cursor == NULL)
+                               return -1;
+
+                       callchain_cursor_reset(cursor);
+                       if (callchain_merge(cursor,
                                            new_he->callchain,
                                            he->callchain) < 0)
                                ret = -1;
                                he_stat__add_stat(iter->stat_acc, he->stat_acc);
 
                        if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) {
-                               callchain_cursor_reset(&callchain_cursor);
-                               if (callchain_merge(&callchain_cursor,
-                                                   iter->callchain,
-                                                   he->callchain) < 0)
-                                       ret = -1;
+                               struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+                               if (cursor != NULL) {
+                                       callchain_cursor_reset(cursor);
+                                       if (callchain_merge(cursor, iter->callchain, he->callchain) < 0)
+                                               ret = -1;
+                               } else {
+                                       ret = 0;
+                               }
                        }
                        hist_entry__delete(he);
                        return ret;
 
 {
        int ret = 0;
 
+       if (cursor == NULL)
+               return -ENOMEM;
+
        callchain_cursor_reset(cursor);
 
        if (callchain_param.order == ORDER_CALLEE) {
 
                                  struct evsel *evsel,
                                  struct addr_location *al)
 {
+       struct callchain_cursor *cursor;
        AV *list;
 
        list = newAV();
        if (!symbol_conf.use_callchain || !sample->callchain)
                goto exit;
 
-       if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+       cursor = get_tls_callchain_cursor();
+
+       if (thread__resolve_callchain(al->thread, cursor, evsel,
                                      sample, NULL, NULL, scripting_max_stack) != 0) {
                pr_err("Failed to resolve callchain. Skipping\n");
                goto exit;
        }
-       callchain_cursor_commit(&callchain_cursor);
+       callchain_cursor_commit(cursor);
 
 
        while (1) {
                HV *elem;
                struct callchain_cursor_node *node;
-               node = callchain_cursor_current(&callchain_cursor);
+               node = callchain_cursor_current(cursor);
                if (!node)
                        break;
 
                        }
                }
 
-               callchain_cursor_advance(&callchain_cursor);
+               callchain_cursor_advance(cursor);
                av_push(list, newRV_noinc((SV*)elem));
        }
 
 
                                         struct addr_location *al)
 {
        PyObject *pylist;
+       struct callchain_cursor *cursor;
 
        pylist = PyList_New(0);
        if (!pylist)
        if (!symbol_conf.use_callchain || !sample->callchain)
                goto exit;
 
-       if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+       cursor = get_tls_callchain_cursor();
+       if (thread__resolve_callchain(al->thread, cursor, evsel,
                                      sample, NULL, NULL,
                                      scripting_max_stack) != 0) {
                pr_err("Failed to resolve callchain. Skipping\n");
                goto exit;
        }
-       callchain_cursor_commit(&callchain_cursor);
+       callchain_cursor_commit(cursor);
 
 
        while (1) {
                PyObject *pyelem;
                struct callchain_cursor_node *node;
-               node = callchain_cursor_current(&callchain_cursor);
+               node = callchain_cursor_current(cursor);
                if (!node)
                        break;
 
                                        _PyUnicode_FromString(dsoname));
                }
 
-               callchain_cursor_advance(&callchain_cursor);
+               callchain_cursor_advance(cursor);
                PyList_Append(pylist, pyelem);
                Py_DECREF(pyelem);
        }