This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
tb_lock();
tb_phys_invalidate(tb, -1);
- tb_remove(tb);
+ tcg_tb_remove(tb);
tb_unlock();
}
#endif
}
}
-static TranslationBlock *tb_find_pc(uintptr_t tc_ptr);
-
void cpu_gen_init(void)
{
tcg_context_init(&tcg_init_ctx);
if (check_offset < tcg_init_ctx.code_gen_buffer_size) {
tb_lock();
- tb = tb_find_pc(host_pc);
+ tb = tcg_tb_lookup(host_pc);
if (tb) {
cpu_restore_state_from_tb(cpu, tb, host_pc, will_exit);
if (tb->cflags & CF_NOCACHE) {
/* one-shot translation, invalidate it immediately */
tb_phys_invalidate(tb, -1);
- tb_remove(tb);
+ tcg_tb_remove(tb);
}
r = true;
}
}
#endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
-/* compare a pointer @ptr and a tb_tc @s */
-static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
-{
- if (ptr >= s->ptr + s->size) {
- return 1;
- } else if (ptr < s->ptr) {
- return -1;
- }
- return 0;
-}
-
-static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
-{
- const struct tb_tc *a = ap;
- const struct tb_tc *b = bp;
-
- /*
- * When both sizes are set, we know this isn't a lookup.
- * This is the most likely case: every TB must be inserted; lookups
- * are a lot less frequent.
- */
- if (likely(a->size && b->size)) {
- if (a->ptr > b->ptr) {
- return 1;
- } else if (a->ptr < b->ptr) {
- return -1;
- }
- /* a->ptr == b->ptr should happen only on deletions */
- g_assert(a->size == b->size);
- return 0;
- }
- /*
- * All lookups have either .size field set to 0.
- * From the glib sources we see that @ap is always the lookup key. However
- * the docs provide no guarantee, so we just mark this case as likely.
- */
- if (likely(a->size == 0)) {
- return ptr_cmp_tb_tc(a->ptr, b);
- }
- return ptr_cmp_tb_tc(b->ptr, a);
-}
-
static inline void code_gen_alloc(size_t tb_size)
{
tcg_ctx->code_gen_buffer_size = size_code_gen_buffer(tb_size);
fprintf(stderr, "Could not allocate dynamic translator buffer\n");
exit(1);
}
- tb_ctx.tb_tree = g_tree_new(tb_tc_cmp);
qemu_mutex_init(&tb_ctx.tb_lock);
}
return tb;
}
-/* Called with tb_lock held. */
-void tb_remove(TranslationBlock *tb)
-{
- assert_tb_locked();
-
- g_tree_remove(tb_ctx.tb_tree, &tb->tc);
-}
-
static inline void invalidate_page_bitmap(PageDesc *p)
{
#ifdef CONFIG_SOFTMMU
}
if (DEBUG_TB_FLUSH_GATE) {
- size_t nb_tbs = g_tree_nnodes(tb_ctx.tb_tree);
+ size_t nb_tbs = tcg_nb_tbs();
size_t host_size = 0;
- g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size);
+ tcg_tb_foreach(tb_host_size_iter, &host_size);
printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n",
tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0);
}
cpu_tb_jmp_cache_clear(cpu);
}
- /* Increment the refcount first so that destroy acts as a reset */
- g_tree_ref(tb_ctx.tb_tree);
- g_tree_destroy(tb_ctx.tb_tree);
-
qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb();
* through the physical hash table and physical page list.
*/
tb_link_page(tb, phys_pc, phys_page2);
- g_tree_insert(tb_ctx.tb_tree, &tb->tc, tb);
+ tcg_tb_insert(tb);
return tb;
}
current_tb = NULL;
if (cpu->mem_io_pc) {
/* now we have a real cpu fault */
- current_tb = tb_find_pc(cpu->mem_io_pc);
+ current_tb = tcg_tb_lookup(cpu->mem_io_pc);
}
}
if (current_tb == tb &&
tb = p->first_tb;
#ifdef TARGET_HAS_PRECISE_SMC
if (tb && pc != 0) {
- current_tb = tb_find_pc(pc);
+ current_tb = tcg_tb_lookup(pc);
}
if (cpu != NULL) {
env = cpu->env_ptr;
}
#endif
-/*
- * Find the TB 'tb' such that
- * tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
- * Return NULL if not found.
- */
-static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
-{
- struct tb_tc s = { .ptr = (void *)tc_ptr };
-
- return g_tree_lookup(tb_ctx.tb_tree, &s);
-}
-
#if !defined(CONFIG_USER_ONLY)
void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
{
{
TranslationBlock *tb;
- tb = tb_find_pc(cpu->mem_io_pc);
+ tb = tcg_tb_lookup(cpu->mem_io_pc);
if (tb) {
/* We can use retranslation to find the PC. */
cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc, true);
uint32_t n;
tb_lock();
- tb = tb_find_pc(retaddr);
+ tb = tcg_tb_lookup(retaddr);
if (!tb) {
cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
(void *)retaddr);
* cpu_exec_nocache() */
tb_phys_invalidate(tb->orig_tb, -1);
}
- tb_remove(tb);
+ tcg_tb_remove(tb);
}
/* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not
}
struct tb_tree_stats {
+ size_t nb_tbs;
size_t host_size;
size_t target_size;
size_t max_target_size;
const TranslationBlock *tb = value;
struct tb_tree_stats *tst = data;
+ tst->nb_tbs++;
tst->host_size += tb->tc.size;
tst->target_size += tb->size;
if (tb->size > tst->max_target_size) {
struct qht_stats hst;
size_t nb_tbs;
- tb_lock();
-
- nb_tbs = g_tree_nnodes(tb_ctx.tb_tree);
- g_tree_foreach(tb_ctx.tb_tree, tb_tree_stats_iter, &tst);
+ tcg_tb_foreach(tb_tree_stats_iter, &tst);
+ nb_tbs = tst.nb_tbs;
/* XXX: avoid using doubles ? */
cpu_fprintf(f, "Translation buffer state:\n");
/*
cpu_fprintf(f, "TB invalidate count %d\n", tb_ctx.tb_phys_invalidate_count);
cpu_fprintf(f, "TLB flush count %zu\n", tlb_flush_count());
tcg_dump_info(f, cpu_fprintf);
-
- tb_unlock();
}
void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf)
* set the page to PAGE_WRITE and did the TB invalidate for us.
*/
#ifdef TARGET_HAS_PRECISE_SMC
- TranslationBlock *current_tb = tb_find_pc(pc);
+ TranslationBlock *current_tb = tcg_tb_lookup(pc);
if (current_tb) {
current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
}
| (use_icount ? CF_USE_ICOUNT : 0);
}
-void tb_remove(TranslationBlock *tb);
void tb_flush(CPUState *cpu);
void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
struct TBContext {
- GTree *tb_tree;
struct qht htable;
/* any access to the tbs or the page table must use this lock */
QemuMutex tb_lock;
static unsigned int n_tcg_ctxs;
TCGv_env cpu_env = 0;
+struct tcg_region_tree {
+ QemuMutex lock;
+ GTree *tree;
+ /* padding to avoid false sharing is computed at run-time */
+};
+
/*
* We divide code_gen_buffer into equally-sized "regions" that TCG threads
* dynamically allocate from as demand dictates. Given appropriate region
};
static struct tcg_region_state region;
+/*
+ * This is an array of struct tcg_region_tree's, with padding.
+ * We use void * to simplify the computation of region_trees[i]; each
+ * struct is found every tree_size bytes.
+ */
+static void *region_trees;
+static size_t tree_size;
static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
static TCGRegSet tcg_target_call_clobber_regs;
#include "tcg-target.inc.c"
+/* compare a pointer @ptr and a tb_tc @s */
+static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
+{
+ if (ptr >= s->ptr + s->size) {
+ return 1;
+ } else if (ptr < s->ptr) {
+ return -1;
+ }
+ return 0;
+}
+
+static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
+{
+ const struct tb_tc *a = ap;
+ const struct tb_tc *b = bp;
+
+ /*
+ * When both sizes are set, we know this isn't a lookup.
+ * This is the most likely case: every TB must be inserted; lookups
+ * are a lot less frequent.
+ */
+ if (likely(a->size && b->size)) {
+ if (a->ptr > b->ptr) {
+ return 1;
+ } else if (a->ptr < b->ptr) {
+ return -1;
+ }
+ /* a->ptr == b->ptr should happen only on deletions */
+ g_assert(a->size == b->size);
+ return 0;
+ }
+ /*
+ * All lookups have either .size field set to 0.
+ * From the glib sources we see that @ap is always the lookup key. However
+ * the docs provide no guarantee, so we just mark this case as likely.
+ */
+ if (likely(a->size == 0)) {
+ return ptr_cmp_tb_tc(a->ptr, b);
+ }
+ return ptr_cmp_tb_tc(b->ptr, a);
+}
+
+static void tcg_region_trees_init(void)
+{
+ size_t i;
+
+ tree_size = ROUND_UP(sizeof(struct tcg_region_tree), qemu_dcache_linesize);
+ region_trees = qemu_memalign(qemu_dcache_linesize, region.n * tree_size);
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ qemu_mutex_init(&rt->lock);
+ rt->tree = g_tree_new(tb_tc_cmp);
+ }
+}
+
+static struct tcg_region_tree *tc_ptr_to_region_tree(void *p)
+{
+ size_t region_idx;
+
+ if (p < region.start_aligned) {
+ region_idx = 0;
+ } else {
+ ptrdiff_t offset = p - region.start_aligned;
+
+ if (offset > region.stride * (region.n - 1)) {
+ region_idx = region.n - 1;
+ } else {
+ region_idx = offset / region.stride;
+ }
+ }
+ return region_trees + region_idx * tree_size;
+}
+
+void tcg_tb_insert(TranslationBlock *tb)
+{
+ struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
+
+ qemu_mutex_lock(&rt->lock);
+ g_tree_insert(rt->tree, &tb->tc, tb);
+ qemu_mutex_unlock(&rt->lock);
+}
+
+void tcg_tb_remove(TranslationBlock *tb)
+{
+ struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
+
+ qemu_mutex_lock(&rt->lock);
+ g_tree_remove(rt->tree, &tb->tc);
+ qemu_mutex_unlock(&rt->lock);
+}
+
+/*
+ * Find the TB 'tb' such that
+ * tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
+ * Return NULL if not found.
+ */
+TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
+{
+ struct tcg_region_tree *rt = tc_ptr_to_region_tree((void *)tc_ptr);
+ TranslationBlock *tb;
+ struct tb_tc s = { .ptr = (void *)tc_ptr };
+
+ qemu_mutex_lock(&rt->lock);
+ tb = g_tree_lookup(rt->tree, &s);
+ qemu_mutex_unlock(&rt->lock);
+ return tb;
+}
+
+static void tcg_region_tree_lock_all(void)
+{
+ size_t i;
+
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ qemu_mutex_lock(&rt->lock);
+ }
+}
+
+static void tcg_region_tree_unlock_all(void)
+{
+ size_t i;
+
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ qemu_mutex_unlock(&rt->lock);
+ }
+}
+
+void tcg_tb_foreach(GTraverseFunc func, gpointer user_data)
+{
+ size_t i;
+
+ tcg_region_tree_lock_all();
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ g_tree_foreach(rt->tree, func, user_data);
+ }
+ tcg_region_tree_unlock_all();
+}
+
+size_t tcg_nb_tbs(void)
+{
+ size_t nb_tbs = 0;
+ size_t i;
+
+ tcg_region_tree_lock_all();
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ nb_tbs += g_tree_nnodes(rt->tree);
+ }
+ tcg_region_tree_unlock_all();
+ return nb_tbs;
+}
+
+static void tcg_region_tree_reset_all(void)
+{
+ size_t i;
+
+ tcg_region_tree_lock_all();
+ for (i = 0; i < region.n; i++) {
+ struct tcg_region_tree *rt = region_trees + i * tree_size;
+
+ /* Increment the refcount first so that destroy acts as a reset */
+ g_tree_ref(rt->tree);
+ g_tree_destroy(rt->tree);
+ }
+ tcg_region_tree_unlock_all();
+}
+
static void tcg_region_bounds(size_t curr_region, void **pstart, void **pend)
{
void *start, *end;
g_assert(!err);
}
qemu_mutex_unlock(®ion.lock);
+
+ tcg_region_tree_reset_all();
}
#ifdef CONFIG_USER_ONLY
g_assert(!rc);
}
+ tcg_region_trees_init();
+
/* In user-mode we support only one ctx, so do the initial allocation now */
#ifdef CONFIG_USER_ONLY
{
size_t tcg_code_size(void);
size_t tcg_code_capacity(void);
+void tcg_tb_insert(TranslationBlock *tb);
+void tcg_tb_remove(TranslationBlock *tb);
+TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr);
+void tcg_tb_foreach(GTraverseFunc func, gpointer user_data);
+size_t tcg_nb_tbs(void);
+
/* user-mode: Called with tb_lock held. */
static inline void *tcg_malloc(int size)
{