bcachefs: Improve btree iterator tracepoints
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 4 Jun 2021 19:18:10 +0000 (15:18 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:05 +0000 (17:09 -0400)
This patch adds some new tracepoints to the btree iterator code, and
adds new fields to the existing tracepoints - primarily for the iterator
position.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/trace.h

index a13e5eef868eea62e7c8b07e37aba3134ed83e5d..5991ebee228c87d9188e602c3fb56ea1a53b670f 100644 (file)
@@ -816,7 +816,10 @@ lock_node:
                        if (bch2_btree_node_relock(iter, level + 1))
                                goto retry;
 
-                       trace_trans_restart_btree_node_reused(iter->trans->ip);
+                       trace_trans_restart_btree_node_reused(iter->trans->ip,
+                                                             trace_ip,
+                                                             iter->btree_id,
+                                                             &iter->real_pos);
                        return ERR_PTR(-EINTR);
                }
        }
index 4b590b2096a7a8004b4a6d7b7642deb63c828685..78eae21693470116c9f015ba8e8b21d9a307b23e 100644 (file)
@@ -178,8 +178,8 @@ success:
        return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter,
-                                       bool upgrade, bool trace)
+static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
+                                       unsigned long trace_ip)
 {
        unsigned l = iter->level;
        int fail_idx = -1;
@@ -191,16 +191,17 @@ static inline bool btree_iter_get_locks(struct btree_iter *iter,
                if (!(upgrade
                      ? bch2_btree_node_upgrade(iter, l)
                      : bch2_btree_node_relock(iter, l))) {
-                       if (trace)
-                               (upgrade
-                                ? trace_node_upgrade_fail
-                                : trace_node_relock_fail)(l, iter->l[l].lock_seq,
-                                               is_btree_node(iter, l)
-                                               ? 0
-                                               : (unsigned long) iter->l[l].b,
-                                               is_btree_node(iter, l)
-                                               ? iter->l[l].b->c.lock.state.seq
-                                               : 0);
+                       (upgrade
+                        ? trace_node_upgrade_fail
+                        : trace_node_relock_fail)(iter->trans->ip, trace_ip,
+                                       iter->btree_id, &iter->real_pos,
+                                       l, iter->l[l].lock_seq,
+                                       is_btree_node(iter, l)
+                                       ? 0
+                                       : (unsigned long) iter->l[l].b,
+                                       is_btree_node(iter, l)
+                                       ? iter->l[l].b->c.lock.state.seq
+                                       : 0);
 
                        fail_idx = l;
                        btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -379,9 +380,9 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
+static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
-       return btree_iter_get_locks(iter, false, trace);
+       return btree_iter_get_locks(iter, false, trace_ip);
 }
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
@@ -393,7 +394,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
 
        iter->locks_want = new_locks_want;
 
-       if (btree_iter_get_locks(iter, true, true))
+       if (btree_iter_get_locks(iter, true, _THIS_IP_))
                return true;
 
        /*
@@ -421,7 +422,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
                    linked->btree_id == iter->btree_id &&
                    linked->locks_want < new_locks_want) {
                        linked->locks_want = new_locks_want;
-                       btree_iter_get_locks(linked, true, false);
+                       btree_iter_get_locks(linked, true, _THIS_IP_);
                }
 
        return false;
@@ -467,8 +468,9 @@ bool bch2_trans_relock(struct btree_trans *trans)
        struct btree_iter *iter;
 
        trans_for_each_iter(trans, iter)
-               if (!bch2_btree_iter_relock(iter, true)) {
-                       trace_trans_restart_relock(trans->ip);
+               if (!bch2_btree_iter_relock(iter, _RET_IP_)) {
+                       trace_trans_restart_relock(trans->ip, _RET_IP_,
+                                       iter->btree_id, &iter->real_pos);
                        return false;
                }
        return true;
@@ -1182,7 +1184,8 @@ err:
 
 static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+                                    unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
@@ -1199,7 +1202,7 @@ retry_all:
        relock_fail = false;
 
        trans_for_each_iter(trans, iter) {
-               if (!bch2_btree_iter_relock(iter, true))
+               if (!bch2_btree_iter_relock(iter, _THIS_IP_))
                        relock_fail = true;
                sorted[nr_sorted++] = iter->idx;
        }
@@ -1276,13 +1279,13 @@ out:
 
        trans->in_traverse_all = false;
 
-       trace_trans_traverse_all(trans->ip);
+       trace_trans_traverse_all(trans->ip, trace_ip);
        return ret;
 }
 
 int bch2_btree_iter_traverse_all(struct btree_trans *trans)
 {
-       return __btree_iter_traverse_all(trans, 0);
+       return __btree_iter_traverse_all(trans, 0, _RET_IP_);
 }
 
 static inline bool btree_iter_good_node(struct btree_iter *iter,
@@ -1327,6 +1330,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
                                   unsigned long trace_ip)
 {
        unsigned depth_want = iter->level;
+       int ret = 0;
 
        /*
         * if we need interior nodes locked, call btree_iter_relock() to make
@@ -1334,16 +1338,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
         */
        if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
            iter->locks_want > 1)
-               bch2_btree_iter_relock(iter, false);
+               bch2_btree_iter_relock(iter, _THIS_IP_);
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED)
-               return bch2_btree_iter_traverse_cached(iter);
+       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+               ret = bch2_btree_iter_traverse_cached(iter);
+               goto out;
+       }
 
        if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
-               return 0;
+               goto out;
 
        if (unlikely(iter->level >= BTREE_MAX_DEPTH))
-               return 0;
+               goto out;
 
        iter->level = btree_iter_up_until_good_node(iter, 0);
 
@@ -1354,12 +1360,18 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
         * btree_iter_lock_root() comes next and that it can't fail
         */
        while (iter->level > depth_want) {
-               int ret = btree_iter_node(iter, iter->level)
+               ret = btree_iter_node(iter, iter->level)
                        ? btree_iter_down(iter, trace_ip)
                        : btree_iter_lock_root(iter, depth_want, trace_ip);
                if (unlikely(ret)) {
-                       if (ret == 1)
-                               return 0;
+                       if (ret == 1) {
+                               /*
+                                * Got to the end of the btree (in
+                                * BTREE_ITER_NODES mode)
+                                */
+                               ret = 0;
+                               goto out;
+                       }
 
                        iter->level = depth_want;
 
@@ -1371,14 +1383,16 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
                                iter->l[iter->level].b =
                                        BTREE_ITER_NO_NODE_DOWN;
                        }
-                       return ret;
+                       goto out;
                }
        }
 
        iter->uptodate = BTREE_ITER_NEED_PEEK;
-
+out:
+       trace_iter_traverse(iter->trans->ip, trace_ip,
+                           iter->btree_id, &iter->real_pos, ret);
        bch2_btree_iter_verify(iter);
-       return 0;
+       return ret;
 }
 
 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
@@ -1389,7 +1403,7 @@ static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        ret =   bch2_trans_cond_resched(trans) ?:
                btree_iter_traverse_one(iter, _RET_IP_);
        if (unlikely(ret))
-               ret = __btree_iter_traverse_all(trans, ret);
+               ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
 
        return ret;
 }
@@ -1505,6 +1519,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+       struct bpos old_pos = iter->real_pos;
        int cmp = bpos_cmp(new_pos, iter->real_pos);
        unsigned l = iter->level;
 
@@ -1515,7 +1530,7 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
 
        if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
                btree_node_unlock(iter, 0);
-               iter->l[0].b = BTREE_ITER_NO_NODE_UP;
+               iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
                return;
        }
@@ -1544,6 +1559,11 @@ out:
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 
        bch2_btree_iter_verify(iter);
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
+                                 iter->btree_id,
+                                 &old_pos, &new_pos, l);
+#endif
 }
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
@@ -2062,13 +2082,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
                best = iter;
        }
 
-       trace_trans_get_iter(_RET_IP_, trans->ip,
-                            btree_id,
-                            &real_pos, locks_want,
-                            best ? &best->real_pos : &pos_min,
-                            best ? best->locks_want : 0,
-                            best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE);
-
        if (!best) {
                iter = btree_trans_iter_alloc(trans);
                bch2_btree_iter_init(trans, iter, btree_id);
@@ -2097,7 +2110,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
        if (locks_want > iter->locks_want) {
                iter->locks_want = locks_want;
-               btree_iter_get_locks(iter, true, false);
+               btree_iter_get_locks(iter, true, _THIS_IP_);
        }
 
        while (iter->level != depth) {
@@ -2115,6 +2128,13 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
        bch2_btree_iter_set_pos(iter, pos);
        btree_iter_set_search_pos(iter, real_pos);
 
+       trace_trans_get_iter(_RET_IP_, trans->ip,
+                            btree_id,
+                            &real_pos, locks_want, iter->uptodate,
+                            best ? &best->real_pos     : &pos_min,
+                            best ? best->locks_want    : U8_MAX,
+                            best ? best->uptodate      : U8_MAX);
+
        return iter;
 }
 
index 2f63adb9e4205b8ee6e2b8ade7194c7504c45569..01b834bf79f7e6ebc5359cea9f6d357f8d58b6ce 100644 (file)
@@ -111,7 +111,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
                              struct btree_node_iter *, struct bkey_packed *,
                              unsigned, unsigned);
 
-bool bch2_btree_iter_relock(struct btree_iter *, bool);
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
index 493d65882222fc5151995a03c1c7073c5b96b7f8..50595f5f158bb465be30b29de70b483eb0506125 100644 (file)
@@ -230,6 +230,7 @@ enum btree_iter_uptodate {
 #define BTREE_ITER_NO_NODE_DOWN                ((struct btree *) 5)
 #define BTREE_ITER_NO_NODE_INIT                ((struct btree *) 6)
 #define BTREE_ITER_NO_NODE_ERROR       ((struct btree *) 7)
+#define BTREE_ITER_NO_NODE_CACHED      ((struct btree *) 8)
 
 /*
  * @pos                        - iterator's current position
index 569db972f3bb6e673aef3608ffcc29459210c695..bb01b036c7a2e19892150fa9ee40d9b519b1530e 100644 (file)
@@ -955,7 +955,9 @@ retry:
         * instead of locking/reserving all the way to the root:
         */
        if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-               trace_trans_restart_iter_upgrade(trans->ip);
+               trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
+                                                iter->btree_id,
+                                                &iter->real_pos);
                return ERR_PTR(-EINTR);
        }
 
@@ -996,7 +998,7 @@ retry:
                 * closure argument
                 */
                if (flags & BTREE_INSERT_NOUNLOCK) {
-                       trace_trans_restart_journal_preres_get(trans->ip);
+                       trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
                        ret = -EINTR;
                        goto err;
                }
@@ -1012,7 +1014,7 @@ retry:
                                BTREE_UPDATE_JOURNAL_RES,
                                journal_flags);
                if (ret) {
-                       trace_trans_restart_journal_preres_get(trans->ip);
+                       trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
                        goto err;
                }
 
index 839262c9501ada4cd23371bc06c4bccadfb54a4a..9eb31d31ed42a9bff03df100973cbe3661eedd47 100644 (file)
@@ -228,7 +228,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 }
 
 static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
+                                  unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
        int ret;
@@ -241,7 +242,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
                return ret;
 
        if (!bch2_trans_relock(trans)) {
-               trace_trans_restart_journal_preres_get(trans->ip);
+               trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
                return -EINTR;
        }
 
@@ -368,7 +369,8 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
 
 static inline int
 bch2_trans_commit_write_locked(struct btree_trans *trans,
-                              struct btree_insert_entry **stopped_at)
+                              struct btree_insert_entry **stopped_at,
+                              unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
@@ -378,7 +380,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
        int ret;
 
        if (race_fault()) {
-               trace_trans_restart_fault_inject(trans->ip);
+               trace_trans_restart_fault_inject(trans->ip, trace_ip);
                return -EINTR;
        }
 
@@ -525,7 +527,8 @@ static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
 static inline int do_bch2_trans_commit(struct btree_trans *trans,
-                                      struct btree_insert_entry **stopped_at)
+                                      struct btree_insert_entry **stopped_at,
+                                      unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
@@ -559,7 +562,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                         ? JOURNAL_RES_GET_RESERVED : 0));
        if (unlikely(ret == -EAGAIN))
                ret = bch2_trans_journal_preres_get_cold(trans,
-                                               trans->journal_preres_u64s);
+                                               trans->journal_preres_u64s, trace_ip);
        if (unlikely(ret))
                return ret;
 
@@ -578,7 +581,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                if (iter->nodes_locked != iter->nodes_intent_locked) {
                        if (btree_iter_keep(trans, iter)) {
                                if (!bch2_btree_iter_upgrade(iter, 1)) {
-                                       trace_trans_restart_upgrade(trans->ip);
+                                       trace_trans_restart_upgrade(trans->ip, trace_ip,
+                                                                   iter->btree_id,
+                                                                   &iter->real_pos);
                                        return -EINTR;
                                }
                        } else {
@@ -606,7 +611,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                        bch2_btree_node_lock_for_insert(c,
                                        iter_l(i->iter)->b, i->iter);
 
-       ret = bch2_trans_commit_write_locked(trans, stopped_at);
+       ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
@@ -644,7 +649,7 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
 static noinline
 int bch2_trans_commit_error(struct btree_trans *trans,
                            struct btree_insert_entry *i,
-                           int ret)
+                           int ret, unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
        unsigned flags = trans->flags;
@@ -685,7 +690,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (!ret ||
                    ret == -EINTR ||
                    (flags & BTREE_INSERT_NOUNLOCK)) {
-                       trace_trans_restart_btree_node_split(trans->ip);
+                       trace_trans_restart_btree_node_split(trans->ip, trace_ip,
+                                                            i->iter->btree_id,
+                                                            &i->iter->real_pos);
                        ret = -EINTR;
                }
                break;
@@ -703,7 +710,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_mark_replicas(trans->ip);
+               trace_trans_restart_mark_replicas(trans->ip, trace_ip);
                ret = -EINTR;
                break;
        case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -720,13 +727,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_journal_res_get(trans->ip);
+               trace_trans_restart_journal_res_get(trans->ip, trace_ip);
                ret = -EINTR;
                break;
        case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
                bch2_trans_unlock(trans);
 
-               trace_trans_blocked_journal_reclaim(trans->ip);
+               trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
 
                wait_event_freezable(c->journal.reclaim_wait,
                                     (ret = journal_reclaim_wait_done(c)));
@@ -736,7 +743,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_journal_reclaim(trans->ip);
+               trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
                ret = -EINTR;
                break;
        default:
@@ -950,7 +957,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
                                                             i->trigger_flags);
                                if (unlikely(ret)) {
                                        if (ret == -EINTR)
-                                               trace_trans_restart_mark(trans->ip);
+                                               trace_trans_restart_mark(trans->ip, _RET_IP_,
+                                                                        i->iter->btree_id,
+                                                                        &i->iter->pos);
                                        goto out;
                                }
                        }
@@ -976,12 +985,16 @@ int __bch2_trans_commit(struct btree_trans *trans)
        trans_for_each_update2(trans, i) {
                ret = bch2_btree_iter_traverse(i->iter);
                if (unlikely(ret)) {
-                       trace_trans_restart_traverse(trans->ip);
+                       trace_trans_restart_traverse(trans->ip, _RET_IP_,
+                                                    i->iter->btree_id,
+                                                    &i->iter->pos);
                        goto out;
                }
 
                if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
-                       trace_trans_restart_upgrade(trans->ip);
+                       trace_trans_restart_upgrade(trans->ip, _RET_IP_,
+                                                   i->iter->btree_id,
+                                                   &i->iter->pos);
                        ret = -EINTR;
                        goto out;
                }
@@ -997,7 +1010,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
 retry:
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-       ret = do_bch2_trans_commit(trans, &i);
+       ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
 
        /* make sure we didn't drop or screw up locks: */
        bch2_btree_trans_verify_locks(trans);
@@ -1023,7 +1036,7 @@ out_reset:
 
        return ret;
 err:
-       ret = bch2_trans_commit_error(trans, i, ret);
+       ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
        if (ret)
                goto out;
 
index d447b79bd6eec9046c7c683a3da1b76913a1b56d..ae2aee8ddee82f75566ea80b19f35f87d89528f4 100644 (file)
@@ -541,59 +541,66 @@ TRACE_EVENT(copygc_wait,
 );
 
 TRACE_EVENT(trans_get_iter,
-       TP_PROTO(unsigned long caller, unsigned long ip,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
                 enum btree_id btree_id,
-                struct bpos *pos_want,
-                unsigned locks_want,
-                struct bpos *pos_found,
-                unsigned locks_found,
-                unsigned uptodate),
-       TP_ARGS(caller, ip, btree_id,
-               pos_want, locks_want,
-               pos_found, locks_found,
-               uptodate),
+                struct bpos *got_pos,
+                unsigned got_locks,
+                unsigned got_uptodate,
+                struct bpos *src_pos,
+                unsigned src_locks,
+                unsigned src_uptodate),
+       TP_ARGS(trans_ip, caller_ip, btree_id,
+               got_pos, got_locks, got_uptodate,
+               src_pos, src_locks, src_uptodate),
 
        TP_STRUCT__entry(
-               __field(unsigned long,  caller                  )
-               __field(unsigned long,  ip                      )
-               __field(u8,             btree_id                )
-               __field(u8,             uptodate                )
-               __field(u8,             locks_want              )
-               __field(u8,             locks_found             )
-               __field(u64,            pos_want_inode          )
-               __field(u64,            pos_want_offset         )
-               __field(u32,            pos_want_snapshot       )
-               __field(u64,            pos_found_inode         )
-               __field(u64,            pos_found_offset        )
-               __field(u32,            pos_found_snapshot      )
+               __field(unsigned long,          trans_ip                )
+               __field(unsigned long,          caller_ip               )
+               __field(u8,                     btree_id                )
+               __field(u64,                    got_pos_inode           )
+               __field(u64,                    got_pos_offset          )
+               __field(u32,                    got_pos_snapshot        )
+               __field(u8,                     got_locks               )
+               __field(u8,                     got_uptodate            )
+               __field(u64,                    src_pos_inode           )
+               __field(u64,                    src_pos_offset          )
+               __field(u32,                    src_pos_snapshot        )
+               __field(u8,                     src_locks               )
+               __field(u8,                     src_uptodate            )
        ),
 
        TP_fast_assign(
-               __entry->caller                 = caller;
-               __entry->ip                     = ip;
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
                __entry->btree_id               = btree_id;
-               __entry->uptodate               = uptodate;
-               __entry->pos_want_inode         = pos_want->inode;
-               __entry->pos_want_offset        = pos_want->offset;
-               __entry->pos_want_snapshot      = pos_want->snapshot;
-               __entry->pos_found_inode        = pos_found->inode;
-               __entry->pos_found_offset       = pos_found->offset;
-               __entry->pos_found_snapshot     = pos_found->snapshot;
-       ),
-
-       TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u",
-                 (void *) __entry->caller,
-                 (void *) __entry->ip,
+               __entry->got_pos_inode          = got_pos->inode;
+               __entry->got_pos_offset         = got_pos->offset;
+               __entry->got_pos_snapshot       = got_pos->snapshot;
+               __entry->got_locks              = got_locks;
+               __entry->got_uptodate           = got_uptodate;
+               __entry->src_pos_inode          = src_pos->inode;
+               __entry->src_pos_offset         = src_pos->offset;
+               __entry->src_pos_snapshot       = src_pos->snapshot;
+               __entry->src_locks              = src_locks;
+               __entry->src_uptodate           = src_uptodate;
+       ),
+
+       TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
+                 "src %llu:%llu:%u l %u u %u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
                  __entry->btree_id,
-                 __entry->uptodate,
-                 __entry->pos_want_inode,
-                 __entry->pos_want_offset,
-                 __entry->pos_want_snapshot,
-                 __entry->locks_want,
-                 __entry->pos_found_inode,
-                 __entry->pos_found_offset,
-                 __entry->pos_found_snapshot,
-                 __entry->locks_found)
+                 __entry->got_pos_inode,
+                 __entry->got_pos_offset,
+                 __entry->got_pos_snapshot,
+                 __entry->got_locks,
+                 __entry->got_uptodate,
+                 __entry->src_pos_inode,
+                 __entry->src_pos_offset,
+                 __entry->src_pos_snapshot,
+                 __entry->src_locks,
+                 __entry->src_uptodate)
 );
 
 TRACE_EVENT(transaction_restart_ip,
@@ -614,28 +621,241 @@ TRACE_EVENT(transaction_restart_ip,
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip),
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          ip      )
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
        ),
 
        TP_fast_assign(
-               __entry->ip = ip;
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
        ),
 
-       TP_printk("%pS", (void *) __entry->ip)
+       TP_printk("%pS %pS",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_btree_node_reused,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
+DEFINE_EVENT(transaction_restart,      trans_blocked_journal_reclaim,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_blocked_journal_reclaim,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
+DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_restart_journal_preres_get,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_restart_fault_inject,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_traverse_all,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip),
+       TP_ARGS(trans_ip, caller_ip)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u64,                    pos_inode       )
+               __field(u64,                    pos_offset      )
+               __field(u32,                    pos_snapshot    )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = btree_id;
+               __entry->pos_inode              = pos->inode;
+               __entry->pos_offset             = pos->offset;
+               __entry->pos_snapshot           = pos->snapshot;
+       ),
+
+       TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->btree_id,
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+);
+
+TRACE_EVENT(iter_traverse,
+       TP_PROTO(unsigned long  trans_ip,
+                unsigned long  caller_ip,
+                enum btree_id  btree_id,
+                struct bpos    *pos,
+                int ret),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u64,                    pos_inode       )
+               __field(u64,                    pos_offset      )
+               __field(u32,                    pos_snapshot    )
+               __field(s32,                    ret             )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = btree_id;
+               __entry->pos_inode              = pos->inode;
+               __entry->pos_offset             = pos->offset;
+               __entry->pos_snapshot           = pos->snapshot;
+               __entry->ret                    = ret;
+       ),
+
+       TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->btree_id,
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->ret)
+);
+
+TRACE_EVENT(iter_set_search_pos,
+       TP_PROTO(unsigned long  trans_ip,
+                unsigned long  caller_ip,
+                enum btree_id  btree_id,
+                struct bpos    *old_pos,
+                struct bpos    *new_pos,
+                unsigned       good_level),
+       TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip                )
+               __field(unsigned long,          caller_ip               )
+               __field(u8,                     btree_id                )
+               __field(u64,                    old_pos_inode           )
+               __field(u64,                    old_pos_offset          )
+               __field(u32,                    old_pos_snapshot        )
+               __field(u64,                    new_pos_inode           )
+               __field(u64,                    new_pos_offset          )
+               __field(u32,                    new_pos_snapshot        )
+               __field(u8,                     good_level              )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = btree_id;
+               __entry->old_pos_inode          = old_pos->inode;
+               __entry->old_pos_offset         = old_pos->offset;
+               __entry->old_pos_snapshot       = old_pos->snapshot;
+               __entry->new_pos_inode          = new_pos->inode;
+               __entry->new_pos_offset         = new_pos->offset;
+               __entry->new_pos_snapshot       = new_pos->snapshot;
+               __entry->good_level             = good_level;
+       ),
+
+       TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->btree_id,
+                 __entry->old_pos_inode,
+                 __entry->old_pos_offset,
+                 __entry->old_pos_snapshot,
+                 __entry->new_pos_inode,
+                 __entry->new_pos_offset,
+                 __entry->new_pos_snapshot,
+                 __entry->good_level)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock,
@@ -730,97 +950,70 @@ TRACE_EVENT(trans_restart_mem_realloced,
                  __entry->bytes)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_preres_get,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_fault_inject,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_btree_node_split,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_mark,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_upgrade,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_iter_upgrade,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_relock,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_restart_traverse,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
-DEFINE_EVENT(transaction_restart,      trans_traverse_all,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
 DECLARE_EVENT_CLASS(node_lock_fail,
-       TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(level, iter_seq, node, node_seq),
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos,
+                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+               level, iter_seq, node, node_seq),
 
        TP_STRUCT__entry(
-               __field(u32,            level)
-               __field(u32,            iter_seq)
-               __field(u32,            node)
-               __field(u32,            node_seq)
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u64,                    pos_inode       )
+               __field(u64,                    pos_offset      )
+               __field(u32,                    pos_snapshot    )
+               __field(u32,                    level           )
+               __field(u32,                    iter_seq        )
+               __field(u32,                    node            )
+               __field(u32,                    node_seq        )
        ),
 
        TP_fast_assign(
-               __entry->level          = level;
-               __entry->iter_seq       = iter_seq;
-               __entry->node           = node;
-               __entry->node_seq       = node_seq;
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = btree_id;
+               __entry->pos_inode              = pos->inode;
+               __entry->pos_offset             = pos->offset;
+               __entry->pos_snapshot           = pos->snapshot;
+               __entry->level                  = level;
+               __entry->iter_seq               = iter_seq;
+               __entry->node                   = node;
+               __entry->node_seq               = node_seq;
        ),
 
-       TP_printk("level %u iter seq %u node %u node seq %u",
+       TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->btree_id,
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
                  __entry->level, __entry->iter_seq,
                  __entry->node, __entry->node_seq)
 );
 
 DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
-       TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(level, iter_seq, node, node_seq)
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos,
+                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+               level, iter_seq, node, node_seq)
 );
 
 DEFINE_EVENT(node_lock_fail, node_relock_fail,
-       TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(level, iter_seq, node, node_seq)
+       TP_PROTO(unsigned long trans_ip,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos,
+                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
+       TP_ARGS(trans_ip, caller_ip, btree_id, pos,
+               level, iter_seq, node, node_seq)
 );
 
 #endif /* _TRACE_BCACHEFS_H */