bcachefs: Don't downgrade locks on transaction restart
authorKent Overstreet <kent.overstreet@linux.dev>
Fri, 27 Oct 2023 19:23:46 +0000 (15:23 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Thu, 2 Nov 2023 01:11:08 +0000 (21:11 -0400)
We should only be downgrading locks on success - otherwise, our
transaction restarts won't be getting the correct locks and we'll
livelock.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_locking.c
fs/bcachefs/btree_locking.h
fs/bcachefs/btree_trans_commit.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/data_update.c
fs/bcachefs/trace.h

index 3b629420655abc4c8c9ecd55346a4cbbcca963e2..0622f729411fd809580757f5bcc91bd68e23d965 100644 (file)
@@ -1523,6 +1523,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
        path->ref               = 0;
        path->intent_ref        = 0;
        path->nodes_locked      = 0;
+       path->alloc_seq++;
 
        btree_path_list_add(trans, pos, path);
        trans->paths_sorted = false;
@@ -1598,7 +1599,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
        if (locks_want > path->locks_want)
-               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
 
        return path;
 }
index 634ffdcb55f9d0d7252962fa6323cbaac6b78b1b..3304bff7d46420136396bf2330f245be53135863 100644 (file)
@@ -509,7 +509,7 @@ fill:
                 * path->uptodate yet:
                 */
                if (!path->locks_want &&
-                   !__bch2_btree_path_upgrade(trans, path, 1)) {
+                   !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
                        trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
                        ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
                        goto err;
index 40c8ed8f7bf187ddd83974bce2d32236b5942973..bc45cd2a34a41fa66be3b795ce05a26c6397340a 100644 (file)
@@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
                                        struct btree_path *path,
-                                       bool upgrade)
+                                       bool upgrade,
+                                       struct get_locks_fail *f)
 {
        unsigned l = path->level;
        int fail_idx = -1;
@@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 
                if (!(upgrade
                      ? bch2_btree_node_upgrade(trans, path, l)
-                     : bch2_btree_node_relock(trans, path, l)))
-                       fail_idx = l;
+                     : bch2_btree_node_relock(trans, path, l))) {
+                       fail_idx        = l;
+
+                       if (f) {
+                               f->l    = l;
+                               f->b    = path->l[l].b;
+                       }
+               }
 
                l++;
        } while (l < path->locks_want);
@@ -584,7 +591,9 @@ __flatten
 bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
                        struct btree_path *path, unsigned long trace_ip)
 {
-       return btree_path_get_locks(trans, path, false);
+       struct get_locks_fail f;
+
+       return btree_path_get_locks(trans, path, false, &f);
 }
 
 int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
 
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
                               struct btree_path *path,
-                              unsigned new_locks_want)
+                              unsigned new_locks_want,
+                              struct get_locks_fail *f)
 {
        EBUG_ON(path->locks_want >= new_locks_want);
 
        path->locks_want = new_locks_want;
 
-       return btree_path_get_locks(trans, path, true);
+       return btree_path_get_locks(trans, path, true, f);
 }
 
 bool __bch2_btree_path_upgrade(struct btree_trans *trans,
                               struct btree_path *path,
-                              unsigned new_locks_want)
+                              unsigned new_locks_want,
+                              struct get_locks_fail *f)
 {
        struct btree_path *linked;
 
-       if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+       if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
                return true;
 
        /*
@@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
                            linked->btree_id == path->btree_id &&
                            linked->locks_want < new_locks_want) {
                                linked->locks_want = new_locks_want;
-                               btree_path_get_locks(trans, linked, true);
+                               btree_path_get_locks(trans, linked, true, NULL);
                        }
 
        return false;
@@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
 {
        unsigned l;
 
+       if (trans->restarted)
+               return;
+
        EBUG_ON(path->locks_want < new_locks_want);
 
        path->locks_want = new_locks_want;
@@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
        }
 
        bch2_btree_path_verify_locks(path);
+
+       path->downgrade_seq++;
+       trace_path_downgrade(trans, _RET_IP_, path);
 }
 
 /* Btree transaction locking: */
@@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 {
        struct btree_path *path;
 
+       if (trans->restarted)
+               return;
+
        trans_for_each_path(trans, path)
                bch2_btree_path_downgrade(trans, path);
 }
index 6231e9ffc5d7497b693febe166e64560a6f024c9..11b0a2c8cd691b21afccdcc38486aa060351f62a 100644 (file)
@@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
 
 /* upgrade */
 
+
+struct get_locks_fail {
+       unsigned        l;
+       struct btree    *b;
+};
+
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
-                              struct btree_path *, unsigned);
+                              struct btree_path *, unsigned,
+                              struct get_locks_fail *);
+
 bool __bch2_btree_path_upgrade(struct btree_trans *,
-                              struct btree_path *, unsigned);
+                              struct btree_path *, unsigned,
+                              struct get_locks_fail *);
 
 static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
                                          struct btree_path *path,
                                          unsigned new_locks_want)
 {
+       struct get_locks_fail f;
        unsigned old_locks_want = path->locks_want;
 
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
        if (path->locks_want < new_locks_want
-           ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+           ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
            : path->uptodate == BTREE_ITER_UPTODATE)
                return 0;
 
        trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
-                       old_locks_want, new_locks_want);
+                       old_locks_want, new_locks_want, &f);
        return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 }
 
index 53ddcaf042a20b255f6e73bb5a72d42d77c03e7a..8140b6e6e9a65245615397dd60489feb57240da1 100644 (file)
@@ -861,12 +861,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
         */
        bch2_journal_res_put(&c->journal, &trans->journal_res);
 
-       if (unlikely(ret))
-               return ret;
-
-       bch2_trans_downgrade(trans);
-
-       return 0;
+       return ret;
 }
 
 static int journal_reclaim_wait_done(struct bch_fs *c)
@@ -1135,6 +1130,8 @@ out:
        if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
                bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
+       if (!ret)
+               bch2_trans_downgrade(trans);
        bch2_trans_reset_updates(trans);
 
        return ret;
index a039ce4a48094b5ed574a654bef7d9416da6d6db..ecbb44b939a05cd3560c68e7745e4493aea66e64 100644 (file)
@@ -228,6 +228,8 @@ struct btree_path {
        u8                      sorted_idx;
        u8                      ref;
        u8                      intent_ref;
+       u32                     alloc_seq;
+       u32                     downgrade_seq;
 
        /* btree_iter_copy starts here: */
        struct bpos             pos;
index 818a83f35d276149ebbcdd2d1e1430f4c281cae1..d029e0348c918a292d596af7ce520588235ccabd 100644 (file)
@@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 out:
        if (new_path)
                bch2_path_put(trans, new_path, true);
-       bch2_btree_path_downgrade(trans, iter->path);
+       bch2_trans_downgrade(trans);
        return ret;
 err:
        bch2_btree_node_free_never_used(as, trans, n);
index d116f2f03db24a8949ac9bc728cfea2b80e9ce30..0771a6d880bf5e2e4efcbcc21d91d34b64160dd4 100644 (file)
@@ -162,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
                        if (((1U << i) & m->data_opts.rewrite_ptrs) &&
                            (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
                            !ptr->cached) {
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-                               /*
-                                * See comment below:
                                bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
-                               */
                                rewrites_found |= 1U << i;
                        }
                        i++;
@@ -212,14 +208,8 @@ restart_drop_extra_replicas:
                        if (!p.ptr.cached &&
                            durability - ptr_durability >= m->op.opts.data_replicas) {
                                durability -= ptr_durability;
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
-                               /*
-                                * Currently, we're dropping unneeded replicas
-                                * instead of marking them as cached, since
-                                * cached data in stripe buckets prevents them
-                                * from being reused:
+
                                bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
-                               */
                                goto restart_drop_extra_replicas;
                        }
                }
index 81f72b2add09a5dc250cfe8646d2b9a37cbaa45b..893304a1f06e6ea03df55020cf7be26f349d8cfe 100644 (file)
@@ -1043,13 +1043,16 @@ DEFINE_EVENT(transaction_restart_iter,  trans_restart_btree_node_split,
        TP_ARGS(trans, caller_ip, path)
 );
 
+struct get_locks_fail;
+
 TRACE_EVENT(trans_restart_upgrade,
        TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
                 struct btree_path *path,
                 unsigned old_locks_want,
-                unsigned new_locks_want),
-       TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
+                unsigned new_locks_want,
+                struct get_locks_fail *f),
+       TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
 
        TP_STRUCT__entry(
                __array(char,                   trans_fn, 32    )
@@ -1057,6 +1060,11 @@ TRACE_EVENT(trans_restart_upgrade,
                __field(u8,                     btree_id        )
                __field(u8,                     old_locks_want  )
                __field(u8,                     new_locks_want  )
+               __field(u8,                     level           )
+               __field(u32,                    path_seq        )
+               __field(u32,                    node_seq        )
+               __field(u32,                    path_alloc_seq  )
+               __field(u32,                    downgrade_seq)
                TRACE_BPOS_entries(pos)
        ),
 
@@ -1066,10 +1074,15 @@ TRACE_EVENT(trans_restart_upgrade,
                __entry->btree_id               = path->btree_id;
                __entry->old_locks_want         = old_locks_want;
                __entry->new_locks_want         = new_locks_want;
+               __entry->level                  = f->l;
+               __entry->path_seq               = path->l[f->l].lock_seq;
+               __entry->node_seq               = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+               __entry->path_alloc_seq         = path->alloc_seq;
+               __entry->downgrade_seq          = path->downgrade_seq;
                TRACE_BPOS_assign(pos, path->pos)
        ),
 
-       TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
                  bch2_btree_id_str(__entry->btree_id),
@@ -1077,7 +1090,12 @@ TRACE_EVENT(trans_restart_upgrade,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
                  __entry->old_locks_want,
-                 __entry->new_locks_want)
+                 __entry->new_locks_want,
+                 __entry->level,
+                 __entry->path_seq,
+                 __entry->node_seq,
+                 __entry->path_alloc_seq,
+                 __entry->downgrade_seq)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
@@ -1238,6 +1256,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
                  __entry->new_u64s)
 );
 
+TRACE_EVENT(path_downgrade,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                struct btree_path *path),
+       TP_ARGS(trans, caller_ip, path),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+       ),
+
+       TP_fast_assign(
+               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+       ),
+
+       TP_printk("%s %pS",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip)
+);
+
 DEFINE_EVENT(transaction_event,        trans_restart_write_buffer_flush,
        TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),