bcachefs: Fix for long running btree transactions & key cache
authorKent Overstreet <kent.overstreet@linux.dev>
Fri, 16 Dec 2022 02:44:32 +0000 (21:44 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:48 +0000 (17:09 -0400)
While a btree transaction is running, we hold a SRCU read lock on the
btree key cache that prevents btree key cache keys from being freed -
this is so that relock() operations won't access freed memory.

The downside of this is that long running btree transactions prevent
memory from being freed from the key cache. This adds a check in
bch2_trans_begin() - if the transaction has been running longer than 1
second, drop and retake the SRCU read lock and zero out pointers to
unlock key cache paths.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_types.h
fs/bcachefs/errcode.h

index c6ccf3add7338e7247726dece7f15102c0700056..669d2b0b384aeafde884322e128198561afcc245 100644 (file)
@@ -2756,6 +2756,20 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        return p;
 }
 
+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               if (path->cached && !btree_node_locked(path, 0))
+                       path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       trans->srcu_lock_time   = jiffies;
+}
+
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
@@ -2811,6 +2825,9 @@ u32 bch2_trans_begin(struct btree_trans *trans)
                bch2_trans_relock(trans);
        }
 
+       if (unlikely(time_after(jiffies, trans->srcu_lock_time + HZ)))
+               bch2_trans_reset_srcu_lock(trans);
+
        trans->last_restarted_ip = _RET_IP_;
        if (trans->restarted)
                bch2_btree_path_traverse_all(trans);
@@ -2897,6 +2914,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                trans->nr_max_paths = s->nr_max_paths;
 
        trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       trans->srcu_lock_time   = jiffies;
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
                struct btree_trans *pos;
index e47fd252c3fb98b61de1b261fc1642d728bb4404..390cfe63fbe87b1387de86d2ce74a855b2437ac5 100644 (file)
@@ -411,6 +411,7 @@ struct btree_trans {
        enum bch_errcode        restarted:16;
        u32                     restart_count;
        unsigned long           last_restarted_ip;
+       unsigned long           srcu_lock_time;
 
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
index dc388864be6f074e5746424eaee6e15d67277bc9..5f0f757267842839fcdec8597a5f17ff52f331f0 100644 (file)
@@ -53,6 +53,7 @@
        x(BCH_ERR_no_btree_node,        no_btree_node_down)                     \
        x(BCH_ERR_no_btree_node,        no_btree_node_init)                     \
        x(BCH_ERR_no_btree_node,        no_btree_node_cached)                   \
+       x(BCH_ERR_no_btree_node,        no_btree_node_srcu_reset)               \
        x(0,                            btree_insert_fail)                      \
        x(BCH_ERR_btree_insert_fail,    btree_insert_btree_node_full)           \
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_mark_replicas)        \