bcachefs: More allocator startup improvements
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 13 Jan 2019 21:02:22 +0000 (16:02 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:14 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_locking.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/buckets.c
fs/bcachefs/util.c

index 9c9464efd333f7c99033de96749e40dcafe75795..871a41b923da7909b3a624061ddfc2035ead6411 100644 (file)
@@ -347,12 +347,14 @@ err:
        return ret;
 }
 
-int bch2_alloc_write(struct bch_fs *c)
+int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 {
        struct bch_dev *ca;
        unsigned i;
        int ret = 0;
 
+       *wrote = false;
+
        for_each_rw_member(ca, c, i) {
                struct btree_iter iter;
                struct bucket_array *buckets;
@@ -370,9 +372,14 @@ int bch2_alloc_write(struct bch_fs *c)
                        if (!buckets->b[b].mark.dirty)
                                continue;
 
-                       ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0);
+                       ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
+                                                    nowait
+                                                    ? BTREE_INSERT_NOWAIT
+                                                    : 0);
                        if (ret)
                                break;
+
+                       *wrote = true;
                }
                up_read(&ca->bucket_lock);
                bch2_btree_iter_unlock(&iter);
@@ -1270,20 +1277,23 @@ static void flush_held_btree_writes(struct bch_fs *c)
        struct bucket_table *tbl;
        struct rhash_head *pos;
        struct btree *b;
-       bool flush_updates;
-       size_t i, nr_pending_updates;
+       bool nodes_blocked;
+       size_t i;
+       struct closure cl;
+
+       closure_init_stack(&cl);
 
        clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
        pr_debug("flushing dirty btree nodes");
        cond_resched();
+       closure_wait(&c->btree_interior_update_wait, &cl);
 
-       flush_updates = false;
-       nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+       nodes_blocked = false;
 
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos)
-               if (btree_node_dirty(b) && (!b->written || b->level)) {
+               if (btree_node_need_write(b)) {
                        if (btree_node_may_write(b)) {
                                rcu_read_unlock();
                                btree_node_lock_type(c, b, SIX_LOCK_read);
@@ -1291,7 +1301,7 @@ again:
                                six_unlock_read(&b->lock);
                                goto again;
                        } else {
-                               flush_updates = true;
+                               nodes_blocked = true;
                        }
                }
        rcu_read_unlock();
@@ -1299,17 +1309,16 @@ again:
        if (c->btree_roots_dirty)
                bch2_journal_meta(&c->journal);
 
-       /*
-        * This is ugly, but it's needed to flush btree node writes
-        * without spinning...
-        */
-       if (flush_updates) {
-               closure_wait_event(&c->btree_interior_update_wait,
-                                  bch2_btree_interior_updates_nr_pending(c) <
-                                  nr_pending_updates);
+       if (nodes_blocked) {
+               closure_sync(&cl);
                goto again;
        }
 
+       closure_wake_up(&c->btree_interior_update_wait);
+       closure_sync(&cl);
+
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
 }
 
 static void allocator_start_issue_discards(struct bch_fs *c)
@@ -1331,13 +1340,10 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
        unsigned dev_iter;
        u64 journal_seq = 0;
        long bu;
-       bool invalidating_data = false;
        int ret = 0;
 
-       if (test_alloc_startup(c)) {
-               invalidating_data = true;
+       if (test_alloc_startup(c))
                goto not_enough;
-       }
 
        /* Scan for buckets that are already invalidated: */
        for_each_rw_member(ca, c, dev_iter) {
@@ -1384,21 +1390,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 not_enough:
        pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
-       for_each_rw_member(ca, c, dev_iter) {
-               find_reclaimable_buckets(c, ca);
-
-               while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
-                      (bu = next_alloc_bucket(ca)) >= 0) {
-                       invalidating_data |=
-                               bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
-
-                       fifo_push(&ca->free[RESERVE_BTREE], bu);
-                       bucket_set_dirty(ca, bu);
-               }
-       }
-
-       pr_debug("done scanning for reclaimable buckets");
-
        /*
         * We're moving buckets to freelists _before_ they've been marked as
         * invalidated on disk - we have to so that we can allocate new btree
@@ -1408,38 +1399,59 @@ not_enough:
         * have cached data in them, which is live until they're marked as
         * invalidated on disk:
         */
-       if (invalidating_data) {
-               pr_debug("invalidating existing data");
-               set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-       } else {
-               pr_debug("issuing discards");
-               allocator_start_issue_discards(c);
-       }
+       set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 
-       /*
-        * XXX: it's possible for this to deadlock waiting on journal reclaim,
-        * since we're holding btree writes. What then?
-        */
-       ret = bch2_alloc_write(c);
-       if (ret)
-               return ret;
+       while (1) {
+               bool wrote = false;
 
-       if (invalidating_data) {
-               pr_debug("flushing journal");
+               for_each_rw_member(ca, c, dev_iter) {
+                       find_reclaimable_buckets(c, ca);
 
-               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-               if (ret)
-                       return ret;
+                       while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+                              (bu = next_alloc_bucket(ca)) >= 0) {
+                               bch2_invalidate_one_bucket(c, ca, bu,
+                                                          &journal_seq);
+
+                               fifo_push(&ca->free[RESERVE_BTREE], bu);
+                               bucket_set_dirty(ca, bu);
+                       }
+               }
+
+               pr_debug("done scanning for reclaimable buckets");
+
+               /*
+                * XXX: it's possible for this to deadlock waiting on journal reclaim,
+                * since we're holding btree writes. What then?
+                */
+               ret = bch2_alloc_write(c, true, &wrote);
 
-               pr_debug("issuing discards");
-               allocator_start_issue_discards(c);
+               /*
+                * If bch2_alloc_write() did anything, it may have used some
+                * buckets, and we need the RESERVE_BTREE freelist full - so we
+                * need to loop and scan again.
+                * And if it errored, it may have been because there weren't
+                * enough buckets, so just scan and loop again as long as it
+                * made some progress:
+                */
+               if (!wrote && ret)
+                       return ret;
+               if (!wrote && !ret)
+                       break;
        }
 
+       pr_debug("flushing journal");
+
+       ret = bch2_journal_flush(&c->journal);
+       if (ret)
+               return ret;
+
+       pr_debug("issuing discards");
+       allocator_start_issue_discards(c);
+
        set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
 
        /* now flush dirty btree nodes: */
-       if (invalidating_data)
-               flush_held_btree_writes(c);
+       flush_held_btree_writes(c);
 
        return 0;
 }
@@ -1448,6 +1460,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i;
+       bool wrote;
        int ret;
 
        down_read(&c->gc_lock);
@@ -1465,7 +1478,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
                }
        }
 
-       return bch2_alloc_write(c);
+       return bch2_alloc_write(c, false, &wrote);
 }
 
 void bch2_fs_allocator_background_init(struct bch_fs *c)
index 8ced4e845281ad07db3adc9658be9a8f930d6909..ef5ec659b05dc53183f3b6bee28b7962e73e9826 100644 (file)
@@ -55,7 +55,7 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *);
+int bch2_alloc_write(struct bch_fs *, bool, bool *);
 int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
index b748afc778f4d3589aaab6b01b76239283941c90..65fc82fba0716b6b6d6cfbecfe6c1028c79dc0a0 100644 (file)
@@ -171,6 +171,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
        if (!btree_node_may_write(b))
                goto out_unlock;
 
+       if (btree_node_dirty(b) &&
+           test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+               goto out_unlock;
+
        if (btree_node_dirty(b) ||
            btree_node_write_in_flight(b) ||
            btree_node_read_in_flight(b)) {
index f205bddd814d1cc0db6f7ffb4b05a794c72796ff..6f1b1e4317a0f542330da8c9d4797918f976b8de 100644 (file)
@@ -1330,8 +1330,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (!(old & (1 << BTREE_NODE_dirty)))
                        return;
 
-               if (b->written &&
-                   !btree_node_may_write(b))
+               if (!btree_node_may_write(b))
                        return;
 
                if (old & (1 << BTREE_NODE_write_in_flight)) {
@@ -1347,7 +1346,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
        BUG_ON(btree_node_fake(b));
-       BUG_ON(!list_empty(&b->write_blocked));
        BUG_ON((b->will_make_reachable != 0) != !b->written);
 
        BUG_ON(b->written >= c->opts.btree_node_size);
@@ -1685,15 +1683,13 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
                unsigned long flags = READ_ONCE(b->flags);
                unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
 
-               if (//!(flags & (1 << BTREE_NODE_dirty)) &&
-                   !b->writes[0].wait.list.first &&
-                   !b->writes[1].wait.list.first &&
-                   !(b->will_make_reachable & 1))
+               if (!(flags & (1 << BTREE_NODE_dirty)))
                        continue;
 
-               pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+               pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
                       b,
                       (flags & (1 << BTREE_NODE_dirty)) != 0,
+                      (flags & (1 << BTREE_NODE_need_write)) != 0,
                       b->level,
                       b->written,
                       !list_empty_careful(&b->write_blocked),
index 9c5a6f9471bd519695060e8478a9b74e2499bc99..c817aeed878adf0c005732b40eeaaa10d5aabe20 100644 (file)
@@ -3,6 +3,7 @@
 #define _BCACHEFS_BTREE_IO_H
 
 #include "bset.h"
+#include "btree_locking.h"
 #include "extents.h"
 #include "io_types.h"
 
@@ -48,7 +49,7 @@ static inline void btree_node_wait_on_io(struct btree *b)
 static inline bool btree_node_may_write(struct btree *b)
 {
        return list_empty_careful(&b->write_blocked) &&
-               !b->will_make_reachable;
+               (!b->written || !b->will_make_reachable);
 }
 
 enum compact_mode {
@@ -100,42 +101,36 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
                          enum six_lock_type);
 
-/*
- * btree_node_dirty() can be cleared with only a read lock,
- * and for bch2_btree_node_write_cond() we want to set need_write iff it's
- * still dirty:
- */
-static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
 {
-       unsigned long old, new, v = READ_ONCE(b->flags);
-
-       do {
-               old = new = v;
-
-               if (!(old & (1 << BTREE_NODE_dirty)))
-                       return;
-
-               new |= (1 << BTREE_NODE_need_write);
-       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+       while (b->written &&
+              btree_node_need_write(b) &&
+              btree_node_may_write(b)) {
+               if (!btree_node_write_in_flight(b)) {
+                       bch2_btree_node_write(c, b, SIX_LOCK_read);
+                       break;
+               }
+
+               six_unlock_read(&b->lock);
+               btree_node_wait_on_io(b);
+               btree_node_lock_type(c, b, SIX_LOCK_read);
+       }
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)                       \
 do {                                                                   \
-       while ((_b)->written && btree_node_dirty(_b) && (cond)) {       \
-               if (!btree_node_may_write(_b)) {                        \
-                       set_btree_node_need_write_if_dirty(_b);         \
-                       break;                                          \
-               }                                                       \
+       unsigned long old, new, v = READ_ONCE((_b)->flags);             \
+                                                                       \
+       do {                                                            \
+               old = new = v;                                          \
                                                                        \
-               if (!btree_node_write_in_flight(_b)) {                  \
-                       bch2_btree_node_write(_c, _b, SIX_LOCK_read);   \
+               if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))        \
                        break;                                          \
-               }                                                       \
                                                                        \
-               six_unlock_read(&(_b)->lock);                           \
-               btree_node_wait_on_io(_b);                              \
-               btree_node_lock_type(c, b, SIX_LOCK_read);              \
-       }                                                               \
+               new |= (1 << BTREE_NODE_need_write);                    \
+       } while ((v = cmpxchg(&(_b)->flags, old, new)) != old);         \
+                                                                       \
+       btree_node_write_if_need(_c, _b);                               \
 } while (0)
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
index 912292dad6e59c3ffc8af136b0ce2382b4c0dd63..52e0e003153b5cd96e213bc8fa4daf671cce92a3 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_ITER_H
 #define _BCACHEFS_BTREE_ITER_H
 
+#include "bset.h"
 #include "btree_types.h"
 
 static inline void btree_iter_set_dirty(struct btree_iter *iter,
index 3871e14e480dbcf3d47af134c0219b7f5bfe4481..48b50e0661869526f046719c0eafe0263479475e 100644 (file)
@@ -11,7 +11,6 @@
  */
 
 #include "btree_iter.h"
-#include "btree_io.h"
 #include "six.h"
 
 /* matches six lock types */
index a314bda544dd295efe7aeee39a7e99c59f4d4452..2efe191cdc30d102cffccdde1c08d0701d87cf57 100644 (file)
@@ -367,6 +367,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
        set_btree_node_accessed(b);
        set_btree_node_dirty(b);
+       set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
        memset(&b->nr, 0, sizeof(b->nr));
@@ -655,6 +656,12 @@ retry:
                closure_wait(&btree_current_write(b)->wait, cl);
 
                list_del(&as->write_blocked_list);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
                mutex_unlock(&c->btree_interior_update_lock);
 
                /*
@@ -958,6 +965,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
                list_del(&p->write_blocked_list);
                btree_update_reparent(as, p);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
        }
 
        clear_btree_node_dirty(b);
index 6501dcf12d592d423dc641422eca5e301453716a..34e5f81b2b5ed074ee05514618dc49db71369766 100644 (file)
@@ -1038,7 +1038,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
        size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
        size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
-                                     btree_reserve);
+                                     btree_reserve * 2);
        bool resize = ca->buckets[0] != NULL,
             start_copygc = ca->copygc_thread != NULL;
        int ret = -ENOMEM;
index 8931aa6a1e2a9b5760f1015ff214763d8e7cf00b..d998e51dbc308d5aef2f1f7890e5258b9eebd68d 100644 (file)
@@ -25,9 +25,6 @@
 #include "eytzinger.h"
 #include "util.h"
 
-#define simple_strtoint(c, end, base)  simple_strtol(c, end, base)
-#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
-
 static const char si_units[] = "?kMGTPEZY";
 
 static int __bch2_strtoh(const char *cp, u64 *res,