bcachefs: Improved btree write statistics
authorKent Overstreet <kent.overstreet@linux.dev>
Fri, 28 Oct 2022 21:08:41 +0000 (17:08 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:45 +0000 (17:09 -0400)
This replaces sysfs btree_avg_write_size with btree_write_stats, which
now breaks out statistics by the source of the btree write.

Btree writes that are too small are a source of inefficiency, and
excessive btree resort overhead - this will let us see what's causing
them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/sysfs.c

index 544621dd4af409336a3f7d0f4da87f90d09575a7..18fe09cdae4d09b57809061d8ae2307130cbfadf 100644 (file)
@@ -596,6 +596,23 @@ typedef struct {
 #define BCACHEFS_ROOT_SUBVOL_INUM                                      \
        ((subvol_inum) { BCACHEFS_ROOT_SUBVOL,  BCACHEFS_ROOT_INO })
 
+#define BCH_BTREE_WRITE_TYPES()                                                \
+       x(initial,              0)                                      \
+       x(init_next_bset,       1)                                      \
+       x(cache_reclaim,        2)                                      \
+       x(journal_reclaim,      3)                                      \
+       x(interior,             4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+       BCH_BTREE_WRITE_TYPES()
+#undef x
+       BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK  (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS  ilog2(BTREE_WRITE_TYPE_MASK)
+
 struct bch_fs {
        struct closure          cl;
 
@@ -705,6 +722,13 @@ struct bch_fs {
        struct workqueue_struct *btree_interior_update_worker;
        struct work_struct      btree_interior_update_work;
 
+       /* btree_io.c: */
+       spinlock_t              btree_write_error_lock;
+       struct btree_write_stats {
+               atomic64_t      nr;
+               atomic64_t      bytes;
+       }                       btree_write_stats[BTREE_WRITE_TYPE_NR];
+
        /* btree_iter.c: */
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
@@ -880,11 +904,6 @@ mempool_t          bio_bounce_pages;
        struct bio_set          dio_write_bioset;
        struct bio_set          dio_read_bioset;
 
-
-       atomic64_t              btree_writes_nr;
-       atomic64_t              btree_writes_sectors;
-       spinlock_t              btree_write_error_lock;
-
        /* ERRORS */
        struct list_head        fsck_errors;
        struct mutex            fsck_error_lock;
index 135c3ea1377d70a9f96f845132562cfa95b155b7..709453a909fc0056822010484d3feb5bb50d34fb 100644 (file)
@@ -241,9 +241,11 @@ wait_on_io:
                 * the post write cleanup:
                 */
                if (bch2_verify_btree_ondisk)
-                       bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
+                       bch2_btree_node_write(c, b, SIX_LOCK_intent,
+                                             BTREE_WRITE_cache_reclaim);
                else
-                       __bch2_btree_node_write(c, b, 0);
+                       __bch2_btree_node_write(c, b,
+                                               BTREE_WRITE_cache_reclaim);
 
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
@@ -347,7 +349,7 @@ restart:
                           six_trylock_read(&b->c.lock)) {
                        list_move(&bc->live, &b->list);
                        mutex_unlock(&bc->lock);
-                       __bch2_btree_node_write(c, b, 0);
+                       __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
                        six_unlock_read(&b->c.lock);
                        if (touched >= nr)
                                goto out_nounlock;
@@ -624,6 +626,7 @@ out:
        b->flags                = 0;
        b->written              = 0;
        b->nsets                = 0;
+       b->write_type           = 0;
        b->sib_u64s[0]          = 0;
        b->sib_u64s[1]          = 0;
        b->whiteout_u64s        = 0;
@@ -1067,7 +1070,7 @@ wait_on_io:
        btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 
        if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b, 0);
+               __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
index a322a83676881bdd3b2d08e361dd316482757972..56f9637d2ca6a5afa6d3adc582d142e3d0a5909e 100644 (file)
@@ -471,7 +471,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
                };
 
                if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-                       bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
+                       bch2_btree_node_write(c, b, SIX_LOCK_write,
+                                             BTREE_WRITE_init_next_bset);
                        reinit_iter = true;
                }
        }
@@ -1646,7 +1647,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
        if (new & (1U << BTREE_NODE_write_in_flight))
-               __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+               __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
        else
                wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
@@ -1795,6 +1796,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
        bool used_mempool;
        unsigned long old, new;
        bool validate_before_checksum = false;
+       enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
        void *data;
        int ret;
 
@@ -1841,6 +1843,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
        if (new & (1U << BTREE_NODE_need_write))
                return;
 do_write:
+       if ((flags & BTREE_WRITE_ONLY_IF_NEED))
+               type = b->write_type;
+       b->write_type = 0;
+
+       BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
        atomic_dec(&c->btree_cache.dirty);
 
        BUG_ON(btree_node_fake(b));
@@ -2015,8 +2023,8 @@ do_write:
                bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
                        cpu_to_le16(b->written);
 
-       atomic64_inc(&c->btree_writes_nr);
-       atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+       atomic64_inc(&c->btree_write_stats[type].nr);
+       atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
 
        INIT_WORK(&wbio->work, btree_write_submit);
        queue_work(c->io_complete_wq, &wbio->work);
@@ -2144,3 +2152,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
        return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
+
+const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+       BCH_BTREE_WRITE_TYPES()
+       NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       printbuf_tabstop_push(out, 20);
+       printbuf_tabstop_push(out, 10);
+
+       prt_tab(out);
+       prt_str(out, "nr");
+       prt_tab(out);
+       prt_str(out, "size");
+       prt_newline(out);
+
+       for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+               u64 nr          = atomic64_read(&c->btree_write_stats[i].nr);
+               u64 bytes       = atomic64_read(&c->btree_write_stats[i].bytes);
+
+               prt_printf(out, "%s:", bch2_btree_write_types[i]);
+               prt_tab(out);
+               prt_u64(out, nr);
+               prt_tab(out);
+               prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+               prt_newline(out);
+       }
+}
index 8af853642123df33276aad4cf1bad547001e7e6a..4b1810ad7d912dd15f28498256b061ed57a86ece 100644 (file)
@@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
-#define BTREE_WRITE_ONLY_IF_NEED       (1U << 0)
-#define BTREE_WRITE_ALREADY_STARTED    (1U << 1)
+enum btree_write_flags {
+       __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+       __BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED       (1U << __BTREE_WRITE_ONLY_IF_NEED )
+#define BTREE_WRITE_ALREADY_STARTED    (1U << __BTREE_WRITE_ALREADY_STARTED)
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
                bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_IO_H */
index ea844dd7a16ba628d2c73dd9deda4b3a3a5167b3..38c4754dbd7e0a2d5dff6439790730016f13dd96 100644 (file)
@@ -77,6 +77,7 @@ struct btree {
        u8                      nsets;
        u8                      nr_key_bits;
        u16                     version_ondisk;
+       u8                      write_type;
 
        struct bkey_format      format;
 
index 0150943074fa9148b4709722390740d7f1043047..e0483abadd72ec654281df661c88c273944fe565 100644 (file)
@@ -1308,6 +1308,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
        bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
        set_btree_node_dirty_acct(c, b);
        set_btree_node_need_write(b);
+       b->write_type = BTREE_WRITE_interior;
 
        printbuf_exit(&buf);
 }
index dabe815965445484d2a24c7ab801d7bf0e19049a..2e6d220c3bcd6e005889b22a60361fb34f32c1aa 100644 (file)
@@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
        struct bkey_packed k;
 
        BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+       EBUG_ON(btree_node_just_written(b));
 
        if (!bkey_pack_pos(&k, pos, b)) {
                struct bkey *u = (void *) &k;
index fc53958e561981f3bcf3ad2d6c836f090d5aff51..8cc271030be62027819d29d1fd185326f30aaf6d 100644 (file)
@@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
                new |= 1 << BTREE_NODE_need_write;
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
+       b->write_type = BTREE_WRITE_journal_reclaim;
+
        btree_node_write_if_need(c, b, SIX_LOCK_read);
        six_unlock_read(&b->c.lock);
 
index 76301209898f15294c6c6156c33f02e31646030c..db3d377ba10c4c97b1e8721895ff8e32e5b368d0 100644 (file)
@@ -183,7 +183,7 @@ read_attribute(io_latency_stats_read);
 read_attribute(io_latency_stats_write);
 read_attribute(congested);
 
-read_attribute(btree_avg_write_size);
+read_attribute(btree_write_stats);
 
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
        return ret;
 }
 
-static size_t bch2_btree_avg_write_size(struct bch_fs *c)
-{
-       u64 nr = atomic64_read(&c->btree_writes_nr);
-       u64 sectors = atomic64_read(&c->btree_writes_sectors);
-
-       return nr ? div64_u64(sectors, nr) : 0;
-}
-
 static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 {
        long ret = 0;
@@ -396,7 +388,9 @@ SHOW(bch2_fs)
        sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
 
        sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
-       sysfs_hprint(btree_avg_write_size,      bch2_btree_avg_write_size(c));
+
+       if (attr == &sysfs_btree_write_stats)
+               bch2_btree_write_stats_to_text(out, c);
 
        sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
 
@@ -557,7 +551,7 @@ SYSFS_OPS(bch2_fs);
 struct attribute *bch2_fs_files[] = {
        &sysfs_minor,
        &sysfs_btree_cache_size,
-       &sysfs_btree_avg_write_size,
+       &sysfs_btree_write_stats,
 
        &sysfs_promote_whole_extents,