We now record the length of time btree locks are held and expose this in debugfs.
Enabled via CONFIG_BCACHEFS_LOCK_TIME_STATS.
Signed-off-by: Daniel Hill <daniel@gluo.nz>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
        help
        Include some unit and performance tests for the core btree code
 
+config BCACHEFS_LOCK_TIME_STATS
+       bool "bcachefs lock time statistics"
+       depends on BCACHEFS_FS
+       help
+       Expose statistics for how long we held a lock in debugfs
+
 config BCACHEFS_NO_LATENCY_ACCT
        bool "disable latency accounting and time stats"
        depends on BCACHEFS_FS
 
 #undef BCH_DEBUG_PARAM
 #endif
 
+#define BCH_LOCK_TIME_NR 128
+
 #define BCH_TIME_STATS()                       \
        x(btree_node_mem_alloc)                 \
        x(btree_node_split)                     \
        unsigned                id;
 };
 
+struct lock_held_stats {
+       struct bch2_time_stats  times[BCH_LOCK_TIME_NR];
+       const char              *names[BCH_LOCK_TIME_NR];
+};
+
 struct bch_fs_pcpu {
        u64                     sectors_available;
 };
        bool                    promote_whole_extents;
 
        struct bch2_time_stats  times[BCH_TIME_STAT_NR];
+
+       struct lock_held_stats  lock_held_stats;
 };
 
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 
        if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
            (btree_node_lock_seq_matches(path, b, level) &&
             btree_node_lock_increment(trans, b, level, want))) {
-               mark_btree_node_locked(path, level, want);
+               mark_btree_node_locked(trans, path, level, want);
                return true;
        }
 fail:
 
        return false;
 success:
-       mark_btree_node_intent_locked(path, level);
+       mark_btree_node_intent_locked(trans, path, level);
        return true;
 }
 
                            t != BTREE_NODE_UNLOCKED) {
                                btree_node_unlock(trans, path, b->c.level);
                                six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-                               mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t);
+                               mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
                        }
 
                        btree_path_level_init(trans, path, b);
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(path, path->level, lock_type);
+                       mark_btree_node_locked(trans, path, path->level, lock_type);
                        btree_path_level_init(trans, path, b);
                        return 0;
                }
        if (unlikely(ret))
                goto err;
 
-       mark_btree_node_locked(path, level, lock_type);
+       mark_btree_node_locked(trans, path, level, lock_type);
        btree_path_level_init(trans, path, b);
 
        if (likely(!trans->journal_replay_not_finished &&
        trans->journal_replay_not_finished =
                !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
+       while (c->lock_held_stats.names[trans->lock_name_idx] != fn
+              && c->lock_held_stats.names[trans->lock_name_idx] != 0)
+               trans->lock_name_idx++;
+
+       if (trans->lock_name_idx >= BCH_LOCK_TIME_NR)
+               pr_warn_once("lock_times array not big enough!");
+       else
+               c->lock_held_stats.names[trans->lock_name_idx] = fn;
+
        bch2_trans_alloc_paths(trans, c);
 
        if (expected_mem_bytes) {
 
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
                path->locks_want = 1;
        } else {
                enum six_lock_type lock_want = __btree_lock_want(path, 0);
                        goto retry;
                }
 
-               mark_btree_node_locked(path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0, lock_want);
        }
 
        path->l[0].lock_seq     = ck->c.lock.state.seq;
 
        path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_path *path,
+static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
+                                         struct btree_path *path,
                                          unsigned level,
                                          enum six_lock_type type)
 {
        path->nodes_intent_locked |= type << level;
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+                                         struct btree_path *path,
+                                         unsigned level,
+                                         enum six_lock_type type)
+{
+       mark_btree_node_locked_noreset(trans, path, level, type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+       path->l[level].lock_taken_time = ktime_get_ns();
+#endif
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+                                                struct btree_path *path,
                                                 unsigned level)
 {
-       mark_btree_node_locked(path, level, SIX_LOCK_intent);
+       mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-       if (lock_type != BTREE_NODE_UNLOCKED)
+       if (lock_type != BTREE_NODE_UNLOCKED) {
                six_unlock_type(&path->l[level].b->c.lock, lock_type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+               if (trans->lock_name_idx < BCH_LOCK_TIME_NR) {
+                       struct bch_fs *c = trans->c;
+
+                       __bch2_time_stats_update(&c->lock_held_stats.times[trans->lock_name_idx],
+                                              path->l[level].lock_taken_time,
+                                                ktime_get_ns());
+               }
+#endif
+       }
        mark_btree_node_unlocked(path, level);
 }
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
        EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
-       return likely(six_trylock_type(&b->c.lock, type)) ||
+       if (likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
                __bch2_btree_node_lock(trans, path, b, pos, level, type,
-                                      should_sleep_fn, p, ip);
+                                      should_sleep_fn, p, ip)) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+               path->l[b->c.level].lock_taken_time = ktime_get_ns();
+#endif
+               return true;
+       } else {
+               return false;
+       }
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
-
-
 
                struct btree    *b;
                struct btree_node_iter iter;
                u32             lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+               u64             lock_taken_time;
+#endif
        }                       l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
        unsigned long           ip_allocated;
        unsigned                journal_u64s;
        unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
+       int                      lock_name_idx;
 };
 
 #define BTREE_FLAGS()                                                  \
 
        .read           = bch2_journal_pins_read,
 };
 
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+       struct bch_fs *c = inode->i_private;
+       struct dump_iter *i;
+
+       i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+       if (!i)
+               return -ENOMEM;
+
+       i->iter = 0;
+       i->c    = c;
+       i->buf  = PRINTBUF;
+       file->private_data = i;
+
+       return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+       struct dump_iter *i = file->private_data;
+
+       printbuf_exit(&i->buf);
+       kfree(i);
+
+       return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+                                     size_t size, loff_t *ppos)
+{
+       struct dump_iter        *i = file->private_data;
+       struct lock_held_stats *lhs = &i->c->lock_held_stats;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       while (lhs->names[i->iter] != 0 && i->iter < BCH_LOCK_TIME_NR) {
+               err = flush_buf(i);
+               if (err)
+                       return err;
+
+               if (!i->size)
+                       break;
+
+               prt_printf(&i->buf, "%s:", lhs->names[i->iter]);
+               prt_newline(&i->buf);
+               printbuf_indent_add(&i->buf, 8);
+               bch2_time_stats_to_text(&i->buf, &lhs->times[i->iter]);
+               printbuf_indent_sub(&i->buf, 8);
+               prt_newline(&i->buf);
+               i->iter++;
+       }
+
+       if (i->buf.allocation_failure)
+               return -ENOMEM;
+
+       return i->ret;
+}
+
+static const struct file_operations lock_held_stats_op = {
+       .owner = THIS_MODULE,
+       .open = lock_held_stats_open,
+       .release = lock_held_stats_release,
+       .read = lock_held_stats_read,
+};
+
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
        if (!IS_ERR_OR_NULL(c->fs_debug_dir))
        debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
                            c->btree_debug, &journal_pins_ops);
 
+       if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+               debugfs_create_file("lock_held_stats", 0400, c->fs_debug_dir,
+                               c, &lock_held_stats_op);
+       }
+
        c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
        if (IS_ERR_OR_NULL(c->btree_debug_dir))
                return;