return ret;
}
-int bch2_alloc_write(struct bch_fs *c)
+int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
{
struct bch_dev *ca;
unsigned i;
int ret = 0;
+ *wrote = false;
+
for_each_rw_member(ca, c, i) {
struct btree_iter iter;
struct bucket_array *buckets;
if (!buckets->b[b].mark.dirty)
continue;
- ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0);
+ ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
+ nowait
+ ? BTREE_INSERT_NOWAIT
+ : 0);
if (ret)
break;
+
+ *wrote = true;
}
up_read(&ca->bucket_lock);
bch2_btree_iter_unlock(&iter);
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
- bool flush_updates;
- size_t i, nr_pending_updates;
+ bool nodes_blocked;
+ size_t i;
+ struct closure cl;
+
+ closure_init_stack(&cl);
clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
pr_debug("flushing dirty btree nodes");
cond_resched();
+ closure_wait(&c->btree_interior_update_wait, &cl);
- flush_updates = false;
- nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+ nodes_blocked = false;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
- if (btree_node_dirty(b) && (!b->written || b->level)) {
+ if (btree_node_need_write(b)) {
if (btree_node_may_write(b)) {
rcu_read_unlock();
btree_node_lock_type(c, b, SIX_LOCK_read);
six_unlock_read(&b->lock);
goto again;
} else {
- flush_updates = true;
+ nodes_blocked = true;
}
}
rcu_read_unlock();
if (c->btree_roots_dirty)
bch2_journal_meta(&c->journal);
- /*
- * This is ugly, but it's needed to flush btree node writes
- * without spinning...
- */
- if (flush_updates) {
- closure_wait_event(&c->btree_interior_update_wait,
- bch2_btree_interior_updates_nr_pending(c) <
- nr_pending_updates);
+ if (nodes_blocked) {
+ closure_sync(&cl);
goto again;
}
+ closure_wake_up(&c->btree_interior_update_wait);
+ closure_sync(&cl);
+
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
}
static void allocator_start_issue_discards(struct bch_fs *c)
unsigned dev_iter;
u64 journal_seq = 0;
long bu;
- bool invalidating_data = false;
int ret = 0;
- if (test_alloc_startup(c)) {
- invalidating_data = true;
+ if (test_alloc_startup(c))
goto not_enough;
- }
/* Scan for buckets that are already invalidated: */
for_each_rw_member(ca, c, dev_iter) {
not_enough:
pr_debug("not enough empty buckets; scanning for reclaimable buckets");
- for_each_rw_member(ca, c, dev_iter) {
- find_reclaimable_buckets(c, ca);
-
- while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
- (bu = next_alloc_bucket(ca)) >= 0) {
- invalidating_data |=
- bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
-
- fifo_push(&ca->free[RESERVE_BTREE], bu);
- bucket_set_dirty(ca, bu);
- }
- }
-
- pr_debug("done scanning for reclaimable buckets");
-
/*
* We're moving buckets to freelists _before_ they've been marked as
* invalidated on disk - we have to so that we can allocate new btree
* have cached data in them, which is live until they're marked as
* invalidated on disk:
*/
- if (invalidating_data) {
- pr_debug("invalidating existing data");
- set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
- } else {
- pr_debug("issuing discards");
- allocator_start_issue_discards(c);
- }
+ set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
- /*
- * XXX: it's possible for this to deadlock waiting on journal reclaim,
- * since we're holding btree writes. What then?
- */
- ret = bch2_alloc_write(c);
- if (ret)
- return ret;
+ while (1) {
+ bool wrote = false;
- if (invalidating_data) {
- pr_debug("flushing journal");
+ for_each_rw_member(ca, c, dev_iter) {
+ find_reclaimable_buckets(c, ca);
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret)
- return ret;
+ while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+ (bu = next_alloc_bucket(ca)) >= 0) {
+ bch2_invalidate_one_bucket(c, ca, bu,
+ &journal_seq);
+
+ fifo_push(&ca->free[RESERVE_BTREE], bu);
+ bucket_set_dirty(ca, bu);
+ }
+ }
+
+ pr_debug("done scanning for reclaimable buckets");
+
+ /*
+ * XXX: it's possible for this to deadlock waiting on journal reclaim,
+ * since we're holding btree writes. What then?
+ */
+ ret = bch2_alloc_write(c, true, &wrote);
- pr_debug("issuing discards");
- allocator_start_issue_discards(c);
+ /*
+ * If bch2_alloc_write() did anything, it may have used some
+ * buckets, and we need the RESERVE_BTREE freelist full - so we
+ * need to loop and scan again.
+ * And if it errored, it may have been because there weren't
+ * enough buckets, so just scan and loop again as long as it
+ * made some progress:
+ */
+ if (!wrote && ret)
+ return ret;
+ if (!wrote && !ret)
+ break;
}
+ pr_debug("flushing journal");
+
+ ret = bch2_journal_flush(&c->journal);
+ if (ret)
+ return ret;
+
+ pr_debug("issuing discards");
+ allocator_start_issue_discards(c);
+
set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
/* now flush dirty btree nodes: */
- if (invalidating_data)
- flush_held_btree_writes(c);
+ flush_held_btree_writes(c);
return 0;
}
{
struct bch_dev *ca;
unsigned i;
+ bool wrote;
int ret;
down_read(&c->gc_lock);
}
}
- return bch2_alloc_write(c);
+ return bch2_alloc_write(c, false, &wrote);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write(struct bch_fs *);
+int bch2_alloc_write(struct bch_fs *, bool, bool *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
if (!btree_node_may_write(b))
goto out_unlock;
+ if (btree_node_dirty(b) &&
+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ goto out_unlock;
+
if (btree_node_dirty(b) ||
btree_node_write_in_flight(b) ||
btree_node_read_in_flight(b)) {
if (!(old & (1 << BTREE_NODE_dirty)))
return;
- if (b->written &&
- !btree_node_may_write(b))
+ if (!btree_node_may_write(b))
return;
if (old & (1 << BTREE_NODE_write_in_flight)) {
} while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(btree_node_fake(b));
- BUG_ON(!list_empty(&b->write_blocked));
BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
unsigned long flags = READ_ONCE(b->flags);
unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
- if (//!(flags & (1 << BTREE_NODE_dirty)) &&
- !b->writes[0].wait.list.first &&
- !b->writes[1].wait.list.first &&
- !(b->will_make_reachable & 1))
+ if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
- pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
+ (flags & (1 << BTREE_NODE_need_write)) != 0,
b->level,
b->written,
!list_empty_careful(&b->write_blocked),
#define _BCACHEFS_BTREE_IO_H
#include "bset.h"
+#include "btree_locking.h"
#include "extents.h"
#include "io_types.h"
static inline bool btree_node_may_write(struct btree *b)
{
return list_empty_careful(&b->write_blocked) &&
- !b->will_make_reachable;
+ (!b->written || !b->will_make_reachable);
}
enum compact_mode {
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
-/*
- * btree_node_dirty() can be cleared with only a read lock,
- * and for bch2_btree_node_write_cond() we want to set need_write iff it's
- * still dirty:
- */
-static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
{
- unsigned long old, new, v = READ_ONCE(b->flags);
-
- do {
- old = new = v;
-
- if (!(old & (1 << BTREE_NODE_dirty)))
- return;
-
- new |= (1 << BTREE_NODE_need_write);
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ while (b->written &&
+ btree_node_need_write(b) &&
+ btree_node_may_write(b)) {
+ if (!btree_node_write_in_flight(b)) {
+ bch2_btree_node_write(c, b, SIX_LOCK_read);
+ break;
+ }
+
+ six_unlock_read(&b->lock);
+ btree_node_wait_on_io(b);
+ btree_node_lock_type(c, b, SIX_LOCK_read);
+ }
}
#define bch2_btree_node_write_cond(_c, _b, cond) \
do { \
- while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
- if (!btree_node_may_write(_b)) { \
- set_btree_node_need_write_if_dirty(_b); \
- break; \
- } \
+ unsigned long old, new, v = READ_ONCE((_b)->flags); \
+ \
+ do { \
+ old = new = v; \
\
- if (!btree_node_write_in_flight(_b)) { \
- bch2_btree_node_write(_c, _b, SIX_LOCK_read); \
+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
break; \
- } \
\
- six_unlock_read(&(_b)->lock); \
- btree_node_wait_on_io(_b); \
- btree_node_lock_type(c, b, SIX_LOCK_read); \
- } \
+ new |= (1 << BTREE_NODE_need_write); \
+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
+ \
+ btree_node_write_if_need(_c, _b); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
#ifndef _BCACHEFS_BTREE_ITER_H
#define _BCACHEFS_BTREE_ITER_H
+#include "bset.h"
#include "btree_types.h"
static inline void btree_iter_set_dirty(struct btree_iter *iter,
*/
#include "btree_iter.h"
-#include "btree_io.h"
#include "six.h"
/* matches six lock types */
set_btree_node_accessed(b);
set_btree_node_dirty(b);
+ set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
memset(&b->nr, 0, sizeof(b->nr));
closure_wait(&btree_current_write(b)->wait, cl);
list_del(&as->write_blocked_list);
+
+ /*
+ * for flush_held_btree_writes() waiting on updates to flush or
+ * nodes to be writeable:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
mutex_unlock(&c->btree_interior_update_lock);
/*
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
btree_update_reparent(as, p);
+
+ /*
+ * for flush_held_btree_writes() waiting on updates to flush or
+ * nodes to be writeable:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
}
clear_btree_node_dirty(b);
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve);
+ btree_reserve * 2);
bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
#include "eytzinger.h"
#include "util.h"
-#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
-#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
-
static const char si_units[] = "?kMGTPEZY";
static int __bch2_strtoh(const char *cp, u64 *res,