From 74ed7e560b794369adf87e0d310453bc78f4b273 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 21 Jul 2020 17:12:39 -0400
Subject: [PATCH] bcachefs: Don't let copygc buckets be stolen by other threads

And assorted other copygc fixes.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c |  4 ++-
 fs/bcachefs/alloc_foreground.c | 46 ++++++++++++++++++++++------------
 fs/bcachefs/alloc_foreground.h |  7 ------
 fs/bcachefs/alloc_types.h      |  1 +
 fs/bcachefs/btree_gc.c         |  6 ++++-
 fs/bcachefs/move.c             |  1 +
 fs/bcachefs/movinggc.c         | 34 +++++++++++++++++--------
 fs/bcachefs/super.c            |  8 ++++++
 8 files changed, 72 insertions(+), 35 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 54455f77ad2a4..ba7620999a8dd 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -518,7 +518,9 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 				  ca->inc_gen_really_needs_gc);
 
 		if (available > fifo_free(&ca->free_inc) ||
-		    (available && !fifo_full(&ca->free[RESERVE_BTREE])))
+		    (available &&
+		     (!fifo_full(&ca->free[RESERVE_BTREE]) ||
+		      !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
 			break;
 
 		up_read(&c->gc_lock);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 2dc8a8ff0569e..926c67e870435 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -144,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-				    struct open_bucket *ob,
-				    bool may_realloc)
+				    struct write_point *wp,
+				    struct open_bucket *ob)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+	bool may_realloc = wp->type == BCH_DATA_user;
 
-	BUG_ON(ca->open_buckets_partial_nr >=
+	BUG_ON(ca->open_buckets_partial_nr >
 	       ARRAY_SIZE(ca->open_buckets_partial));
 
 	if (ca->open_buckets_partial_nr <
@@ -228,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 
 	spin_lock(&c->freelist_lock);
 
-	if (may_alloc_partial &&
-	    ca->open_buckets_partial_nr) {
-		ob = c->open_buckets +
-			ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		ob->on_partial_list = false;
-		spin_unlock(&c->freelist_lock);
-		return ob;
+	if (may_alloc_partial) {
+		int i;
+
+		for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+			ob = c->open_buckets + ca->open_buckets_partial[i];
+
+			if (reserve <= ob->alloc_reserve) {
+				array_remove_item(ca->open_buckets_partial,
+						  ca->open_buckets_partial_nr,
+						  i);
+				ob->on_partial_list = false;
+				ob->alloc_reserve = reserve;
+				spin_unlock(&c->freelist_lock);
+				return ob;
+			}
+		}
 	}
 
 	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
@@ -291,6 +301,7 @@ out:
 
 	ob->valid	= true;
 	ob->sectors_free = ca->mi.bucket_size;
+	ob->alloc_reserve = reserve;
 	ob->ptr		= (struct bch_extent_ptr) {
 		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
 		.gen	= buckets->b[bucket].mark.gen,
@@ -835,9 +846,6 @@ retry:
 alloc_done:
 	BUG_ON(!ret && nr_effective < nr_replicas);
 
-	WARN_ON(reserve == RESERVE_MOVINGGC &&
-		ret == FREELIST_EMPTY);
-
 	if (erasure_code && !ec_open_bucket(c, &ptrs))
 		pr_debug("failed to get ec bucket: ret %u", ret);
 
@@ -850,7 +858,7 @@ alloc_done:
 
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
-		open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user);
+		open_bucket_free_unused(c, wp, ob);
 
 	wp->ptrs = ptrs;
 
@@ -869,8 +877,7 @@ err:
 		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
 			ob_push(c, &ptrs, ob);
 		else
-			open_bucket_free_unused(c, ob,
-					wp->type == BCH_DATA_user);
+			open_bucket_free_unused(c, wp, ob);
 	wp->ptrs = ptrs;
 
 	mutex_unlock(&wp->lock);
@@ -938,6 +945,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 	bch2_open_buckets_put(c, &ptrs);
 }
 
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 {
 	struct open_bucket *ob;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index dc8574a1a76a2..c658295cb8e09 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -133,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 	return (struct write_point_specifier) { .v = (unsigned long) wp };
 }
 
-static inline void writepoint_init(struct write_point *wp,
-				   enum bch_data_type type)
-{
-	mutex_init(&wp->lock);
-	wp->type = type;
-}
-
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4f14650779947..20705460bb0aa 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -66,6 +66,7 @@ struct open_bucket {
 	u8			type;
 	unsigned		valid:1;
 	unsigned		on_partial_list:1;
+	int			alloc_reserve:3;
 	unsigned		sectors_free;
 	struct bch_extent_ptr	ptr;
 	struct ec_stripe_new	*ec;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index cebba06f3a96d..4b20817402f6b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -954,8 +954,10 @@ int bch2_gc_gens(struct bch_fs *c)
 	for (i = 0; i < BTREE_ID_NR; i++)
 		if (btree_node_type_needs_gc(i)) {
 			ret = bch2_gc_btree_gens(c, i);
-			if (ret)
+			if (ret) {
+				bch_err(c, "error recalculating oldest_gen: %i", ret);
 				goto err;
+			}
 		}
 
 	for_each_member_device(ca, c, i) {
@@ -966,6 +968,8 @@ int bch2_gc_gens(struct bch_fs *c)
 			g->oldest_gen = g->gc_gen;
 		up_read(&ca->bucket_lock);
 	}
+
+	c->gc_count++;
 err:
 	up_read(&c->gc_lock);
 	return ret;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 6a43a89e0fdd9..b5970f09609a0 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -248,6 +248,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
 	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
 		m->op.alloc_reserve = RESERVE_MOVINGGC;
+		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
 	} else {
 		/* XXX: this should probably be passed in */
 		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 44360bf03d297..25ae4e195c15b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -12,6 +12,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "error.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include "io.h"
@@ -104,7 +105,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 	if (dev_idx < 0)
 		return DATA_SKIP;
 
-	/* XXX: use io_opts for this inode */
 	data_opts->target		= io_opts->background_target;
 	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
 	data_opts->rewrite_dev		= dev_idx;
@@ -123,7 +123,7 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
-static void bch2_copygc(struct bch_fs *c)
+static int bch2_copygc(struct bch_fs *c)
 {
 	copygc_heap *h = &c->copygc_heap;
 	struct copygc_heap_entry e, *i;
@@ -153,7 +153,7 @@ static void bch2_copygc(struct bch_fs *c)
 		free_heap(&c->copygc_heap);
 		if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
 			bch_err(c, "error allocating copygc heap");
-			return;
+			return 0;
 		}
 	}
 
@@ -178,6 +178,7 @@ static void bch2_copygc(struct bch_fs *c)
 				continue;
 
 			e = (struct copygc_heap_entry) {
+				.dev		= dev_idx,
 				.gen		= m.gen,
 				.sectors	= bucket_sectors_used(m),
 				.offset		= bucket_to_sector(ca, b),
@@ -187,6 +188,11 @@ static void bch2_copygc(struct bch_fs *c)
 		up_read(&ca->bucket_lock);
 	}
 
+	if (!sectors_reserved) {
+		bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+		return -1;
+	}
+
 	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += i->sectors;
 
@@ -198,7 +204,7 @@ static void bch2_copygc(struct bch_fs *c)
 	buckets_to_move = h->used;
 
 	if (!buckets_to_move)
-		return;
+		return 0;
 
 	eytzinger0_sort(h->data, h->used,
 			sizeof(h->data[0]),
@@ -214,10 +220,17 @@ static void bch2_copygc(struct bch_fs *c)
 		down_read(&ca->bucket_lock);
 		buckets = bucket_array(ca);
 		for (i = h->data; i < h->data + h->used; i++) {
-			size_t b = sector_to_bucket(ca, i->offset);
-			struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+			struct bucket_mark m;
+			size_t b;
 
-			if (i->gen == m.gen && bucket_sectors_used(m)) {
+			if (i->dev != dev_idx)
+				continue;
+
+			b = sector_to_bucket(ca, i->offset);
+			m = READ_ONCE(buckets->b[b].mark);
+
+			if (i->gen == m.gen &&
+			    bucket_sectors_used(m)) {
 				sectors_not_moved += bucket_sectors_used(m);
 				buckets_not_moved++;
 			}
@@ -237,6 +250,7 @@ static void bch2_copygc(struct bch_fs *c)
 	trace_copygc(c,
 		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 		     buckets_to_move, buckets_not_moved);
+	return 0;
 }
 
 /*
@@ -292,7 +306,8 @@ static int bch2_copygc_thread(void *arg)
 			continue;
 		}
 
-		bch2_copygc(c);
+		if (bch2_copygc(c))
+			break;
 	}
 
 	return 0;
@@ -323,8 +338,7 @@ int bch2_copygc_start(struct bch_fs *c)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_copygc_thread, c,
-			   "bch_copygc[%s]", c->name);
+	t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6dc899be5bd2f..084976c9ac746 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1308,12 +1308,20 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
+	/*
+	 * Device going read only means the copygc reserve get smaller, so we
+	 * don't want that happening while copygc is in progress:
+	 */
+	bch2_copygc_stop(c);
+
 	/*
 	 * The allocator thread itself allocates btree nodes, so stop it first:
 	 */
 	bch2_dev_allocator_stop(ca);
 	bch2_dev_allocator_remove(c, ca);
 	bch2_dev_journal_stop(&c->journal, ca);
+
+	bch2_copygc_start(c);
 }
 
 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-- 
2.30.2