bcachefs: gc now operates on second set of bucket marks
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 23 Jul 2018 09:32:01 +0000 (05:32 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:12 +0000 (17:08 -0400)
This means we can now use gc to verify the allocation information -
important for testing persistant alloc info

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_gc.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/buckets_types.h
fs/bcachefs/journal.c
fs/bcachefs/super.c
fs/bcachefs/sysfs.c

index c17fba1eae963d44482cfb9d3a548483727b0e8e..3f0e2dd29fde037d7974993870da3bcbb1372508 100644 (file)
@@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
                pr_debug("free_inc now empty");
 
                do {
-                       if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-                               up_read(&c->gc_lock);
-                               bch_err(ca, "gc failure");
-                               goto stop;
-                       }
-
                        /*
                         * Find some buckets that we can invalidate, either
                         * they're completely unused, or only contain clean data
@@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
        bool invalidating_data = false;
        int ret = 0;
 
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return -1;
-
        if (test_alloc_startup(c)) {
                invalidating_data = true;
                goto not_enough;
@@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
                                continue;
 
                        bch2_mark_alloc_bucket(c, ca, bu, true,
-                                       gc_pos_alloc(c, NULL),
-                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                       gc_pos_alloc(c, NULL), 0);
 
                        fifo_push(&ca->free_inc, bu);
 
index cdea3a1d9176b58f2eb85740df6c265be47b5a2f..eaa2055000b6ccf53f096c6a50b15c027986c344 100644 (file)
@@ -347,7 +347,6 @@ enum gc_phase {
 
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
-       GC_PHASE_DONE
 };
 
 struct gc_pos {
@@ -392,15 +391,14 @@ struct bch_dev {
         * gc_lock, for device resize - holding any is sufficient for access:
         * Or rcu_read_lock(), but only for ptr_stale():
         */
-       struct bucket_array __rcu *buckets;
+       struct bucket_array __rcu *buckets[2];
        unsigned long           *buckets_dirty;
        unsigned long           *buckets_written;
        /* most out of date gen in the btree */
        u8                      *oldest_gens;
        struct rw_semaphore     bucket_lock;
 
-       struct bch_dev_usage __percpu *usage_percpu;
-       struct bch_dev_usage    usage_cached;
+       struct bch_dev_usage __percpu *usage[2];
 
        /* Allocator: */
        struct task_struct __rcu *alloc_thread;
@@ -478,7 +476,6 @@ enum {
 
        /* errors: */
        BCH_FS_ERROR,
-       BCH_FS_GC_FAILURE,
 
        /* misc: */
        BCH_FS_BDEV_MOUNTED,
@@ -614,8 +611,8 @@ struct bch_fs {
 
        atomic64_t              sectors_available;
 
-       struct bch_fs_usage __percpu *usage_percpu;
-       struct bch_fs_usage     usage_cached;
+       struct bch_fs_usage __percpu *usage[2];
+
        struct percpu_rw_semaphore usage_lock;
 
        struct closure_waitlist freelist_wait;
@@ -656,9 +653,6 @@ struct bch_fs {
         *
         * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
         *
-        * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-        * currently running, and gc marks are currently valid
-        *
         * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
         * can read without a lock.
         */
index e900fd4ffd067f1b4c43cd89a2574f2f705bdc33..6eba65fcb52cb4f79f4dbb11b35c39cb7cb17ba7 100644 (file)
@@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 {
        struct gc_pos pos = { 0 };
        unsigned flags =
-               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-               BCH_BUCKET_MARK_GC_LOCK_HELD|
+               BCH_BUCKET_MARK_GC|
                (initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
        int ret = 0;
 
@@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
                                      BCH_DATA_SB, flags);
        }
 
-       if (c)
-               spin_lock(&c->journal.lock);
-
        for (i = 0; i < ca->journal.nr; i++) {
                b = ca->journal.buckets[i];
                bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
        }
 
        if (c) {
-               spin_unlock(&c->journal.lock);
                percpu_up_read(&c->usage_lock);
        } else {
                preempt_enable();
@@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
        gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
        for_each_online_member(ca, c, i)
-               bch2_mark_dev_superblock(c, ca,
-                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
+               bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
        mutex_unlock(&c->sb_lock);
 }
 
@@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
        struct gc_pos pos = { 0 };
-       struct bch_fs_usage stats = { 0 };
        struct btree_update *as;
        struct pending_btree_node_free *d;
 
@@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
                        bch2_mark_key(c, BKEY_TYPE_BTREE,
                                      bkey_i_to_s_c(&d->key),
                                      true, 0,
-                                     pos, &stats, 0,
-                                     BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                     BCH_BUCKET_MARK_GC_LOCK_HELD);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
+                                     pos, NULL, 0,
+                                     BCH_BUCKET_MARK_GC);
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                fifo_for_each_entry(i, &ca->free_inc, iter)
                        bch2_mark_alloc_bucket(c, ca, i, true,
                                               gc_pos_alloc(c, NULL),
-                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                              BCH_BUCKET_MARK_GC);
 
 
 
@@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                        fifo_for_each_entry(i, &ca->free[j], iter)
                                bch2_mark_alloc_bucket(c, ca, i, true,
                                                       gc_pos_alloc(c, NULL),
-                                                      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                                      BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                                      BCH_BUCKET_MARK_GC);
        }
 
        spin_unlock(&c->freelist_lock);
@@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                        ca = bch_dev_bkey_exists(c, ob->ptr.dev);
                        bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
                                               gc_pos_alloc(c, ob),
-                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                              BCH_BUCKET_MARK_GC);
                }
                spin_unlock(&ob->lock);
        }
@@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
        percpu_up_read(&c->usage_lock);
 }
 
-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+                       sizeof(struct bucket_array) +
+                       ca->mi.nbuckets * sizeof(struct bucket));
+               ca->buckets[1] = NULL;
+
+               free_percpu(ca->usage[1]);
+               ca->usage[1] = NULL;
+       }
+
+       free_percpu(c->usage[1]);
+       c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket_mark new;
        unsigned i;
-       size_t b;
        int cpu;
 
-       percpu_down_write(&c->usage_lock);
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *src = __bucket_array(ca, 1);
 
-       /*
-        * Indicates to buckets code that gc is now in progress - done under
-        * usage_lock to avoid racing with bch2_mark_key():
-        */
-       __gc_pos_set(c, gc_phase(GC_PHASE_START));
+               memcpy(__bucket_array(ca, 0), src,
+                      sizeof(struct bucket_array) +
+                      sizeof(struct bucket) * src->nbuckets);
+       };
 
-       /* Save a copy of the existing bucket stats while we recompute them: */
        for_each_member_device(ca, c, i) {
-               ca->usage_cached = __bch2_dev_usage_read(ca);
+               struct bch_dev_usage *p;
+
                for_each_possible_cpu(cpu) {
-                       struct bch_dev_usage *p =
-                               per_cpu_ptr(ca->usage_percpu, cpu);
+                       p = per_cpu_ptr(ca->usage[0], cpu);
                        memset(p, 0, sizeof(*p));
                }
+
+               preempt_disable();
+               *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+               preempt_enable();
        }
 
-       c->usage_cached = __bch2_fs_usage_read(c);
-       for_each_possible_cpu(cpu) {
-               struct bch_fs_usage *p =
-                       per_cpu_ptr(c->usage_percpu, cpu);
+       {
+               struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+               struct bch_fs_usage *p;
 
-               memset(p->replicas, 0, sizeof(p->replicas));
-               memset(p->buckets, 0, sizeof(p->buckets));
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(c->usage[0], cpu);
+                       memset(p, 0, offsetof(typeof(*p), online_reserved));
+               }
+
+               preempt_disable();
+               memcpy(this_cpu_ptr(c->usage[0]),
+                      &src,
+                      offsetof(typeof(*p), online_reserved));
+               preempt_enable();
        }
 
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int cpu;
+
+#define copy_field(_f, _msg, ...)                                      \
+       if (dst._f != src._f) {                                         \
+               pr_info(_msg ": got %llu, should be %llu, fixing"       \
+                       , ##__VA_ARGS__, dst._f, src._f);               \
+               dst._f = src._f;                                        \
+       }
+#define copy_bucket_field(_f)                                          \
+       if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
+               pr_info("dev %u bucket %zu has wrong " #_f              \
+                       ": got %u, should be %u, fixing",               \
+                       i, b, dst->b[b].mark._f, src->b[b].mark._f);    \
+               dst->b[b]._mark._f = src->b[b].mark._f;                 \
+       }
+#define copy_dev_field(_f, _msg, ...)                                  \
+       copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)                                   \
+       copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+       percpu_down_write(&c->usage_lock);
+
+       if (initial) {
+               bch2_gc_done_nocheck(c);
+               goto out;
+       }
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *dst = __bucket_array(ca, 0);
+               struct bucket_array *src = __bucket_array(ca, 1);
+               size_t b;
+
+               if (initial) {
+                       memcpy(dst, src,
+                              sizeof(struct bucket_array) +
+                              sizeof(struct bucket) * dst->nbuckets);
+               }
+
+               for (b = 0; b < src->nbuckets; b++) {
+                       copy_bucket_field(gen);
+                       copy_bucket_field(data_type);
+                       copy_bucket_field(owned_by_allocator);
+                       copy_bucket_field(stripe);
+                       copy_bucket_field(dirty_sectors);
+                       copy_bucket_field(cached_sectors);
+               }
+       };
+
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+               struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+               struct bch_dev_usage *p;
+               unsigned b;
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_dev_field(buckets[b],
+                                      "buckets[%s]", bch2_data_types[b]);
+               copy_dev_field(buckets_alloc, "buckets_alloc");
+               copy_dev_field(buckets_ec, "buckets_ec");
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_dev_field(sectors[b],
+                                      "sectors[%s]", bch2_data_types[b]);
+               copy_dev_field(sectors_fragmented,
+                              "sectors_fragmented");
+
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(ca->usage[0], cpu);
+                       memset(p, 0, sizeof(*p));
+               }
+
+               preempt_disable();
+               p = this_cpu_ptr(ca->usage[0]);
+               *p = dst;
+               preempt_enable();
+       }
+
+       {
+               struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+               struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+               struct bch_fs_usage *p;
+               unsigned r, b;
+
+               for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+                       for (b = 0; b < BCH_DATA_NR; b++)
+                               copy_fs_field(replicas[r].data[b],
+                                             "replicas[%i].data[%s]",
+                                             r, bch2_data_types[b]);
+                       copy_fs_field(replicas[r].ec_data,
+                                     "replicas[%i].ec_data", r);
+                       copy_fs_field(replicas[r].persistent_reserved,
+                                     "replicas[%i].persistent_reserved", r);
+               }
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_fs_field(buckets[b],
+                                     "buckets[%s]", bch2_data_types[b]);
+
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(c->usage[0], cpu);
+                       memset(p, 0, offsetof(typeof(*p), online_reserved));
+               }
+
+               preempt_disable();
+               p = this_cpu_ptr(c->usage[0]);
+               memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+               preempt_enable();
+       }
+out:
        percpu_up_write(&c->usage_lock);
 
-       /* Clear bucket marks: */
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       BUG_ON(c->usage[1]);
+
+       c->usage[1] = alloc_percpu(struct bch_fs_usage);
+       if (!c->usage[1])
+               return -ENOMEM;
+
        for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-                       bucket_cmpxchg(buckets->b + b, new, ({
-                               new.owned_by_allocator  = 0;
-                               new.data_type           = 0;
-                               new.cached_sectors      = 0;
-                               new.dirty_sectors       = 0;
-                               new.stripe              = 0;
-                       }));
-                       ca->oldest_gens[b] = new.gen;
+               BUG_ON(ca->buckets[1]);
+               BUG_ON(ca->usage[1]);
+
+               ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+                               ca->mi.nbuckets * sizeof(struct bucket),
+                               GFP_KERNEL|__GFP_ZERO);
+               if (!ca->buckets[1]) {
+                       percpu_ref_put(&ca->ref);
+                       return -ENOMEM;
+               }
+
+               ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage[1]) {
+                       percpu_ref_put(&ca->ref);
+                       return -ENOMEM;
                }
-               up_read(&ca->bucket_lock);
        }
+
+       percpu_down_write(&c->usage_lock);
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *dst = __bucket_array(ca, 1);
+               struct bucket_array *src = __bucket_array(ca, 0);
+               size_t b;
+
+               dst->first_bucket       = src->first_bucket;
+               dst->nbuckets           = src->nbuckets;
+
+               for (b = 0; b < src->nbuckets; b++)
+                       dst->b[b]._mark.gen = src->b[b].mark.gen;
+       };
+
+       percpu_up_write(&c->usage_lock);
+
+       return 0;
 }
 
 /**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
  */
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 {
        struct bch_dev *ca;
        u64 start_time = local_clock();
-       unsigned i;
+       unsigned i, iter = 0;
        int ret;
 
-       /*
-        * Walk _all_ references to buckets, and recompute them:
-        *
-        * Order matters here:
-        *  - Concurrent GC relies on the fact that we have a total ordering for
-        *    everything that GC walks - see  gc_will_visit_node(),
-        *    gc_will_visit_root()
-        *
-        *  - also, references move around in the course of index updates and
-        *    various other crap: everything needs to agree on the ordering
-        *    references are allowed to move around in - e.g., we're allowed to
-        *    start with a reference owned by an open_bucket (the allocator) and
-        *    move it to the btree, but not the reverse.
-        *
-        *    This is necessary to ensure that gc doesn't miss references that
-        *    move around - if references move backwards in the ordering GC
-        *    uses, GC could skip past them
-        */
        trace_gc_start(c);
 
-       /*
-        * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-        * gc_lock if sectors_available goes to 0:
-        */
-       bch2_recalc_sectors_available(c);
-
        down_write(&c->gc_lock);
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+       ret = bch2_gc_start(c);
+       if (ret)
                goto out;
 
-       bch2_gc_start(c);
-
        bch2_mark_superblocks(c);
 
-       ret = bch2_gc_btrees(c, NULL, false);
-       if (ret) {
-               bch_err(c, "btree gc failed: %d", ret);
-               set_bit(BCH_FS_GC_FAILURE, &c->flags);
+       ret = bch2_gc_btrees(c, journal, initial);
+       if (ret)
                goto out;
-       }
 
        bch2_mark_pending_btree_node_frees(c);
        bch2_mark_allocator_buckets(c);
 
-       /* Indicates that gc is no longer in progress: */
-       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        c->gc_count++;
 out:
+       if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+               /*
+                * XXX: make sure gens we fixed got saved
+                */
+               if (iter++ <= 2) {
+                       bch_info(c, "Fixed gens, restarting mark and sweep:");
+                       clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       goto again;
+               }
+
+               bch_info(c, "Unable to fix bucket gens, looping");
+               ret = -EINVAL;
+       }
+
+       if (!ret)
+               bch2_gc_done(c, initial);
+
+       /* Indicates that gc is no longer in progress: */
+       __gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+       bch2_gc_free(c);
        up_write(&c->gc_lock);
+
+       if (!ret && initial)
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
        trace_gc_end(c);
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
@@ -724,6 +896,7 @@ out:
         * allocator thread - issue wakeup in case they blocked on gc_lock:
         */
        closure_wake_up(&c->freelist_wait);
+       return ret;
 }
 
 /* Btree coalescing */
@@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c)
 {
        enum btree_id id;
 
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return;
-
        down_read(&c->gc_lock);
        trace_gc_coalesce_start(c);
 
@@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c)
                if (ret) {
                        if (ret != -ESHUTDOWN)
                                bch_err(c, "btree coalescing failed: %d", ret);
-                       set_bit(BCH_FS_GC_FAILURE, &c->flags);
                        return;
                }
        }
@@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg)
        struct io_clock *clock = &c->io_clock[WRITE];
        unsigned long last = atomic_long_read(&clock->now);
        unsigned last_kick = atomic_read(&c->kick_gc);
+       int ret;
 
        set_freezable();
 
@@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg)
                last = atomic_long_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
-               bch2_gc(c);
+               ret = bch2_gc(c, NULL, false);
+               if (ret)
+                       bch_err(c, "btree gc failed: %i", ret);
 
                debug_check_no_locks_held();
        }
@@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
-       unsigned iter = 0;
-       int ret = 0;
-
-       down_write(&c->gc_lock);
-again:
-       bch2_gc_start(c);
-
-       bch2_mark_superblocks(c);
-
-       ret = bch2_gc_btrees(c, journal, true);
-       if (ret)
-               goto err;
-
-       if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-               if (iter++ > 2) {
-                       bch_info(c, "Unable to fix bucket gens, looping");
-                       ret = -EINVAL;
-                       goto err;
-               }
-
-               bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-               clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-               goto again;
-       }
+       int ret = bch2_gc(c, journal, true);
 
        /*
         * Skip past versions that might have possibly been used (as nonces),
@@ -1174,9 +1323,5 @@ again:
        if (c->sb.encryption_type)
                atomic64_add(1 << 16, &c->key_version);
 
-       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
-       up_write(&c->gc_lock);
        return ret;
 }
index 47a590015325e7e830f9104db1f55e45ee9b6948..bb77564b9463c04efd04e3e540a5d63b5ec68f30 100644 (file)
@@ -7,7 +7,7 @@
 enum bkey_type;
 
 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
        };
 }
 
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
        unsigned seq;
        bool ret;
 
        do {
                seq = read_seqcount_begin(&c->gc_pos_lock);
-               ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+               ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
        } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
        return ret;
index af31819c88c78e9339156e59714e17d61e2dca33..2631b0732d4babb6c7b8d88b830abf236c6136f4 100644 (file)
@@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
        struct bch_fs *c = as->c;
        struct pending_btree_node_free *d;
-       unsigned replicas;
 
        /*
         * btree_update lock is only needed here to avoid racing with
@@ -178,15 +177,6 @@ found:
        BUG_ON(d->index_update_done);
        d->index_update_done = true;
 
-       /*
-        * Btree nodes are accounted as freed in bch_alloc_stats when they're
-        * freed from the index:
-        */
-       replicas = bch2_extent_nr_dirty_ptrs(k);
-       if (replicas)
-               stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-                       c->opts.btree_node_size * replicas;
-
        /*
         * We're dropping @k from the btree, but it's still live until the
         * index update is persistent so we need to keep a reference around for
@@ -208,15 +198,16 @@ found:
         * bch2_mark_key() compares the current gc pos to the pos we're
         * moving this reference from, hence one comparison here:
         */
-       if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-               struct bch_fs_usage tmp = { 0 };
+       if (gc_pos_cmp(c->gc_pos, b
+                      ? gc_pos_btree_node(b)
+                      : gc_pos_btree_root(as->btree_id)) >= 0 &&
+           gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+               struct gc_pos pos = { 0 };
 
                bch2_mark_key(c, BKEY_TYPE_BTREE,
                              bkey_i_to_s_c(&d->key),
-                             false, 0, b
-                             ? gc_pos_btree_node(b)
-                             : gc_pos_btree_root(as->btree_id),
-                             &tmp, 0, 0);
+                             false, 0, pos,
+                             NULL, 0, BCH_BUCKET_MARK_GC);
                /*
                 * Don't apply tmp - pending deletes aren't tracked in
                 * bch_alloc_stats:
@@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
                                        struct pending_btree_node_free *pending)
 {
-       struct bch_fs_usage stats = { 0 };
-
        BUG_ON(!pending->index_update_done);
 
        bch2_mark_key(c, BKEY_TYPE_BTREE,
                      bkey_i_to_s_c(&pending->key),
                      false, 0,
                      gc_phase(GC_PHASE_PENDING_DELETE),
-                     &stats, 0, 0);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
+                     NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
        btree_interior_update_add_node_reference(as, b);
 
+       /*
+        * XXX: the rest of the update path treats this like we're actually
+        * inserting a new node and deleting the existing node, so the
+        * reservation needs to include enough space for @b
+        *
+        * that is actually sketch as fuck though and I am surprised the code
+        * seems to work like that, definitely need to go back and rework it
+        * into something saner.
+        *
+        * (I think @b is just getting double counted until the btree update
+        * finishes and "deletes" @b on disk)
+        */
+       ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+                       c->opts.btree_node_size *
+                       bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
+                       BCH_DISK_RESERVATION_NOFAIL|
+                       BCH_DISK_RESERVATION_GC_LOCK_HELD);
+       BUG_ON(ret);
+
        parent = btree_node_parent(iter, b);
        if (parent) {
                if (new_hash) {
index 201798866242b6949398d3193f85ec2e0915becb..2ebe8bad978e78dc7477e663256d2477de2e24b4 100644 (file)
@@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
 static void bch2_fs_stats_verify(struct bch_fs *c)
 {
-       struct bch_fs_usage stats =
-               __bch2_fs_usage_read(c);
+       struct bch_fs_usage stats =_bch2_fs_usage_read(c);
        unsigned i, j;
 
        for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@@ -209,43 +208,24 @@ do {                                                                      \
        _acc;                                                           \
 })
 
-#define bch2_usage_read_cached(_c, _cached, _uncached)                 \
-({                                                                     \
-       typeof(_cached) _ret;                                           \
-       unsigned _seq;                                                  \
-                                                                       \
-       do {                                                            \
-               _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
-               _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
-                       ? bch2_usage_read_raw(_uncached)                        \
-                       : (_cached);                                    \
-       } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
-                                                                       \
-       _ret;                                                           \
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
 {
-       return bch2_usage_read_raw(ca->usage_percpu);
+       return bch2_usage_read_raw(ca->usage[gc]);
 }
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-       return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+       return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
 {
-       return bch2_usage_read_raw(c->usage_percpu);
+       return bch2_usage_read_raw(c->usage[gc]);
 }
 
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
-       return bch2_usage_read_cached(c,
-                                    c->usage_cached,
-                                    c->usage_percpu);
+       return bch2_usage_read_raw(c->usage[0]);
 }
 
 struct fs_usage_sum {
@@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
                : m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bch_fs *c,
-                                     struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
                                      struct bucket_mark new)
 {
        return is_available_bucket(old) &&
-              !is_available_bucket(new) &&
-              (!c || c->gc_pos.phase == GC_PHASE_DONE);
+              !is_available_bucket(new);
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
@@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c,
        percpu_down_read(&c->usage_lock);
        preempt_disable();
        /* online_reserved not subject to gc: */
-       this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+       this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
        stats->online_reserved = 0;
 
-       if (!gc_will_visit(c, gc_pos))
-               bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+       bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+       if (gc_visited(c, gc_pos))
+               bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
 
        bch2_fs_stats_verify(c);
        preempt_enable();
@@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-                                 struct bch_fs_usage *stats,
-                                 struct bucket_mark old, struct bucket_mark new)
+                                 struct bch_fs_usage *fs_usage,
+                                 struct bucket_mark old, struct bucket_mark new,
+                                 bool gc)
 {
        struct bch_dev_usage *dev_usage;
 
@@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                bch2_data_types[old.data_type],
                bch2_data_types[new.data_type]);
 
-       stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-       stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
-
        preempt_disable();
-       dev_usage = this_cpu_ptr(ca->usage_percpu);
+       dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-       dev_usage->buckets[bucket_type(old)]--;
-       dev_usage->buckets[bucket_type(new)]++;
+       if (bucket_type(old) != bucket_type(new)) {
+               if (bucket_type(old)) {
+                       fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+                       dev_usage->buckets[bucket_type(old)]--;
+               } else {
+                       fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+                       dev_usage->buckets[bucket_type(new)]++;
+               }
+       }
 
        dev_usage->buckets_alloc +=
                (int) new.owned_by_allocator - (int) old.owned_by_allocator;
@@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 ({                                                             \
        struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
                                                                \
-       bch2_dev_usage_update(c, ca, stats, _old, new);         \
+       bch2_dev_usage_update(c, ca, stats, _old, new, gc);     \
        _old;                                                   \
 })
 
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, struct bucket_mark *old,
+                                    bool gc)
 {
-       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-       struct bucket *g;
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
        struct bucket_mark new;
 
-       percpu_rwsem_assert_held(&c->usage_lock);
-
-       g = bucket(ca, b);
-
        *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                BUG_ON(!is_available_bucket(new));
 
@@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.gen++;
        }));
 
-       /*
-        * This isn't actually correct yet, since fs usage is still
-        * uncompressed sectors:
-        */
        stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, struct bucket_mark *old)
+{
+       percpu_rwsem_assert_held(&c->usage_lock);
+
+       __bch2_invalidate_bucket(c, ca, b, old, false);
 
        if (!old->owned_by_allocator && old->cached_sectors)
                trace_invalidate(ca, bucket_to_sector(ca, b),
                                 old->cached_sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, bool owned_by_allocator,
-                           struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, bool owned_by_allocator,
+                                    bool gc)
 {
-       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-       struct bucket *g;
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
        struct bucket_mark old, new;
 
-       percpu_rwsem_assert_held(&c->usage_lock);
-       g = bucket(ca, b);
-
-       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-           gc_will_visit(c, pos))
-               return;
-
        old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
-       BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-              c->gc_pos.phase == GC_PHASE_DONE);
+       BUG_ON(!gc &&
+              !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, bool owned_by_allocator,
+                           struct gc_pos pos, unsigned flags)
+{
+       percpu_rwsem_assert_held(&c->usage_lock);
+
+       if (!(flags & BCH_BUCKET_MARK_GC))
+               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+       if ((flags & BCH_BUCKET_MARK_GC) ||
+           gc_visited(c, pos))
+               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 }
 
 #define checked_add(a, b)                                      \
@@ -491,37 +484,49 @@ do {                                                              \
        BUG_ON((a) != _res);                                    \
 } while (0)
 
+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                       size_t b, enum bch_data_type type,
+                                       unsigned sectors, bool gc)
+{
+       struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
+       struct bucket_mark old, new;
+
+       BUG_ON(type != BCH_DATA_SB &&
+              type != BCH_DATA_JOURNAL);
+
+       old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+               new.data_type   = type;
+               checked_add(new.dirty_sectors, sectors);
+       }));
+
+       fs_usage->replicas[0].data[type] += sectors;
+}
+
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                               size_t b, enum bch_data_type type,
                               unsigned sectors, struct gc_pos pos,
                               unsigned flags)
 {
-       struct bch_fs_usage *stats;
-       struct bucket *g;
-       struct bucket_mark old, new;
-
        BUG_ON(type != BCH_DATA_SB &&
               type != BCH_DATA_JOURNAL);
 
+       preempt_disable();
+
        if (likely(c)) {
                percpu_rwsem_assert_held(&c->usage_lock);
 
-               if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-                   gc_will_visit(c, pos))
-                       return;
-
-               preempt_disable();
-               stats = this_cpu_ptr(c->usage_percpu);
-
-               g = bucket(ca, b);
-               old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-                       new.data_type = type;
-                       checked_add(new.dirty_sectors, sectors);
-               }));
-
-               stats->replicas[0].data[type] += sectors;
-               preempt_enable();
+               if (!(flags & BCH_BUCKET_MARK_GC))
+                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+                                                   false);
+               if ((flags & BCH_BUCKET_MARK_GC) ||
+                   gc_visited(c, pos))
+                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+                                                   true);
        } else {
+               struct bucket *g;
+               struct bucket_mark old, new;
+
                rcu_read_lock();
 
                g = bucket(ca, b);
@@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                rcu_read_unlock();
        }
 
-       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-              bucket_became_unavailable(c, old, new));
+       preempt_enable();
 }
 
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              struct extent_ptr_decoded p,
                              s64 sectors, enum bch_data_type data_type,
                              struct bch_fs_usage *fs_usage,
-                             u64 journal_seq, unsigned flags)
+                             u64 journal_seq, unsigned flags,
+                             bool gc)
 {
        struct bucket_mark old, new;
        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+       size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+       struct bucket *g = __bucket(ca, b, gc);
        u64 v;
 
-       if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-               if (journal_seq)
-                       bucket_cmpxchg(g, new, ({
-                               new.journal_seq_valid   = 1;
-                               new.journal_seq         = journal_seq;
-                       }));
-
-               return;
-       }
-
        v = atomic64_read(&g->_mark.v);
        do {
                new.v.counter = old.v.counter = v;
@@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
-       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-              bucket_became_unavailable(c, old, new));
+       BUG_ON(!gc && bucket_became_unavailable(old, new));
 }
 
 static void bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
 
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
                             s64 sectors, enum bch_data_type data_type,
-                            struct gc_pos pos,
                             struct bch_fs_usage *stats,
-                            u64 journal_seq, unsigned flags)
+                            u64 journal_seq, unsigned flags,
+                            bool gc)
 {
        BUG_ON(!sectors);
 
@@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
                        s64 adjusted_disk_sectors = disk_sectors;
 
                        bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-                                         stats, journal_seq, flags);
+                                         stats, journal_seq, flags, gc);
 
                        if (!p.ptr.cached)
                                for (i = 0; i < p.ec_nr; i++)
@@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c,
                              const struct bch_stripe *v,
                              bool enabled,
                              struct bch_fs_usage *fs_usage,
-                             u64 journal_seq)
+                             u64 journal_seq,
+                             bool gc)
 {
        unsigned i;
 
        for (i = 0; i < v->nr_blocks; i++) {
                const struct bch_extent_ptr *ptr = v->ptrs + i;
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g;
+               size_t b = PTR_BUCKET_NR(ca, ptr);
+               struct bucket *g = __bucket(ca, b, gc);
                struct bucket_mark new, old;
 
                BUG_ON(ptr_stale(ca, ptr));
 
-               rcu_read_lock();
-               g = PTR_BUCKET(ca, ptr);
-
                old = bucket_cmpxchg(g, new, ({
                        new.stripe                      = enabled;
                        if (journal_seq) {
@@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c,
                                new.journal_seq         = journal_seq;
                        }
                }));
-               rcu_read_unlock();
 
                BUG_ON(old.stripe == enabled);
 
-               bch2_dev_usage_update(c, ca, fs_usage, old, new);
+               bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
        }
 }
 
 static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
-                            bool inserting, struct gc_pos pos,
+                            bool inserting,
                             struct bch_fs_usage *fs_usage,
-                            u64 journal_seq, unsigned flags)
+                            u64 journal_seq, unsigned flags,
+                            bool gc)
 {
        switch (k.k->type) {
        case BCH_STRIPE: {
@@ -820,74 +814,64 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
                else
                        bch2_stripes_heap_del(c, m, idx);
 
-               bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
+               bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
                break;
        }
        }
 }
 
-void bch2_mark_key(struct bch_fs *c,
-                  enum bkey_type type, struct bkey_s_c k,
-                  bool inserting, s64 sectors,
-                  struct gc_pos pos,
-                  struct bch_fs_usage *stats,
-                  u64 journal_seq, unsigned flags)
+static void __bch2_mark_key(struct bch_fs *c,
+                           enum bkey_type type, struct bkey_s_c k,
+                           bool inserting, s64 sectors,
+                           struct bch_fs_usage *stats,
+                           u64 journal_seq, unsigned flags,
+                           bool gc)
 {
-       /*
-        * synchronization w.r.t. GC:
-        *
-        * Normally, bucket sector counts/marks are updated on the fly, as
-        * references are added/removed from the btree, the lists of buckets the
-        * allocator owns, other metadata buckets, etc.
-        *
-        * When GC is in progress and going to mark this reference, we do _not_
-        * mark this reference here, to avoid double counting - GC will count it
-        * when it gets to it.
-        *
-        * To know whether we should mark a given reference (GC either isn't
-        * running, or has already marked references at this position) we
-        * construct a total order for everything GC walks. Then, we can simply
-        * compare the position of the reference we're marking - @pos - with
-        * GC's current position. If GC is going to mark this reference, GC's
-        * current position will be less than @pos; if GC's current position is
-        * greater than @pos GC has either already walked this position, or
-        * isn't running.
-        *
-        * To avoid racing with GC's position changing, we have to deal with
-        *  - GC's position being set to GC_POS_MIN when GC starts:
-        *    usage_lock guards against this
-        *  - GC's position overtaking @pos: we guard against this with
-        *    whatever lock protects the data structure the reference lives in
-        *    (e.g. the btree node lock, or the relevant allocator lock).
-        */
-
-       percpu_down_read(&c->usage_lock);
-       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-           gc_will_visit(c, pos))
-               flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
-
-       if (!stats)
-               stats = this_cpu_ptr(c->usage_percpu);
-
        switch (type) {
        case BKEY_TYPE_BTREE:
                bch2_mark_extent(c, k, inserting
                                 ?  c->opts.btree_node_size
                                 : -c->opts.btree_node_size,
                                 BCH_DATA_BTREE,
-                                pos, stats, journal_seq, flags);
+                                stats, journal_seq, flags, gc);
                break;
        case BKEY_TYPE_EXTENTS:
                bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-                                pos, stats, journal_seq, flags);
+                                stats, journal_seq, flags, gc);
                break;
        case BKEY_TYPE_EC:
                bch2_mark_stripe(c, k, inserting,
-                                pos, stats, journal_seq, flags);
+                                stats, journal_seq, flags, gc);
                break;
        default:
                break;
        }
+}
+
+void bch2_mark_key(struct bch_fs *c,
+                  enum bkey_type type, struct bkey_s_c k,
+                  bool inserting, s64 sectors,
+                  struct gc_pos pos,
+                  struct bch_fs_usage *stats,
+                  u64 journal_seq, unsigned flags)
+{
+       percpu_down_read(&c->usage_lock);
+
+       if (!(flags & BCH_BUCKET_MARK_GC)) {
+               if (!stats)
+                       stats = this_cpu_ptr(c->usage[0]);
+
+               __bch2_mark_key(c, type, k, inserting, sectors,
+                               stats, journal_seq, flags, false);
+       }
+
+       if ((flags & BCH_BUCKET_MARK_GC) ||
+           gc_visited(c, pos)) {
+               __bch2_mark_key(c, type, k, inserting, sectors,
+                               this_cpu_ptr(c->usage[1]),
+                               journal_seq, flags, true);
+       }
+
        percpu_up_read(&c->usage_lock);
 }
 
@@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans,
 
 /* Disk reservations: */
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
        int cpu;
 
        for_each_possible_cpu(cpu)
-               per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+               per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
 
        return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
-       percpu_down_write(&c->usage_lock);
-       atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-       percpu_up_write(&c->usage_lock);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
        percpu_down_read(&c->usage_lock);
-       this_cpu_sub(c->usage_percpu->online_reserved,
+       this_cpu_sub(c->usage[0]->online_reserved,
                     res->sectors);
 
        bch2_fs_stats_verify(c);
@@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
        percpu_down_read(&c->usage_lock);
        preempt_disable();
-       stats = this_cpu_ptr(c->usage_percpu);
+       stats = this_cpu_ptr(c->usage[0]);
 
        if (sectors <= stats->available_cache)
                goto out;
@@ -1055,7 +1031,7 @@ recalculate:
        }
 
        percpu_down_write(&c->usage_lock);
-       sectors_available = __recalc_sectors_available(c);
+       sectors_available = bch2_recalc_sectors_available(c);
 
        if (sectors <= sectors_available ||
            (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
        size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
                                      btree_reserve);
-       bool resize = ca->buckets != NULL,
+       bool resize = ca->buckets[0] != NULL,
             start_copygc = ca->copygc_thread != NULL;
        int ret = -ENOMEM;
        unsigned i;
@@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                       BITS_TO_LONGS(n) * sizeof(unsigned long));
        }
 
-       rcu_assign_pointer(ca->buckets, buckets);
+       rcu_assign_pointer(ca->buckets[0], buckets);
        buckets = old_buckets;
 
        swap(ca->oldest_gens, oldest_gens);
@@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
        kvpfree(ca->buckets_dirty,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-       kvpfree(rcu_dereference_protected(ca->buckets, 1),
+       kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
 
-       free_percpu(ca->usage_percpu);
+       free_percpu(ca->usage[0]);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+       if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
                return -ENOMEM;
 
        return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
index b48960fa5ce787322b65f69c4f70af4b434120d7..813e0c44e1076a802a07a0f2b4c4b37f426edb35 100644 (file)
        _old;                                                   \
 })
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+                                                 bool gc)
 {
-       return rcu_dereference_check(ca->buckets,
+       return rcu_dereference_check(ca->buckets[gc],
                                     !ca->fs ||
                                     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
                                     lockdep_is_held(&ca->fs->gc_lock) ||
                                     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+       return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 {
-       struct bucket_array *buckets = bucket_array(ca);
+       struct bucket_array *buckets = __bucket_array(ca, gc);
 
        BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
        return buckets->b + b;
 }
 
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+       return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
                                         size_t b, int rw)
 {
@@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
 
 /* Device usage: */
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                         struct disk_reservation *, struct gc_pos);
@@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               struct gc_pos, unsigned);
 
 #define BCH_BUCKET_MARK_NOATOMIC               (1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE   (1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT          (1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD           (1 << 3)
+#define BCH_BUCKET_MARK_GC                     (1 << 1)
 
 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
                   bool, s64, struct gc_pos,
                   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
index 9ec96dbab0e8104ea6525edd07c8f7000e00a471..0187f465d23f52d24718f6eef8645ae855e581d8 100644 (file)
@@ -64,8 +64,6 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
-       u64                     online_reserved;
-       u64                     available_cache;
 
        struct {
                u64             data[BCH_DATA_NR];
@@ -74,6 +72,10 @@ struct bch_fs_usage {
        }                       replicas[BCH_REPLICAS_MAX];
 
        u64                     buckets[BCH_DATA_NR];
+
+       /* fields starting here aren't touched by gc: */
+       u64                     online_reserved;
+       u64                     available_cache;
 };
 
 /*
index 939caa3b8183e1bcfd3b1d564c02339154172e3d..4045c0e684627f0ac4482cfc1074952f3036d9d7 100644 (file)
@@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
                                ca->mi.bucket_size,
                                gc_phase(GC_PHASE_SB),
-                               new_fs
-                               ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-                               : 0);
+                               0);
 
                if (c) {
                        spin_unlock(&c->journal.lock);
index 931e50e8ad578c2a7f74f6c30a46408c17fa8efc..59f2aa7e047c3da7756adb18b775c2651f123f39 100644 (file)
@@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        percpu_free_rwsem(&c->usage_lock);
-       free_percpu(c->usage_percpu);
+       free_percpu(c->usage[0]);
        mempool_exit(&c->btree_iters_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
@@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        max(offsetof(struct btree_read_bio, bio),
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
-           !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+           !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
            percpu_init_rwsem(&c->usage_lock) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
@@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
                return ret;
 
        mutex_lock(&c->sb_lock);
-       bch2_mark_dev_superblock(ca->fs, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(ca->fs, ca, 0);
        mutex_unlock(&c->sb_lock);
 
        bch2_dev_sysfs_online(c, ca);
@@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)
 
        for_each_possible_cpu(cpu) {
                struct bch_dev_usage *p =
-                       per_cpu_ptr(ca->usage_percpu, cpu);
+                       per_cpu_ptr(ca->usage[0], cpu);
                memset(p, 0, sizeof(*p));
        }
 
@@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         * allocate the journal, reset all the marks, then remark after we
         * attach...
         */
-       bch2_mark_dev_superblock(ca->fs, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(ca->fs, ca, 0);
 
        err = "journal alloc failed";
        ret = bch2_dev_journal_alloc(ca);
@@ -1435,8 +1433,7 @@ have_slot:
        ca->disk_sb.sb->dev_idx = dev_idx;
        bch2_dev_attach(c, ca, dev_idx);
 
-       bch2_mark_dev_superblock(c, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(c, ca, 0);
 
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
index 188e19572d91b49fd8ab417302f6b06e0bfbbc6a..8eacc0d2550b1f1c56704cd2a8f350f831d07494 100644 (file)
@@ -478,7 +478,7 @@ STORE(__bch2_fs)
                bch2_coalesce(c);
 
        if (attr == &sysfs_trigger_gc)
-               bch2_gc(c);
+               bch2_gc(c, NULL, false);
 
        if (attr == &sysfs_prune_cache) {
                struct shrink_control sc;