And assorted other copygc fixes.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
                                  ca->inc_gen_really_needs_gc);
 
                if (available > fifo_free(&ca->free_inc) ||
-                   (available && !fifo_full(&ca->free[RESERVE_BTREE])))
+                   (available &&
+                    (!fifo_full(&ca->free[RESERVE_BTREE]) ||
+                     !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
                        break;
 
                up_read(&c->gc_lock);
 
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-                                   struct open_bucket *ob,
-                                   bool may_realloc)
+                                   struct write_point *wp,
+                                   struct open_bucket *ob)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+       bool may_realloc = wp->type == BCH_DATA_user;
 
-       BUG_ON(ca->open_buckets_partial_nr >=
+       BUG_ON(ca->open_buckets_partial_nr >
               ARRAY_SIZE(ca->open_buckets_partial));
 
        if (ca->open_buckets_partial_nr <
 
        spin_lock(&c->freelist_lock);
 
-       if (may_alloc_partial &&
-           ca->open_buckets_partial_nr) {
-               ob = c->open_buckets +
-                       ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-               ob->on_partial_list = false;
-               spin_unlock(&c->freelist_lock);
-               return ob;
+       if (may_alloc_partial) {
+               int i;
+
+               for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+                       ob = c->open_buckets + ca->open_buckets_partial[i];
+
+                       if (reserve <= ob->alloc_reserve) {
+                               array_remove_item(ca->open_buckets_partial,
+                                                 ca->open_buckets_partial_nr,
+                                                 i);
+                               ob->on_partial_list = false;
+                               ob->alloc_reserve = reserve;
+                               spin_unlock(&c->freelist_lock);
+                               return ob;
+                       }
+               }
        }
 
        if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
 
        ob->valid       = true;
        ob->sectors_free = ca->mi.bucket_size;
+       ob->alloc_reserve = reserve;
        ob->ptr         = (struct bch_extent_ptr) {
                .type   = 1 << BCH_EXTENT_ENTRY_ptr,
                .gen    = buckets->b[bucket].mark.gen,
 alloc_done:
        BUG_ON(!ret && nr_effective < nr_replicas);
 
-       WARN_ON(reserve == RESERVE_MOVINGGC &&
-               ret == FREELIST_EMPTY);
-
        if (erasure_code && !ec_open_bucket(c, &ptrs))
                pr_debug("failed to get ec bucket: ret %u", ret);
 
 
        /* Free buckets we didn't use: */
        open_bucket_for_each(c, &wp->ptrs, ob, i)
-               open_bucket_free_unused(c, ob, wp->type == BCH_DATA_user);
+               open_bucket_free_unused(c, wp, ob);
 
        wp->ptrs = ptrs;
 
                if (ptrs.nr < ARRAY_SIZE(ptrs.v))
                        ob_push(c, &ptrs, ob);
                else
-                       open_bucket_free_unused(c, ob,
-                                       wp->type == BCH_DATA_user);
+                       open_bucket_free_unused(c, wp, ob);
        wp->ptrs = ptrs;
 
        mutex_unlock(&wp->lock);
        bch2_open_buckets_put(c, &ptrs);
 }
 
+static inline void writepoint_init(struct write_point *wp,
+                                  enum bch_data_type type)
+{
+       mutex_init(&wp->lock);
+       wp->type = type;
+}
+
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
 {
        struct open_bucket *ob;
 
        return (struct write_point_specifier) { .v = (unsigned long) wp };
 }
 
-static inline void writepoint_init(struct write_point *wp,
-                                  enum bch_data_type type)
-{
-       mutex_init(&wp->lock);
-       wp->type = type;
-}
-
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
 
        u8                      type;
        unsigned                valid:1;
        unsigned                on_partial_list:1;
+       int                     alloc_reserve:3;
        unsigned                sectors_free;
        struct bch_extent_ptr   ptr;
        struct ec_stripe_new    *ec;
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if (btree_node_type_needs_gc(i)) {
                        ret = bch2_gc_btree_gens(c, i);
-                       if (ret)
+                       if (ret) {
+                               bch_err(c, "error recalculating oldest_gen: %i", ret);
                                goto err;
+                       }
                }
 
        for_each_member_device(ca, c, i) {
                        g->oldest_gen = g->gc_gen;
                up_read(&ca->bucket_lock);
        }
+
+       c->gc_count++;
 err:
        up_read(&c->gc_lock);
        return ret;
 
 
        if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
                m->op.alloc_reserve = RESERVE_MOVINGGC;
+               m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
        } else {
                /* XXX: this should probably be passed in */
                m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
 
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
+#include "error.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include "io.h"
        if (dev_idx < 0)
                return DATA_SKIP;
 
-       /* XXX: use io_opts for this inode */
        data_opts->target               = io_opts->background_target;
        data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
        data_opts->rewrite_dev          = dev_idx;
        return ret;
 }
 
-static void bch2_copygc(struct bch_fs *c)
+static int bch2_copygc(struct bch_fs *c)
 {
        copygc_heap *h = &c->copygc_heap;
        struct copygc_heap_entry e, *i;
                free_heap(&c->copygc_heap);
                if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
                        bch_err(c, "error allocating copygc heap");
-                       return;
+                       return 0;
                }
        }
 
                                continue;
 
                        e = (struct copygc_heap_entry) {
+                               .dev            = dev_idx,
                                .gen            = m.gen,
                                .sectors        = bucket_sectors_used(m),
                                .offset         = bucket_to_sector(ca, b),
                up_read(&ca->bucket_lock);
        }
 
+       if (!sectors_reserved) {
+               bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+               return -1;
+       }
+
        for (i = h->data; i < h->data + h->used; i++)
                sectors_to_move += i->sectors;
 
        buckets_to_move = h->used;
 
        if (!buckets_to_move)
-               return;
+               return 0;
 
        eytzinger0_sort(h->data, h->used,
                        sizeof(h->data[0]),
                down_read(&ca->bucket_lock);
                buckets = bucket_array(ca);
                for (i = h->data; i < h->data + h->used; i++) {
-                       size_t b = sector_to_bucket(ca, i->offset);
-                       struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+                       struct bucket_mark m;
+                       size_t b;
 
-                       if (i->gen == m.gen && bucket_sectors_used(m)) {
+                       if (i->dev != dev_idx)
+                               continue;
+
+                       b = sector_to_bucket(ca, i->offset);
+                       m = READ_ONCE(buckets->b[b].mark);
+
+                       if (i->gen == m.gen &&
+                           bucket_sectors_used(m)) {
                                sectors_not_moved += bucket_sectors_used(m);
                                buckets_not_moved++;
                        }
        trace_copygc(c,
                     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
                     buckets_to_move, buckets_not_moved);
+       return 0;
 }
 
 /*
                        continue;
                }
 
-               bch2_copygc(c);
+               if (bch2_copygc(c))
+                       break;
        }
 
        return 0;
        if (bch2_fs_init_fault("copygc_start"))
                return -ENOMEM;
 
-       t = kthread_create(bch2_copygc_thread, c,
-                          "bch_copygc[%s]", c->name);
+       t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
        if (IS_ERR(t))
                return PTR_ERR(t);
 
 
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
+       /*
+        * Device going read only means the copygc reserve get smaller, so we
+        * don't want that happening while copygc is in progress:
+        */
+       bch2_copygc_stop(c);
+
        /*
         * The allocator thread itself allocates btree nodes, so stop it first:
         */
        bch2_dev_allocator_stop(ca);
        bch2_dev_allocator_remove(c, ca);
        bch2_dev_journal_stop(&c->journal, ca);
+
+       bch2_copygc_start(c);
 }
 
 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)