struct bch_fs_pcpu __percpu     *pcpu;
 
-       struct bch_fs_usage __percpu    *usage[2];
-
        struct percpu_rw_semaphore      mark_lock;
 
+       struct bch_fs_usage __percpu    *usage[2];
+       struct bch_fs_usage __percpu    *usage_scratch;
+
        /*
         * When we invalidate buckets, we use both the priority and the amount
         * of good data to determine which buckets to reuse first - to weight
 
                ca->usage[1] = NULL;
        }
 
+       percpu_down_write(&c->mark_lock);
+
        free_percpu(c->usage[1]);
        c->usage[1] = NULL;
-}
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-       u64 *ret;
-       int cpu;
-
-       preempt_disable();
-       ret = this_cpu_ptr(p);
-       preempt_enable();
-
-       for_each_possible_cpu(cpu) {
-               u64 *i = per_cpu_ptr(p, cpu);
 
-               if (i != ret) {
-                       acc_u64s(ret, i, nr);
-                       memset(i, 0, nr * sizeof(u64));
-               }
-       }
-
-       return ret;
+       percpu_up_write(&c->mark_lock);
 }
 
 static void bch2_gc_done_nocheck(struct bch_fs *c)
        for_each_member_device(ca, c, i) {
                unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
                struct bch_dev_usage *dst = (void *)
-                       acc_percpu_u64s((void *) ca->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
                struct bch_dev_usage *src = (void *)
-                       acc_percpu_u64s((void *) ca->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 
                *dst = *src;
        }
 
        {
-               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+                       c->replicas.nr;
                struct bch_fs_usage *dst = (void *)
-                       acc_percpu_u64s((void *) c->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[0], nr);
                struct bch_fs_usage *src = (void *)
-                       acc_percpu_u64s((void *) c->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[1], nr);
                unsigned offset = offsetof(typeof(*dst), s.gc_start);
 
                memcpy((void *) dst + offset,
                       (void *) src + offset,
-                      sizeof(*dst) - offset);
+                      nr * sizeof(u64) - offset);
        }
 }
 
        for_each_member_device(ca, c, i) {
                unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
                struct bch_dev_usage *dst = (void *)
-                       acc_percpu_u64s((void *) ca->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
                struct bch_dev_usage *src = (void *)
-                       acc_percpu_u64s((void *) ca->usage[1], nr);
+                       bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
                unsigned b;
 
                for (b = 0; b < BCH_DATA_NR; b++)
        }
 
        {
-               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+                       c->replicas.nr;
                struct bch_fs_usage *dst = (void *)
-                       acc_percpu_u64s((void *) c->usage[0], nr);
+                       bch2_acc_percpu_u64s((void *) c->usage[0], nr);
                struct bch_fs_usage *src = (void *)
-                       acc_percpu_u64s((void *) c->usage[1], nr);
-               unsigned r, b;
+                       bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
                copy_fs_field(s.hidden,         "hidden");
                copy_fs_field(s.data,           "data");
                copy_fs_field(s.reserved,       "reserved");
                copy_fs_field(s.nr_inodes,      "nr_inodes");
 
-               for (r = 0; r < BCH_REPLICAS_MAX; r++) {
-                       for (b = 0; b < BCH_DATA_NR; b++)
-                               copy_fs_field(replicas[r].data[b],
-                                             "replicas[%i].data[%s]",
-                                             r, bch2_data_types[b]);
-                       copy_fs_field(replicas[r].ec_data,
-                                     "replicas[%i].ec_data", r);
-                       copy_fs_field(replicas[r].persistent_reserved,
-                                     "replicas[%i].persistent_reserved", r);
-               }
+               for (i = 0; i < BCH_REPLICAS_MAX; i++)
+                       copy_fs_field(persistent_reserved[i],
+                                     "persistent_reserved[%i]", i);
 
-               for (b = 0; b < BCH_DATA_NR; b++)
-                       copy_fs_field(buckets[b],
-                                     "buckets[%s]", bch2_data_types[b]);
+               for (i = 0; i < c->replicas.nr; i++) {
+                       /*
+                        * XXX: print out replicas entry
+                        */
+                       copy_fs_field(data[i], "data[%i]", i);
+               }
        }
 out:
        percpu_up_write(&c->mark_lock);
         */
        gc_pos_set(c, gc_phase(GC_PHASE_START));
 
+       percpu_down_write(&c->mark_lock);
        BUG_ON(c->usage[1]);
 
-       c->usage[1] = alloc_percpu(struct bch_fs_usage);
+       c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
+                                        sizeof(u64) * c->replicas.nr,
+                                        sizeof(u64),
+                                        GFP_KERNEL);
+       percpu_up_write(&c->mark_lock);
+
        if (!c->usage[1])
                return -ENOMEM;
 
 
 {
        struct bch_fs *c = as->c;
        struct btree *old = btree_node_root(c, b);
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
 
        __bch2_btree_set_root_inmem(c, b);
 
        mutex_lock(&c->btree_interior_update_lock);
        percpu_down_read(&c->mark_lock);
+       preempt_disable();
+       fs_usage = bch2_fs_usage_get_scratch(c);
 
        bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
                      true, 0,
                      gc_pos_btree_root(b->btree_id),
-                     &stats, 0, 0);
+                     fs_usage, 0, 0);
 
        if (old && !btree_node_fake(old))
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&old->key),
-                                          &stats);
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_root(b->btree_id));
 
+       preempt_enable();
        percpu_up_read(&c->mark_lock);
        mutex_unlock(&c->btree_interior_update_lock);
 }
                                        struct btree_node_iter *node_iter)
 {
        struct bch_fs *c = as->c;
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
        struct bkey_packed *k;
        struct bkey tmp;
 
 
        mutex_lock(&c->btree_interior_update_lock);
        percpu_down_read(&c->mark_lock);
+       fs_usage = bch2_fs_usage_get_scratch(c);
 
        bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
                             true, 0,
-                            gc_pos_btree_node(b), &stats, 0, 0);
+                            gc_pos_btree_node(b), fs_usage, 0, 0);
 
        while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
               bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
        if (k && !bkey_cmp_packed(b, k, &insert->k))
                bch2_btree_node_free_index(as, b,
                                           bkey_disassemble(b, k, &tmp),
-                                          &stats);
+                                          fs_usage);
 
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_node(b));
 
        percpu_up_read(&c->mark_lock);
                        bkey_copy(&b->key, &new_key->k_i);
                }
        } else {
-               struct bch_fs_usage stats = { 0 };
+               struct bch_fs_usage *fs_usage;
 
                BUG_ON(btree_node_root(c, b) != b);
 
 
                mutex_lock(&c->btree_interior_update_lock);
                percpu_down_read(&c->mark_lock);
+               fs_usage = bch2_fs_usage_get_scratch(c);
 
                bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
                              true, 0,
                              gc_pos_btree_root(b->btree_id),
-                             &stats, 0, 0);
+                             fs_usage, 0, 0);
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&b->key),
-                                          &stats);
-               bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                                    gc_pos_btree_root(b->btree_id));
 
                percpu_up_read(&c->mark_lock);
 
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "replicas.h"
 #include "trace.h"
 
 #include <linux/preempt.h>
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
        return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 {
-       return bch2_usage_read_raw(c->usage[0]);
+       struct bch_fs_usage *ret;
+       unsigned nr = READ_ONCE(c->replicas.nr);
+retry:
+       ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+       if (unlikely(!ret))
+               return NULL;
+
+       percpu_down_read(&c->mark_lock);
+
+       if (unlikely(nr < c->replicas.nr)) {
+               nr = c->replicas.nr;
+               percpu_up_read(&c->mark_lock);
+               kfree(ret);
+               goto retry;
+       }
+
+       acc_u64s_percpu((u64 *) ret,
+                       (u64 __percpu *) c->usage[0],
+                       sizeof(*ret) / sizeof(u64) + nr);
+
+       return ret;
 }
 
 #define RESERVE_FACTOR 6
        return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
-{
-       return fs_usage.s.hidden +
-               fs_usage.s.data +
-               reserve_factor(fs_usage.s.reserved +
-                              fs_usage.s.online_reserved);
-}
-
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-       return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
+       return min(fs_usage.s.hidden +
+                  fs_usage.s.data +
+                  reserve_factor(fs_usage.s.reserved +
+                                 fs_usage.s.online_reserved),
+                  c->capacity);
 }
 
 struct bch_fs_usage_short
               !is_available_bucket(new);
 }
 
-void bch2_fs_usage_apply(struct bch_fs *c,
-                        struct bch_fs_usage *fs_usage,
-                        struct disk_reservation *disk_res,
-                        struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+                       struct bch_fs_usage *fs_usage,
+                       struct disk_reservation *disk_res,
+                       struct gc_pos gc_pos)
 {
        s64 added = fs_usage->s.data + fs_usage->s.reserved;
        s64 should_not_have_added;
+       int ret = 0;
 
        percpu_rwsem_assert_held(&c->mark_lock);
 
                      "disk usage increased without a reservation")) {
                atomic64_sub(should_not_have_added, &c->sectors_available);
                added -= should_not_have_added;
+               ret = -1;
        }
 
        if (added > 0) {
        preempt_disable();
        acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
                 (u64 *) fs_usage,
-                sizeof(*fs_usage) / sizeof(u64));
+                sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 
        if (gc_visited(c, gc_pos)) {
                BUG_ON(!c->usage[1]);
                acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
                         (u64 *) fs_usage,
-                        sizeof(*fs_usage) / sizeof(u64));
+                        sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
        }
        preempt_enable();
 
-       memset(fs_usage, 0, sizeof(*fs_usage));
+       return ret;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
        if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
                fs_usage->s.hidden      += size;
 
-       fs_usage->buckets[type]         += size;
        dev_usage->buckets[type]        += nr;
 }
 
        _old;                                                   \
 })
 
+static inline void update_replicas(struct bch_fs *c,
+                                  struct bch_fs_usage *fs_usage,
+                                  struct bch_replicas_entry *r,
+                                  s64 sectors)
+{
+       int idx = bch2_replicas_entry_idx(c, r);
+
+       BUG_ON(idx < 0);
+       BUG_ON(!sectors);
+
+       if (r->data_type == BCH_DATA_CACHED)
+               fs_usage->s.cached      += sectors;
+       else
+               fs_usage->s.data        += sectors;
+       fs_usage->data[idx]             += sectors;
+}
+
+static inline void update_cached_sectors(struct bch_fs *c,
+                                        struct bch_fs_usage *fs_usage,
+                                        unsigned dev, s64 sectors)
+{
+       struct bch_replicas_padded r;
+
+       bch2_replicas_entry_cached(&r.e, dev);
+
+       update_replicas(c, fs_usage, &r.e, sectors);
+}
+
 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                                     size_t b, struct bucket_mark *old,
                                     bool gc)
                new.gen++;
        }));
 
-       fs_usage->replicas[0].data[BCH_DATA_CACHED]     -= old->cached_sectors;
-       fs_usage->s.cached                              -= old->cached_sectors;
+       if (old->cached_sectors)
+               update_cached_sectors(c, fs_usage, ca->dev_idx,
+                                     -old->cached_sectors);
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.data_type   = type;
                checked_add(new.dirty_sectors, sectors);
        }));
-
-       if (type == BCH_DATA_BTREE ||
-           type == BCH_DATA_USER)
-               fs_usage->s.data                += sectors;
-       fs_usage->replicas[0].data[type]        += sectors;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
                                struct bch_extent_stripe_ptr p,
+                               enum bch_data_type data_type,
+                               struct bch_fs_usage *fs_usage,
                                s64 sectors, unsigned flags,
-                               s64 *adjusted_disk_sectors,
-                               unsigned *redundancy,
                                bool gc)
 {
        struct stripe *m;
                return -1;
        }
 
+       BUG_ON(m->r.e.data_type != data_type);
+
        nr_data = m->nr_blocks - m->nr_redundant;
 
        parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
 
        if (sectors < 0)
                parity_sectors = -parity_sectors;
-
-       *adjusted_disk_sectors += parity_sectors;
-
-       *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+       sectors += parity_sectors;
 
        new = atomic_add_return(sectors, &m->block_sectors[p.block]);
        old = new - sectors;
        if (!gc)
                bch2_stripes_heap_update(c, m, p.idx);
 
+       update_replicas(c, fs_usage, &m->r.e, sectors);
+
        return 0;
 }
 
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       s64 cached_sectors      = 0;
-       s64 dirty_sectors       = 0;
-       s64 ec_sectors          = 0;
-       unsigned replicas       = 0;
-       unsigned ec_redundancy  = 0;
+       struct bch_replicas_padded r;
+       s64 dirty_sectors = 0;
        unsigned i;
        int ret;
 
+       r.e.data_type   = data_type;
+       r.e.nr_devs     = 0;
+       r.e.nr_required = 1;
+
        BUG_ON(!sectors);
 
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = data_type == BCH_DATA_BTREE
                        ? sectors
                        : ptr_disk_sectors_delta(p, sectors);
-               s64 adjusted_disk_sectors = disk_sectors;
 
                bch2_mark_pointer(c, p, disk_sectors, data_type,
                                  fs_usage, journal_seq, flags, gc);
 
-               if (!p.ptr.cached)
+               if (p.ptr.cached) {
+                       update_cached_sectors(c, fs_usage, p.ptr.dev,
+                                             disk_sectors);
+               } else if (!p.ec_nr) {
+                       dirty_sectors          += disk_sectors;
+                       r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+               } else {
                        for (i = 0; i < p.ec_nr; i++) {
                                ret = bch2_mark_stripe_ptr(c, p.ec[i],
-                                               disk_sectors, flags,
-                                               &adjusted_disk_sectors,
-                                               &ec_redundancy, gc);
+                                               data_type, fs_usage,
+                                               disk_sectors, flags, gc);
                                if (ret)
                                        return ret;
                        }
-               if (!p.ptr.cached)
-                       replicas++;
 
-               if (p.ptr.cached)
-                       cached_sectors  += adjusted_disk_sectors;
-               else if (!p.ec_nr)
-                       dirty_sectors   += adjusted_disk_sectors;
-               else
-                       ec_sectors      += adjusted_disk_sectors;
+                       r.e.nr_required = 0;
+               }
        }
 
-       replicas        = clamp_t(unsigned,     replicas,
-                                 1, ARRAY_SIZE(fs_usage->replicas));
-       ec_redundancy   = clamp_t(unsigned,     ec_redundancy,
-                                 1, ARRAY_SIZE(fs_usage->replicas));
-
-       fs_usage->s.cached                                      += cached_sectors;
-       fs_usage->replicas[0].data[BCH_DATA_CACHED]             += cached_sectors;
-
-       fs_usage->s.data                                        += dirty_sectors;
-       fs_usage->replicas[replicas - 1].data[data_type]        += dirty_sectors;
-
-       fs_usage->s.data                                        += ec_sectors;
-       fs_usage->replicas[ec_redundancy - 1].ec_data           += ec_sectors;
+       if (dirty_sectors)
+               update_replicas(c, fs_usage, &r.e, dirty_sectors);
 
        return 0;
 }
                m->algorithm    = s.v->algorithm;
                m->nr_blocks    = s.v->nr_blocks;
                m->nr_redundant = s.v->nr_redundant;
+
+               memset(&m->r, 0, sizeof(m->r));
+
+               m->r.e.data_type        = BCH_DATA_USER;
+               m->r.e.nr_devs          = s.v->nr_blocks;
+               m->r.e.nr_required      = s.v->nr_blocks - s.v->nr_redundant;
+
+               for (i = 0; i < s.v->nr_blocks; i++)
+                       m->r.e.devs[i] = s.v->ptrs[i].dev;
        }
 
+       /*
+        * XXX: account for stripes somehow here
+        */
+#if 0
+       update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+#endif
+
        if (!gc) {
                if (inserting)
                        bch2_stripes_heap_insert(c, m, idx);
                unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
                sectors *= replicas;
-               replicas = clamp_t(unsigned, replicas,
-                                  1, ARRAY_SIZE(fs_usage->replicas));
+               replicas = clamp_t(unsigned, replicas, 1,
+                                  ARRAY_SIZE(fs_usage->persistent_reserved));
 
-               fs_usage->s.reserved                                    += sectors;
-               fs_usage->replicas[replicas - 1].persistent_reserved    += sectors;
+               fs_usage->s.reserved                            += sectors;
+               fs_usage->persistent_reserved[replicas - 1]     += sectors;
                break;
        }
        default:
        struct btree_iter       *iter = insert->iter;
        struct btree            *b = iter->l[0].b;
        struct btree_node_iter  node_iter = iter->l[0].iter;
-       struct bch_fs_usage     fs_usage = { 0 };
+       struct bch_fs_usage     *fs_usage;
        struct gc_pos           pos = gc_pos_btree_node(b);
        struct bkey_packed      *_k;
+       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+       static int warned_disk_usage = 0;
 
        if (!btree_node_type_needs_gc(iter->btree_id))
                return;
 
        percpu_down_read(&c->mark_lock);
+       preempt_disable();
+       fs_usage = bch2_fs_usage_get_scratch(c);
 
        if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
                bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
                        bpos_min(insert->k->k.p, b->key.k.p).offset -
                        bkey_start_offset(&insert->k->k),
-                       pos, &fs_usage, trans->journal_res.seq, 0);
+                       pos, fs_usage, trans->journal_res.seq, 0);
 
        while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
                                                      KEY_TYPE_discard))) {
                                BUG_ON(sectors <= 0);
 
                                bch2_mark_key_locked(c, k, true, sectors,
-                                       pos, &fs_usage, trans->journal_res.seq, 0);
+                                       pos, fs_usage, trans->journal_res.seq, 0);
 
                                sectors = bkey_start_offset(&insert->k->k) -
                                        k.k->p.offset;
                }
 
                bch2_mark_key_locked(c, k, false, sectors,
-                       pos, &fs_usage, trans->journal_res.seq, 0);
+                       pos, fs_usage, trans->journal_res.seq, 0);
 
                bch2_btree_node_iter_advance(&node_iter, b);
        }
 
-       bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
+       if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+           !warned_disk_usage &&
+           !xchg(&warned_disk_usage, 1)) {
+               char buf[200];
+
+               pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+               pr_err("while inserting");
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+               pr_err("%s", buf);
+               pr_err("overlapping with");
+
+               node_iter = iter->l[0].iter;
+               while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+                                                             KEY_TYPE_discard))) {
+                       struct bkey             unpacked;
+                       struct bkey_s_c         k;
+
+                       k = bkey_disassemble(b, _k, &unpacked);
 
+                       if (btree_node_is_extents(b)
+                           ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+                           : bkey_cmp(insert->k->k.p, k.k->p))
+                               break;
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       pr_err("%s", buf);
+
+                       bch2_btree_node_iter_advance(&node_iter, b);
+               }
+       }
+
+       preempt_enable();
        percpu_up_read(&c->mark_lock);
 }
 
 
 
 /* Filesystem usage: */
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+       struct bch_fs_usage *ret;
+
+       ret = this_cpu_ptr(c->usage_scratch);
+
+       memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+
+       return ret;
+}
+
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
                  bool, s64, struct gc_pos,
                  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-                        struct disk_reservation *, struct gc_pos);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+                       struct disk_reservation *, struct gc_pos);
 
 /* disk reservations: */
 
 
                u64             cached;
                u64             reserved;
                u64             nr_inodes;
+
+               /* XXX: add stats for compression ratio */
+#if 0
+               u64             uncompressed;
+               u64             compressed;
+#endif
        } s;
 
        /* broken out: */
-       struct {
-               u64             data[BCH_DATA_NR];
-               u64             ec_data;
-               u64             persistent_reserved;
-       }                       replicas[BCH_REPLICAS_MAX];
 
-       u64                     buckets[BCH_DATA_NR];
+       u64                     persistent_reserved[BCH_REPLICAS_MAX];
+       u64                     data[];
 };
 
 struct bch_fs_usage_short {
 
        }
 
        {
-               struct bch_fs_usage src = bch2_fs_usage_read(c);
+               struct bch_fs_usage *src;
                struct bch_ioctl_fs_usage dst = {
                        .capacity               = c->capacity,
-                       .used                   = bch2_fs_sectors_used(c, src),
-                       .online_reserved        = src.s.online_reserved,
                };
 
+               src = bch2_fs_usage_read(c);
+               if (!src)
+                       return -ENOMEM;
+
+               percpu_up_read(&c->mark_lock);
+
+               dst.used                = bch2_fs_sectors_used(c, *src);
+               dst.online_reserved     = src->s.online_reserved;
+
                for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                        dst.persistent_reserved[i] =
-                               src.replicas[i].persistent_reserved;
-
+                               src->persistent_reserved[i];
+#if 0
                        for (j = 0; j < BCH_DATA_NR; j++)
                                dst.sectors[j][i] = src.replicas[i].data[j];
+#endif
                }
 
+               kfree(src);
+
                ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
                if (ret)
                        return ret;
 
 
 #define EC_STRIPE_MAX  16
 
+struct bch_replicas_padded {
+       struct bch_replicas_entry       e;
+       u8                              pad[EC_STRIPE_MAX];
+};
+
 struct stripe {
        size_t                  heap_idx;
 
        u8                      alive;
        atomic_t                blocks_nonempty;
        atomic_t                block_sectors[EC_STRIPE_MAX];
+
+       struct bch_replicas_padded r;
 };
 
 struct ec_stripe_heap_entry {
 
        return ret == BCH_MERGE_MERGE;
 }
 
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+                              unsigned nr_replicas)
 {
        struct btree_iter iter;
        struct bpos end = pos;
        struct bkey_s_c k;
-       int ret = 0;
+       bool ret = true;
 
        end.offset += size;
 
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               if (!bch2_extent_is_fully_allocated(k)) {
-                       ret = -ENOSPC;
+               if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+                       ret = false;
                        break;
                }
        }
        return ret;
 }
 
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+       unsigned ret = 0;
+
+       switch (k.k->type) {
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+
+               extent_for_each_ptr_decode(e, p, entry)
+                       ret += !p.ptr.cached &&
+                               p.crc.compression_type == BCH_COMPRESSION_NONE;
+               break;
+       }
+       case KEY_TYPE_reservation:
+               ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+               break;
+       }
+
+       return ret;
+}
+
 /* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
                BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 
 #endif /* _BCACHEFS_EXTENTS_H */
 
        }
 }
 
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-                                    eytzinger_cmp_fn cmp, const void *search)
-{
-       size_t i = 0;
-       int res;
-
-       while (i < nr &&
-              (res = cmp(search, base + i * size, size)))
-               i = eytzinger0_child(i, res > 0);
-
-       return i;
-}
+#define eytzinger0_find(base, nr, size, _cmp, search)                  \
+({                                                                     \
+       void *_base     = (base);                                       \
+       void *_search   = (search);                                     \
+       size_t _nr      = (nr);                                         \
+       size_t _size    = (size);                                       \
+       size_t _i       = 0;                                            \
+       int _res;                                                       \
+                                                                       \
+       while (_i < _nr &&                                              \
+              (_res = _cmp(_search, _base + _i * _size, _size)))       \
+               _i = eytzinger0_child(_i, _res > 0);                    \
+       _i;                                                             \
+})
 
 void eytzinger0_sort(void *, size_t, size_t,
                    int (*cmp_func)(const void *, const void *, size_t),
 
                BUG_ON(btree_iter_err(old));
 
                if (allocating &&
-                   !bch2_extent_is_fully_allocated(old))
+                   !*allocating &&
+                   bch2_bkey_nr_ptrs_allocated(old) <
+                   bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
                        *allocating = true;
 
                delta += (min(new->k.p.offset,
 {
        struct bvec_iter iter;
        struct bio_vec bv;
-       unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-               ? bch2_bkey_nr_dirty_ptrs(k)
-               : 0;
+       unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
 
        bio_for_each_segment(bv, bio, iter) {
                /* brand new pages, don't need to be locked: */
        if (unlikely(ret))
                goto err;
 
+       dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
+
        ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
                                        dio->iop.op.opts.data_replicas, 0);
        if (unlikely(ret)) {
-               if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-                                                     req->ki_pos >> 9),
-                                              iter->count >> 9))
+               if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
+                                                      req->ki_pos >> 9),
+                                               iter->count >> 9,
+                                               dio->iop.op.opts.data_replicas))
                        goto err;
 
                dio->iop.unalloc = true;
        }
 
-       dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
-
        return bch2_dio_write_loop(dio);
 err:
        bch2_disk_reservation_put(c, &dio->iop.op.res);
 
        }
 
        list_for_each_entry(i, list, list) {
+               struct bch_replicas_padded replicas;
+               char buf[80];
+
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
 
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-                                                      i->devs, false), c,
-                                "superblock not marked as containing replicas (type %u)",
-                                BCH_DATA_JOURNAL))) {
-                       ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+                    fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+                                "superblock not marked as containing replicas %s",
+                                (bch2_replicas_entry_to_text(&PBUF(buf),
+                                                             &replicas.e), buf)))) {
+                       ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
                                return ret;
                }
        struct journal_buf *w = journal_prev_buf(j);
        struct bch_devs_list devs =
                bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+       struct bch_replicas_padded replicas;
        u64 seq = le64_to_cpu(w->data->seq);
        u64 last_seq = le64_to_cpu(w->data->last_seq);
 
                goto err;
        }
 
-       if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+       bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+       if (bch2_mark_replicas(c, &replicas.e))
                goto err;
 
        spin_lock(&j->lock);
 
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_entry_pin_list *p;
-       struct bch_devs_list devs;
        u64 iter, seq = 0;
        int ret = 0;
 
 
        spin_lock(&j->lock);
        while (!ret && seq < j->pin.back) {
+               struct bch_replicas_padded replicas;
+
                seq = max(seq, journal_last_seq(j));
-               devs = journal_seq_pin(j, seq)->devs;
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+                                        journal_seq_pin(j, seq)->devs);
                seq++;
 
                spin_unlock(&j->lock);
-               ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+               ret = bch2_mark_replicas(c, &replicas.e);
                spin_lock(&j->lock);
        }
        spin_unlock(&j->lock);
 
 
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "extents.h"
 #include "io.h"
                bch2_btree_iter_unlock(&iter);
        }
 
+       /* flush relevant btree updates */
+       while (1) {
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_nr_pending(c) ||
+                                  c->btree_roots_dirty);
+               if (!bch2_btree_interior_updates_nr_pending(c))
+                       break;
+               bch2_journal_meta(&c->journal);
+       }
+
        ret = 0;
 out:
        ret = bch2_replicas_gc_end(c, ret);
 
 #include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
 #include "inode.h"
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
                ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+               while (1) {
+                       closure_wait_event(&c->btree_interior_update_wait,
+                                          !bch2_btree_interior_updates_nr_pending(c) ||
+                                          c->btree_roots_dirty);
+                       if (!bch2_btree_interior_updates_nr_pending(c))
+                               break;
+                       bch2_journal_meta(&c->journal);
+               }
+
                ret = bch2_gc_btree_replicas(c) ?: ret;
 
                ret = bch2_move_data(c, NULL,
 
 #include "replicas.h"
 #include "super-io.h"
 
-struct bch_replicas_padded {
-       struct bch_replicas_entry       e;
-       u8                              pad[BCH_SB_MEMBERS_MAX];
-};
-
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
                                            struct bch_replicas_cpu *);
 
        return (l > r) - (l < r);
 }
 
+static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHES_DEBUG
+       unsigned i;
+
+       for (i = 0; i + 1 < e->nr_devs; i++)
+               BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
 static void replicas_entry_sort(struct bch_replicas_entry *e)
 {
        bubble_sort(e->devs, e->nr_devs, u8_cmp);
             (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
             _i = (void *) (_i) + (_r)->entry_size)
 
-static inline struct bch_replicas_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-       return (void *) r->entries + r->entry_size * i;
-}
-
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
        eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static void replicas_entry_to_text(struct printbuf *out,
-                                 struct bch_replicas_entry *e)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+                                struct bch_replicas_entry *e)
 {
        unsigned i;
 
                        pr_buf(out, " ");
                first = false;
 
-               replicas_entry_to_text(out, e);
+               bch2_replicas_entry_to_text(out, e);
        }
 }
 
                r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(struct bkey_s_c k,
-                            struct bch_replicas_entry *e)
+static void bkey_to_replicas(struct bch_replicas_entry *e,
+                            struct bkey_s_c k)
 {
        e->nr_devs = 0;
 
        replicas_entry_sort(e);
 }
 
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-                                      enum bch_data_type data_type,
-                                      struct bch_replicas_entry *e)
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+                             enum bch_data_type data_type,
+                             struct bch_devs_list devs)
 {
        unsigned i;
 
                                        replicas_entry_bytes(new_entry)),
        };
 
+       BUG_ON(!new_entry->data_type);
+       verify_replicas_entry_sorted(new_entry);
+
        new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
        if (!new.entries)
                return new;
        return new;
 }
 
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+                                      struct bch_replicas_entry *search)
+{
+       int idx, entry_size = replicas_entry_bytes(search);
+
+       if (unlikely(entry_size > r->entry_size))
+               return -1;
+
+       verify_replicas_entry_sorted(search);
+
+#define entry_cmp(_l, _r, size)        memcmp(_l, _r, entry_size)
+       idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+                             entry_cmp, search);
+#undef entry_cmp
+
+       return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+                           struct bch_replicas_entry *search)
+{
+       replicas_entry_sort(search);
+
+       return __replicas_entry_idx(&c->replicas, search);
+}
+
 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
                                 struct bch_replicas_entry *search)
 {
-       return replicas_entry_bytes(search) <= r->entry_size &&
-               eytzinger0_find(r->entries, r->nr,
-                               r->entry_size,
-                               memcmp, search) < r->nr;
+       return __replicas_entry_idx(r, search) >= 0;
 }
 
-static bool replicas_has_entry(struct bch_fs *c,
-                              struct bch_replicas_entry *search,
-                              bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+                         struct bch_replicas_entry *search,
+                         bool check_gc_replicas)
 {
        bool marked;
 
+       if (!search->nr_devs)
+               return true;
+
+       verify_replicas_entry_sorted(search);
+
        percpu_down_read(&c->mark_lock);
        marked = __replicas_has_entry(&c->replicas, search) &&
                (!check_gc_replicas ||
        return marked;
 }
 
+static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
+                                   struct bch_replicas_cpu *dst_r,
+                                   struct bch_fs_usage __percpu *src_p,
+                                   struct bch_replicas_cpu *src_r)
+{
+       unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+       struct bch_fs_usage *dst, *src = (void *)
+               bch2_acc_percpu_u64s((void *) src_p, src_nr);
+       int src_idx, dst_idx;
+
+       preempt_disable();
+       dst = this_cpu_ptr(dst_p);
+       preempt_enable();
+
+       *dst = *src;
+
+       for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+               if (!src->data[src_idx])
+                       continue;
+
+               dst_idx = __replicas_entry_idx(dst_r,
+                               cpu_replicas_entry(src_r, src_idx));
+               BUG_ON(dst_idx < 0);
+
+               dst->data[dst_idx] = src->data[src_idx];
+       }
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+                                struct bch_replicas_cpu *new_r)
+{
+       struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+       unsigned bytes = sizeof(struct bch_fs_usage) +
+               sizeof(u64) * new_r->nr;
+       unsigned i;
+       int ret = -ENOMEM;
+
+       for (i = 0; i < 3; i++) {
+               if (i < 2 && !c->usage[i])
+                       continue;
+
+               new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
+                                                 GFP_NOIO);
+               if (!new_usage[i])
+                       goto err;
+       }
+
+       for (i = 0; i < 2; i++) {
+               if (!c->usage[i])
+                       continue;
+
+               __replicas_table_update(new_usage[i],   new_r,
+                                       c->usage[i],    &c->replicas);
+
+               swap(c->usage[i], new_usage[i]);
+       }
+
+       swap(c->usage_scratch, new_usage[2]);
+
+       swap(c->replicas, *new_r);
+       ret = 0;
+err:
+       for (i = 0; i < 3; i++)
+               free_percpu(new_usage[i]);
+       return ret;
+}
+
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
                                struct bch_replicas_entry *new_entry)
        /* don't update in memory replicas until changes are persistent */
        percpu_down_write(&c->mark_lock);
        if (new_r.entries)
-               swap(new_r, c->replicas);
+               ret = replicas_table_update(c, &new_r);
        if (new_gc.entries)
                swap(new_gc, c->replicas_gc);
        percpu_up_write(&c->mark_lock);
        return ret;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-                               struct bch_replicas_entry *devs)
+int bch2_mark_replicas(struct bch_fs *c,
+                      struct bch_replicas_entry *r)
 {
-       return likely(replicas_has_entry(c, devs, true))
+       return likely(bch2_replicas_marked(c, r, true))
                ? 0
-               : bch2_mark_replicas_slowpath(c, devs);
+               : bch2_mark_replicas_slowpath(c, r);
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-                      enum bch_data_type data_type,
-                      struct bch_devs_list devs)
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+                              struct bkey_s_c k,
+                              bool check_gc_replicas)
 {
        struct bch_replicas_padded search;
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
 
-       if (!devs.nr)
-               return 0;
-
-       memset(&search, 0, sizeof(search));
+       for (i = 0; i < cached.nr; i++) {
+               bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+               if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+                       return false;
+       }
 
-       devlist_to_replicas(devs, data_type, &search.e);
+       bkey_to_replicas(&search.e, k);
 
-       return __bch2_mark_replicas(c, &search.e);
+       return bch2_replicas_marked(c, &search.e, check_gc_replicas);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
        unsigned i;
        int ret;
 
-       memset(&search, 0, sizeof(search));
+       for (i = 0; i < cached.nr; i++) {
+               bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-       for (i = 0; i < cached.nr; i++)
-               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-                                             bch2_dev_list_single(cached.devs[i]))))
+               ret = bch2_mark_replicas(c, &search.e);
+               if (ret)
                        return ret;
+       }
 
-       bkey_to_replicas(k, &search.e);
+       bkey_to_replicas(&search.e, k);
 
-       return search.e.nr_devs
-               ? __bch2_mark_replicas(c, &search.e)
-               : 0;
+       return bch2_mark_replicas(c, &search.e);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
+       unsigned i;
+
        lockdep_assert_held(&c->replicas_gc_lock);
 
        mutex_lock(&c->sb_lock);
        if (ret)
                goto err;
 
+       /*
+        * this is kind of crappy; the replicas gc mechanism needs to be ripped
+        * out
+        */
+
+       for (i = 0; i < c->replicas.nr; i++) {
+               struct bch_replicas_entry *e =
+                       cpu_replicas_entry(&c->replicas, i);
+               struct bch_replicas_cpu n;
+               u64 v = 0;
+               int cpu;
+
+               if (__replicas_has_entry(&c->replicas_gc, e))
+                       continue;
+
+               for_each_possible_cpu(cpu)
+                       v += *per_cpu_ptr(&c->usage[0]->data[i], cpu);
+               if (!v)
+                       continue;
+
+               n = cpu_replicas_add_entry(&c->replicas_gc, e);
+               if (!n.entries) {
+                       ret = -ENOSPC;
+                       goto err;
+               }
+
+               percpu_down_write(&c->mark_lock);
+               swap(n, c->replicas_gc);
+               percpu_up_write(&c->mark_lock);
+
+               kfree(n.entries);
+       }
+
        if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
                ret = -ENOSPC;
                goto err;
 err:
        percpu_down_write(&c->mark_lock);
        if (!ret)
-               swap(c->replicas, c->replicas_gc);
+               ret = replicas_table_update(c, &c->replicas_gc);
 
        kfree(c->replicas_gc.entries);
        c->replicas_gc.entries = NULL;
        bch2_cpu_replicas_sort(&new_r);
 
        percpu_down_write(&c->mark_lock);
-       swap(c->replicas, new_r);
+       ret = replicas_table_update(c, &new_r);
        percpu_up_write(&c->mark_lock);
 
        kfree(new_r.entries);
                        pr_buf(out, " ");
                first = false;
 
-               replicas_entry_to_text(out, e);
+               bch2_replicas_entry_to_text(out, e);
        }
 }
 
 
 /* Query replicas: */
 
-bool bch2_replicas_marked(struct bch_fs *c,
-                         enum bch_data_type data_type,
-                         struct bch_devs_list devs,
-                         bool check_gc_replicas)
-{
-       struct bch_replicas_padded search;
-
-       if (!devs.nr)
-               return true;
-
-       memset(&search, 0, sizeof(search));
-
-       devlist_to_replicas(devs, data_type, &search.e);
-
-       return replicas_has_entry(c, &search.e, check_gc_replicas);
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-                              struct bkey_s_c k,
-                              bool check_gc_replicas)
-{
-       struct bch_replicas_padded search;
-       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-       unsigned i;
-
-       memset(&search, 0, sizeof(search));
-
-       for (i = 0; i < cached.nr; i++)
-               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-                                         bch2_dev_list_single(cached.devs[i]),
-                                         check_gc_replicas))
-                       return false;
-
-       bkey_to_replicas(k, &search.e);
-
-       return search.e.nr_devs
-               ? replicas_has_entry(c, &search.e, check_gc_replicas)
-               : true;
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
                                              struct bch_devs_mask online_devs)
 {
 
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "eytzinger.h"
 #include "replicas_types.h"
 
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-                         struct bch_devs_list, bool);
+void bch2_replicas_entry_to_text(struct printbuf *,
+                                struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+                           struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+                             enum bch_data_type,
+                             struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+                         struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+                      struct bch_replicas_entry *);
+
 bool bch2_bkey_replicas_marked(struct bch_fs *,
                               struct bkey_s_c, bool);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-                      struct bch_devs_list);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+                                             unsigned dev)
+{
+       e->data_type    = BCH_DATA_CACHED;
+       e->nr_devs      = 1;
+       e->nr_required  = 1;
+       e->devs[0]      = dev;
+}
 
 struct replicas_status {
        struct {
 
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        percpu_free_rwsem(&c->mark_lock);
+       free_percpu(c->usage_scratch);
        free_percpu(c->usage[0]);
        free_percpu(c->pcpu);
        mempool_exit(&c->btree_iters_pool);
 {
        struct bch_sb_field_members *mi;
        struct bch_fs *c;
-       unsigned i, iter_size;
+       unsigned i, iter_size, fs_usage_size;
        const char *err;
 
        pr_verbose_init(opts, "");
                (btree_blocks(c) + 1) * 2 *
                sizeof(struct btree_node_iter_set);
 
+       fs_usage_size = sizeof(struct bch_fs_usage) +
+               sizeof(u64) * c->replicas.nr;
+
        if (!(c->wq = alloc_workqueue("bcachefs",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcache_copygc",
                        max(offsetof(struct btree_read_bio, bio),
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
-           !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
+           !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
+           !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
 
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
        struct printbuf out = _PBUF(buf, PAGE_SIZE);
-       struct bch_fs_usage stats = bch2_fs_usage_read(c);
-       unsigned replicas, type;
+       struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+       unsigned i;
 
-       pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
+       if (!fs_usage)
+               return -ENOMEM;
 
-       for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
-               pr_buf(&out, "%u replicas:\n", replicas + 1);
+       pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
 
+       for (i = 0;
+            i < ARRAY_SIZE(fs_usage->persistent_reserved);
+            i++) {
+               pr_buf(&out, "%u replicas:\n", i + 1);
+#if 0
                for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
                        pr_buf(&out, "\t%s:\t\t%llu\n",
                               bch2_data_types[type],
                               stats.replicas[replicas].data[type]);
                pr_buf(&out, "\terasure coded:\t%llu\n",
                       stats.replicas[replicas].ec_data);
+#endif
                pr_buf(&out, "\treserved:\t%llu\n",
-                      stats.replicas[replicas].persistent_reserved);
+                      fs_usage->persistent_reserved[i]);
        }
 
-       pr_buf(&out, "bucket usage\n");
+       pr_buf(&out, "online reserved:\t%llu\n",
+              fs_usage->s.online_reserved);
 
-       for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-               pr_buf(&out, "\t%s:\t\t%llu\n",
-                      bch2_data_types[type],
-                      stats.buckets[type]);
+       for (i = 0; i < c->replicas.nr; i++) {
+               struct bch_replicas_entry *e =
+                       cpu_replicas_entry(&c->replicas, i);
 
-       pr_buf(&out, "online reserved:\t%llu\n",
-              stats.s.online_reserved);
+               pr_buf(&out, "\t");
+               bch2_replicas_entry_to_text(&out, e);
+               pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+       }
+
+       percpu_up_read(&c->mark_lock);
+
+       kfree(fs_usage);
 
        return out.pos - buf;
 }
 
        kfree(test_array);
 }
 #endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+       u64 *ret;
+       int cpu;
+
+       preempt_disable();
+       ret = this_cpu_ptr(p);
+       preempt_enable();
+
+       for_each_possible_cpu(cpu) {
+               u64 *i = per_cpu_ptr(p, cpu);
+
+               if (i != ret) {
+                       acc_u64s(ret, i, nr);
+                       memset(i, 0, nr * sizeof(u64));
+               }
+       }
+
+       return ret;
+}
 
                acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
 }
 
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
 #endif /* _BCACHEFS_UTIL_H */