return ret;
}
- percpu_down_write(&c->mark_lock);
- bch2_dev_usage_from_buckets(c);
- percpu_up_write(&c->mark_lock);
-
return 0;
}
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage __percpu *usage[2];
+ struct bch_dev_usage *usage_base;
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
struct journal_entry_res replicas_journal_res;
+ struct journal_entry_res dev_usage_journal_res;
+
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
x(blacklist_v2, 4) \
x(usage, 5) \
x(data_usage, 6) \
- x(clock, 7)
+ x(clock, 7) \
+ x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
__le64 time;
} __attribute__((packed));
+struct jset_entry_dev_usage_type {
+ __le64 buckets;
+ __le64 sectors;
+ __le64 fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+ struct jset_entry entry;
+ __le32 dev;
+ __u32 pad;
+
+ __le64 buckets_ec;
+ __le64 buckets_unavailable;
+
+ struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
- free_percpu(ca->usage[1]);
- ca->usage[1] = NULL;
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
struct bch_dev *ca;
bool verify = (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
- unsigned i;
+ unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
}
}
- for_each_member_device(ca, c, i) {
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
- };
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
+ {
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
- bch2_dev_usage_from_buckets(c);
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ }
+ }
+ };
{
unsigned nr = fs_usage_u64s(c);
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
- BUG_ON(ca->usage[1]);
+ BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
return -ENOMEM;
}
- ca->usage[1] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[1]) {
- bch_err(c, "error allocating ca->usage[gc]");
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage_gc) {
+ bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
+ struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
+ ca->mi.bucket_size;
+ }
+
percpu_up_write(&c->mark_lock);
}
return ret;
}
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+ unsigned journal_seq,
+ bool gc)
+{
+ return this_cpu_ptr(gc
+ ? ca->usage_gc
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
+ struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
+ unsigned seq, i, u64s = dev_usage_u64s();
- memset(&ret, 0, sizeof(ret));
- acc_u64s_percpu((u64 *) &ret,
- (u64 __percpu *) ca->usage[0],
- sizeof(ret) / sizeof(u64));
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- unsigned u64s = fs_usage_u64s(c);
+ struct bch_dev *ca;
+ unsigned i, u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ u64s = dev_usage_u64s();
+
+ acc_u64s_percpu((u64 *) ca->usage_base,
+ (u64 __percpu *) ca->usage[idx], u64s);
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+ }
+ rcu_read_unlock();
+
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
- bool gc)
+ u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
- u = this_cpu_ptr(ca->usage[gc]);
+ u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
bch2_wake_allocator(ca);
}
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct bucket_mark old = { .v.counter = 0 };
- struct bucket_array *buckets;
- struct bucket *g;
- unsigned i;
- int cpu;
-
- c->usage_base->hidden = 0;
-
- for_each_member_device(ca, c, i) {
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(ca->usage[0], cpu), 0,
- sizeof(*ca->usage[0]));
-
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- bch2_dev_usage_update(c, ca, c->usage_base,
- old, g->mark, false);
- }
-}
-
static inline int update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
new.owned_by_allocator = owned_by_allocator;
}));
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ /*
+ * XXX: this is wrong, this means we'll be doing updates to the percpu
+ * buckets_alloc counter that don't have an open journal buffer and
+ * we'll race with the machinery that accumulates that to ca->usage_base
+ */
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
}
}));
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
- old, new, gc);
+ old, new, 0, gc);
return 0;
}
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- free_percpu(ca->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ free_percpu(ca->usage[i]);
+ kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+ unsigned i;
+
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+ if (!ca->usage_base)
return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[i])
+ return -ENOMEM;
+ }
+
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
READ_ONCE(c->replicas.nr);
}
+static inline unsigned dev_usage_u64s(void)
+{
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);
return ret;
}
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, "invalid journal entry dev usage: bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, "invalid journal entry dev usage: bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
+
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
+ case BCH_JSET_ENTRY_dev_usage: {
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+ unsigned i;
+
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
+ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
+
+ for (i = 0; i < nr_types; i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+
+ break;
+ }
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
struct jset_entry **end,
u64 journal_seq)
{
- unsigned i;
+ struct bch_dev *ca;
+ unsigned i, dev;
percpu_down_read(&c->mark_lock);
"embedded variable length struct");
}
+ for_each_member_device(ca, c, dev) {
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+ struct jset_entry_dev_usage *u =
+ container_of(jset_entry_init(end, b),
+ struct jset_entry_dev_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
+ u->dev = cpu_to_le32(dev);
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+ }
+ }
+
percpu_up_read(&c->mark_lock);
for (i = 0; i < 2; i++) {
return c;
}
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, nr = 0, u64s =
+ (sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ nr++;
+ rcu_read_unlock();
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->dev_usage_journal_res, u64s * nr);
+}
+
/* Filesystem RO/RW: */
/*
bch2_fs_fsio_init(c))
goto err;
+ bch2_dev_usage_journal_reserve(c);
+
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
+
+ bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
return ret;
}
-static void dev_usage_clear(struct bch_dev *ca)
-{
- struct bucket_array *buckets;
-
- percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
- up_read(&ca->bucket_lock);
-}
-
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
if (ret)
goto err;
- dev_usage_clear(ca);
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ bch2_dev_usage_journal_reserve(c);
+
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)