bcachefs: Persist 64 bit io clocks
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 21 Jan 2021 20:28:59 +0000 (15:28 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:52 +0000 (17:08 -0400)
Originally, bcachefs - going back to bcache - stored, for each bucket, a
16 bit counter corresponding to how long it had been since the bucket
was read from. But, this required periodically rescaling counters on
every bucket to avoid wraparound. That wasn't an issue in bcache, where
we'd perodically rewrite the per bucket metadata all at once, but in
bcachefs we're trying to avoid having to walk every single bucket.

This patch switches to persisting 64 bit io clocks, corresponding to the
64 bit bucket timestaps introduced in the previous patch with
KEY_TYPE_alloc_v2.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
19 files changed:
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_types.h
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_gc.c
fs/bcachefs/buckets.h
fs/bcachefs/buckets_types.h
fs/bcachefs/clock.c
fs/bcachefs/clock_types.h
fs/bcachefs/journal.c
fs/bcachefs/journal_io.c
fs/bcachefs/movinggc.c
fs/bcachefs/rebalance.c
fs/bcachefs/rebalance_types.h
fs/bcachefs/recovery.c
fs/bcachefs/super-io.c
fs/bcachefs/super-io.h
fs/bcachefs/super.c
fs/bcachefs/sysfs.c

index 9a670bb2ccfbbebe7887c7bf17d600e73807b26c..bba83011b18b8a5005bf052c50217c3a68f7b2df 100644 (file)
@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       int ret;
 
        down_read(&c->gc_lock);
        ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
        bch2_dev_usage_from_buckets(c);
        percpu_up_write(&c->mark_lock);
 
-       mutex_lock(&c->bucket_clock[READ].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, READ);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       mutex_lock(&c->bucket_clock[WRITE].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, WRITE);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[WRITE].lock);
-
        return 0;
 }
 
@@ -460,114 +440,6 @@ err:
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket *g;
-       u16 max_last_io = 0;
-       unsigned i;
-
-       lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-       /* Recalculate max_last_io for this device: */
-       for_each_bucket(g, buckets)
-               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-       ca->max_last_bucket_io[rw] = max_last_io;
-
-       /* Recalculate global max_last_io: */
-       max_last_io = 0;
-
-       for_each_member_device(ca, c, i)
-               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-       clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets;
-       struct bch_dev *ca;
-       struct bucket *g;
-       unsigned i;
-
-       trace_rescale_prios(c);
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->io_time[rw] = clock->hand -
-                       bucket_last_io(c, g, rw) / 2;
-
-               bch2_recalc_oldest_io(c, ca, rw);
-
-               up_read(&ca->bucket_lock);
-       }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-       return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-       struct bucket_clock *clock = container_of(timer,
-                                               struct bucket_clock, rescale);
-       struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, bucket_clock[clock->rw]);
-       struct bch_dev *ca;
-       u64 capacity;
-       unsigned i;
-
-       mutex_lock(&clock->lock);
-
-       /* if clock cannot be advanced more, rescale prio */
-       if (clock->max_last_io >= U16_MAX - 2)
-               bch2_rescale_bucket_io_times(c, clock->rw);
-
-       BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-       for_each_member_device(ca, c, i)
-               ca->max_last_bucket_io[clock->rw]++;
-       clock->max_last_io++;
-       clock->hand++;
-
-       mutex_unlock(&clock->lock);
-
-       capacity = READ_ONCE(c->capacity);
-
-       if (!capacity)
-               return;
-
-       /*
-        * we only increment when 0.1% of the filesystem capacity has been read
-        * or written too, this determines if it's time
-        *
-        * XXX: we shouldn't really be going off of the capacity of devices in
-        * RW mode (that will be 0 when we're RO, yet we can still service
-        * reads)
-        */
-       timer->expire += bucket_clock_freq(capacity);
-
-       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-
-       clock->hand             = 1;
-       clock->rw               = rw;
-       clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = bucket_clock_freq(c->capacity);
-       mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                              size_t bucket_nr, int rw)
 {
@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        struct bucket *g;
        struct bkey_alloc_buf *a;
        struct bkey_alloc_unpacked u;
-       u64 *time;
+       u64 *time, now;
        int ret = 0;
 
        iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        percpu_up_read(&c->mark_lock);
 
        time = rw == READ ? &u.read_time : &u.write_time;
-       if (*time == c->bucket_clock[rw].hand)
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
                goto out;
 
-       *time = c->bucket_clock[rw].hand;
+       *time = now;
 
        bch2_alloc_pack(c, a, u);
        ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-                                      size_t bucket,
-                                      struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+                                      struct bucket_mark m)
 {
        u8 gc_gen;
 
-       if (!is_available_bucket(mark))
+       if (!is_available_bucket(m))
                return false;
 
-       if (mark.owned_by_allocator)
+       if (m.owned_by_allocator)
                return false;
 
        if (ca->buckets_nouse &&
-           test_bit(bucket, ca->buckets_nouse))
+           test_bit(b, ca->buckets_nouse))
                return false;
 
-       gc_gen = bucket_gc_gen(ca, bucket);
+       gc_gen = bucket_gc_gen(bucket(ca, b));
 
        if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
                ca->inc_gen_needs_gc++;
@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+                               u64 now, u64 last_seq_ondisk)
 {
-       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-       unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-       /*
-        * Time since last read, scaled to [0, 8) where larger value indicates
-        * more recently read data:
-        */
-       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-       /* How much we want to keep the data in this bucket: */
-       unsigned long data_wantness =
-               (hotness + 1) * bucket_sectors_used(m);
+       unsigned used = bucket_sectors_used(m);
 
-       unsigned long needs_journal_commit =
-               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+       if (used) {
+               /*
+                * Prefer to keep buckets that have been read more recently, and
+                * buckets that have more data in them:
+                */
+               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-       return  (data_wantness << 9) |
-               (needs_journal_commit << 8) |
-               (bucket_gc_gen(ca, b) / 16);
+               return -last_read_scaled;
+       } else {
+               /*
+                * Prefer to use buckets with smaller gc_gen so that we don't
+                * have to walk the btree and recalculate oldest_gen - but shift
+                * off the low bits so that buckets will still have equal sort
+                * keys when there's only a small difference, so that we can
+                * keep sequential buckets together:
+                */
+               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+                       (bucket_gc_gen(g) >> 4);
+       }
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets;
        struct alloc_heap_entry e = { 0 };
+       u64 now, last_seq_ondisk;
        size_t b, i, nr = 0;
 
-       ca->alloc_heap.used = 0;
-
-       mutex_lock(&c->bucket_clock[READ].lock);
        down_read(&ca->bucket_lock);
 
        buckets = bucket_array(ca);
-
-       bch2_recalc_oldest_io(c, ca, READ);
+       ca->alloc_heap.used = 0;
+       now = atomic64_read(&c->io_clock[READ].now);
+       last_seq_ondisk = c->journal.last_seq_ondisk;
 
        /*
         * Find buckets with lowest read priority, by building a maxheap sorted
@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
         * all buckets have been visited.
         */
        for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-               unsigned long key = bucket_sort_key(c, ca, b, m);
+               struct bucket *g = &buckets->b[b];
+               struct bucket_mark m = READ_ONCE(g->mark);
+               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        }
 
        up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -1031,8 +892,8 @@ retry:
        u.data_type     = 0;
        u.dirty_sectors = 0;
        u.cached_sectors = 0;
-       u.read_time     = c->bucket_clock[READ].hand;
-       u.write_time    = c->bucket_clock[WRITE].hand;
+       u.read_time     = atomic64_read(&c->io_clock[READ].now);
+       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
 
        bch2_alloc_pack(c, &a, u);
        bch2_trans_update(trans, iter, &a.k,
@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
-       bch2_bucket_clock_init(c, READ);
-       bch2_bucket_clock_init(c, WRITE);
 
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
index 1abfff5290bc52e5ef263558a3dfca7287751a42..be164d6108bbcdbb3f2ca48bf8e6dc0618d4b6a8 100644 (file)
 
 struct ec_bucket_buf;
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-       /*
-        * "now" in (read/write) IO time - incremented whenever we do X amount
-        * of reads or writes.
-        *
-        * Goes with the bucket read/write prios: when we read or write to a
-        * bucket we reset the bucket's prio to the current hand; thus hand -
-        * prio = time since bucket was last read/written.
-        *
-        * The units are some amount (bytes/sectors) of data read/written, and
-        * the units can change on the fly if we need to rescale to fit
-        * everything in a u16 - your only guarantee is that the units are
-        * consistent.
-        */
-       u16                     hand;
-       u16                     max_last_io;
-
-       int                     rw;
-
-       struct io_timer         rescale;
-       struct mutex            lock;
-};
-
 enum alloc_reserve {
        RESERVE_BTREE_MOVINGGC  = -2,
        RESERVE_BTREE           = -1,
index bd675b88b35412db31ac864f1004382b29b62bd7..763cac0efa0cca87f5b1a3db42734f488c9d9ee7 100644 (file)
@@ -451,9 +451,6 @@ struct bch_dev {
 
        size_t                  fifo_last_bucket;
 
-       /* last calculated minimum prio */
-       u16                     max_last_bucket_io[2];
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
 
@@ -693,14 +690,6 @@ struct bch_fs {
        struct mutex            usage_scratch_lock;
        struct bch_fs_usage_online *usage_scratch;
 
-       /*
-        * When we invalidate buckets, we use both the priority and the amount
-        * of good data to determine which buckets to reuse first - to weight
-        * those together consistently we keep track of the smallest nonzero
-        * priority of any bucket.
-        */
-       struct bucket_clock     bucket_clock[2];
-
        struct io_clock         io_clock[2];
 
        /* JOURNAL SEQ BLACKLIST */
index b6c7e57b6bcda328f673226df0a92f24443ca18f..5dab5bfd228a9538de940c9092cbc90db5a9855d 100644 (file)
@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
        struct bch_sb_field     field;
 
        __le32                  flags;
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
        __le64                  journal_seq;
 
        union {
@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(blacklist,            3)              \
        x(blacklist_v2,         4)              \
        x(usage,                5)              \
-       x(data_usage,           6)
+       x(data_usage,           6)              \
+       x(clock,                7)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
        struct bch_replicas_entry r;
 } __attribute__((packed));
 
+struct jset_entry_clock {
+       struct jset_entry       entry;
+       __u8                    rw;
+       __u8                    pad[7];
+       __le64                  time;
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1581,8 +1589,8 @@ struct jset {
 
        __u8                    encrypted_start[0];
 
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
 
        /* Sequence number of oldest dirty journal entry */
        __le64                  last_seq;
index 9e123736a125ddf7e645ec966684def18655069d..5ea9bae09d5905c38efbc555b499309ddf7f7af2 100644 (file)
@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last = atomic_long_read(&clock->now);
+       unsigned long last = atomic64_read(&clock->now);
        unsigned last_kick = atomic_read(&c->kick_gc);
        int ret;
 
@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
                        if (c->btree_gc_periodic) {
                                unsigned long next = last + c->capacity / 16;
 
-                               if (atomic_long_read(&clock->now) >= next)
+                               if (atomic64_read(&clock->now) >= next)
                                        break;
 
                                bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
                }
                __set_current_state(TASK_RUNNING);
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
                /*
index 4103ea7e769a4c10823639ee32cc826d72c75383..50989d2861908ce5789d77687df8f6aafc8f7601 100644 (file)
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
        return __bucket(ca, b, false);
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-       return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
  */
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-       struct bucket *g = bucket(ca, b);
-
        return g->mark.gen - g->oldest_gen;
 }
 
index 99ab9f48ba9d48df7e324890f8437f1c5be47cf1..b6ea67506cc28821bf388212d775decf3029c790 100644 (file)
@@ -37,7 +37,7 @@ struct bucket {
                const struct bucket_mark mark;
        };
 
-       u16                             io_time[2];
+       u64                             io_time[2];
        u8                              oldest_gen;
        u8                              gc_gen;
        unsigned                        gen_valid:1;
index 869ba188775775c07aeedc70d86530781a0ce7f7..da91c95e3ffc213ce3a70525006b53fc86e98222 100644 (file)
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 
        spin_lock(&clock->timer_lock);
 
-       if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+       if (time_after_eq((unsigned long) atomic64_read(&clock->now),
                          timer->expire)) {
                spin_unlock(&clock->timer_lock);
                timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
        struct io_timer *timer;
-       unsigned long now = atomic_long_add_return(sectors, &clock->now);
+       unsigned long now = atomic64_add_return(sectors, &clock->now);
 
        while ((timer = get_expired_timer(clock, now)))
                timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
        unsigned i;
 
        spin_lock(&clock->timer_lock);
-       now = atomic_long_read(&clock->now);
+       now = atomic64_read(&clock->now);
 
        for (i = 0; i < clock->timers.used; i++)
                pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-       atomic_long_set(&clock->now, 0);
+       atomic64_set(&clock->now, 0);
        spin_lock_init(&clock->timer_lock);
 
        clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
index 92c740a475656da2093528b802320ad73eabbe46..5fae0012d808f7a1b5f4e5334804eee50c31d577 100644 (file)
@@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)        io_timer_heap;
 
 struct io_clock {
-       atomic_long_t           now;
+       atomic64_t              now;
        u16 __percpu            *pcpu_buf;
        unsigned                max_slop;
 
index ba37c78c01dbdb76f62f9feffece8a27c08650db..379b9ad2c0f9317058774feaecd6206e9a6c4813 100644 (file)
@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
        j->entry_u64s_reserved +=
                BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
+       j->entry_u64s_reserved +=
+               2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
index 7e726db7788115cb1bbf907180db99fb02e98d37..a82548983dbd28a5d9aff9879c4cafa25d27078c 100644 (file)
@@ -426,6 +426,32 @@ fsck_err:
        return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+                                       struct jset *jset,
+                                       struct jset_entry *entry,
+                                       int write)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes != sizeof(*clock),
+                                c, "invalid journal entry clock: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(clock->rw > 1,
+                                c, "invalid journal entry clock: bad rw")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
                        struct jset_entry *, int);
@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
 
        end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-       end     = bch2_journal_super_entries_add_common(c, end,
-                                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end,
+                               le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
 
        journal_write_compact(jset);
 
-       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
        jset->magic             = cpu_to_le64(jset_magic(c));
-
        jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
                ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                : cpu_to_le32(c->sb.version);
index 8e6e4cd7388694a31d4289b95b4db91ea2f11f80..e2472c19beafa44199813c94f98718714da26a72 100644 (file)
@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last, wait;
+       u64 last, wait;
 
        set_freezable();
 
@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
                if (kthread_wait_freezable(c->copy_gc_enabled))
                        break;
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                wait = bch2_copygc_wait_amount(c);
 
                if (wait > clock->max_slop) {
index f9a12dd797a5f2c4cbda6bd515da5d488d5c8803..2263ee41c44478c1671344b21f73f0bee3f7d126 100644 (file)
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
        unsigned long start, prev_start;
        unsigned long prev_run_time, prev_run_cputime;
        unsigned long cputime, prev_cputime;
-       unsigned long io_start;
+       u64 io_start;
        long throttle;
 
        set_freezable();
 
-       io_start        = atomic_long_read(&clock->now);
+       io_start        = atomic64_read(&clock->now);
        p               = rebalance_work(c);
        prev_start      = jiffies;
        prev_cputime    = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
                                        (20 - w.dev_most_full_percent),
                                        50);
 
-                       if (atomic_long_read(&clock->now) + clock->max_slop <
+                       if (atomic64_read(&clock->now) + clock->max_slop <
                            r->throttled_until_iotime) {
                                r->throttled_until_cputime = start + throttle;
                                r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
                              max(p.dev_most_full_percent, 1U) /
                              max(w.dev_most_full_percent, 1U));
 
-               io_start        = atomic_long_read(&clock->now);
+               io_start        = atomic64_read(&clock->now);
                p               = w;
                prev_start      = start;
                prev_cputime    = cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
        case REBALANCE_THROTTLED:
                bch2_hprint(&PBUF(h1),
                            (r->throttled_until_iotime -
-                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
                pr_buf(out, "throttled for %lu sec or %s io\n",
                       (r->throttled_until_cputime - jiffies) / HZ,
                       h1);
index 192c6be20cedd841311518fbee9028f07f09b23b..2f62a643c39fbb0c08f024fbf58a7f3325755875 100644 (file)
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
        atomic64_t              work_unknown_dev;
 
        enum rebalance_state    state;
-       unsigned long           throttled_until_iotime;
+       u64                     throttled_until_iotime;
        unsigned long           throttled_until_cputime;
        struct bch_move_stats   move_stats;
 
index f470e0e233ce949c46480cb57242fae6d34074d3..55f7771e11c8b38d52b8228ff3b90378d64f9c4d 100644 (file)
@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                le64_to_cpu(bl_entry->end) + 1);
                break;
        }
+       case BCH_JSET_ENTRY_clock: {
+               struct jset_entry_clock *clock =
+                       container_of(entry, struct jset_entry_clock, entry);
+
+               atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+       }
        }
 
        return ret;
@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
        int ret;
 
        if (clean) {
-               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
                for (entry = clean->start;
                     entry != vstruct_end(&clean->field);
                     entry = vstruct_next(entry)) {
@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
                        if (i->ignore)
                                continue;
 
-                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
                        vstruct_for_each(&i->j, entry) {
                                ret = journal_replay_entry_early(c, entry);
                                if (ret)
@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
                return 0;
        }
 
-       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-                       "superblock read clock %u doesn't match journal %u after clean shutdown",
-                       clean->read_clock, j->read_clock);
-       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-                       "superblock write clock %u doesn't match journal %u after clean shutdown",
-                       clean->write_clock, j->write_clock);
-
        for (i = 0; i < BTREE_ID_NR; i++) {
                char buf1[200], buf2[200];
                struct bkey_i *k1, *k2;
index 61b947313c88df43ba128e0a5a98ef4143218033..3b082da934fbff5d6bb30412959763673f3418d6 100644 (file)
@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
        return ret;
 }
 
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-       memset(entry, 0, u64s * sizeof(u64));
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
 
+       memset(entry, 0, u64s * sizeof(u64));
        /*
         * The u64s field counts from the start of data, ignoring the shared
         * fields.
         */
        entry->u64s = u64s - 1;
-}
 
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-       entry_init_u64s(entry, u64s);
+       *end = vstruct_next(*end);
+       return entry;
 }
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                     struct jset_entry *entry,
-                                     u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
 {
        unsigned i;
 
@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_INODES;
                u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-
-               entry = vstruct_next(entry);
        }
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_KEY_VERSION;
                u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_RESERVED;
                u->entry.level  = i;
                u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
                struct jset_entry_data_usage *u =
-                       container_of(entry, struct jset_entry_data_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
 
-               entry_init_size(entry, sizeof(*u) + e->nr_devs);
                u->entry.type   = BCH_JSET_ENTRY_data_usage;
                u->v            = cpu_to_le64(c->usage_base->replicas[i]);
                unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
                              "embedded variable length struct");
-
-               entry = vstruct_next(entry);
        }
 
        percpu_up_read(&c->mark_lock);
 
-       return entry;
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = atomic64_read(&c->io_clock[i].now);
+       }
 }
 
 void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        }
 
        sb_clean->flags         = 0;
-       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
-       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
        sb_clean->journal_seq   = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
 
        /* Trying to catch outstanding bug: */
        BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
        entry = sb_clean->start;
-       entry = bch2_journal_super_entries_add_common(c, entry, 0);
+       bch2_journal_super_entries_add_common(c, &entry, 0);
        entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
        BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
index 402ae563b3c70a1f7dff18988479a216d12b185a..dd8d4ba911f0393ff18a2d4789de313df3a6e6e5 100644 (file)
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-                                     struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+                                          struct jset_entry **, u64);
 
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
index d451a29b517b5eb8420eab2b85b4d12195a5b475..5f5893ab9edfec89ce45ecb59260947049cda669 100644 (file)
@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        /*
         * Flush journal before stopping allocators, because flushing journal
         * blacklist entries involves allocating new btree nodes:
@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        for_each_rw_member(ca, c, i) {
                ret = bch2_dev_allocator_start(ca);
                if (ret) {
index 521b6d8d518f46c0bd23c136f818d45f12f99044..8fdbeaf9df321229e0ef0579e872924b3505907a 100644 (file)
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
        int rw = (private ? 1 : 0);
 
-       return bucket_last_io(c, bucket(ca, b), rw);
+       return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }
 
 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
                                     size_t b, void *private)
 {
-       return bucket_gc_gen(ca, b);
+       return bucket_gc_gen(bucket(ca, b));
 }
 
 static int unsigned_cmp(const void *_l, const void *_r)