bcachefs: Persist 64 bit io clocks

author Kent Overstreet <kent.overstreet@gmail.com>

Thu, 21 Jan 2021 20:28:59 +0000 (15:28 -0500)

committer Kent Overstreet <kent.overstreet@linux.dev>

Sun, 22 Oct 2023 21:08:52 +0000 (17:08 -0400)
author Kent Overstreet <kent.overstreet@gmail.com>
Thu, 21 Jan 2021 20:28:59 +0000 (15:28 -0500)
committer Kent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:52 +0000 (17:08 -0400)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c

index 9a670bb2ccfbbebe7887c7bf17d600e73807b26c..bba83011b18b8a5005bf052c50217c3a68f7b2df 100644 (file)
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
  #undef x
  };
  
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
  /* Ratelimiting/PD controllers */
  
  static void pd_controllers_update(struct work_struct *work)
@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
  
  int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
  {
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       int ret;
  
         down_read(&c->gc_lock);
         ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
         bch2_dev_usage_from_buckets(c);
         percpu_up_write(&c->mark_lock);
  
-       mutex_lock(&c->bucket_clock[READ].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, READ);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       mutex_lock(&c->bucket_clock[WRITE].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, WRITE);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[WRITE].lock);
-
         return 0;
  }
  
@@ -460,114 +440,6 @@ err:
  
  /* Bucket IO clocks: */
  
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket *g;
-       u16 max_last_io = 0;
-       unsigned i;
-
-       lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-       /* Recalculate max_last_io for this device: */
-       for_each_bucket(g, buckets)
-               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-       ca->max_last_bucket_io[rw] = max_last_io;
-
-       /* Recalculate global max_last_io: */
-       max_last_io = 0;
-
-       for_each_member_device(ca, c, i)
-               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-       clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets;
-       struct bch_dev *ca;
-       struct bucket *g;
-       unsigned i;
-
-       trace_rescale_prios(c);
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->io_time[rw] = clock->hand -
-                       bucket_last_io(c, g, rw) / 2;
-
-               bch2_recalc_oldest_io(c, ca, rw);
-
-               up_read(&ca->bucket_lock);
-       }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-       return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-       struct bucket_clock *clock = container_of(timer,
-                                               struct bucket_clock, rescale);
-       struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, bucket_clock[clock->rw]);
-       struct bch_dev *ca;
-       u64 capacity;
-       unsigned i;
-
-       mutex_lock(&clock->lock);
-
-       /* if clock cannot be advanced more, rescale prio */
-       if (clock->max_last_io >= U16_MAX - 2)
-               bch2_rescale_bucket_io_times(c, clock->rw);
-
-       BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-       for_each_member_device(ca, c, i)
-               ca->max_last_bucket_io[clock->rw]++;
-       clock->max_last_io++;
-       clock->hand++;
-
-       mutex_unlock(&clock->lock);
-
-       capacity = READ_ONCE(c->capacity);
-
-       if (!capacity)
-               return;
-
-       /*
-        * we only increment when 0.1% of the filesystem capacity has been read
-        * or written too, this determines if it's time
-        *
-        * XXX: we shouldn't really be going off of the capacity of devices in
-        * RW mode (that will be 0 when we're RO, yet we can still service
-        * reads)
-        */
-       timer->expire += bucket_clock_freq(capacity);
-
-       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-
-       clock->hand             = 1;
-       clock->rw               = rw;
-       clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = bucket_clock_freq(c->capacity);
-       mutex_init(&clock->lock);
-}
-
  int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                               size_t bucket_nr, int rw)
  {
@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
         struct bucket *g;
         struct bkey_alloc_buf *a;
         struct bkey_alloc_unpacked u;
-       u64 *time;
+       u64 *time, now;
         int ret = 0;
  
         iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
         percpu_up_read(&c->mark_lock);
  
         time = rw == READ ? &u.read_time : &u.write_time;
-       if (*time == c->bucket_clock[rw].hand)
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
                 goto out;
  
-       *time = c->bucket_clock[rw].hand;
+       *time = now;
  
         bch2_alloc_pack(c, a, u);
         ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
         return ret;
  }
  
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-                                      size_t bucket,
-                                      struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+                                      struct bucket_mark m)
  {
         u8 gc_gen;
  
-       if (!is_available_bucket(mark))
+       if (!is_available_bucket(m))
                 return false;
  
-       if (mark.owned_by_allocator)
+       if (m.owned_by_allocator)
                 return false;
  
         if (ca->buckets_nouse &&
-           test_bit(bucket, ca->buckets_nouse))
+           test_bit(b, ca->buckets_nouse))
                 return false;
  
-       gc_gen = bucket_gc_gen(ca, bucket);
+       gc_gen = bucket_gc_gen(bucket(ca, b));
  
         if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
                 ca->inc_gen_needs_gc++;
@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
  /*
   * Determines what order we're going to reuse buckets, smallest bucket_key()
   * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
   */
  
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+                               u64 now, u64 last_seq_ondisk)
  {
-       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-       unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-       /*
-        * Time since last read, scaled to [0, 8) where larger value indicates
-        * more recently read data:
-        */
-       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-       /* How much we want to keep the data in this bucket: */
-       unsigned long data_wantness =
-               (hotness + 1) * bucket_sectors_used(m);
+       unsigned used = bucket_sectors_used(m);
  
-       unsigned long needs_journal_commit =
-               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+       if (used) {
+               /*
+                * Prefer to keep buckets that have been read more recently, and
+                * buckets that have more data in them:
+                */
+               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
  
-       return  (data_wantness << 9) |
-               (needs_journal_commit << 8) |
-               (bucket_gc_gen(ca, b) / 16);
+               return -last_read_scaled;
+       } else {
+               /*
+                * Prefer to use buckets with smaller gc_gen so that we don't
+                * have to walk the btree and recalculate oldest_gen - but shift
+                * off the low bits so that buckets will still have equal sort
+                * keys when there's only a small difference, so that we can
+                * keep sequential buckets together:
+                */
+               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+                       (bucket_gc_gen(g) >> 4);
+       }
  }
  
  static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
  {
         struct bucket_array *buckets;
         struct alloc_heap_entry e = { 0 };
+       u64 now, last_seq_ondisk;
         size_t b, i, nr = 0;
  
-       ca->alloc_heap.used = 0;
-
-       mutex_lock(&c->bucket_clock[READ].lock);
         down_read(&ca->bucket_lock);
  
         buckets = bucket_array(ca);
-
-       bch2_recalc_oldest_io(c, ca, READ);
+       ca->alloc_heap.used = 0;
+       now = atomic64_read(&c->io_clock[READ].now);
+       last_seq_ondisk = c->journal.last_seq_ondisk;
  
         /*
          * Find buckets with lowest read priority, by building a maxheap sorted
@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
          * all buckets have been visited.
          */
         for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-               unsigned long key = bucket_sort_key(c, ca, b, m);
+               struct bucket *g = &buckets->b[b];
+               struct bucket_mark m = READ_ONCE(g->mark);
+               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
  
                 if (!bch2_can_invalidate_bucket(ca, b, m))
                         continue;
@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
         }
  
         up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
  }
  
  static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -1031,8 +892,8 @@ retry:
         u.data_type     = 0;
         u.dirty_sectors = 0;
         u.cached_sectors = 0;
-       u.read_time     = c->bucket_clock[READ].hand;
-       u.write_time    = c->bucket_clock[WRITE].hand;
+       u.read_time     = atomic64_read(&c->io_clock[READ].now);
+       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
  
         bch2_alloc_pack(c, &a, u);
         bch2_trans_update(trans, iter, &a.k,
@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
  void bch2_fs_allocator_background_init(struct bch_fs *c)
  {
         spin_lock_init(&c->freelist_lock);
-       bch2_bucket_clock_init(c, READ);
-       bch2_bucket_clock_init(c, WRITE);
  
         c->pd_controllers_update_seconds = 5;
         INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h

index 1abfff5290bc52e5ef263558a3dfca7287751a42..be164d6108bbcdbb3f2ca48bf8e6dc0618d4b6a8 100644 (file)
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,30 +10,6 @@
  
  struct ec_bucket_buf;
  
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-       /*
-        * "now" in (read/write) IO time - incremented whenever we do X amount
-        * of reads or writes.
-        *
-        * Goes with the bucket read/write prios: when we read or write to a
-        * bucket we reset the bucket's prio to the current hand; thus hand -
-        * prio = time since bucket was last read/written.
-        *
-        * The units are some amount (bytes/sectors) of data read/written, and
-        * the units can change on the fly if we need to rescale to fit
-        * everything in a u16 - your only guarantee is that the units are
-        * consistent.
-        */
-       u16                     hand;
-       u16                     max_last_io;
-
-       int                     rw;
-
-       struct io_timer         rescale;
-       struct mutex            lock;
-};
-
  enum alloc_reserve {
         RESERVE_BTREE_MOVINGGC  = -2,
         RESERVE_BTREE           = -1,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h

index bd675b88b35412db31ac864f1004382b29b62bd7..763cac0efa0cca87f5b1a3db42734f488c9d9ee7 100644 (file)
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,9 +451,6 @@ struct bch_dev {
  
         size_t                  fifo_last_bucket;
  
-       /* last calculated minimum prio */
-       u16                     max_last_bucket_io[2];
-
         size_t                  inc_gen_needs_gc;
         size_t                  inc_gen_really_needs_gc;
  
@@ -693,14 +690,6 @@ struct bch_fs {
         struct mutex            usage_scratch_lock;
         struct bch_fs_usage_online *usage_scratch;
  
-       /*
-        * When we invalidate buckets, we use both the priority and the amount
-        * of good data to determine which buckets to reuse first - to weight
-        * those together consistently we keep track of the smallest nonzero
-        * priority of any bucket.
-        */
-       struct bucket_clock     bucket_clock[2];
-
         struct io_clock         io_clock[2];
  
         /* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h

index b6c7e57b6bcda328f673226df0a92f24443ca18f..5dab5bfd228a9538de940c9092cbc90db5a9855d 100644 (file)
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
         struct bch_sb_field     field;
  
         __le32                  flags;
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
         __le64                  journal_seq;
  
         union {
@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
         x(blacklist,            3)              \
         x(blacklist_v2,         4)              \
         x(usage,                5)              \
-       x(data_usage,           6)
+       x(data_usage,           6)              \
+       x(clock,                7)
  
  enum {
  #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
         struct bch_replicas_entry r;
  } __attribute__((packed));
  
+struct jset_entry_clock {
+       struct jset_entry       entry;
+       __u8                    rw;
+       __u8                    pad[7];
+       __le64                  time;
+} __attribute__((packed));
+
  /*
   * On disk format for a journal entry:
   * seq is monotonically increasing; every journal entry has its own unique
@@ -1581,8 +1589,8 @@ struct jset {
  
         __u8                    encrypted_start[0];
  
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
  
         /* Sequence number of oldest dirty journal entry */
         __le64                  last_seq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c

index 9e123736a125ddf7e645ec966684def18655069d..5ea9bae09d5905c38efbc555b499309ddf7f7af2 100644 (file)
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
  {
         struct bch_fs *c = arg;
         struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last = atomic_long_read(&clock->now);
+       unsigned long last = atomic64_read(&clock->now);
         unsigned last_kick = atomic_read(&c->kick_gc);
         int ret;
  
@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
                         if (c->btree_gc_periodic) {
                                 unsigned long next = last + c->capacity / 16;
  
-                               if (atomic_long_read(&clock->now) >= next)
+                               if (atomic64_read(&clock->now) >= next)
                                         break;
  
                                 bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
                 }
                 __set_current_state(TASK_RUNNING);
  
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                 last_kick = atomic_read(&c->kick_gc);
  
                 /*
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h

index 4103ea7e769a4c10823639ee32cc826d72c75383..50989d2861908ce5789d77687df8f6aafc8f7601 100644 (file)
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
         return __bucket(ca, b, false);
  }
  
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-       return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
  /*
   * bucket_gc_gen() returns the difference between the bucket's current gen and
   * the oldest gen of any pointer into that bucket in the btree.
   */
  
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
  {
-       struct bucket *g = bucket(ca, b);
-
         return g->mark.gen - g->oldest_gen;
  }
  
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h

index 99ab9f48ba9d48df7e324890f8437f1c5be47cf1..b6ea67506cc28821bf388212d775decf3029c790 100644 (file)
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -37,7 +37,7 @@ struct bucket {
                 const struct bucket_mark mark;
         };
  
-       u16                             io_time[2];
+       u64                             io_time[2];
         u8                              oldest_gen;
         u8                              gc_gen;
         unsigned                        gen_valid:1;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c

index 869ba188775775c07aeedc70d86530781a0ce7f7..da91c95e3ffc213ce3a70525006b53fc86e98222 100644 (file)
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
  
         spin_lock(&clock->timer_lock);
  
-       if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+       if (time_after_eq((unsigned long) atomic64_read(&clock->now),
                           timer->expire)) {
                 spin_unlock(&clock->timer_lock);
                 timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
  void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
  {
         struct io_timer *timer;
-       unsigned long now = atomic_long_add_return(sectors, &clock->now);
+       unsigned long now = atomic64_add_return(sectors, &clock->now);
  
         while ((timer = get_expired_timer(clock, now)))
                 timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
         unsigned i;
  
         spin_lock(&clock->timer_lock);
-       now = atomic_long_read(&clock->now);
+       now = atomic64_read(&clock->now);
  
         for (i = 0; i < clock->timers.used; i++)
                 pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
  
  int bch2_io_clock_init(struct io_clock *clock)
  {
-       atomic_long_set(&clock->now, 0);
+       atomic64_set(&clock->now, 0);
         spin_lock_init(&clock->timer_lock);
  
         clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h

index 92c740a475656da2093528b802320ad73eabbe46..5fae0012d808f7a1b5f4e5334804eee50c31d577 100644 (file)
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
  typedef HEAP(struct io_timer *)        io_timer_heap;
  
  struct io_clock {
-       atomic_long_t           now;
+       atomic64_t              now;
         u16 __percpu            *pcpu_buf;
         unsigned                max_slop;
  
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c

index ba37c78c01dbdb76f62f9feffece8a27c08650db..379b9ad2c0f9317058774feaecd6206e9a6c4813 100644 (file)
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
         j->entry_u64s_reserved +=
                 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
  
+       j->entry_u64s_reserved +=
+               2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
         atomic64_set(&j->reservations.counter,
                 ((union journal_res_state)
                  { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c

index 7e726db7788115cb1bbf907180db99fb02e98d37..a82548983dbd28a5d9aff9879c4cafa25d27078c 100644 (file)
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -426,6 +426,32 @@ fsck_err:
         return ret;
  }
  
+static int journal_entry_validate_clock(struct bch_fs *c,
+                                       struct jset *jset,
+                                       struct jset_entry *entry,
+                                       int write)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes != sizeof(*clock),
+                                c, "invalid journal entry clock: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(clock->rw > 1,
+                                c, "invalid journal entry clock: bad rw")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
  struct jset_entry_ops {
         int (*validate)(struct bch_fs *, struct jset *,
                         struct jset_entry *, int);
@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
  
         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
  
-       end     = bch2_journal_super_entries_add_common(c, end,
-                                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end,
+                               le64_to_cpu(jset->seq));
         u64s    = (u64 *) end - (u64 *) start;
         BUG_ON(u64s > j->entry_u64s_reserved);
  
@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
  
         journal_write_compact(jset);
  
-       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
         jset->magic             = cpu_to_le64(jset_magic(c));
-
         jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                 : cpu_to_le32(c->sb.version);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c

index 8e6e4cd7388694a31d4289b95b4db91ea2f11f80..e2472c19beafa44199813c94f98718714da26a72 100644 (file)
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
  {
         struct bch_fs *c = arg;
         struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last, wait;
+       u64 last, wait;
  
         set_freezable();
  
@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
                 if (kthread_wait_freezable(c->copy_gc_enabled))
                         break;
  
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                 wait = bch2_copygc_wait_amount(c);
  
                 if (wait > clock->max_slop) {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c

index f9a12dd797a5f2c4cbda6bd515da5d488d5c8803..2263ee41c44478c1671344b21f73f0bee3f7d126 100644 (file)
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
         unsigned long start, prev_start;
         unsigned long prev_run_time, prev_run_cputime;
         unsigned long cputime, prev_cputime;
-       unsigned long io_start;
+       u64 io_start;
         long throttle;
  
         set_freezable();
  
-       io_start        = atomic_long_read(&clock->now);
+       io_start        = atomic64_read(&clock->now);
         p               = rebalance_work(c);
         prev_start      = jiffies;
         prev_cputime    = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
                                         (20 - w.dev_most_full_percent),
                                         50);
  
-                       if (atomic_long_read(&clock->now) + clock->max_slop <
+                       if (atomic64_read(&clock->now) + clock->max_slop <
                             r->throttled_until_iotime) {
                                 r->throttled_until_cputime = start + throttle;
                                 r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
                               max(p.dev_most_full_percent, 1U) /
                               max(w.dev_most_full_percent, 1U));
  
-               io_start        = atomic_long_read(&clock->now);
+               io_start        = atomic64_read(&clock->now);
                 p               = w;
                 prev_start      = start;
                 prev_cputime    = cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
         case REBALANCE_THROTTLED:
                 bch2_hprint(&PBUF(h1),
                             (r->throttled_until_iotime -
-                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
                 pr_buf(out, "throttled for %lu sec or %s io\n",
                        (r->throttled_until_cputime - jiffies) / HZ,
                        h1);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h

index 192c6be20cedd841311518fbee9028f07f09b23b..2f62a643c39fbb0c08f024fbf58a7f3325755875 100644 (file)
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
         atomic64_t              work_unknown_dev;
  
         enum rebalance_state    state;
-       unsigned long           throttled_until_iotime;
+       u64                     throttled_until_iotime;
         unsigned long           throttled_until_cputime;
         struct bch_move_stats   move_stats;
  
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c

index f470e0e233ce949c46480cb57242fae6d34074d3..55f7771e11c8b38d52b8228ff3b90378d64f9c4d 100644 (file)
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                 le64_to_cpu(bl_entry->end) + 1);
                 break;
         }
+       case BCH_JSET_ENTRY_clock: {
+               struct jset_entry_clock *clock =
+                       container_of(entry, struct jset_entry_clock, entry);
+
+               atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+       }
         }
  
         return ret;
@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
         int ret;
  
         if (clean) {
-               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
                 for (entry = clean->start;
                      entry != vstruct_end(&clean->field);
                      entry = vstruct_next(entry)) {
@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
                         if (i->ignore)
                                 continue;
  
-                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
                         vstruct_for_each(&i->j, entry) {
                                 ret = journal_replay_entry_early(c, entry);
                                 if (ret)
@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
                 return 0;
         }
  
-       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-                       "superblock read clock %u doesn't match journal %u after clean shutdown",
-                       clean->read_clock, j->read_clock);
-       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-                       "superblock write clock %u doesn't match journal %u after clean shutdown",
-                       clean->write_clock, j->write_clock);
-
         for (i = 0; i < BTREE_ID_NR; i++) {
                 char buf1[200], buf2[200];
                 struct bkey_i *k1, *k2;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c

index 61b947313c88df43ba128e0a5a98ef4143218033..3b082da934fbff5d6bb30412959763673f3418d6 100644 (file)
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
         return ret;
  }
  
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
  {
-       memset(entry, 0, u64s * sizeof(u64));
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
  
+       memset(entry, 0, u64s * sizeof(u64));
         /*
          * The u64s field counts from the start of data, ignoring the shared
          * fields.
          */
         entry->u64s = u64s - 1;
-}
  
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-       entry_init_u64s(entry, u64s);
+       *end = vstruct_next(*end);
+       return entry;
  }
  
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                     struct jset_entry *entry,
-                                     u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
  {
         unsigned i;
  
@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
  
         {
                 struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
  
-               entry_init_size(entry, sizeof(*u));
                 u->entry.type   = BCH_JSET_ENTRY_usage;
                 u->entry.btree_id = FS_USAGE_INODES;
                 u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-
-               entry = vstruct_next(entry);
         }
  
         {
                 struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
  
-               entry_init_size(entry, sizeof(*u));
                 u->entry.type   = BCH_JSET_ENTRY_usage;
                 u->entry.btree_id = FS_USAGE_KEY_VERSION;
                 u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-
-               entry = vstruct_next(entry);
         }
  
         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                 struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
  
-               entry_init_size(entry, sizeof(*u));
                 u->entry.type   = BCH_JSET_ENTRY_usage;
                 u->entry.btree_id = FS_USAGE_RESERVED;
                 u->entry.level  = i;
                 u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-               entry = vstruct_next(entry);
         }
  
         for (i = 0; i < c->replicas.nr; i++) {
                 struct bch_replicas_entry *e =
                         cpu_replicas_entry(&c->replicas, i);
                 struct jset_entry_data_usage *u =
-                       container_of(entry, struct jset_entry_data_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
  
-               entry_init_size(entry, sizeof(*u) + e->nr_devs);
                 u->entry.type   = BCH_JSET_ENTRY_data_usage;
                 u->v            = cpu_to_le64(c->usage_base->replicas[i]);
                 unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
                               "embedded variable length struct");
-
-               entry = vstruct_next(entry);
         }
  
         percpu_up_read(&c->mark_lock);
  
-       return entry;
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = atomic64_read(&c->io_clock[i].now);
+       }
  }
  
  void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
         }
  
         sb_clean->flags         = 0;
-       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
-       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
         sb_clean->journal_seq   = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
  
         /* Trying to catch outstanding bug: */
         BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
  
         entry = sb_clean->start;
-       entry = bch2_journal_super_entries_add_common(c, entry, 0);
+       bch2_journal_super_entries_add_common(c, &entry, 0);
         entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
         BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
  
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h

index 402ae563b3c70a1f7dff18988479a216d12b185a..dd8d4ba911f0393ff18a2d4789de313df3a6e6e5 100644 (file)
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
  
  /* BCH_SB_FIELD_clean: */
  
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-                                     struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+                                          struct jset_entry **, u64);
  
  void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
  
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c

index d451a29b517b5eb8420eab2b85b4d12195a5b475..5f5893ab9edfec89ce45ecb59260947049cda669 100644 (file)
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         bch2_copygc_stop(c);
         bch2_gc_thread_stop(c);
  
-       bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
         /*
          * Flush journal before stopping allocators, because flushing journal
          * blacklist entries involves allocating new btree nodes:
@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                 bch2_dev_allocator_add(c, ca);
         bch2_recalc_capacity(c);
  
-       bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
         for_each_rw_member(ca, c, i) {
                 ret = bch2_dev_allocator_start(ca);
                 if (ret) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c

index 521b6d8d518f46c0bd23c136f818d45f12f99044..8fdbeaf9df321229e0ef0579e872924b3505907a 100644 (file)
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
  {
         int rw = (private ? 1 : 0);
  
-       return bucket_last_io(c, bucket(ca, b), rw);
+       return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
  }
  
  static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
  static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
                                      size_t b, void *private)
  {
-       return bucket_gc_gen(ca, b);
+       return bucket_gc_gen(bucket(ca, b));
  }
  
  static int unsigned_cmp(const void *_l, const void *_r)
author	Kent Overstreet <kent.overstreet@gmail.com>
	Thu, 21 Jan 2021 20:28:59 +0000 (15:28 -0500)
committer	Kent Overstreet <kent.overstreet@linux.dev>
	Sun, 22 Oct 2023 21:08:52 +0000 (17:08 -0400)
fs/bcachefs/alloc_background.c		patch \| blob \| history
fs/bcachefs/alloc_types.h		patch \| blob \| history
fs/bcachefs/bcachefs.h		patch \| blob \| history
fs/bcachefs/bcachefs_format.h		patch \| blob \| history
fs/bcachefs/btree_gc.c		patch \| blob \| history
fs/bcachefs/buckets.h		patch \| blob \| history
fs/bcachefs/buckets_types.h		patch \| blob \| history
fs/bcachefs/clock.c		patch \| blob \| history
fs/bcachefs/clock_types.h		patch \| blob \| history
fs/bcachefs/journal.c		patch \| blob \| history
fs/bcachefs/journal_io.c		patch \| blob \| history
fs/bcachefs/movinggc.c		patch \| blob \| history
fs/bcachefs/rebalance.c		patch \| blob \| history
fs/bcachefs/rebalance_types.h		patch \| blob \| history
fs/bcachefs/recovery.c		patch \| blob \| history
fs/bcachefs/super-io.c		patch \| blob \| history
fs/bcachefs/super-io.h		patch \| blob \| history
fs/bcachefs/super.c		patch \| blob \| history
fs/bcachefs/sysfs.c		patch \| blob \| history