bcachefs: Journal updates to interior nodes

author Kent Overstreet <kent.overstreet@gmail.com>

Sun, 9 Feb 2020 00:06:31 +0000 (19:06 -0500)

committer Kent Overstreet <kent.overstreet@linux.dev>

Sun, 22 Oct 2023 21:08:37 +0000 (17:08 -0400)
author Kent Overstreet <kent.overstreet@gmail.com>
Sun, 9 Feb 2020 00:06:31 +0000 (19:06 -0500)
committer Kent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:37 +0000 (17:08 -0400)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h

index 1ad5ff449a5be7733a533ccd0643deac6a9fda57..6f74fda1f21d1b30bbb5fd4c51f1bc9397b2b649 100644 (file)
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1316,7 +1316,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
         x(new_extent_overwrite,         9)      \
         x(incompressible,               10)     \
         x(btree_ptr_v2,                 11)     \
-       x(extents_above_btree_updates,  12)
+       x(extents_above_btree_updates,  12)     \
+       x(btree_updates_journalled,     13)
  
  #define BCH_SB_FEATURES_ALL                            \
         ((1ULL << BCH_FEATURE_new_siphash)|             \
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c

index d0b76141790330e4425408cba43a6df58503dc95..e43d1b2ce5c7ad9d2770d2204a6e0d3e858452ed 100644 (file)
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1260,7 +1260,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
                 closure_put(&((struct btree_update *) new)->cl);
  
         bch2_journal_pin_drop(&c->journal, &w->journal);
-       closure_wake_up(&w->wait);
  }
  
  static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
         wbio->wbio.bio.bi_private       = b;
  
-       if (b->c.level || !b->written)
-               wbio->wbio.bio.bi_opf |= REQ_FUA;
-
         bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
  
         /*
@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
         rcu_read_lock();
         for_each_cached_btree(b, c, tbl, i, pos) {
                 unsigned long flags = READ_ONCE(b->flags);
-               unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
  
                 if (!(flags & (1 << BTREE_NODE_dirty)))
                         continue;
  
-               pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+               pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
                        b,
                        (flags & (1 << BTREE_NODE_dirty)) != 0,
                        (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
                        b->written,
                        !list_empty_careful(&b->write_blocked),
                        b->will_make_reachable != 0,
-                      b->will_make_reachable & 1,
-                      b->writes[ idx].wait.list.first != NULL,
-                      b->writes[!idx].wait.list.first != NULL);
+                      b->will_make_reachable & 1);
         }
         rcu_read_unlock();
  
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h

index 43fa8a6dbee57ca9800f138c6d1ebfcf87579a16..a02e261c2eb21ba6fe668de5e230b001ffcc76cb 100644 (file)
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
  void bch2_btree_node_write(struct bch_fs *, struct btree *,
                           enum six_lock_type);
  
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+                                           enum six_lock_type lock_held)
  {
         while (b->written &&
                btree_node_need_write(b) &&
                btree_node_may_write(b)) {
                 if (!btree_node_write_in_flight(b)) {
-                       bch2_btree_node_write(c, b, SIX_LOCK_read);
+                       bch2_btree_node_write(c, b, lock_held);
                         break;
                 }
  
                 six_unlock_read(&b->c.lock);
                 btree_node_wait_on_io(b);
-               btree_node_lock_type(c, b, SIX_LOCK_read);
+               btree_node_lock_type(c, b, lock_held);
         }
  }
  
@@ -131,7 +132,7 @@ do {                                                                        \
                 new |= (1 << BTREE_NODE_need_write);                    \
         } while ((v = cmpxchg(&(_b)->flags, old, new)) != old);         \
                                                                         \
-       btree_node_write_if_need(_c, _b);                               \
+       btree_node_write_if_need(_c, _b, SIX_LOCK_read);                \
  } while (0)
  
  void bch2_btree_flush_all_reads(struct bch_fs *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h

index 885cc9500f36b85f1dd0bc8753e9c7853d9805ce..a794f9fe4fce05b298ce27c80aa7c0f7a1567642 100644 (file)
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -53,7 +53,6 @@ struct bset_tree {
  
  struct btree_write {
         struct journal_entry_pin        journal;
-       struct closure_waitlist         wait;
  };
  
  struct btree_alloc {
@@ -547,8 +546,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
  struct btree_root {
         struct btree            *b;
  
-       struct btree_update     *as;
-
         /* On disk root - see async splits: */
         __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
         u8                      level;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h

index 9f58d47ef5d6abbecfaa703a9fc4e351ea298a86..11f7d02de622b0d8ebeeed37056b983b2ac241fd 100644 (file)
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
                                      struct btree_iter *);
  bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
                                 struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
  
  enum btree_insert_flags {
         __BTREE_INSERT_NOUNLOCK,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c

index fa9c7f5e0bb92bd77dde05163f89768116f2d3fb..68deb4eb31a69bb51ce1777aa825e7677d5eced8 100644 (file)
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -24,7 +24,6 @@
  static void btree_node_will_make_reachable(struct btree_update *,
                                            struct btree *);
  static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
  
  /* Debug code: */
  
@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
  }
  
  static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-                                       struct pending_btree_node_free *pending)
+                       struct pending_btree_node_free *pending,
+                       u64 journal_seq)
  {
         BUG_ON(!pending->index_update_done);
  
         bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                     0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+                     0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
  
         if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
                 bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                             0, 0, NULL, 0,
+                             0, 0, NULL, journal_seq,
                               BTREE_TRIGGER_OVERWRITE|
                               BTREE_TRIGGER_GC);
  }
@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
  {
         struct bch_fs *c = as->c;
  
+       bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+       bch2_journal_pin_drop(&c->journal, &as->journal);
         bch2_journal_pin_flush(&c->journal, &as->journal);
  
-       BUG_ON(as->nr_new_nodes);
-       BUG_ON(as->nr_pending);
+       BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+              !bch2_journal_error(&c->journal));;
  
         if (as->reserve)
                 bch2_btree_reserve_put(c, as->reserve);
@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
         mutex_unlock(&c->btree_interior_update_lock);
  }
  
-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
  {
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
         struct bch_fs *c = as->c;
  
-       bch2_journal_pin_drop(&c->journal, &as->journal);
-
         mutex_lock(&c->btree_interior_update_lock);
  
         while (as->nr_new_nodes) {
@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
         }
  
         while (as->nr_pending)
-               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+                                           seq);
  
         mutex_unlock(&c->btree_interior_update_lock);
-
-       closure_wake_up(&as->wait);
-
-       bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
-       struct bch_fs *c = as->c;
-       int ret;
-
-       ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-       if (ret == -EAGAIN) {
-               continue_at(cl, btree_update_wait_on_journal, system_wq);
-               return;
-       }
-       if (ret < 0)
-               goto err;
-
-       bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
-       continue_at(cl, btree_update_nodes_reachable, system_wq);
  }
  
  static void btree_update_nodes_written(struct closure *cl)
  {
         struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct journal_res res = { 0 };
         struct bch_fs *c = as->c;
         struct btree *b;
+       struct bset *i;
+       struct bkey_i *k;
+       unsigned journal_u64s = 0;
+       int ret;
  
         /*
          * We did an update to a parent node where the pointers we added pointed
@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
          */
         mutex_lock(&c->btree_interior_update_lock);
         as->nodes_written = true;
-retry:
+again:
         as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
                                       struct btree_update, unwritten_list);
         if (!as || !as->nodes_written) {
@@ -679,31 +662,53 @@ retry:
                 return;
         }
  
+       b = as->b;
+       if (b && !six_trylock_intent(&b->c.lock)) {
+               mutex_unlock(&c->btree_interior_update_lock);
+               btree_node_lock_type(c, b, SIX_LOCK_intent);
+               six_unlock_intent(&b->c.lock);
+               goto out;
+       }
+
+       journal_u64s = 0;
+
+       if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+               for_each_keylist_key(&as->parent_keys, k)
+                       journal_u64s += jset_u64s(k->k.u64s);
+
+       ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+                                  JOURNAL_RES_GET_RESERVED);
+       if (ret) {
+               BUG_ON(!bch2_journal_error(&c->journal));
+               /* can't unblock btree writes */
+               goto free_update;
+       }
+
+       if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+               for_each_keylist_key(&as->parent_keys, k)
+                       bch2_journal_add_entry(&c->journal, &res,
+                                              BCH_JSET_ENTRY_btree_keys,
+                                              as->btree_id,
+                                              as->level,
+                                              k, k->k.u64s);
+
         switch (as->mode) {
         case BTREE_INTERIOR_NO_UPDATE:
                 BUG();
         case BTREE_INTERIOR_UPDATING_NODE:
-               /* The usual case: */
-               b = READ_ONCE(as->b);
-
-               if (!six_trylock_read(&b->c.lock)) {
-                       mutex_unlock(&c->btree_interior_update_lock);
-                       btree_node_lock_type(c, b, SIX_LOCK_read);
-                       six_unlock_read(&b->c.lock);
-                       mutex_lock(&c->btree_interior_update_lock);
-                       goto retry;
-               }
-
-               BUG_ON(!btree_node_dirty(b));
-               closure_wait(&btree_current_write(b)->wait, &as->cl);
+               /* @b is the node we did the final insert into: */
+               BUG_ON(!res.ref);
  
+               six_lock_write(&b->c.lock, NULL, NULL);
                 list_del(&as->write_blocked_list);
  
-               /*
-                * for flush_held_btree_writes() waiting on updates to flush or
-                * nodes to be writeable:
-                */
-               closure_wake_up(&c->btree_interior_update_wait);
+               i = btree_bset_last(b);
+               i->journal_seq = cpu_to_le64(
+                       max(res.seq,
+                           le64_to_cpu(i->journal_seq)));
+
+               bch2_btree_add_journal_pin(c, b, res.seq);
+               six_unlock_write(&b->c.lock);
  
                 list_del(&as->unwritten_list);
                 mutex_unlock(&c->btree_interior_update_lock);
@@ -712,82 +717,51 @@ retry:
                  * b->write_blocked prevented it from being written, so
                  * write it now if it needs to be written:
                  */
-               bch2_btree_node_write_cond(c, b, true);
-               six_unlock_read(&b->c.lock);
-               continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
+               btree_node_write_if_need(c, b, SIX_LOCK_intent);
+               six_unlock_intent(&b->c.lock);
                 break;
  
         case BTREE_INTERIOR_UPDATING_AS:
-               /*
-                * The btree node we originally updated has been freed and is
-                * being rewritten - so we need to write anything here, we just
-                * need to signal to that btree_update that it's ok to make the
-                * new replacement node visible:
-                */
-               closure_put(&as->parent_as->cl);
-
-               /*
-                * and then we have to wait on that btree_update to finish:
-                */
-               closure_wait(&as->parent_as->wait, &as->cl);
+               BUG_ON(b);
  
                 list_del(&as->unwritten_list);
                 mutex_unlock(&c->btree_interior_update_lock);
-
-               continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
                 break;
  
-       case BTREE_INTERIOR_UPDATING_ROOT:
-               /* b is the new btree root: */
-               b = READ_ONCE(as->b);
-
-               if (!six_trylock_read(&b->c.lock)) {
-                       mutex_unlock(&c->btree_interior_update_lock);
-                       btree_node_lock_type(c, b, SIX_LOCK_read);
-                       six_unlock_read(&b->c.lock);
-                       mutex_lock(&c->btree_interior_update_lock);
-                       goto retry;
-               }
-
-               BUG_ON(c->btree_roots[b->c.btree_id].as != as);
-               c->btree_roots[b->c.btree_id].as = NULL;
+       case BTREE_INTERIOR_UPDATING_ROOT: {
+               struct btree_root *r = &c->btree_roots[as->btree_id];
  
-               bch2_btree_set_root_ondisk(c, b, WRITE);
+               BUG_ON(b);
  
-               /*
-                * We don't have to wait anything anything here (before
-                * btree_update_nodes_reachable frees the old nodes
-                * ondisk) - we've ensured that the very next journal write will
-                * have the pointer to the new root, and before the allocator
-                * can reuse the old nodes it'll have to do a journal commit:
-                */
-               six_unlock_read(&b->c.lock);
+               mutex_lock(&c->btree_root_lock);
+               bkey_copy(&r->key, as->parent_keys.keys);
+               r->level = as->level;
+               r->alive = true;
+               c->btree_roots_dirty = true;
+               mutex_unlock(&c->btree_root_lock);
  
                 list_del(&as->unwritten_list);
                 mutex_unlock(&c->btree_interior_update_lock);
-
-               /*
-                * Bit of funny circularity going on here we have to break:
-                *
-                * We have to drop our journal pin before writing the journal
-                * entry that points to the new btree root: else, we could
-                * deadlock if the journal currently happens to be full.
-                *
-                * This mean we're dropping the journal pin _before_ the new
-                * nodes are technically reachable - but this is safe, because
-                * after the bch2_btree_set_root_ondisk() call above they will
-                * be reachable as of the very next journal write:
-                */
-               bch2_journal_pin_drop(&c->journal, &as->journal);
-
-               as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
-               btree_update_wait_on_journal(&as->cl);
                 break;
         }
+       }
  
+       bch2_journal_pin_drop(&c->journal, &as->journal);
+
+       bch2_journal_res_put(&c->journal, &res);
+       bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+       btree_update_nodes_reachable(as, res.seq);
+free_update:
+       bch2_btree_update_free(as);
+       /*
+        * for flush_held_btree_writes() waiting on updates to flush or
+        * nodes to be writeable:
+        */
+       closure_wake_up(&c->btree_interior_update_wait);
+out:
         mutex_lock(&c->btree_interior_update_lock);
-       goto retry;
+       goto again;
  }
  
  /*
@@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
         BUG_ON(!btree_node_dirty(b));
  
-       as->mode = BTREE_INTERIOR_UPDATING_NODE;
-       as->b = b;
+       as->mode        = BTREE_INTERIOR_UPDATING_NODE;
+       as->b           = b;
+       as->level       = b->c.level;
         list_add(&as->write_blocked_list, &b->write_blocked);
  
         mutex_unlock(&c->btree_interior_update_lock);
-
-       /*
-        * In general, when you're staging things in a journal that will later
-        * be written elsewhere, and you also want to guarantee ordering: that
-        * is, if you have updates a, b, c, after a crash you should never see c
-        * and not a or b - there's a problem:
-        *
-        * If the final destination of the update(s) (i.e. btree node) can be
-        * written/flushed _before_ the relevant journal entry - oops, that
-        * breaks ordering, since the various leaf nodes can be written in any
-        * order.
-        *
-        * Normally we use bset->journal_seq to deal with this - if during
-        * recovery we find a btree node write that's newer than the newest
-        * journal entry, we just ignore it - we don't need it, anything we're
-        * supposed to have (that we reported as completed via fsync()) will
-        * still be in the journal, and as far as the state of the journal is
-        * concerned that btree node write never happened.
-        *
-        * That breaks when we're rewriting/splitting/merging nodes, since we're
-        * mixing btree node writes that haven't happened yet with previously
-        * written data that has been reported as completed to the journal.
-        *
-        * Thus, before making the new nodes reachable, we have to wait the
-        * newest journal sequence number we have data for to be written (if it
-        * hasn't been yet).
-        */
-       bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
-                       struct journal_entry_pin *pin, u64 seq)
-{
-       struct btree_update *as =
-               container_of(pin, struct btree_update, journal);
-
-       bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
  }
  
  static void btree_update_reparent(struct btree_update *as,
@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
  {
         struct bch_fs *c = as->c;
  
+       lockdep_assert_held(&c->btree_interior_update_lock);
+
         child->b = NULL;
         child->mode = BTREE_INTERIOR_UPDATING_AS;
-       child->parent_as = as;
-       closure_get(&as->cl);
  
         /*
          * When we write a new btree root, we have to drop our journal pin
@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
          * just transfer the journal pin to the new interior update so
          * btree_update_nodes_written() can drop it.
          */
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &child->journal, interior_update_flush);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
         bch2_journal_pin_drop(&c->journal, &child->journal);
-
-       as->journal_seq = max(as->journal_seq, child->journal_seq);
  }
  
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
  {
         struct bch_fs *c = as->c;
-       struct btree_root *r = &c->btree_roots[as->btree_id];
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
  
         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+       BUG_ON(!bch2_keylist_empty(&as->parent_keys));
  
-       /*
-        * Old root might not be persistent yet - if so, redirect its
-        * btree_update operation to point to us:
-        */
-       if (r->as)
-               btree_update_reparent(as, r->as);
-
-       as->mode = BTREE_INTERIOR_UPDATING_ROOT;
-       as->b = r->b;
-       r->as = as;
+       mutex_lock(&c->btree_interior_update_lock);
+       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
  
+       as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
+       as->level       = b->c.level;
+       bch2_keylist_add(&as->parent_keys, &b->key);
         mutex_unlock(&c->btree_interior_update_lock);
-
-       /*
-        * When we're rewriting nodes and updating interior nodes, there's an
-        * issue with updates that haven't been written in the journal getting
-        * mixed together with older data - see btree_update_updated_node()
-        * for the explanation.
-        *
-        * However, this doesn't affect us when we're writing a new btree root -
-        * because to make that new root reachable we have to write out a new
-        * journal entry, which must necessarily be newer than as->journal_seq.
-        */
  }
  
  static void btree_node_will_make_reachable(struct btree_update *as,
@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                                                struct btree *b)
  {
         struct bch_fs *c = as->c;
-       struct closure *cl, *cl_n;
         struct btree_update *p, *n;
         struct btree_write *w;
-       struct bset_tree *t;
  
         set_btree_node_dying(b);
  
@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
  
         btree_interior_update_add_node_reference(as, b);
  
-       /*
-        * Does this node have data that hasn't been written in the journal?
-        *
-        * If so, we have to wait for the corresponding journal entry to be
-        * written before making the new nodes reachable - we can't just carry
-        * over the bset->journal_seq tracking, since we'll be mixing those keys
-        * in with keys that aren't in the journal anymore:
-        */
-       for_each_bset(b, t)
-               as->journal_seq = max(as->journal_seq,
-                                     le64_to_cpu(bset(b, t)->journal_seq));
-
         mutex_lock(&c->btree_interior_update_lock);
  
         /*
@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
  
         clear_btree_node_dirty(b);
         clear_btree_node_need_write(b);
-       w = btree_current_write(b);
-
-       /*
-        * Does this node have any btree_update operations waiting on this node
-        * to be written?
-        *
-        * If so, wake them up when this btree_update operation is reachable:
-        */
-       llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
-               llist_add(&cl->list, &as->wait.list);
  
         /*
          * Does this node have unwritten data that has a pin on the journal?
@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
          * oldest pin of any of the nodes we're freeing. We'll release the pin
          * when the new nodes are persistent and reachable on disk:
          */
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &w->journal, interior_update_flush);
+       w = btree_current_write(b);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
         bch2_journal_pin_drop(&c->journal, &w->journal);
  
         w = btree_prev_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &w->journal, interior_update_flush);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
         bch2_journal_pin_drop(&c->journal, &w->journal);
  
         mutex_unlock(&c->btree_interior_update_lock);
@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
  {
         struct btree_reserve *reserve;
         struct btree_update *as;
+       int ret;
  
         reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
         if (IS_ERR(reserve))
@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
  
         bch2_keylist_init(&as->parent_keys, as->inline_keys);
  
+       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+                                jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+       if (ret) {
+               bch2_btree_reserve_put(c, reserve);
+               closure_debug_destroy(&as->cl);
+               mempool_free(as, &c->btree_interior_update_pool);
+               return ERR_PTR(ret);
+       }
+
         mutex_lock(&c->btree_interior_update_lock);
         list_add_tail(&as->list, &c->btree_interior_update_list);
         mutex_unlock(&c->btree_interior_update_lock);
@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
         mutex_unlock(&c->btree_interior_update_lock);
  }
  
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
-       struct btree_root *r = &c->btree_roots[b->c.btree_id];
-
-       mutex_lock(&c->btree_root_lock);
-
-       BUG_ON(b != r->b);
-       bkey_copy(&r->key, &b->key);
-       r->level = b->c.level;
-       r->alive = true;
-       if (rw == WRITE)
-               c->btree_roots_dirty = true;
-
-       mutex_unlock(&c->btree_root_lock);
-}
-
  /**
   * bch_btree_set_root - update the root in memory and on disk
   *
@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
  
         bch2_btree_set_root_inmem(as, b);
  
-       btree_update_updated_root(as);
+       btree_update_updated_root(as, b);
  
         /*
          * Unlock old root after new root is visible:
@@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
                 bch2_btree_build_aux_trees(n1);
                 six_unlock_write(&n1->c.lock);
  
-               bch2_keylist_add(&as->parent_keys, &n1->key);
+               if (parent)
+                       bch2_keylist_add(&as->parent_keys, &n1->key);
         }
  
         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                (bkey_cmp_packed(b, k, &insert->k) >= 0))
                 ;
  
-       while (!bch2_keylist_empty(keys)) {
-               insert = bch2_keylist_front(keys);
-
+       for_each_keylist_key(keys, insert)
                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
-               bch2_keylist_pop_front(keys);
-       }
  
         btree_update_updated_node(as, b);
  
@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                         bkey_copy(&b->key, new_key);
                 }
  
-               btree_update_updated_root(as);
+               btree_update_updated_root(as, b);
                 bch2_btree_node_unlock_write(b, iter);
         }
  
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h

index f6aceed894272341a8f82d3fbefdc84496e961e1..4a2ea69f6a2c682e2b1339760711a5af5ab88e89 100644 (file)
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -69,8 +69,10 @@ struct btree_update {
         unsigned                        nodes_written:1;
  
         enum btree_id                   btree_id;
+       u8                              level;
  
         struct btree_reserve            *reserve;
+       struct journal_preres           journal_preres;
  
         /*
          * BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +85,6 @@ struct btree_update {
         struct btree                    *b;
         struct list_head                write_blocked_list;
  
-       /*
-        * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-        * we're now blocking another btree_update
-        * @parent_as - btree_update that's waiting on our nodes to finish
-        * writing, before it can make new nodes visible on disk
-        * @wait - list of child btree_updates that are waiting on this
-        * btree_update to make all the new nodes visible before they can free
-        * their old btree nodes
-        */
-       struct btree_update             *parent_as;
-       struct closure_waitlist         wait;
-
         /*
          * We may be freeing nodes that were dirty, and thus had journal entries
          * pinned: we need to transfer the oldest of those pins to the
@@ -103,8 +93,6 @@ struct btree_update {
          */
         struct journal_entry_pin        journal;
  
-       u64                             journal_seq;
-
         /*
          * Nodes being freed:
          * Protected by c->btree_node_pending_free_lock
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c

index a8487f8275b682524689b839abc7ace0c73eb933..06e735fc69ecdd64553ba73622d3b47011f19f33 100644 (file)
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
         return __btree_node_flush(j, pin, 1, seq);
  }
  
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+                                      struct btree *b, u64 seq)
+{
+       struct btree_write *w = btree_current_write(b);
+
+       bch2_journal_pin_add(&c->journal, seq, &w->journal,
+                            btree_node_write_idx(b) == 0
+                            ? btree_node_flush0
+                            : btree_node_flush1);
+}
+
  static inline void __btree_journal_key(struct btree_trans *trans,
                                        enum btree_id btree_id,
                                        struct bkey_i *insert)
@@ -173,10 +184,6 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
         struct bch_fs *c = trans->c;
         struct journal *j = &c->journal;
         struct btree *b = iter_l(iter)->b;
-       struct btree_write *w = btree_current_write(b);
-       u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-               ? trans->journal_res.seq
-               : j->replay_journal_seq;
  
         EBUG_ON(trans->journal_res.ref !=
                 !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -187,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
                         cpu_to_le64(trans->journal_res.seq);
         }
  
-       bch2_journal_pin_add(j, seq, &w->journal,
-                            btree_node_write_idx(b) == 0
-                            ? btree_node_flush0
-                            : btree_node_flush1);
+       bch2_btree_add_journal_pin(c, b,
+               likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+                       ? trans->journal_res.seq
+                       : j->replay_journal_seq);
  
         if (unlikely(!btree_node_dirty(b)))
                 set_btree_node_dirty(b);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c

index b50f85d1b0577e7c2b9d2328bfe272595f21cbf1..c9d2a01fec299dc7fa75f2f459c956532235fa9f 100644 (file)
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
         c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
         ret = bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
  
@@ -1090,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
         c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
         c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
         c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+       c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
  
         u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
author	Kent Overstreet <kent.overstreet@gmail.com>
	Sun, 9 Feb 2020 00:06:31 +0000 (19:06 -0500)
committer	Kent Overstreet <kent.overstreet@linux.dev>
	Sun, 22 Oct 2023 21:08:37 +0000 (17:08 -0400)
fs/bcachefs/bcachefs_format.h		patch \| blob \| history
fs/bcachefs/btree_io.c		patch \| blob \| history
fs/bcachefs/btree_io.h		patch \| blob \| history
fs/bcachefs/btree_types.h		patch \| blob \| history
fs/bcachefs/btree_update.h		patch \| blob \| history
fs/bcachefs/btree_update_interior.c		patch \| blob \| history
fs/bcachefs/btree_update_interior.h		patch \| blob \| history
fs/bcachefs/btree_update_leaf.c		patch \| blob \| history
fs/bcachefs/super-io.c		patch \| blob \| history