bcachefs: Update btree ptrs after every write
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 10 Jul 2021 17:44:42 +0000 (13:44 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:08 +0000 (17:09 -0400)
This closes a significant hole (and last known hole) in our ability to
verify metadata. Previously, since btree nodes are log structured, we
couldn't detect lost btree writes that weren't the first write to a
given node. Additionally, this seems to have lead to some significant
metadata corruption on multi device filesystems with metadata
replication: since a write may have made it to one device and not
another, if we read that btree node back from the replica that did have
that write and started appending after that point, the other replica
would have a gap in the bset entries and reading from that replica
wouldn't find the rest of the bsets.

But, since updates to interior btree nodes are now journalled, we can
close this hole by updating pointers to btree nodes after every write
with the currently written number of sectors, without negatively
affecting performance. This means we will always detect lost or corrupt
metadata - it also means that our btree is now a curious hybrid of COW
and non COW btrees, with all the benefits of both (excluding
complexity).

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
13 files changed:
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_iter.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/io_types.h
fs/bcachefs/migrate.c
fs/bcachefs/recovery.c
fs/bcachefs/super.c

index bed2e76e6dc8e38b25980fc18c1e2fb0a0af2997..6a289b6f1fb4d491bfc16419295ffe22ca46651f 100644 (file)
@@ -676,7 +676,7 @@ struct bch_fs {
        struct btree_key_cache  btree_key_cache;
 
        struct workqueue_struct *btree_update_wq;
-       struct workqueue_struct *btree_error_wq;
+       struct workqueue_struct *btree_io_complete_wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
 
@@ -827,8 +827,6 @@ mempool_t           bio_bounce_pages;
 
        atomic64_t              btree_writes_nr;
        atomic64_t              btree_writes_sectors;
-       struct bio_list         btree_write_error_list;
-       struct work_struct      btree_write_error_work;
        spinlock_t              btree_write_error_lock;
 
        /* ERRORS */
index e6be594fd0be8100653199d8f15cd60ffe0ffbfd..659bcfe09fb4b16d49459eb8a8db9d37f0fe557e 100644 (file)
@@ -1214,7 +1214,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_inode_btree_change    = 11,
        bcachefs_metadata_version_snapshot              = 12,
        bcachefs_metadata_version_inode_backpointers    = 13,
-       bcachefs_metadata_version_max                   = 14,
+       bcachefs_metadata_version_btree_ptr_sectors_written = 14,
+       bcachefs_metadata_version_max                   = 15,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
index 1d4b5fcd1e3910570349356c1f4d353a455b6c6c..b99e4198bdbeaea0e98df17f105cc2c71d71cbfc 100644 (file)
@@ -26,6 +26,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
 {
        EBUG_ON(!btree_node_write_in_flight(b));
 
+       clear_btree_node_write_in_flight_inner(b);
        clear_btree_node_write_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 }
@@ -865,7 +866,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
                BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
-       unsigned nonblacklisted_written = 0;
+       unsigned blacklisted_written, nonblacklisted_written = 0;
+       unsigned ptr_written = btree_ptr_sectors_written(&b->key);
        int ret, retry_read = 0, write = READ;
 
        b->version_ondisk = U16_MAX;
@@ -896,7 +898,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                             b->data->keys.seq, bp->seq);
        }
 
-       while (b->written < c->opts.btree_node_size) {
+       while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
                unsigned sectors, whiteout_u64s = 0;
                struct nonce nonce;
                struct bch_csum csum;
@@ -976,6 +978,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                btree_err_on(blacklisted && first,
                             BTREE_ERR_FIXABLE, c, ca, b, i,
                             "first btree node bset has blacklisted journal seq");
+
+               btree_err_on(blacklisted && ptr_written,
+                            BTREE_ERR_FIXABLE, c, ca, b, i,
+                            "found blacklisted bset in btree node with sectors_written");
                if (blacklisted && !first)
                        continue;
 
@@ -989,26 +995,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                nonblacklisted_written = b->written;
        }
 
-       for (bne = write_block(b);
-            bset_byte_offset(b, bne) < btree_bytes(c);
-            bne = (void *) bne + block_bytes(c))
-               btree_err_on(bne->keys.seq == b->data->keys.seq &&
-                            !bch2_journal_seq_is_blacklisted(c,
-                                       le64_to_cpu(bne->keys.journal_seq),
-                                       true),
+       if (ptr_written) {
+               btree_err_on(b->written < ptr_written,
                             BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
-                            "found bset signature after last bset");
+                            "btree node data missing: expected %u sectors, found %u",
+                            ptr_written, b->written);
+       } else {
+               for (bne = write_block(b);
+                    bset_byte_offset(b, bne) < btree_bytes(c);
+                    bne = (void *) bne + block_bytes(c))
+                       btree_err_on(bne->keys.seq == b->data->keys.seq &&
+                                    !bch2_journal_seq_is_blacklisted(c,
+                                                                     le64_to_cpu(bne->keys.journal_seq),
+                                                                     true),
+                                    BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+                                    "found bset signature after last bset");
 
-       /*
-        * Blacklisted bsets are those that were written after the most recent
-        * (flush) journal write. Since there wasn't a flush, they may not have
-        * made it to all devices - which means we shouldn't write new bsets
-        * after them, as that could leave a gap and then reads from that device
-        * wouldn't find all the bsets in that btree node - which means it's
-        * important that we start writing new bsets after the most recent _non_
-        * blacklisted bset:
-        */
-       b->written = nonblacklisted_written;
+               /*
+                * Blacklisted bsets are those that were written after the most recent
+                * (flush) journal write. Since there wasn't a flush, they may not have
+                * made it to all devices - which means we shouldn't write new bsets
+                * after them, as that could leave a gap and then reads from that device
+                * wouldn't find all the bsets in that btree node - which means it's
+                * important that we start writing new bsets after the most recent _non_
+                * blacklisted bset:
+                */
+               blacklisted_written = b->written;
+               b->written = nonblacklisted_written;
+       }
 
        sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
        sorted->keys.u64s = 0;
@@ -1076,6 +1090,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                if (ca->mi.state != BCH_MEMBER_STATE_rw)
                        set_btree_node_need_rewrite(b);
        }
+
+       if (!ptr_written)
+               set_btree_node_need_rewrite(b);
 out:
        mempool_free(iter, &c->fill_iter);
        return retry_read;
@@ -1574,6 +1591,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
                        goto do_write;
 
                new &= ~(1U << BTREE_NODE_write_in_flight);
+               new &= ~(1U << BTREE_NODE_write_in_flight_inner);
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
        wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
@@ -1592,10 +1610,12 @@ do_write:
                        new &= ~(1U << BTREE_NODE_dirty);
                        new &= ~(1U << BTREE_NODE_need_write);
                        new |=  (1U << BTREE_NODE_write_in_flight);
+                       new |=  (1U << BTREE_NODE_write_in_flight_inner);
                        new |=  (1U << BTREE_NODE_just_written);
                        new ^=  (1U << BTREE_NODE_write_idx);
                } else {
                        new &= ~(1U << BTREE_NODE_write_in_flight);
+                       new &= ~(1U << BTREE_NODE_write_in_flight_inner);
                }
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
@@ -1605,52 +1625,38 @@ do_write:
        six_unlock_read(&b->c.lock);
 }
 
-static void bch2_btree_node_write_error(struct bch_fs *c,
-                                       struct btree_write_bio *wbio)
+static void btree_node_write_work(struct work_struct *work)
 {
+       struct btree_write_bio *wbio =
+               container_of(work, struct btree_write_bio, work);
+       struct bch_fs *c        = wbio->wbio.c;
        struct btree *b         = wbio->wbio.bio.bi_private;
-       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
-       struct btree_trans trans;
-       struct btree_iter *iter;
        int ret;
 
-       bch2_bkey_buf_init(&k);
-       bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
-                                       BTREE_MAX_DEPTH, b->c.level, 0);
-retry:
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               goto err;
-
-       /* has node been freed? */
-       if (iter->l[b->c.level].b != b) {
-               /* node has been freed: */
-               BUG_ON(!btree_node_dying(b));
-               goto out;
-       }
-
-       BUG_ON(!btree_node_hashed(b));
-
-       bch2_bkey_buf_copy(&k, c, &b->key);
+       btree_bounce_free(c,
+               wbio->data_bytes,
+               wbio->wbio.used_mempool,
+               wbio->data);
 
-       bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
+       bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
                bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
+       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
                goto err;
 
-       ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
-       if (ret == -EINTR)
-               goto retry;
-       if (ret)
-               goto err;
+       if (wbio->wbio.first_btree_write) {
+               if (wbio->wbio.failed.nr) {
+
+               }
+       } else {
+               ret = bch2_trans_do(c, NULL, NULL, 0,
+                       bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+                                                           !wbio->wbio.failed.nr));
+               if (ret)
+                       goto err;
+       }
 out:
-       bch2_trans_iter_put(&trans, iter);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&k, c);
        bio_put(&wbio->wbio.bio);
        btree_node_write_done(c, b);
        return;
@@ -1660,58 +1666,14 @@ err:
        goto out;
 }
 
-void bch2_btree_write_error_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs,
-                                       btree_write_error_work);
-       struct bio *bio;
-
-       while (1) {
-               spin_lock_irq(&c->btree_write_error_lock);
-               bio = bio_list_pop(&c->btree_write_error_list);
-               spin_unlock_irq(&c->btree_write_error_lock);
-
-               if (!bio)
-                       break;
-
-               bch2_btree_node_write_error(c,
-                       container_of(bio, struct btree_write_bio, wbio.bio));
-       }
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
-       struct btree_write_bio *wbio =
-               container_of(work, struct btree_write_bio, work);
-       struct bch_fs *c        = wbio->wbio.c;
-       struct btree *b         = wbio->wbio.bio.bi_private;
-
-       btree_bounce_free(c,
-               wbio->bytes,
-               wbio->wbio.used_mempool,
-               wbio->data);
-
-       if (wbio->wbio.failed.nr) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&c->btree_write_error_lock, flags);
-               bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
-               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
-
-               queue_work(c->btree_error_wq, &c->btree_write_error_work);
-               return;
-       }
-
-       bio_put(&wbio->wbio.bio);
-       btree_node_write_done(c, b);
-}
-
 static void btree_node_write_endio(struct bio *bio)
 {
        struct bch_write_bio *wbio      = to_wbio(bio);
        struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
        struct bch_write_bio *orig      = parent ?: wbio;
+       struct btree_write_bio *wb      = container_of(orig, struct btree_write_bio, wbio);
        struct bch_fs *c                = wbio->c;
+       struct btree *b                 = wbio->bio.bi_private;
        struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
        unsigned long flags;
 
@@ -1732,13 +1694,13 @@ static void btree_node_write_endio(struct bio *bio)
        if (parent) {
                bio_put(bio);
                bio_endio(&parent->bio);
-       } else {
-               struct btree_write_bio *wb =
-                       container_of(orig, struct btree_write_bio, wbio);
-
-               INIT_WORK(&wb->work, btree_node_write_work);
-               queue_work(c->io_complete_wq, &wb->work);
+               return;
        }
+
+       clear_btree_node_write_in_flight_inner(b);
+       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+       INIT_WORK(&wb->work, btree_node_write_work);
+       queue_work(c->btree_io_complete_wq, &wb->work);
 }
 
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1763,8 +1725,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 static void btree_write_submit(struct work_struct *work)
 {
        struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+       struct bch_extent_ptr *ptr;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+       bkey_copy(&tmp.k, &wbio->key);
+
+       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+               ptr->offset += wbio->sector_offset;
 
-       bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
+       bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 }
 
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
@@ -1774,7 +1743,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       struct bch_extent_ptr *ptr;
        struct sort_iter sort_iter;
        struct nonce nonce;
        unsigned bytes_to_write, sectors_to_write, bytes, u64s;
@@ -1814,6 +1782,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
                new &= ~(1 << BTREE_NODE_dirty);
                new &= ~(1 << BTREE_NODE_need_write);
                new |=  (1 << BTREE_NODE_write_in_flight);
+               new |=  (1 << BTREE_NODE_write_in_flight_inner);
                new |=  (1 << BTREE_NODE_just_written);
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
@@ -1967,36 +1936,29 @@ do_write:
                            struct btree_write_bio, wbio.bio);
        wbio_init(&wbio->wbio.bio);
        wbio->data                      = data;
-       wbio->bytes                     = bytes;
+       wbio->data_bytes                = bytes;
+       wbio->sector_offset             = b->written;
        wbio->wbio.c                    = c;
        wbio->wbio.used_mempool         = used_mempool;
+       wbio->wbio.first_btree_write    = !b->written;
        wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
        wbio->wbio.bio.bi_private       = b;
 
        bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
-       /*
-        * If we're appending to a leaf node, we don't technically need FUA -
-        * this write just needs to be persisted before the next journal write,
-        * which will be marked FLUSH|FUA.
-        *
-        * Similarly if we're writing a new btree root - the pointer is going to
-        * be in the next journal entry.
-        *
-        * But if we're writing a new btree node (that isn't a root) or
-        * appending to a non leaf btree node, we need either FUA or a flush
-        * when we write the parent with the new pointer. FUA is cheaper than a
-        * flush, and writes appending to leaf nodes aren't blocking anything so
-        * just make all btree node writes FUA to keep things sane.
-        */
-
        bkey_copy(&wbio->key, &b->key);
 
-       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
-               ptr->offset += b->written;
-
        b->written += sectors_to_write;
 
+       if (wbio->wbio.first_btree_write &&
+           b->key.k.type == KEY_TYPE_btree_ptr_v2)
+               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+                       cpu_to_le16(b->written);
+
+       if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+               bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+                       cpu_to_le16(b->written);
+
        atomic64_inc(&c->btree_writes_nr);
        atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
@@ -2005,6 +1967,10 @@ do_write:
        return;
 err:
        set_btree_node_noevict(b);
+       if (!b->written &&
+           b->key.k.type == KEY_TYPE_btree_ptr_v2)
+               bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+                       cpu_to_le16(sectors_to_write);
        b->written += sectors_to_write;
 nowrite:
        btree_bounce_free(c, bytes, used_mempool, data);
index 3732d135de8dd44ca6134fd55a1dde7732632c55..7fdcf879c7d468ae796c4079a791a9c7570b648a 100644 (file)
@@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
                atomic_dec(&c->btree_cache.dirty);
 }
 
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+       return k->k.type == KEY_TYPE_btree_ptr_v2
+               ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+               : 0;
+}
+
 struct btree_read_bio {
        struct bch_fs           *c;
        struct btree            *b;
@@ -48,7 +55,8 @@ struct btree_write_bio {
        struct work_struct      work;
        __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
        void                    *data;
-       unsigned                bytes;
+       unsigned                data_bytes;
+       unsigned                sector_offset;
        struct bch_write_bio    wbio;
 };
 
@@ -137,7 +145,6 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
-void bch2_btree_write_error_work(struct work_struct *);
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
index 58f15b716d49128a15c4a9c24c030edda709c1b7..7385cca43f8b0eb30b84b784d24ebb186e017384 100644 (file)
@@ -132,7 +132,7 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
 
 static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 {
-       unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+       unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
 
        if (iter->locks_want > new_locks_want)
                __bch2_btree_iter_downgrade(iter, new_locks_want);
index 134d221d150e128dac6a496af8208dfe4c02d3f4..78b312e5bcf35476948abf346a778f58505fd360 100644 (file)
@@ -435,6 +435,7 @@ enum btree_flags {
        BTREE_NODE_write_idx,
        BTREE_NODE_accessed,
        BTREE_NODE_write_in_flight,
+       BTREE_NODE_write_in_flight_inner,
        BTREE_NODE_just_written,
        BTREE_NODE_dying,
        BTREE_NODE_fake,
@@ -449,6 +450,7 @@ BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
+BTREE_FLAG(write_in_flight_inner);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
index 12065bba82dd4201e374a063d1d0bcc0aeea1bbd..bab135fae0b0bbd9225c308eb5b9fda8868a70c4 100644 (file)
@@ -74,7 +74,9 @@ int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
                            __le64, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
-                              struct btree *, struct bkey_i *);
+                              struct btree *, struct bkey_i *, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *,
+                               struct btree *, struct bkey_i *, bool);
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
                      struct bkey_i *, enum btree_update_flags);
index 0b78fb9d356109b7efa1e694f89eb105d9d5fd96..e9b7af4c357432e3eb83b35c8ae07e6fcb7bbcac 100644 (file)
@@ -246,11 +246,7 @@ retry:
                goto retry;
        }
 
-       if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
-               bkey_btree_ptr_v2_init(&tmp.k);
-       else
-               bkey_btree_ptr_init(&tmp.k);
-
+       bkey_btree_ptr_v2_init(&tmp.k);
        bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
        bch2_open_bucket_get(c, wp, &ob);
@@ -567,7 +563,8 @@ static void btree_update_nodes_written(struct btree_update *as)
                six_unlock_read(&old->c.lock);
 
                if (seq == as->old_nodes_seq[i])
-                       bch2_btree_node_wait_on_write(old);
+                       wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+                                      TASK_UNINTERRUPTIBLE);
        }
 
        /*
@@ -1153,6 +1150,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        struct bkey_packed *k;
        const char *invalid;
 
+       BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+              !btree_ptr_sectors_written(insert));
+
        invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
                bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
        if (invalid) {
@@ -1395,6 +1395,7 @@ static void btree_split(struct btree_update *as,
                six_unlock_write(&n2->c.lock);
                six_unlock_write(&n1->c.lock);
 
+               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
                bch2_btree_node_write(c, n2, SIX_LOCK_intent);
 
                /*
@@ -1422,12 +1423,12 @@ static void btree_split(struct btree_update *as,
                bch2_btree_build_aux_trees(n1);
                six_unlock_write(&n1->c.lock);
 
+               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
                if (parent)
                        bch2_keylist_add(&as->parent_keys, &n1->key);
        }
 
-       bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-
        /* New nodes all written, now make them visible: */
 
        if (parent) {
@@ -1703,13 +1704,13 @@ retry:
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->c.lock);
 
+       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
        bkey_init(&delete.k);
        delete.k.p = prev->key.k.p;
        bch2_keylist_add(&as->parent_keys, &delete);
        bch2_keylist_add(&as->parent_keys, &n->key);
 
-       bch2_btree_node_write(c, n, SIX_LOCK_intent);
-
        bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
 
        bch2_btree_update_get_open_buckets(as, n);
@@ -1883,74 +1884,109 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
        queue_work(c->btree_interior_update_worker, &a->work);
 }
 
-static void __bch2_btree_node_update_key(struct btree_update *as,
-                                        struct btree_trans *trans,
-                                        struct btree_iter *iter,
-                                        struct btree *b, struct btree *new_hash,
-                                        struct bkey_i *new_key)
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+                                       struct btree_iter *iter,
+                                       struct btree *b, struct btree *new_hash,
+                                       struct bkey_i *new_key,
+                                       bool skip_triggers)
 {
-       struct bch_fs *c = as->c;
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter2 = NULL;
        struct btree *parent;
+       u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
        int ret;
 
-       btree_update_will_delete_key(as, &b->key);
-       btree_update_will_add_key(as, new_key);
+       if (!skip_triggers) {
+               ret = bch2_trans_mark_key(trans,
+                                         bkey_s_c_null,
+                                         bkey_i_to_s_c(new_key),
+                                         BTREE_TRIGGER_INSERT);
+               if (ret)
+                       return ret;
+
+               ret = bch2_trans_mark_key(trans,
+                                         bkey_i_to_s_c(&b->key),
+                                         bkey_s_c_null,
+                                         BTREE_TRIGGER_OVERWRITE);
+               if (ret)
+                       return ret;
+       }
+
+       if (new_hash) {
+               bkey_copy(&new_hash->key, new_key);
+               ret = bch2_btree_node_hash_insert(&c->btree_cache,
+                               new_hash, b->c.level, b->c.btree_id);
+               BUG_ON(ret);
+       }
 
        parent = btree_node_parent(iter, b);
        if (parent) {
-               if (new_hash) {
-                       bkey_copy(&new_hash->key, new_key);
-                       ret = bch2_btree_node_hash_insert(&c->btree_cache,
-                                       new_hash, b->c.level, b->c.btree_id);
-                       BUG_ON(ret);
-               }
+               iter2 = bch2_trans_copy_iter(trans, iter);
 
-               bch2_keylist_add(&as->parent_keys, new_key);
-               bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, 0);
+               BUG_ON(iter2->level != b->c.level);
+               BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
 
-               if (new_hash) {
-                       mutex_lock(&c->btree_cache.lock);
-                       bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+               btree_node_unlock(iter2, iter2->level);
+               iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
+               iter2->level++;
 
-                       bch2_btree_node_hash_remove(&c->btree_cache, b);
-
-                       bkey_copy(&b->key, new_key);
-                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-                       BUG_ON(ret);
-                       mutex_unlock(&c->btree_cache.lock);
-               } else {
-                       bkey_copy(&b->key, new_key);
-               }
+               ret   = bch2_btree_iter_traverse(iter2) ?:
+                       bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+               if (ret)
+                       goto err;
        } else {
                BUG_ON(btree_node_root(c, b) != b);
 
-               bch2_btree_node_lock_write(b, iter);
-               bkey_copy(&b->key, new_key);
+               trans->extra_journal_entries = (void *) &journal_entries[0];
+               trans->extra_journal_entry_u64s =
+                       journal_entry_set((void *) &journal_entries[0],
+                                         BCH_JSET_ENTRY_btree_root,
+                                         b->c.btree_id, b->c.level,
+                                         new_key, new_key->k.u64s);
+       }
 
-               if (btree_ptr_hash_val(&b->key) != b->hash_val) {
-                       mutex_lock(&c->btree_cache.lock);
-                       bch2_btree_node_hash_remove(&c->btree_cache, b);
+       ret = bch2_trans_commit(trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_NOCHECK_RW|
+                               BTREE_INSERT_JOURNAL_RECLAIM|
+                               BTREE_INSERT_JOURNAL_RESERVED|
+                               BTREE_INSERT_NOUNLOCK);
+       if (ret)
+               goto err;
 
-                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
-                       BUG_ON(ret);
-                       mutex_unlock(&c->btree_cache.lock);
-               }
+       bch2_btree_node_lock_write(b, iter);
 
-               btree_update_updated_root(as, b);
-               bch2_btree_node_unlock_write(b, iter);
+       if (new_hash) {
+               mutex_lock(&c->btree_cache.lock);
+               bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+               bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+               bkey_copy(&b->key, new_key);
+               ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+               BUG_ON(ret);
+               mutex_unlock(&c->btree_cache.lock);
+       } else {
+               bkey_copy(&b->key, new_key);
        }
 
-       bch2_btree_update_done(as);
+       bch2_btree_node_unlock_write(b, iter);
+out:
+       bch2_trans_iter_put(trans, iter2);
+       return ret;
+err:
+       if (new_hash) {
+               mutex_lock(&c->btree_cache.lock);
+               bch2_btree_node_hash_remove(&c->btree_cache, b);
+               mutex_unlock(&c->btree_cache.lock);
+       }
+       goto out;
 }
 
-int bch2_btree_node_update_key(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct btree *b,
-                              struct bkey_i *new_key)
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+                              struct btree *b, struct bkey_i *new_key,
+                              bool skip_triggers)
 {
        struct bch_fs *c = trans->c;
-       struct btree *parent = btree_node_parent(iter, b);
-       struct btree_update *as = NULL;
        struct btree *new_hash = NULL;
        struct closure cl;
        int ret = 0;
@@ -1964,27 +2000,18 @@ int bch2_btree_node_update_key(struct btree_trans *trans,
        if (btree_ptr_hash_val(new_key) != b->hash_val) {
                ret = bch2_btree_cache_cannibalize_lock(c, &cl);
                if (ret) {
-                       bch2_trans_unlock(iter->trans);
+                       bch2_trans_unlock(trans);
                        closure_sync(&cl);
-                       if (!bch2_trans_relock(iter->trans))
+                       if (!bch2_trans_relock(trans))
                                return -EINTR;
                }
 
                new_hash = bch2_btree_node_mem_alloc(c);
        }
 
-       as = bch2_btree_update_start(iter, b->c.level,
-               parent ? btree_update_reserve_required(c, parent) : 0,
-               BTREE_INSERT_NOFAIL);
-       if (IS_ERR(as)) {
-               ret = PTR_ERR(as);
-               goto err;
-       }
-
-       __bch2_btree_node_update_key(as, trans, iter, b, new_hash, new_key);
+       ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
+                                          new_key, skip_triggers);
 
-       bch2_btree_iter_downgrade(iter);
-err:
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
                list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1998,6 +2025,35 @@ err:
        return ret;
 }
 
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+                                       struct btree *b, struct bkey_i *new_key,
+                                       bool skip_triggers)
+{
+       struct btree_iter *iter;
+       int ret;
+
+       iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
+                                       BTREE_MAX_DEPTH, b->c.level,
+                                       BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto out;
+
+       /* has node been freed? */
+       if (iter->l[b->c.level].b != b) {
+               /* node has been freed: */
+               BUG_ON(!btree_node_dying(b));
+               goto out;
+       }
+
+       BUG_ON(!btree_node_hashed(b));
+
+       ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+out:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
 /* Init code: */
 
 /*
index c593c8d712c57a3a8351b32db5ce4ac28cfdeecb..c9de49286fb7ab46c4c04a9ffbd733544fa75594 100644 (file)
@@ -910,7 +910,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
        unsigned u64s, reset_flags = 0;
        int ret = 0;
 
-       if (!trans->nr_updates)
+       if (!trans->nr_updates &&
+           !trans->extra_journal_entry_u64s)
                goto out_reset;
 
        if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
index 148808bdea508950349009782cd4d0909130ebc9..50361f2fb8f1d1c45ea13936dbfeae9d490ba388 100644 (file)
@@ -95,7 +95,8 @@ struct bch_write_bio {
                                bounce:1,
                                put_bio:1,
                                have_ioref:1,
-                               used_mempool:1;
+                               used_mempool:1,
+                               first_btree_write:1;
        );
 
        struct bio              bio;
index aacd6385db1f2e511a0ead236ab6f04bc2052b0d..1f65eca48c6ef48d20c033d119a04f0215f7a607 100644 (file)
@@ -139,7 +139,7 @@ retry:
                                break;
                        }
 
-                       ret = bch2_btree_node_update_key(&trans, iter, b, k.k);
+                       ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
                        if (ret == -EINTR) {
                                b = bch2_btree_iter_peek_node(iter);
                                ret = 0;
index c6fa4ca31ae9163581713af436f68a20da74259f..84e224fb0d019c2e2eb02ded749279b4c53860cc 100644 (file)
@@ -1005,6 +1005,11 @@ int bch2_fs_recovery(struct bch_fs *c)
                c->opts.fix_errors      = FSCK_OPT_YES;
        }
 
+       if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
+               bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
+               c->opts.version_upgrade = true;
+       }
+
        ret = bch2_blacklist_table_initialize(c);
        if (ret) {
                bch_err(c, "error initializing blacklist table");
index 619cfdcd293441d5536aa20aa5905fc1796cc6e9..11557a863d3dbbb473d9639ef3e9027eecbcdf9e 100644 (file)
@@ -514,8 +514,8 @@ static void __bch2_fs_free(struct bch_fs *c)
                destroy_workqueue(c->io_complete_wq );
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
-       if (c->btree_error_wq)
-               destroy_workqueue(c->btree_error_wq);
+       if (c->btree_io_complete_wq)
+               destroy_workqueue(c->btree_io_complete_wq);
        if (c->btree_update_wq)
                destroy_workqueue(c->btree_update_wq);
 
@@ -567,7 +567,6 @@ void __bch2_fs_stop(struct bch_fs *c)
        for_each_member_device(ca, c, i)
                cancel_work_sync(&ca->io_error_work);
 
-       cancel_work_sync(&c->btree_write_error_work);
        cancel_work_sync(&c->read_only_work);
 }
 
@@ -696,9 +695,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        mutex_init(&c->bio_bounce_pages_lock);
 
-       bio_list_init(&c->btree_write_error_list);
        spin_lock_init(&c->btree_write_error_lock);
-       INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
 
        INIT_WORK(&c->journal_seq_blacklist_gc_work,
                  bch2_blacklist_entries_gc);
@@ -768,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
-           !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
+           !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||