bcachefs: Check for stale dirty pointer before reads
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 15 Feb 2022 05:06:59 +0000 (00:06 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:24 +0000 (17:09 -0400)
Since we retry reads when we discover we read from a pointer that went
stale, if a dirty pointer is erroniously stale it would cause us to loop
retrying that read forever - unless we check before issuing the read,
while the btree is still locked, when we know that a dirty pointer
should never be stale.

This patch adds that check, along with printing some helpful debug info.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/fs-io.c
fs/bcachefs/io.c
fs/bcachefs/move.c

index 5fce958bafc90233224bec2c1a6629bdad297b3f..9161125aec177eec4848bb827159bfe0edbb660c 100644 (file)
@@ -1043,8 +1043,6 @@ retry:
 
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               bch2_trans_unlock(trans);
-
                if (readpages_iter)
                        readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
                                            extent_partial_reads_expensive(k));
index 218934b4e19b32cf038712cd5b2595d5933da7d5..914e22c5c24789421357d0606305b7500e8d6a50 100644 (file)
@@ -2032,6 +2032,33 @@ err:
        return ret;
 }
 
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+                                                  struct bkey_s_c k,
+                                                  struct bch_extent_ptr ptr)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+       struct btree_iter iter;
+       char buf[200];
+       int ret;
+
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+                            BTREE_ITER_CACHED);
+
+       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+       if (ret)
+               return;
+
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch_err(c, "%s", buf);
+       bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+       bch2_trans_iter_exit(trans, &iter);
+}
+
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                       struct bvec_iter iter, struct bpos read_pos,
                       enum btree_id data_btree, struct bkey_s_c k,
@@ -2041,7 +2068,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
        struct bch_fs *c = trans->c;
        struct extent_ptr_decoded pick;
        struct bch_read_bio *rbio = NULL;
-       struct bch_dev *ca;
+       struct bch_dev *ca = NULL;
        struct promote_op *promote = NULL;
        bool bounce = false, read_full = false, narrow_crcs = false;
        struct bpos data_pos = bkey_start_pos(k.k);
@@ -2058,7 +2085,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                zero_fill_bio_iter(&orig->bio, iter);
                goto out_read_done;
        }
-
+retry_pick:
        pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
        /* hole or reservation - just zero fill: */
@@ -2071,8 +2098,27 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                goto err;
        }
 
-       if (pick_ret > 0)
-               ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       /*
+        * Stale dirty pointers are treated as IO errors, but @failed isn't
+        * allocated unless we're in the retry path - so if we're not in the
+        * retry path, don't check here, it'll be caught in bch2_read_endio()
+        * and we'll end up in the retry path:
+        */
+       if ((flags & BCH_READ_IN_RETRY) &&
+           !pick.ptr.cached &&
+           unlikely(ptr_stale(ca, &pick.ptr))) {
+               read_from_stale_dirty_pointer(trans, k, pick.ptr);
+               bch2_mark_io_failure(failed, &pick);
+               goto retry_pick;
+       }
+
+       /*
+        * Unlock the iterator while the btree node's lock is still in
+        * cache, before doing the IO:
+        */
+       bch2_trans_unlock(trans);
 
        if (flags & BCH_READ_NODECODE) {
                /*
@@ -2367,12 +2413,6 @@ retry:
                 */
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               /*
-                * Unlock the iterator while the btree node's lock is still in
-                * cache, before doing the IO:
-                */
-               bch2_trans_unlock(&trans);
-
                bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
                swap(bvec_iter.bi_size, bytes);
 
index 04971bf847bf811e622552fcae023ca71388cca7..4751d79219cb247cb361511404eb296c7ccc16b6 100644 (file)
@@ -752,10 +752,12 @@ static int __bch2_move_data(struct bch_fs *c,
                        BUG();
                }
 
-               /* unlock before doing IO: */
+               /*
+                * The iterator gets unlocked by __bch2_read_extent - need to
+                * save a copy of @k elsewhere:
+                 */
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
-               bch2_trans_unlock(&trans);
 
                ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);