struct btrfs_path *p,
                                                        int write_lock_level)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *b;
        int root_lock = 0;
        int level = 0;
 
        if (p->search_commit_root) {
-               /*
-                * The commit roots are read only so we always do read locks,
-                * and we always must hold the commit_root_sem when doing
-                * searches on them, the only exception is send where we don't
-                * want to block transaction commits for a long time, so
-                * we need to clone the commit root in order to avoid races
-                * with transaction commits that create a snapshot of one of
-                * the roots used by a send operation.
-                */
-               if (p->need_commit_sem) {
-                       down_read(&fs_info->commit_root_sem);
-                       b = btrfs_clone_extent_buffer(root->commit_root);
-                       up_read(&fs_info->commit_root_sem);
-                       if (!b)
-                               return ERR_PTR(-ENOMEM);
-
-               } else {
-                       b = root->commit_root;
-                       atomic_inc(&b->refs);
-               }
+               b = root->commit_root;
+               atomic_inc(&b->refs);
                level = btrfs_header_level(b);
                /*
                 * Ensure that all callers have set skip_locking when
        return b;
 }
 
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+       const int i = path->lowest_level;
+       const int slot = path->slots[i];
+       struct extent_buffer *lowest = path->nodes[i];
+       struct extent_buffer *clone;
+
+       ASSERT(path->need_commit_sem);
+
+       if (!lowest)
+               return 0;
+
+       lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+       clone = btrfs_clone_extent_buffer(lowest);
+       if (!clone)
+               return -ENOMEM;
+
+       btrfs_release_path(path);
+       path->nodes[i] = clone;
+       path->slots[i] = slot;
+
+       return 0;
+}
 
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
                      const struct btrfs_key *key, struct btrfs_path *p,
                      int ins_len, int cow)
 {
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *b;
        int slot;
        int ret;
 
        min_write_lock_level = write_lock_level;
 
+       if (p->need_commit_sem) {
+               ASSERT(p->search_commit_root);
+               down_read(&fs_info->commit_root_sem);
+       }
+
 again:
        prev_cmp = -1;
        b = btrfs_search_slot_get_root(root, p, write_lock_level);
 done:
        if (ret < 0 && !p->skip_release_on_error)
                btrfs_release_path(p);
+
+       if (p->need_commit_sem) {
+               int ret2;
+
+               ret2 = finish_need_commit_sem_search(p);
+               up_read(&fs_info->commit_root_sem);
+               if (ret2)
+                       ret = ret2;
+       }
+
        return ret;
 }
 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
        int level;
        struct extent_buffer *c;
        struct extent_buffer *next;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
+       bool need_commit_sem = false;
        u32 nritems;
        int ret;
        int i;
 
        path->keep_locks = 1;
 
-       if (time_seq)
+       if (time_seq) {
                ret = btrfs_search_old_slot(root, &key, path, time_seq);
-       else
+       } else {
+               if (path->need_commit_sem) {
+                       path->need_commit_sem = 0;
+                       need_commit_sem = true;
+                       down_read(&fs_info->commit_root_sem);
+               }
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       }
        path->keep_locks = 0;
 
        if (ret < 0)
-               return ret;
+               goto done;
 
        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
        ret = 0;
 done:
        unlock_up(path, 0, 1, 0, NULL);
+       if (need_commit_sem) {
+               int ret2;
+
+               path->need_commit_sem = 1;
+               ret2 = finish_need_commit_sem_search(path);
+               up_read(&fs_info->commit_root_sem);
+               if (ret2)
+                       ret = ret2;
+       }
 
        return ret;
 }
 
 #include "transaction.h"
 #include "compression.h"
 #include "xattr.h"
+#include "print-tree.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
        struct btrfs_path *right_path;
        struct btrfs_key *cmp_key;
 
+       /*
+        * Keep track of the generation of the last transaction that was used
+        * for relocating a block group. This is periodically checked in order
+        * to detect if a relocation happened since the last check, so that we
+        * don't operate on stale extent buffers for nodes (level >= 1) or on
+        * stale disk_bytenr values of file extent items.
+        */
+       u64 last_reloc_trans;
+
        /*
         * infos of the currently processed inode. In case of deleted inodes,
         * these are the values from the deleted inode.
        if (ret < 0)
                goto out;
 
+       down_read(&fs_info->commit_root_sem);
+       if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+               /*
+                * A transaction commit for a transaction in which block group
+                * relocation was done just happened.
+                * The disk_bytenr of the file extent item we processed is
+                * possibly stale, referring to the extent's location before
+                * relocation. So act as if we haven't found any clone sources
+                * and fallback to write commands, which will read the correct
+                * data from the new extent location. Otherwise we will fail
+                * below because we haven't found our own back reference or we
+                * could be getting incorrect sources in case the old extent
+                * was already reallocated after the relocation.
+                */
+               up_read(&fs_info->commit_root_sem);
+               ret = -ENOENT;
+               goto out;
+       }
+       up_read(&fs_info->commit_root_sem);
+
        if (!backref_ctx.found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
 {
        int ret = 0;
 
+       /*
+        * We can not hold the commit root semaphore here. This is because in
+        * the case of sending and receiving to the same filesystem, using a
+        * pipe, could result in a deadlock:
+        *
+        * 1) The task running send blocks on the pipe because it's full;
+        *
+        * 2) The task running receive, which is the only consumer of the pipe,
+        *    is waiting for a transaction commit (for example due to a space
+        *    reservation when doing a write or triggering a transaction commit
+        *    when creating a subvolume);
+        *
+        * 3) The transaction is waiting to write lock the commit root semaphore,
+        *    but can not acquire it since it's being held at 1).
+        *
+        * Down this call chain we write to the pipe through kernel_write().
+        * The same type of problem can also happen when sending to a file that
+        * is stored in the same filesystem - when reserving space for a write
+        * into the file, we can trigger a transaction commit.
+        *
+        * Our caller has supplied us with clones of leaves from the send and
+        * parent roots, so we're safe here from a concurrent relocation and
+        * further reallocation of metadata extents while we are here. Below we
+        * also assert that the leaves are clones.
+        */
+       lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+       /*
+        * We always have a send root, so left_path is never NULL. We will not
+        * have a leaf when we have reached the end of the send root but have
+        * not yet reached the end of the parent root.
+        */
+       if (left_path->nodes[0])
+               ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+                               &left_path->nodes[0]->bflags));
+       /*
+        * When doing a full send we don't have a parent root, so right_path is
+        * NULL. When doing an incremental send, we may have reached the end of
+        * the parent root already, so we don't have a leaf at right_path.
+        */
+       if (right_path && right_path->nodes[0])
+               ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+                               &right_path->nodes[0]->bflags));
+
        if (result == BTRFS_COMPARE_TREE_SAME) {
                if (key->type == BTRFS_INODE_REF_KEY ||
                    key->type == BTRFS_INODE_EXTREF_KEY) {
        return ret;
 }
 
+static int search_key_again(const struct send_ctx *sctx,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path,
+                           const struct btrfs_key *key)
+{
+       int ret;
+
+       if (!path->need_commit_sem)
+               lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+       /*
+        * Roots used for send operations are readonly and no one can add,
+        * update or remove keys from them, so we should be able to find our
+        * key again. The only exception is deduplication, which can operate on
+        * readonly roots and add, update or remove keys to/from them - but at
+        * the moment we don't allow it to run in parallel with send.
+        */
+       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       ASSERT(ret <= 0);
+       if (ret > 0) {
+               btrfs_print_tree(path->nodes[path->lowest_level], false);
+               btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+                         key->objectid, key->type, key->offset,
+                         (root == sctx->parent_root ? "parent" : "send"),
+                         root->root_key.objectid, path->lowest_level,
+                         path->slots[path->lowest_level]);
+               return -EUCLEAN;
+       }
+
+       return ret;
+}
+
 static int full_send_tree(struct send_ctx *sctx)
 {
        int ret;
        struct btrfs_root *send_root = sctx->send_root;
        struct btrfs_key key;
+       struct btrfs_fs_info *fs_info = send_root->fs_info;
        struct btrfs_path *path;
-       struct extent_buffer *eb;
-       int slot;
 
        path = alloc_path_for_send();
        if (!path)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
 
+       down_read(&fs_info->commit_root_sem);
+       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+       up_read(&fs_info->commit_root_sem);
+
        ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
        if (ret < 0)
                goto out;
                goto out_finish;
 
        while (1) {
-               eb = path->nodes[0];
-               slot = path->slots[0];
-               btrfs_item_key_to_cpu(eb, &key, slot);
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
                ret = changed_cb(path, NULL, &key,
                                 BTRFS_COMPARE_TREE_NEW, sctx);
                if (ret < 0)
                        goto out;
 
+               down_read(&fs_info->commit_root_sem);
+               if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+                       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+                       up_read(&fs_info->commit_root_sem);
+                       /*
+                        * A transaction used for relocating a block group was
+                        * committed or is about to finish its commit. Release
+                        * our path (leaf) and restart the search, so that we
+                        * avoid operating on any file extent items that are
+                        * stale, with a disk_bytenr that reflects a pre
+                        * relocation value. This way we avoid as much as
+                        * possible to fallback to regular writes when checking
+                        * if we can clone file ranges.
+                        */
+                       btrfs_release_path(path);
+                       ret = search_key_again(sctx, send_root, path, &key);
+                       if (ret < 0)
+                               goto out;
+               } else {
+                       up_read(&fs_info->commit_root_sem);
+               }
+
                ret = btrfs_next_item(send_root, path);
                if (ret < 0)
                        goto out;
        return ret;
 }
 
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+       struct extent_buffer *clone;
+
+       clone = btrfs_clone_extent_buffer(path->nodes[level]);
+       if (!clone)
+               return -ENOMEM;
+
+       free_extent_buffer(path->nodes[level]);
+       path->nodes[level] = clone;
+
+       return 0;
+}
+
 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
 {
        struct extent_buffer *eb;
        u64 reada_max;
        u64 reada_done = 0;
 
+       lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
        BUG_ON(*level == 0);
        eb = btrfs_read_node_slot(parent, slot);
        if (IS_ERR(eb))
        path->nodes[*level - 1] = eb;
        path->slots[*level - 1] = 0;
        (*level)--;
+
+       if (*level == 0)
+               return replace_node_with_clone(path, 0);
+
        return 0;
 }
 
        path->slots[*level]++;
 
        while (path->slots[*level] >= nritems) {
-               if (*level == root_level)
+               if (*level == root_level) {
+                       path->slots[*level] = nritems - 1;
                        return -1;
+               }
 
                /* move upnext */
                path->slots[*level] = 0;
        } else {
                ret = tree_move_down(path, level, reada_min_gen);
        }
-       if (ret >= 0) {
-               if (*level == 0)
-                       btrfs_item_key_to_cpu(path->nodes[*level], key,
-                                       path->slots[*level]);
-               else
-                       btrfs_node_key_to_cpu(path->nodes[*level], key,
-                                       path->slots[*level]);
-       }
+
+       /*
+        * Even if we have reached the end of a tree, ret is -1, update the key
+        * anyway, so that in case we need to restart due to a block group
+        * relocation, we can assert that the last key of the root node still
+        * exists in the tree.
+        */
+       if (*level == 0)
+               btrfs_item_key_to_cpu(path->nodes[*level], key,
+                                     path->slots[*level]);
+       else
+               btrfs_node_key_to_cpu(path->nodes[*level], key,
+                                     path->slots[*level]);
+
        return ret;
 }
 
        return 0;
 }
 
+/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ *    any locks on them, which does not prevent them from having been relocated
+ *    and reallocated after the last time we released the commit root semaphore.
+ *    The exception are the root nodes, for which we always have a clone, see
+ *    the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ *    we are safe from the concurrent relocation and reallocation. However they
+ *    can have file extent items with a pre relocation disk_bytenr value, so we
+ *    restart the start from the current commit roots and clone the new leaves so
+ *    that we get the post relocation disk_bytenr values. Not doing so, could
+ *    make us clone the wrong data in case there are new extents using the old
+ *    disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+                                   struct btrfs_path *right_path,
+                                   const struct btrfs_key *left_key,
+                                   const struct btrfs_key *right_key,
+                                   int left_level,
+                                   int right_level,
+                                   const struct send_ctx *sctx)
+{
+       int root_level;
+       int ret;
+
+       lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+       btrfs_release_path(left_path);
+       btrfs_release_path(right_path);
+
+       /*
+        * Since keys can not be added or removed to/from our roots because they
+        * are readonly and we do not allow deduplication to run in parallel
+        * (which can add, remove or change keys), the layout of the trees should
+        * not change.
+        */
+       left_path->lowest_level = left_level;
+       ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+       if (ret < 0)
+               return ret;
+
+       right_path->lowest_level = right_level;
+       ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If the lowest level nodes are leaves, clone them so that they can be
+        * safely used by changed_cb() while not under the protection of the
+        * commit root semaphore, even if relocation and reallocation happens in
+        * parallel.
+        */
+       if (left_level == 0) {
+               ret = replace_node_with_clone(left_path, 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       if (right_level == 0) {
+               ret = replace_node_with_clone(right_path, 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * Now clone the root nodes (unless they happen to be the leaves we have
+        * already cloned). This is to protect against concurrent snapshotting of
+        * the send and parent roots (see the comment at btrfs_compare_trees()).
+        */
+       root_level = btrfs_header_level(sctx->send_root->commit_root);
+       if (root_level > 0) {
+               ret = replace_node_with_clone(left_path, root_level);
+               if (ret < 0)
+                       return ret;
+       }
+
+       root_level = btrfs_header_level(sctx->parent_root->commit_root);
+       if (root_level > 0) {
+               ret = replace_node_with_clone(right_path, root_level);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
 /*
  * This function compares two trees and calls the provided callback for
  * every changed/new/deleted item it finds.
        int right_root_level;
        int left_level;
        int right_level;
-       int left_end_reached;
-       int right_end_reached;
-       int advance_left;
-       int advance_right;
+       int left_end_reached = 0;
+       int right_end_reached = 0;
+       int advance_left = 0;
+       int advance_right = 0;
        u64 left_blockptr;
        u64 right_blockptr;
        u64 left_gen;
        down_read(&fs_info->commit_root_sem);
        left_level = btrfs_header_level(left_root->commit_root);
        left_root_level = left_level;
+       /*
+        * We clone the root node of the send and parent roots to prevent races
+        * with snapshot creation of these roots. Snapshot creation COWs the
+        * root node of a tree, so after the transaction is committed the old
+        * extent can be reallocated while this send operation is still ongoing.
+        * So we clone them, under the commit root semaphore, to be race free.
+        */
        left_path->nodes[left_level] =
                        btrfs_clone_extent_buffer(left_root->commit_root);
        if (!left_path->nodes[left_level]) {
-               up_read(&fs_info->commit_root_sem);
                ret = -ENOMEM;
-               goto out;
+               goto out_unlock;
        }
 
        right_level = btrfs_header_level(right_root->commit_root);
        right_path->nodes[right_level] =
                        btrfs_clone_extent_buffer(right_root->commit_root);
        if (!right_path->nodes[right_level]) {
-               up_read(&fs_info->commit_root_sem);
                ret = -ENOMEM;
-               goto out;
+               goto out_unlock;
        }
        /*
         * Our right root is the parent root, while the left root is the "send"
         * will need to read them at some point.
         */
        reada_min_gen = btrfs_header_generation(right_root->commit_root);
-       up_read(&fs_info->commit_root_sem);
 
        if (left_level == 0)
                btrfs_item_key_to_cpu(left_path->nodes[left_level],
                btrfs_node_key_to_cpu(right_path->nodes[right_level],
                                &right_key, right_path->slots[right_level]);
 
-       left_end_reached = right_end_reached = 0;
-       advance_left = advance_right = 0;
+       sctx->last_reloc_trans = fs_info->last_reloc_trans;
 
        while (1) {
-               cond_resched();
+               if (need_resched() ||
+                   rwsem_is_contended(&fs_info->commit_root_sem)) {
+                       up_read(&fs_info->commit_root_sem);
+                       cond_resched();
+                       down_read(&fs_info->commit_root_sem);
+               }
+
+               if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+                       ret = restart_after_relocation(left_path, right_path,
+                                                      &left_key, &right_key,
+                                                      left_level, right_level,
+                                                      sctx);
+                       if (ret < 0)
+                               goto out_unlock;
+                       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+               }
+
                if (advance_left && !left_end_reached) {
                        ret = tree_advance(left_path, &left_level,
                                        left_root_level,
                        if (ret == -1)
                                left_end_reached = ADVANCE;
                        else if (ret < 0)
-                               goto out;
+                               goto out_unlock;
                        advance_left = 0;
                }
                if (advance_right && !right_end_reached) {
                        if (ret == -1)
                                right_end_reached = ADVANCE;
                        else if (ret < 0)
-                               goto out;
+                               goto out_unlock;
                        advance_right = 0;
                }
 
                if (left_end_reached && right_end_reached) {
                        ret = 0;
-                       goto out;
+                       goto out_unlock;
                } else if (left_end_reached) {
                        if (right_level == 0) {
+                               up_read(&fs_info->commit_root_sem);
                                ret = changed_cb(left_path, right_path,
                                                &right_key,
                                                BTRFS_COMPARE_TREE_DELETED,
                                                sctx);
                                if (ret < 0)
                                        goto out;
+                               down_read(&fs_info->commit_root_sem);
                        }
                        advance_right = ADVANCE;
                        continue;
                } else if (right_end_reached) {
                        if (left_level == 0) {
+                               up_read(&fs_info->commit_root_sem);
                                ret = changed_cb(left_path, right_path,
                                                &left_key,
                                                BTRFS_COMPARE_TREE_NEW,
                                                sctx);
                                if (ret < 0)
                                        goto out;
+                               down_read(&fs_info->commit_root_sem);
                        }
                        advance_left = ADVANCE;
                        continue;
                }
 
                if (left_level == 0 && right_level == 0) {
+                       up_read(&fs_info->commit_root_sem);
                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
                        if (cmp < 0) {
                                ret = changed_cb(left_path, right_path,
                                                &left_key,
                                                BTRFS_COMPARE_TREE_NEW,
                                                sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_left = ADVANCE;
                        } else if (cmp > 0) {
                                ret = changed_cb(left_path, right_path,
                                                &right_key,
                                                BTRFS_COMPARE_TREE_DELETED,
                                                sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_right = ADVANCE;
                        } else {
                                enum btrfs_compare_tree_result result;
                                        result = BTRFS_COMPARE_TREE_SAME;
                                ret = changed_cb(left_path, right_path,
                                                 &left_key, result, sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_left = ADVANCE;
                                advance_right = ADVANCE;
                        }
+
+                       if (ret < 0)
+                               goto out;
+                       down_read(&fs_info->commit_root_sem);
                } else if (left_level == right_level) {
                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
                        if (cmp < 0) {
                }
        }
 
+out_unlock:
+       up_read(&fs_info->commit_root_sem);
 out:
        btrfs_free_path(left_path);
        btrfs_free_path(right_path);
        if (ret)
                goto out;
 
-       spin_lock(&fs_info->send_reloc_lock);
-       if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
-               spin_unlock(&fs_info->send_reloc_lock);
-               btrfs_warn_rl(fs_info,
-               "cannot run send because a relocation operation is in progress");
-               ret = -EAGAIN;
-               goto out;
-       }
-       fs_info->send_in_progress++;
-       spin_unlock(&fs_info->send_reloc_lock);
-
        ret = send_subvol(sctx);
-       spin_lock(&fs_info->send_reloc_lock);
-       fs_info->send_in_progress--;
-       spin_unlock(&fs_info->send_reloc_lock);
        if (ret < 0)
                goto out;