}
 
 /*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file.  The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
  */
 STATIC int
 xfs_reflink_remap_extent(
        struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *irec,
-       xfs_fileoff_t           destoff,
+       struct xfs_bmbt_irec    *dmap,
        xfs_off_t               new_isize)
 {
+       struct xfs_bmbt_irec    smap;
        struct xfs_mount        *mp = ip->i_mount;
-       bool                    real_extent = xfs_bmap_is_written_extent(irec);
        struct xfs_trans        *tp;
-       unsigned int            resblks;
-       struct xfs_bmbt_irec    uirec;
-       xfs_filblks_t           rlen;
-       xfs_filblks_t           unmap_len;
        xfs_off_t               newlen;
-       int64_t                 qres;
+       int64_t                 qres, qdelta;
+       unsigned int            resblks;
+       bool                    smap_real;
+       bool                    dmap_written = xfs_bmap_is_written_extent(dmap);
+       int                     nimaps;
        int                     error;
 
-       unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
-       trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
-       /* No reflinking if we're low on space */
-       if (real_extent) {
-               error = xfs_reflink_ag_has_free_space(mp,
-                               XFS_FSB_TO_AGNO(mp, irec->br_startblock));
-               if (error)
-                       goto out;
-       }
-
        /* Start a rolling transaction to switch the mappings */
-       resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+       resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
        if (error)
                goto out;
        xfs_trans_ijoin(tp, ip, 0);
 
        /*
-        * Reserve quota for this operation.  We don't know if the first unmap
-        * in the dest file will cause a bmap btree split, so we always reserve
-        * at least enough blocks for that split.  If the extent being mapped
-        * in is written, we need to reserve quota for that too.
+        * Read what's currently mapped in the destination file into smap.
+        * If smap isn't a hole, we will have to remove it before we can add
+        * dmap to the destination file.
         */
-       qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-       if (real_extent)
-               qres += irec->br_blockcount;
-       error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
-                       XFS_QMOPT_RES_REGBLKS);
+       nimaps = 1;
+       error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+                       &smap, &nimaps, 0);
        if (error)
                goto out_cancel;
+       ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+       smap_real = xfs_bmap_is_real_extent(&smap);
 
-       trace_xfs_reflink_remap(ip, irec->br_startoff,
-                               irec->br_blockcount, irec->br_startblock);
+       /*
+        * We can only remap as many blocks as the smaller of the two extent
+        * maps, because we can only remap one extent at a time.
+        */
+       dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+       ASSERT(dmap->br_blockcount == smap.br_blockcount);
 
-       /* Unmap the old blocks in the data fork. */
-       rlen = unmap_len;
-       while (rlen) {
-               ASSERT(tp->t_firstblock == NULLFSBLOCK);
-               error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+       trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+       /* No reflinking if the AG of the dest mapping is low on space. */
+       if (dmap_written) {
+               error = xfs_reflink_ag_has_free_space(mp,
+                               XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
                if (error)
                        goto out_cancel;
+       }
+
+       /*
+        * Compute quota reservation if we think the quota block counter for
+        * this file could increase.
+        *
+        * We start by reserving enough blocks to handle a bmbt split.
+        *
+        * If we are mapping a written extent into the file, we need to have
+        * enough quota block count reservation to handle the blocks in that
+        * extent.
+        *
+        * Note that if we're replacing a delalloc reservation with a written
+        * extent, we have to take the full quota reservation because removing
+        * the delalloc reservation gives the block count back to the quota
+        * count.  This is suboptimal, but the VFS flushed the dest range
+        * before we started.  That should have removed all the delalloc
+        * reservations, but we code defensively.
+        */
+       qdelta = 0;
+       qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+       if (dmap_written)
+               qres += dmap->br_blockcount;
+       error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+                       XFS_QMOPT_RES_REGBLKS);
+       if (error)
+               goto out_cancel;
 
+       if (smap_real) {
                /*
-                * Trim the extent to whatever got unmapped.
-                * Remember, bunmapi works backwards.
+                * If the extent we're unmapping is backed by storage (written
+                * or not), unmap the extent and drop its refcount.
                 */
-               uirec.br_startblock = irec->br_startblock + rlen;
-               uirec.br_startoff = irec->br_startoff + rlen;
-               uirec.br_blockcount = unmap_len - rlen;
-               uirec.br_state = irec->br_state;
-               unmap_len = rlen;
-
-               /* If this isn't a real mapping, we're done. */
-               if (!real_extent || uirec.br_blockcount == 0)
-                       goto next_extent;
-
-               trace_xfs_reflink_remap(ip, uirec.br_startoff,
-                               uirec.br_blockcount, uirec.br_startblock);
+               xfs_bmap_unmap_extent(tp, ip, &smap);
+               xfs_refcount_decrease_extent(tp, &smap);
+               qdelta -= smap.br_blockcount;
+       } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+               xfs_filblks_t   len = smap.br_blockcount;
 
-               /* Update the refcount tree */
-               xfs_refcount_increase_extent(tp, &uirec);
-
-               /* Map the new blocks into the data fork. */
-               xfs_bmap_map_extent(tp, ip, &uirec);
+               /*
+                * If the extent we're unmapping is a delalloc reservation,
+                * we can use the regular bunmapi function to release the
+                * incore state.  Dropping the delalloc reservation takes care
+                * of the quota reservation for us.
+                */
+               error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+               if (error)
+                       goto out_cancel;
+               ASSERT(len == 0);
+       }
 
-               /* Update quota accounting. */
-               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-                               uirec.br_blockcount);
+       /*
+        * If the extent we're sharing is backed by written storage, increase
+        * its refcount and map it into the file.
+        */
+       if (dmap_written) {
+               xfs_refcount_increase_extent(tp, dmap);
+               xfs_bmap_map_extent(tp, ip, dmap);
+               qdelta += dmap->br_blockcount;
+       }
 
-               /* Update dest isize if needed. */
-               newlen = XFS_FSB_TO_B(mp,
-                               uirec.br_startoff + uirec.br_blockcount);
-               newlen = min_t(xfs_off_t, newlen, new_isize);
-               if (newlen > i_size_read(VFS_I(ip))) {
-                       trace_xfs_reflink_update_inode_size(ip, newlen);
-                       i_size_write(VFS_I(ip), newlen);
-                       ip->i_d.di_size = newlen;
-                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               }
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
 
-next_extent:
-               /* Process all the deferred stuff. */
-               error = xfs_defer_finish(&tp);
-               if (error)
-                       goto out_cancel;
+       /* Update dest isize if needed. */
+       newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+       newlen = min_t(xfs_off_t, newlen, new_isize);
+       if (newlen > i_size_read(VFS_I(ip))) {
+               trace_xfs_reflink_update_inode_size(ip, newlen);
+               i_size_write(VFS_I(ip), newlen);
+               ip->i_d.di_size = newlen;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        }
 
+       /* Commit everything and unlock. */
        error = xfs_trans_commit(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               goto out;
-       return 0;
+       goto out_unlock;
 
 out_cancel:
        xfs_trans_cancel(tp);
+out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
-       trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+       if (error)
+               trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
        return error;
 }
 
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
 int
 xfs_reflink_remap_blocks(
        struct xfs_inode        *src,
        loff_t                  *remapped)
 {
        struct xfs_bmbt_irec    imap;
-       xfs_fileoff_t           srcoff;
-       xfs_fileoff_t           destoff;
+       struct xfs_mount        *mp = src->i_mount;
+       xfs_fileoff_t           srcoff = XFS_B_TO_FSBT(mp, pos_in);
+       xfs_fileoff_t           destoff = XFS_B_TO_FSBT(mp, pos_out);
        xfs_filblks_t           len;
-       xfs_filblks_t           range_len;
        xfs_filblks_t           remapped_len = 0;
        xfs_off_t               new_isize = pos_out + remap_len;
        int                     nimaps;
        int                     error = 0;
 
-       destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
-       srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
-       len = XFS_B_TO_FSB(src->i_mount, remap_len);
+       len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+                       XFS_MAX_FILEOFF);
 
-       /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
-       while (len) {
-               uint            lock_mode;
+       trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
 
-               trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
-                               dest, destoff);
+       while (len > 0) {
+               unsigned int    lock_mode;
 
                /* Read extent from the source file */
                nimaps = 1;
                xfs_iunlock(src, lock_mode);
                if (error)
                        break;
-               ASSERT(nimaps == 1);
-
-               trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
-                               &imap);
+               /*
+                * The caller supposedly flushed all dirty pages in the source
+                * file range, which means that writeback should have allocated
+                * or deleted all delalloc reservations in that range.  If we
+                * find one, that's a good sign that something is seriously
+                * wrong here.
+                */
+               ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+               if (imap.br_startblock == DELAYSTARTBLOCK) {
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       error = -EFSCORRUPTED;
+                       break;
+               }
 
-               /* Translate imap into the destination file. */
-               range_len = imap.br_startoff + imap.br_blockcount - srcoff;
-               imap.br_startoff += destoff - srcoff;
+               trace_xfs_reflink_remap_extent_src(src, &imap);
 
-               /* Clear dest from destoff to the end of imap and map it in. */
-               error = xfs_reflink_remap_extent(dest, &imap, destoff,
-                               new_isize);
+               /* Remap into the destination file at the given offset. */
+               imap.br_startoff = destoff;
+               error = xfs_reflink_remap_extent(dest, &imap, new_isize);
                if (error)
                        break;
 
                }
 
                /* Advance drange/srange */
-               srcoff += range_len;
-               destoff += range_len;
-               len -= range_len;
-               remapped_len += range_len;
+               srcoff += imap.br_blockcount;
+               destoff += imap.br_blockcount;
+               len -= imap.br_blockcount;
+               remapped_len += imap.br_blockcount;
        }
 
        if (error)
 
 DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
 DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
 DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
-TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+TRACE_EVENT(xfs_reflink_remap_blocks,
        TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
                 xfs_filblks_t len, struct xfs_inode *dest,
                 xfs_fileoff_t doffset),
                  __entry->dest_ino,
                  __entry->dest_lblk)
 );
-TRACE_EVENT(xfs_reflink_punch_range,
-       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
-                xfs_extlen_t len),
-       TP_ARGS(ip, lblk, len),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fileoff_t, lblk)
-               __field(xfs_extlen_t, len)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->lblk = lblk;
-               __entry->len = len;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->lblk,
-                 __entry->len)
-);
-TRACE_EVENT(xfs_reflink_remap,
-       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
-                xfs_extlen_t len, xfs_fsblock_t new_pblk),
-       TP_ARGS(ip, lblk, len, new_pblk),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fileoff_t, lblk)
-               __field(xfs_extlen_t, len)
-               __field(xfs_fsblock_t, new_pblk)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->lblk = lblk;
-               __entry->len = len;
-               __entry->new_pblk = new_pblk;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->lblk,
-                 __entry->len,
-                 __entry->new_pblk)
-);
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
 
 /* dedupe tracepoints */
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);