* out of the pt_bo.
         */
        struct drm_suballoc_manager vm_update_sa;
+       /** @min_chunk_size: For dgfx, Minimum chunk size */
+       u64 min_chunk_size;
 };
 
 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
        if (err)
                return ERR_PTR(err);
 
+       if (IS_DGFX(xe)) {
+               if (xe_device_has_flat_ccs(xe))
+                       /* min chunk size corresponds to 4K of CCS Metadata */
+                       m->min_chunk_size = SZ_4K * SZ_64K /
+                               xe_device_ccs_bytes(xe, SZ_64K);
+               else
+                       /* Somewhat arbitrary to avoid a huge amount of blits */
+                       m->min_chunk_size = SZ_64K;
+               m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
+               drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
+                       (unsigned long long)m->min_chunk_size);
+       }
+
        return m;
 }
 
        return MAX_PREEMPTDISABLE_TRANSFER;
 }
 
-static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur)
+static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
 {
-       /*
-        * For VRAM we use identity mapped pages so we are limited to current
-        * cursor size. For system we program the pages ourselves so we have no
-        * such limitation.
-        */
-       return min_t(u64, max_mem_transfer_per_pass(xe),
-                    mem_type_is_vram(cur->mem_type) ? cur->size :
-                    cur->remaining);
+       struct xe_device *xe = tile_to_xe(m->tile);
+       u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
+
+       if (mem_type_is_vram(cur->mem_type)) {
+               /*
+                * VRAM we want to blit in chunks with sizes aligned to
+                * min_chunk_size in order for the offset to CCS metadata to be
+                * page-aligned. If it's the last chunk it may be smaller.
+                *
+                * Another constraint is that we need to limit the blit to
+                * the VRAM block size, unless size is smaller than
+                * min_chunk_size.
+                */
+               u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
+
+               size = min_t(u64, size, chunk);
+               if (size > m->min_chunk_size)
+                       size = round_down(size, m->min_chunk_size);
+       }
+
+       return size;
+}
+
+static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
+{
+       /* If the chunk is not fragmented, allow identity map. */
+       return cur->size >= size;
 }
 
 static u32 pte_update_size(struct xe_migrate *m,
        u32 cmds = 0;
 
        *L0_pt = pt_ofs;
-       if (!is_vram) {
+       if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
+               /* Offset into identity map. */
+               *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
+                                             cur->start + vram_region_gpu_offset(res));
+               cmds += cmd_size;
+       } else {
                /* Clip L0 to available size */
                u64 size = min(*L0, (u64)avail_pts * SZ_2M);
                u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
 
                /* Each chunk has a single blit command */
                cmds += cmd_size;
-       } else {
-               /* Offset into identity map. */
-               *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
-                                             cur->start + vram_region_gpu_offset(res));
-               cmds += cmd_size;
        }
 
        return cmds;
                     struct xe_bb *bb, u32 at_pt,
                     bool is_vram, bool is_comp_pte,
                     struct xe_res_cursor *cur,
-                    u32 size, struct xe_bo *bo)
+                    u32 size, struct ttm_resource *res)
 {
        struct xe_device *xe = tile_to_xe(m->tile);
-
+       struct xe_vm *vm = m->q->vm;
        u16 pat_index;
        u32 ptes;
        u64 ofs = at_pt * XE_PAGE_SIZE;
        else
                pat_index = xe->pat.idx[XE_CACHE_WB];
 
-       /*
-        * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
-        * we're only emitting VRAM PTEs during sanity tests, so when
-        * that's moved to a Kunit test, we should condition VRAM PTEs
-        * on running tests.
-        */
-
        ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
 
        while (ptes) {
 
                        addr = xe_res_dma(cur) & PAGE_MASK;
                        if (is_vram) {
-                               /* Is this a 64K PTE entry? */
-                               if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
-                                   !(cur_ofs & (16 * 8 - 1))) {
-                                       xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
+                               if (vm->flags & XE_VM_FLAG_64K) {
+                                       u64 va = cur_ofs * XE_PAGE_SIZE / 8;
+
+                                       xe_assert(xe, (va & (SZ_64K - 1)) ==
+                                                 (addr & (SZ_64K - 1)));
+
                                        flags |= XE_PTE_PS64;
                                }
 
-                               addr += vram_region_gpu_offset(bo->ttm.resource);
+                               addr += vram_region_gpu_offset(res);
                                devmem = true;
                        }
 
-                       addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
-                                                                addr, pat_index,
-                                                                0, devmem, flags);
+                       addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
+                                                          addr, pat_index,
+                                                          0, devmem, flags);
                        bb->cs[bb->len++] = lower_32_bits(addr);
                        bb->cs[bb->len++] = upper_32_bits(addr);
 
                bool usm = xe->info.has_usm;
                u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
 
-               src_L0 = xe_migrate_res_sizes(xe, &src_it);
-               dst_L0 = xe_migrate_res_sizes(xe, &dst_it);
+               src_L0 = xe_migrate_res_sizes(m, &src_it);
+               dst_L0 = xe_migrate_res_sizes(m, &dst_it);
 
                drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
                        pass++, src_L0, dst_L0);
                                                      &ccs_ofs, &ccs_pt, 0,
                                                      2 * avail_pts,
                                                      avail_pts);
+                       xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
                }
 
                /* Add copy commands size here */
                        goto err_sync;
                }
 
-               if (!src_is_vram)
-                       emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
-                                src_bo);
-               else
+               if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
                        xe_res_next(&src_it, src_L0);
-
-               if (!dst_is_vram)
-                       emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
-                                dst_bo);
                else
+                       emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
+                                src);
+
+               if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
                        xe_res_next(&dst_it, src_L0);
+               else
+                       emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
+                                dst);
 
                if (copy_system_ccs)
-                       emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo);
+                       emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
 
                bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
                update_idx = bb->len;
                bool usm = xe->info.has_usm;
                u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
 
-               clear_L0 = xe_migrate_res_sizes(xe, &src_it);
+               clear_L0 = xe_migrate_res_sizes(m, &src_it);
 
                drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
 
 
                size -= clear_L0;
                /* Preemption is enabled again by the ring ops. */
-               if (!clear_vram) {
-                       emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
-                                bo);
-               } else {
+               if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
                        xe_res_next(&src_it, clear_L0);
-               }
+               else
+                       emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
+                                dst);
+
                bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
                update_idx = bb->len;