Btrfs: implement full reflink support for inline extents

author Filipe Manana <fdmanana@suse.com>

Fri, 28 Feb 2020 13:04:19 +0000 (13:04 +0000)

committer David Sterba <dsterba@suse.com>

Mon, 23 Mar 2020 16:01:54 +0000 (17:01 +0100)
author Filipe Manana <fdmanana@suse.com>
Fri, 28 Feb 2020 13:04:19 +0000 (13:04 +0000)
committer David Sterba <dsterba@suse.com>
Mon, 23 Mar 2020 16:01:54 +0000 (17:01 +0100)
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c

index 829adbb508372440ea2b665694eab0032ec7f661..d1973141d3bb819ed90d7aaa43f67f470ef40e1e 100644 (file)
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -1,7 +1,10 @@
  // SPDX-License-Identifier: GPL-2.0
  
+#include <linux/blkdev.h>
  #include <linux/iversion.h>
+#include "compression.h"
  #include "ctree.h"
+#include "delalloc-space.h"
  #include "reflink.h"
  #include "transaction.h"
  
@@ -42,49 +45,131 @@ out:
         return ret;
  }
  
+static int copy_inline_to_page(struct inode *inode,
+                              const u64 file_offset,
+                              char *inline_data,
+                              const u64 size,
+                              const u64 datal,
+                              const u8 comp_type)
+{
+       const u64 block_size = btrfs_inode_sectorsize(inode);
+       const u64 range_end = file_offset + block_size - 1;
+       const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+       char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+       struct extent_changeset *data_reserved = NULL;
+       struct page *page = NULL;
+       int ret;
+
+       ASSERT(IS_ALIGNED(file_offset, block_size));
+
+       /*
+        * We have flushed and locked the ranges of the source and destination
+        * inodes, we also have locked the inodes, so we are safe to do a
+        * reservation here. Also we must not do the reservation while holding
+        * a transaction open, otherwise we would deadlock.
+        */
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+                                          block_size);
+       if (ret)
+               goto out;
+
+       page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
+                                  btrfs_alloc_write_mask(inode->i_mapping));
+       if (!page) {
+               ret = -ENOMEM;
+               goto out_unlock;
+       }
+
+       set_page_extent_mapped(page);
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+                        EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+                        0, 0, NULL);
+       ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+       if (ret)
+               goto out_unlock;
+
+       if (comp_type == BTRFS_COMPRESS_NONE) {
+               char *map;
+
+               map = kmap(page);
+               memcpy(map, data_start, datal);
+               flush_dcache_page(page);
+               kunmap(page);
+       } else {
+               ret = btrfs_decompress(comp_type, data_start, page, 0,
+                                      inline_size, datal);
+               if (ret)
+                       goto out_unlock;
+               flush_dcache_page(page);
+       }
+
+       /*
+        * If our inline data is smaller then the block/page size, then the
+        * remaining of the block/page is equivalent to zeroes. We had something
+        * like the following done:
+        *
+        * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+        * $ sync  # (or fsync)
+        * $ xfs_io -c "falloc 0 4K" file
+        * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+        *
+        * So what's in the range [500, 4095] corresponds to zeroes.
+        */
+       if (datal < block_size) {
+               char *map;
+
+               map = kmap(page);
+               memset(map + datal, 0, block_size - datal);
+               flush_dcache_page(page);
+               kunmap(page);
+       }
+
+       SetPageUptodate(page);
+       ClearPageChecked(page);
+       set_page_dirty(page);
+out_unlock:
+       if (page) {
+               unlock_page(page);
+               put_page(page);
+       }
+       if (ret)
+               btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+                                            block_size, true);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+out:
+       extent_changeset_free(data_reserved);
+
+       return ret;
+}
+
  /*
- * Make sure we do not end up inserting an inline extent into a file that has
- * already other (non-inline) extents. If a file has an inline extent it can
- * not have any other extents and the (single) inline extent must start at the
- * file offset 0. Failing to respect these rules will lead to file corruption,
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
- *
- * We can have extents that have been already written to disk or we can have
- * dirty ranges still in delalloc, in which case the extent maps and items are
- * created only when we run delalloc, and the delalloc ranges might fall outside
- * the range we are currently locking in the inode's io tree. So we check the
- * inode's i_size because of that (i_size updates are done while holding the
- * i_mutex, which we are holding here).
- * We also check to see if the inode has a size not greater than "datal" but has
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
- * protected against such concurrent fallocate calls by the i_mutex).
- *
- * If the file has no extents but a size greater than datal, do not allow the
- * copy because we would need turn the inline extent into a non-inline one (even
- * with NO_HOLES enabled). If we find our destination inode only has one inline
- * extent, just overwrite it with the source inline extent if its size is less
- * than the source extent's size, or we could copy the source inline extent's
- * data into the destination inode's inline extent if the later is greater then
- * the former.
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
   */
  static int clone_copy_inline_extent(struct inode *dst,
-                                   struct btrfs_trans_handle *trans,
                                     struct btrfs_path *path,
                                     struct btrfs_key *new_key,
                                     const u64 drop_start,
                                     const u64 datal,
                                     const u64 size,
-                                   const char *inline_data)
+                                   const u8 comp_type,
+                                   char *inline_data,
+                                   struct btrfs_trans_handle **trans_out)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
         struct btrfs_root *root = BTRFS_I(dst)->root;
         const u64 aligned_end = ALIGN(new_key->offset + datal,
                                       fs_info->sectorsize);
+       struct btrfs_trans_handle *trans = NULL;
         int ret;
         struct btrfs_key key;
  
-       if (new_key->offset > 0)
-               return -EOPNOTSUPP;
+       if (new_key->offset > 0) {
+               ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+                                         size, datal, comp_type);
+               goto out;
+       }
  
         key.objectid = btrfs_ino(BTRFS_I(dst));
         key.type = BTRFS_EXTENT_DATA_KEY;
@@ -103,72 +188,75 @@ static int clone_copy_inline_extent(struct inode *dst,
                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                 if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
                     key.type == BTRFS_EXTENT_DATA_KEY) {
+                       /*
+                        * There's an implicit hole at file offset 0, copy the
+                        * inline extent's data to the page.
+                        */
                         ASSERT(key.offset > 0);
-                       return -EOPNOTSUPP;
+                       ret = copy_inline_to_page(dst, new_key->offset,
+                                                 inline_data, size, datal,
+                                                 comp_type);
+                       goto out;
                 }
         } else if (i_size_read(dst) <= datal) {
                 struct btrfs_file_extent_item *ei;
-               u64 ext_len;
  
-               /*
-                * If the file size is <= datal, make sure there are no other
-                * extents following (can happen do to an fallocate call with
-                * the flag FALLOC_FL_KEEP_SIZE).
-                */
                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_file_extent_item);
                 /*
-                * If it's an inline extent, it can not have other extents
-                * following it.
+                * If it's an inline extent replace it with the source inline
+                * extent, otherwise copy the source inline extent data into
+                * the respective page at the destination inode.
                  */
                 if (btrfs_file_extent_type(path->nodes[0], ei) ==
                     BTRFS_FILE_EXTENT_INLINE)
                         goto copy_inline_extent;
  
-               ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
-               if (ext_len > aligned_end)
-                       return -EOPNOTSUPP;
-
-               ret = btrfs_next_item(root, path);
-               if (ret < 0) {
-                       return ret;
-               } else if (ret == 0) {
-                       btrfs_item_key_to_cpu(path->nodes[0], &key,
-                                             path->slots[0]);
-                       if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
-                           key.type == BTRFS_EXTENT_DATA_KEY)
-                               return -EOPNOTSUPP;
-               }
+               ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+                                         size, datal, comp_type);
+               goto out;
         }
  
  copy_inline_extent:
+       ret = 0;
         /*
          * We have no extent items, or we have an extent at offset 0 which may
          * or may not be inlined. All these cases are dealt the same way.
          */
         if (i_size_read(dst) > datal) {
                 /*
-                * If the destination inode has an inline extent.
-                * This would require copying the data from the source inline
-                * extent into the beginning of the destination's inline extent.
-                * But this is really complex, both extents can be compressed
-                * or just one of them, which would require decompressing and
-                * re-compressing data (which could increase the new compressed
-                * size, not allowing the compressed data to fit anymore in an
-                * inline extent).
-                * So just don't support this case for now (it should be rare,
-                * we are not really saving space when cloning inline extents).
+                * At the destination offset 0 we have either a hole, a regular
+                * extent or an inline extent larger then the one we want to
+                * clone. Deal with all these cases by copying the inline extent
+                * data into the respective page at the destination inode.
                  */
-               return -EOPNOTSUPP;
+               ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+                                          size, datal, comp_type);
+               goto out;
         }
  
         btrfs_release_path(path);
+       /*
+        * If we end up here it means were copy the inline extent into a leaf
+        * of the destination inode. We know we will drop or adjust at most one
+        * extent item in the destination root.
+        *
+        * 1 unit - adjusting old extent (we may have to split it)
+        * 1 unit - add new extent
+        * 1 unit - inode update
+        */
+       trans = btrfs_start_transaction(root, 3);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               trans = NULL;
+               goto out;
+       }
         ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
         if (ret)
-               return ret;
+               goto out;
         ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
         if (ret)
-               return ret;
+               goto out;
  
         write_extent_buffer(path->nodes[0], inline_data,
                             btrfs_item_ptr_offset(path->nodes[0],
@@ -176,8 +264,28 @@ copy_inline_extent:
                             size);
         inode_add_bytes(dst, datal);
         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+out:
+       if (!ret && !trans) {
+               /*
+                * No transaction here means we copied the inline extent into a
+                * page of the destination inode.
+                *
+                * 1 unit to update inode item
+                */
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       trans = NULL;
+               }
+       }
+       if (ret && trans) {
+               btrfs_abort_transaction(trans, ret);
+               btrfs_end_transaction(trans);
+       }
+       if (!ret)
+               *trans_out = trans;
  
-       return 0;
+       return ret;
  }
  
  /**
@@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                        const u64 destoff, int no_time_update)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_path *path = NULL;
         struct extent_buffer *leaf;
         struct btrfs_trans_handle *trans;
@@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                 struct btrfs_key new_key;
                 u64 disko = 0, diskl = 0;
                 u64 datao = 0, datal = 0;
+               u8 comp;
                 u64 drop_start;
  
                 /* Note the key will change type as we walk through the tree */
@@ -275,6 +383,7 @@ process_slot:
  
                 extent = btrfs_item_ptr(leaf, slot,
                                         struct btrfs_file_extent_item);
+               comp = btrfs_file_extent_compression(leaf, extent);
                 type = btrfs_file_extent_type(leaf, extent);
                 if (type == BTRFS_FILE_EXTENT_REG ||
                     type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -369,29 +478,11 @@ process_slot:
                         if (key.offset != 0 || datal > fs_info->sectorsize)
                                 return -EUCLEAN;
  
-                       /*
-                        * If our extent is inline, we know we will drop or
-                        * adjust at most 1 extent item in the destination root.
-                        *
-                        * 1 - adjusting old extent (we may have to split it)
-                        * 1 - add new extent
-                        * 1 - inode update
-                        */
-                       trans = btrfs_start_transaction(root, 3);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                               goto out;
-                       }
-
-                       ret = clone_copy_inline_extent(inode, trans, path,
-                                                      &new_key, drop_start,
-                                                      datal, size, buf);
-                       if (ret) {
-                               if (ret != -EOPNOTSUPP)
-                                       btrfs_abort_transaction(trans, ret);
-                               btrfs_end_transaction(trans);
+                       ret = clone_copy_inline_extent(inode, path, &new_key,
+                                                      drop_start, datal, size,
+                                                      comp, buf, &trans);
+                       if (ret)
                                 goto out;
-                       }
                 }
  
                 btrfs_release_path(path);
@@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
         struct inode *src = file_inode(file_src);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         int ret;
+       int wb_ret;
         u64 len = olen;
         u64 bs = fs_info->sb->s_blocksize;
  
@@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
         btrfs_double_extent_lock(src, off, inode, destoff, len);
         ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
         btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+       /*
+        * We may have copied an inline extent into a page of the destination
+        * range, so wait for writeback to complete before truncating pages
+        * from the page cache. This is a rare case.
+        */
+       wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+       ret = ret ? ret : wb_ret;
         /*
          * Truncate page cache pages so that future reads will see the cloned
          * data immediately and not the previous data.
author	Filipe Manana <fdmanana@suse.com>
	Fri, 28 Feb 2020 13:04:19 +0000 (13:04 +0000)
committer	David Sterba <dsterba@suse.com>
	Mon, 23 Mar 2020 16:01:54 +0000 (17:01 +0100)