From 42d237320e9817a94f3a0a2de28156523596b086 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Tue, 16 Mar 2021 23:28:43 -0400
Subject: [PATCH] bcachefs: Snapshot creation, deletion

This is the final patch in the patch series implementing snapshots.
This patch implements two new ioctls that work like creation and
deletion of directories, but fancier.

 - BCH_IOCTL_SUBVOLUME_CREATE, for creating new subvolumes and snaphots
 - BCH_IOCTL_SUBVOLUME_DESTROY, for deleting subvolumes and snapshots

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/bcachefs/dirent.c    |   8 --
 fs/bcachefs/dirent.h    |   4 -
 fs/bcachefs/fs-common.c | 182 ++++++++++++++++++++++++++++++++--------
 fs/bcachefs/fs-common.h |   7 +-
 fs/bcachefs/fs-ioctl.c  | 168 +++++++++++++++++++++++++++++++++++++
 fs/bcachefs/fs.c        |  29 ++++---
 fs/bcachefs/fs.h        |   3 +-
 fs/bcachefs/fsck.c      |   7 +-
 fs/bcachefs/recovery.c  |   2 +-
 fs/bcachefs/str_hash.h  |   7 +-
 10 files changed, 348 insertions(+), 69 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index f290580594ce3..8653a106809df 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -383,14 +383,6 @@ out:
 	return ret;
 }
 
-int bch2_dirent_delete_at(struct btree_trans *trans,
-			  const struct bch_hash_info *hash_info,
-			  struct btree_iter *iter)
-{
-	return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				   hash_info, iter);
-}
-
 int __bch2_dirent_lookup_trans(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       subvol_inum dir,
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 88b784a99cb5d..e7f65fbd8e65f 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -33,10 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum,
 		       const struct bch_hash_info *, u8,
 		       const struct qstr *, u64, u64 *, int);
 
-int bch2_dirent_delete_at(struct btree_trans *,
-			  const struct bch_hash_info *,
-			  struct btree_iter *);
-
 int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
 			      u32 *, u32 *, u64 *, bool);
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 02bf32cc7659a..3e8e3c5bf8703 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -11,6 +11,11 @@
 
 #include <linux/posix_acl.h>
 
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
 int bch2_create_trans(struct btree_trans *trans,
 		      subvol_inum dir,
 		      struct bch_inode_unpacked *dir_u,
@@ -19,6 +24,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 		      struct posix_acl *default_acl,
 		      struct posix_acl *acl,
+		      subvol_inum snapshot_src,
 		      unsigned flags)
 {
 	struct bch_fs *c = trans->c;
@@ -27,10 +33,9 @@ int bch2_create_trans(struct btree_trans *trans,
 	subvol_inum new_inum = dir;
 	u64 now = bch2_current_time(c);
 	u64 cpu = raw_smp_processor_id();
-	u64 dir_offset = 0;
 	u64 dir_target;
 	u32 snapshot;
-	unsigned dir_type;
+	unsigned dir_type = mode_to_type(mode);
 	int ret;
 
 	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
@@ -41,37 +46,122 @@ int bch2_create_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		/* Normal create path - allocate a new inode: */
+		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
 
-	if (!name)
-		new_inode->bi_flags |= BCH_INODE_UNLINKED;
+		if (flags & BCH_CREATE_TMPFILE)
+			new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-	ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
-	if (ret)
-		goto err;
+		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+		if (ret)
+			goto err;
+
+		snapshot_src = (subvol_inum) { 0 };
+	} else {
+		/*
+		 * Creating a snapshot - we're not allocating a new inode, but
+		 * we do have to lookup the root inode of the subvolume we're
+		 * snapshotting and update it (in the new snapshot):
+		 */
+
+		if (!snapshot_src.inum) {
+			/* Inode wasn't specified, just snapshot: */
+			struct btree_iter subvol_iter;
+			struct bkey_s_c k;
+
+			bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
+					     POS(0, snapshot_src.subvol), 0);
+			k = bch2_btree_iter_peek_slot(&subvol_iter);
+
+			ret = bkey_err(k);
+			if (!ret && k.k->type != KEY_TYPE_subvolume) {
+				bch_err(c, "subvolume %u not found",
+					snapshot_src.subvol);
+				ret = -ENOENT;
+			}
+
+			if (!ret)
+				snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+			bch2_trans_iter_exit(trans, &subvol_iter);
+
+			if (ret)
+				goto err;
+		}
+
+		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		if (new_inode->bi_subvol != snapshot_src.subvol) {
+			/* Not a subvolume root: */
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * If we're not root, we have to own the subvolume being
+		 * snapshotted:
+		 */
+		if (uid && new_inode->bi_uid != uid) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		flags |= BCH_CREATE_SUBVOL;
+	}
 
 	new_inum.inum	= new_inode->bi_inum;
 	dir_target	= new_inode->bi_inum;
-	dir_type	= mode_to_type(new_inode->bi_mode);
 
-	if (default_acl) {
-		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-					 default_acl, ACL_TYPE_DEFAULT);
+	if (flags & BCH_CREATE_SUBVOL) {
+		u32 new_subvol, dir_snapshot;
+
+		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    snapshot_src.subvol,
+					    &new_subvol, &snapshot,
+					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
 		if (ret)
 			goto err;
-	}
 
-	if (acl) {
-		ret = bch2_set_acl_trans(trans, new_inum, new_inode,
-					 acl, ACL_TYPE_ACCESS);
+		new_inode->bi_parent_subvol	= dir.subvol;
+		new_inode->bi_subvol		= new_subvol;
+		new_inum.subvol			= new_subvol;
+		dir_target			= new_subvol;
+		dir_type			= DT_SUBVOL;
+
+		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+		ret = bch2_btree_iter_traverse(&dir_iter);
 		if (ret)
 			goto err;
 	}
 
-	if (name) {
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		if (default_acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 default_acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto err;
+		}
+
+		if (acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 acl, ACL_TYPE_ACCESS);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
 		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		u64 dir_offset;
 
-		if (S_ISDIR(new_inode->bi_mode))
+		if (is_subdir_for_nlink(new_inode))
 			dir_u->bi_nlink++;
 		dir_u->bi_mtime = dir_u->bi_ctime = now;
 
@@ -87,11 +177,11 @@ int bch2_create_trans(struct btree_trans *trans,
 					 BCH_HASH_SET_MUST_CREATE);
 		if (ret)
 			goto err;
-	}
 
-	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-		new_inode->bi_dir		= dir_u->bi_inum;
-		new_inode->bi_dir_offset	= dir_offset;
+		if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+			new_inode->bi_dir		= dir_u->bi_inum;
+			new_inode->bi_dir_offset	= dir_offset;
+		}
 	}
 
 	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
@@ -160,7 +250,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      subvol_inum dir,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
-		      const struct qstr *name)
+		      const struct qstr *name,
+		      int deleting_snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -169,6 +260,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	struct bch_hash_info dir_hash;
 	subvol_inum inum;
 	u64 now = bch2_current_time(c);
+	struct bkey_s_c k;
 	int ret;
 
 	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
@@ -187,29 +279,51 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
-	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
-		inode_u->bi_dir		= 0;
-		inode_u->bi_dir_offset	= 0;
+	if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
+		ret = -ENOENT;
+		goto err;
 	}
 
-	if (S_ISDIR(inode_u->bi_mode)) {
+	if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (dir.subvol != inum.subvol) {
-		ret = bch2_subvolume_delete(trans, inum.subvol, false);
+	if (inode_u->bi_subvol) {
+		ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
+					    deleting_snapshot);
+		if (ret)
+			goto err;
+
+		k = bch2_btree_iter_peek_slot(&dirent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		/*
+		 * If we're deleting a subvolume, we need to really delete the
+		 * dirent, not just emit a whiteout in the current snapshot:
+		 */
+		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&dirent_iter);
 		if (ret)
 			goto err;
 	}
 
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
 	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-	dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
 	bch2_inode_nlink_dec(inode_u);
 
-	ret =   bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash, &dirent_iter,
+				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 		bch2_inode_write(trans, &dir_iter, dir_u) ?:
 		bch2_inode_write(trans, &inode_iter, inode_u);
 err:
@@ -348,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans,
 		goto err;
 	}
 
-	if (S_ISDIR(src_inode_u->bi_mode)) {
+	if (is_subdir_for_nlink(src_inode_u)) {
 		src_dir_u->bi_nlink--;
 		dst_dir_u->bi_nlink++;
 	}
 
-	if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) {
+	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
 		dst_dir_u->bi_nlink--;
 		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
 	}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 1bb2ac4dc13af..9bb0a96761472 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -5,6 +5,9 @@
 struct posix_acl;
 
 #define BCH_CREATE_TMPFILE		(1U << 0)
+#define BCH_CREATE_SUBVOL		(1U << 1)
+#define BCH_CREATE_SNAPSHOT		(1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
 
 int bch2_create_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
@@ -13,7 +16,7 @@ int bch2_create_trans(struct btree_trans *, subvol_inum,
 		      uid_t, gid_t, umode_t, dev_t,
 		      struct posix_acl *,
 		      struct posix_acl *,
-		      unsigned);
+		      subvol_inum, unsigned);
 
 int bch2_link_trans(struct btree_trans *,
 		    subvol_inum, struct bch_inode_unpacked *,
@@ -23,7 +26,7 @@ int bch2_link_trans(struct btree_trans *,
 int bch2_unlink_trans(struct btree_trans *, subvol_inum,
 		      struct bch_inode_unpacked *,
 		      struct bch_inode_unpacked *,
-		      const struct qstr *);
+		      const struct qstr *, int);
 
 int bch2_rename_trans(struct btree_trans *,
 		      subvol_inum, struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 91f52ab9b4e21..ae402d350d4c8 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -10,7 +10,11 @@
 #include "quota.h"
 
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
 
 #define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
 #define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
@@ -292,6 +296,154 @@ err:
 	return ret;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct inode *dir;
+	struct bch_inode_info *inode;
+	struct user_namespace *s_user_ns;
+	struct dentry *dst_dentry;
+	struct path src_path, dst_path;
+	int how = LOOKUP_FOLLOW;
+	int error;
+	subvol_inum snapshot_src = { 0 };
+	unsigned lookup_flags = 0;
+	unsigned create_flags = BCH_CREATE_SUBVOL;
+
+	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+			  BCH_SUBVOL_SNAPSHOT_RO))
+		return -EINVAL;
+
+	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    (arg.src_ptr ||
+	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+		return -EINVAL;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		create_flags |= BCH_CREATE_SNAPSHOT;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+		create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+	/* why do we need this lock? */
+	down_read(&c->vfs_sb->s_umount);
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		sync_inodes_sb(c->vfs_sb);
+retry:
+	if (arg.src_ptr) {
+		error = user_path_at(arg.dirfd,
+				(const char __user *)(unsigned long)arg.src_ptr,
+				how, &src_path);
+		if (error)
+			goto err1;
+
+		if (src_path.dentry->d_sb->s_fs_info != c) {
+			path_put(&src_path);
+			error = -EXDEV;
+			goto err1;
+		}
+
+		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+	}
+
+	dst_dentry = user_path_create(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			&dst_path, lookup_flags);
+	error = PTR_ERR_OR_ZERO(dst_dentry);
+	if (error)
+		goto err2;
+
+	if (dst_dentry->d_sb->s_fs_info != c) {
+		error = -EXDEV;
+		goto err3;
+	}
+
+	if (dst_dentry->d_inode) {
+		error = -EEXIST;
+		goto err3;
+	}
+
+	dir = dst_path.dentry->d_inode;
+	if (IS_DEADDIR(dir)) {
+		error = -ENOENT;
+		goto err3;
+	}
+
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
+		error = -EOVERFLOW;
+		goto err3;
+	}
+
+	error = inode_permission(file_mnt_idmap(filp),
+				 dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto err3;
+
+	if (!IS_POSIXACL(dir))
+		arg.mode &= ~current_umask();
+
+	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+	if (error)
+		goto err3;
+
+	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    !arg.src_ptr)
+		snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+			      dst_dentry, arg.mode|S_IFDIR,
+			      0, snapshot_src, create_flags);
+	error = PTR_ERR_OR_ZERO(inode);
+	if (error)
+		goto err3;
+
+	d_instantiate(dst_dentry, &inode->v);
+	fsnotify_mkdir(dir, dst_dentry);
+err3:
+	done_path_create(&dst_path, dst_dentry);
+err2:
+	if (arg.src_ptr)
+		path_put(&src_path);
+
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+err1:
+	up_read(&c->vfs_sb->s_umount);
+
+	return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct path path;
+	int ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	ret = user_path_at(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ret;
+
+	if (path.dentry->d_sb->s_fs_info != c) {
+		path_put(&path);
+		return -EXDEV;
+	}
+
+	ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
+	path_put(&path);
+
+	return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
@@ -322,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case FS_IOC_GOINGDOWN:
 		return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
+	case BCH_IOCTL_SUBVOLUME_CREATE: {
+		struct bch_ioctl_subvolume i;
+
+		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+			return -EFAULT;
+		return bch2_ioctl_subvolume_create(c, file, i);
+	}
+
+	case BCH_IOCTL_SUBVOLUME_DESTROY: {
+		struct bch_ioctl_subvolume i;
+
+		if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+			return -EFAULT;
+		return bch2_ioctl_subvolume_destroy(c, file, i);
+	}
+
 	default:
 		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
 	}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0d47d9d5737b7..7475830bb33f3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -240,12 +240,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	struct bch_inode_info *inode;
 	int ret;
 
-	/*
-	 * debug assert, to be removed when we start creating
-	 * subvolumes/snapshots:
-	 */
-	BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL);
-
 	inode = to_bch_ei(iget5_locked(c->vfs_sb,
 				       bch2_inode_hash(inum),
 				       bch2_iget5_test,
@@ -274,7 +268,8 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 struct bch_inode_info *
 __bch2_create(struct mnt_idmap *idmap,
 	      struct bch_inode_info *dir, struct dentry *dentry,
-	      umode_t mode, dev_t rdev, unsigned flags)
+	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+	      unsigned flags)
 {
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans trans;
@@ -319,7 +314,7 @@ retry:
 				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
 				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
 				  mode, rdev,
-				  default_acl, acl, flags) ?:
+				  default_acl, acl, snapshot_src, flags) ?:
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
 				KEY_TYPE_QUOTA_PREALLOC);
 	if (unlikely(ret))
@@ -426,7 +421,8 @@ static int bch2_mknod(struct mnt_idmap *idmap,
 		      umode_t mode, dev_t rdev)
 {
 	struct bch_inode_info *inode =
-		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 0);
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+			      (subvol_inum) { 0 }, 0);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -493,7 +489,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 	return 0;
 }
 
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+		  int deleting_snapshot)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -509,7 +506,8 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 			      BTREE_INSERT_NOFAIL,
 			bch2_unlink_trans(&trans,
 					  inode_inum(dir), &dir_u,
-					  &inode_u, &dentry->d_name));
+					  &inode_u, &dentry->d_name,
+					  deleting_snapshot));
 
 	if (likely(!ret)) {
 		BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -527,6 +525,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	return ret;
 }
 
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	return __bch2_unlink(vdir, dentry, -1);
+}
+
 static int bch2_symlink(struct mnt_idmap *idmap,
 			struct inode *vdir, struct dentry *dentry,
 			const char *symname)
@@ -536,7 +539,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
 	int ret;
 
 	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
-			      BCH_CREATE_TMPFILE);
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 	if (unlikely(IS_ERR(inode)))
 		return PTR_ERR(inode);
 
@@ -855,7 +858,7 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
 	struct bch_inode_info *inode =
 		__bch2_create(idmap, to_bch_ei(vdir),
 			      file->f_path.dentry, mode, 0,
-			      BCH_CREATE_TMPFILE);
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index aa755987b36c9..40898c4d197b7 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -146,7 +146,7 @@ struct bch_inode_unpacked;
 
 struct bch_inode_info *
 __bch2_create(struct mnt_idmap *, struct bch_inode_info *,
-	      struct dentry *, umode_t, dev_t, unsigned);
+	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
 
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
@@ -183,6 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, int);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index f9a6a0b3ce7a5..16a1eae9b374b 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -307,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
 	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				  &dir_hash_info, &iter);
+				  &dir_hash_info, &iter, 0);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
@@ -386,7 +386,8 @@ create_lostfound:
 				      BTREE_INSERT_LAZY_RW,
 			bch2_create_trans(trans, root_inum, &root,
 					  lostfound, &lostfound_str,
-					  0, 0, S_IFDIR|0700, 0, NULL, NULL, 0));
+					  0, 0, S_IFDIR|0700, 0, NULL, NULL,
+					  (subvol_inum) { }, 0));
 		if (ret)
 			bch_err(c, "error creating lost+found: %i", ret);
 	}
@@ -759,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
 {
 	int ret;
 retry:
-	ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
+	ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL,
 				  BTREE_INSERT_NOFAIL|
 				  BTREE_INSERT_LAZY_RW);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 47c8fecc68395..64e0b542e7791 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1485,7 +1485,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 				  &root_inode, &lostfound_inode,
 				  &lostfound,
 				  0, 0, S_IFDIR|0700, 0,
-				  NULL, NULL, 0));
+				  NULL, NULL, (subvol_inum) { 0 }, 0));
 	if (ret) {
 		bch_err(c, "error creating lost+found");
 		goto err;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 6418089531ad6..6486e709b700d 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -307,7 +307,8 @@ static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
 			const struct bch_hash_desc desc,
 			const struct bch_hash_info *info,
-			struct btree_iter *iter)
+			struct btree_iter *iter,
+			unsigned update_flags)
 {
 	struct bkey_i *delete;
 	int ret;
@@ -325,7 +326,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 	delete->k.p = iter->pos;
 	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-	return bch2_trans_update(trans, iter, delete, 0);
+	return bch2_trans_update(trans, iter, delete, update_flags);
 }
 
 static __always_inline
@@ -342,7 +343,7 @@ int bch2_hash_delete(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	ret = bch2_hash_delete_at(trans, desc, info, &iter);
+	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
-- 
2.30.2