#include "quota_types.h"
 #include "rebalance_types.h"
 #include "replicas_types.h"
+#include "subvolume_types.h"
 #include "super_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
        struct bch_snapshot_table __rcu *snapshot_table;
        struct mutex            snapshot_table_lock;
        struct work_struct      snapshot_delete_work;
+       struct work_struct      snapshot_wait_for_pagecache_and_delete_work;
+       struct snapshot_id_list snapshots_unlinked;
+       struct mutex            snapshots_unlinked_lock;
 
        /* BTREE CACHE */
        struct bio_set          btree_bio;
 
  * can delete it (or whether it should just be rm -rf'd)
  */
 LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,   struct bch_subvolume, flags,  2,  3)
 
 /* Snapshots */
 
 
                      struct bch_inode_unpacked *dir_u,
                      struct bch_inode_unpacked *inode_u,
                      const struct qstr *name,
-                     int deleting_snapshot)
+                     bool deleting_snapshot)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter dir_iter = { NULL };
        if (ret)
                goto err;
 
-       if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
+       if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
                ret = bch2_empty_dir_trans(trans, inum);
                if (ret)
                        goto err;
        }
 
-       if (deleting_snapshot < 0 &&
-           inode_u->bi_subvol) {
-               struct bch_subvolume s;
-
-               ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true,
-                                        BTREE_ITER_CACHED|
-                                        BTREE_ITER_WITH_UPDATES,
-                                        &s);
-               if (ret)
-                       goto err;
-
-               if (BCH_SUBVOLUME_SNAP(&s))
-                       deleting_snapshot = 1;
+       if (deleting_snapshot && !inode_u->bi_subvol) {
+               ret = -ENOENT;
+               goto err;
        }
 
-       if (deleting_snapshot == 1) {
-               if (!inode_u->bi_subvol) {
-                       ret = -ENOENT;
-                       goto err;
-               }
-
-               ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
-                                           deleting_snapshot);
+       if (deleting_snapshot || inode_u->bi_subvol) {
+               ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
                if (ret)
                        goto err;
 
 
 int bch2_unlink_trans(struct btree_trans *, subvol_inum,
                      struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
-                     const struct qstr *, int);
+                     const struct qstr *, bool);
 
 int bch2_rename_trans(struct btree_trans *,
                      subvol_inum, struct bch_inode_unpacked *,
 
 
        dir = path.dentry->d_parent->d_inode;
 
-       ret = __bch2_unlink(dir, path.dentry, 1);
+       ret = __bch2_unlink(dir, path.dentry, true);
        if (!ret) {
                fsnotify_rmdir(dir, path.dentry);
                d_delete(path.dentry);
 
 }
 
 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-                 int deleting_snapshot)
+                 bool deleting_snapshot)
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
 
 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 {
-       return __bch2_unlink(vdir, dentry, -1);
+       return __bch2_unlink(vdir, dentry, false);
 }
 
 static int bch2_symlink(struct mnt_idmap *idmap,
        return ret;
 }
 
+static int bch2_drop_inode(struct inode *vinode)
+{
+
+       return generic_drop_inode(vinode);
+}
+
 static void bch2_evict_inode(struct inode *vinode)
 {
        struct bch_fs *c = vinode->i_sb->s_fs_info;
        .alloc_inode    = bch2_alloc_inode,
        .destroy_inode  = bch2_destroy_inode,
        .write_inode    = bch2_vfs_write_inode,
+       .drop_inode     = bch2_drop_inode,
        .evict_inode    = bch2_evict_inode,
        .sync_fs        = bch2_sync_fs,
        .statfs         = bch2_statfs,
 
 int bch2_setattr_nonsize(struct mnt_idmap *,
                         struct bch_inode_info *,
                         struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, int);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
 
        /* Subvolume root? */
        if (inode_u.bi_subvol) {
-               ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+               ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
                if (ret)
                        goto err;
        }
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bkey_s_c_subvolume subvol;
        int ret;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
                           0, k, ret) {
+               if (k.k->type != KEY_TYPE_subvolume)
+                       continue;
+
+               subvol = bkey_s_c_to_subvolume(k);
+
+               if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+                       ret = __bch2_trans_do(&trans,  NULL, NULL,
+                                             BTREE_INSERT_LAZY_RW,
+                                       bch2_subvolume_delete(&trans, iter.pos.offset));
+                       if (ret) {
+                               bch_err(c, "error deleting subvolume %llu: %i",
+                                       iter.pos.offset, ret);
+                               break;
+                       }
+               }
        }
        bch2_trans_iter_exit(&trans, &iter);
 
 
        bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
        /* Subvolume root? */
-       if (inode_u.bi_subvol) {
-               ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
-               if (ret)
-                       goto err;
-       }
+       BUG_ON(inode_u.bi_subvol);
 
        bkey_inode_generation_init(&delete.k_i);
        delete.k.p = iter.pos;
 
 #include "btree_key_cache.h"
 #include "btree_update.h"
 #include "error.h"
+#include "fs.h"
 #include "subvolume.h"
 
 /* Snapshot tree: */
        return ret;
 }
 
-/* List of snapshot IDs that are being deleted: */
-struct snapshot_id_list {
-       u32             nr;
-       u32             size;
-       u32             *d;
-};
-
 static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
 {
        unsigned i;
        return ret;
 }
 
-/* XXX: mark snapshot id for deletion, walk btree and delete: */
-int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
-                         int deleting_snapshot)
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
        subvol = bkey_s_c_to_subvolume(k);
        snapid = le32_to_cpu(subvol.v->snapshot);
 
-       if (deleting_snapshot >= 0 &&
-           deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
-               ret = -ENOENT;
-               goto err;
-       }
-
        delete = bch2_trans_kmalloc(trans, sizeof(*delete));
        ret = PTR_ERR_OR_ZERO(delete);
        if (ret)
        return ret;
 }
 
+static void bch2_evict_subvolume_inodes(struct bch_fs *c,
+                                struct snapshot_id_list *s)
+{
+       struct super_block *sb = c->vfs_sb;
+       struct inode *inode;
+
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               d_mark_dontcache(inode);
+               d_prune_aliases(inode);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+again:
+       cond_resched();
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               if (!(inode->i_state & I_DONTCACHE)) {
+                       d_mark_dontcache(inode);
+                       d_prune_aliases(inode);
+               }
+
+               spin_lock(&inode->i_lock);
+               if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+                   !(inode->i_state & I_FREEING)) {
+                       wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+                       DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+                       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       spin_unlock(&sb->s_inode_list_lock);
+                       schedule();
+                       finish_wait(wq, &wait.wq_entry);
+                       goto again;
+               }
+
+               spin_unlock(&inode->i_lock);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs,
+                               snapshot_wait_for_pagecache_and_delete_work);
+       struct snapshot_id_list s;
+       u32 *id;
+       int ret = 0;
+
+       while (!ret) {
+               mutex_lock(&c->snapshots_unlinked_lock);
+               s = c->snapshots_unlinked;
+               memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+               mutex_unlock(&c->snapshots_unlinked_lock);
+
+               if (!s.nr)
+                       break;
+
+               bch2_evict_subvolume_inodes(c, &s);
+
+               for (id = s.d; id < s.d + s.nr; id++) {
+                       ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+                                     bch2_subvolume_delete(&trans, *id));
+                       if (ret) {
+                               bch_err(c, "error %i deleting subvolume %u", ret, *id);
+                               break;
+                       }
+               }
+
+               kfree(s.d);
+       }
+
+       percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+       struct btree_trans_commit_hook  h;
+       u32                             subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+                                                     struct btree_trans_commit_hook *_h)
+{
+       struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+       struct bch_fs *c = trans->c;
+       int ret = 0;
+
+       mutex_lock(&c->snapshots_unlinked_lock);
+       if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+               ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+       mutex_unlock(&c->snapshots_unlinked_lock);
+
+       if (ret)
+               return ret;
+
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return -EROFS;
+
+       if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+               percpu_ref_put(&c->writes);
+       return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_subvolume *n;
+       struct subvolume_unlink_hook *h;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvolid),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+               ret = -EIO;
+               goto err;
+       }
+
+       n = bch2_trans_kmalloc(trans, sizeof(*n));
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               goto err;
+
+       bkey_reassemble(&n->k_i, k);
+       SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+       ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+       if (ret)
+               goto err;
+
+       h = bch2_trans_kmalloc(trans, sizeof(*h));
+       ret = PTR_ERR_OR_ZERO(h);
+       if (ret)
+               goto err;
+
+       h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
+       h->subvol       = subvolid;
+       bch2_trans_commit_hook(trans, &h->h);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
                          u32 src_subvolid,
                          u32 *new_subvolid,
 int bch2_fs_subvolumes_init(struct bch_fs *c)
 {
        INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+       INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+                 bch2_subvolume_wait_for_pagecache_and_delete);
+       mutex_init(&c->snapshots_unlinked_lock);
        return 0;
 }
 
 #ifndef _BCACHEFS_SUBVOLUME_H
 #define _BCACHEFS_SUBVOLUME_H
 
+#include "subvolume_types.h"
+
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
 
                       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
-int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
                          u32 *, u32 *, bool);
 
 
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+       u32             nr;
+       u32             size;
+       u32             *d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */