xfs: rework xfs_iflush_cluster() dirty inode iteration

author Dave Chinner <dchinner@redhat.com>

Mon, 29 Jun 2020 21:49:20 +0000 (14:49 -0700)

committer Darrick J. Wong <darrick.wong@oracle.com>

Tue, 7 Jul 2020 14:15:09 +0000 (07:15 -0700)
author Dave Chinner <dchinner@redhat.com>
Mon, 29 Jun 2020 21:49:20 +0000 (14:49 -0700)
committer Darrick J. Wong <darrick.wong@oracle.com>
Tue, 7 Jul 2020 14:15:09 +0000 (07:15 -0700)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 31e105f95739a376b4aceacec0102c07659537a0..ece3622f6d28f5661ffb97d52ebb816218dcea77 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3603,141 +3603,120 @@ flush_out:
   * locked. The function will walk across all the inodes on the cluster buffer it
   * can find and lock without blocking, and flush them to the cluster buffer.
   *
- * On success, the caller must write out the buffer returned in *bp and
- * release it. On failure, the filesystem will be shut down, the buffer will
- * have been unlocked and released, and EFSCORRUPTED will be returned.
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
   */
  int
  xfs_iflush_cluster(
-       struct xfs_inode        *ip,
         struct xfs_buf          *bp)
  {
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_perag        *pag;
-       unsigned long           first_index, mask;
-       int                     cilist_size;
-       struct xfs_inode        **cilist;
-       struct xfs_inode        *cip;
-       struct xfs_ino_geometry *igeo = M_IGEO(mp);
-       int                     error = 0;
-       int                     nr_found;
+       struct xfs_mount        *mp = bp->b_mount;
+       struct xfs_log_item     *lip, *n;
+       struct xfs_inode        *ip;
+       struct xfs_inode_log_item *iip;
         int                     clcount = 0;
-       int                     i;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
-       cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
-       cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
-       if (!cilist)
-               goto out_put;
-
-       mask = ~(igeo->inodes_per_cluster - 1);
-       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       rcu_read_lock();
-       /* really need a gang lookup range call here */
-       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
-                                       first_index, igeo->inodes_per_cluster);
-       if (nr_found == 0)
-               goto out_free;
+       int                     error = 0;
  
-       for (i = 0; i < nr_found; i++) {
-               cip = cilist[i];
+       /*
+        * We must use the safe variant here as on shutdown xfs_iflush_abort()
+        * can remove itself from the list.
+        */
+       list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+               iip = (struct xfs_inode_log_item *)lip;
+               ip = iip->ili_inode;
  
                 /*
-                * because this is an RCU protected lookup, we could find a
-                * recently freed or even reallocated inode during the lookup.
-                * We need to check under the i_flags_lock for a valid inode
-                * here. Skip it if it is not valid or the wrong inode.
+                * Quick and dirty check to avoid locks if possible.
                  */
-               spin_lock(&cip->i_flags_lock);
-               if (!cip->i_ino ||
-                   __xfs_iflags_test(cip, XFS_ISTALE)) {
-                       spin_unlock(&cip->i_flags_lock);
+               if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
+                       continue;
+               if (xfs_ipincount(ip))
                         continue;
-               }
  
                 /*
-                * Once we fall off the end of the cluster, no point checking
-                * any more inodes in the list because they will also all be
-                * outside the cluster.
+                * The inode is still attached to the buffer, which means it is
+                * dirty but reclaim might try to grab it. Check carefully for
+                * that, and grab the ilock while still holding the i_flags_lock
+                * to guarantee reclaim will not be able to reclaim this inode
+                * once we drop the i_flags_lock.
                  */
-               if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
-                       spin_unlock(&cip->i_flags_lock);
-                       break;
+               spin_lock(&ip->i_flags_lock);
+               ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+               if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
                 }
-               spin_unlock(&cip->i_flags_lock);
  
                 /*
-                * Do an un-protected check to see if the inode is dirty and
-                * is a candidate for flushing.  These checks will be repeated
-                * later after the appropriate locks are acquired.
+                * ILOCK will pin the inode against reclaim and prevent
+                * concurrent transactions modifying the inode while we are
+                * flushing the inode.
                  */
-               if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
+               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                       spin_unlock(&ip->i_flags_lock);
                         continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
  
                 /*
-                * Try to get locks.  If any are unavailable or it is pinned,
-                * then this inode cannot be flushed and is skipped.
+                * Skip inodes that are already flush locked as they have
+                * already been written to the buffer.
                  */
-
-               if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
-                       continue;
-               if (!xfs_iflock_nowait(cip)) {
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
+               if (!xfs_iflock_nowait(ip)) {
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
                         continue;
                 }
-               if (xfs_ipincount(cip)) {
-                       xfs_ifunlock(cip);
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                       continue;
-               }
-
  
                 /*
-                * Check the inode number again, just to be certain we are not
-                * racing with freeing in xfs_reclaim_inode(). See the comments
-                * in that function for more information as to why the initial
-                * check is not sufficient.
+                * Abort flushing this inode if we are shut down because the
+                * inode may not currently be in the AIL. This can occur when
+                * log I/O failure unpins the inode without inserting into the
+                * AIL, leaving a dirty/unpinned inode attached to the buffer
+                * that otherwise looks like it should be flushed.
                  */
-               if (!cip->i_ino) {
-                       xfs_ifunlock(cip);
-                       xfs_iunlock(cip, XFS_ILOCK_SHARED);
+               if (XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_iunpin_wait(ip);
+                       /* xfs_iflush_abort() drops the flush lock */
+                       xfs_iflush_abort(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                       error = -EIO;
                         continue;
                 }
  
-               /*
-                * arriving here means that this inode can be flushed.  First
-                * re-check that it's dirty before flushing.
-                */
-               if (!xfs_inode_clean(cip)) {
-                       error = xfs_iflush(cip, bp);
-                       if (error) {
-                               xfs_iunlock(cip, XFS_ILOCK_SHARED);
-                               goto out_free;
-                       }
-                       clcount++;
-               } else {
-                       xfs_ifunlock(cip);
+               /* don't block waiting on a log force to unpin dirty inodes */
+               if (xfs_ipincount(ip)) {
+                       xfs_ifunlock(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                       continue;
                 }
-               xfs_iunlock(cip, XFS_ILOCK_SHARED);
-       }
  
-       if (clcount) {
-               XFS_STATS_INC(mp, xs_icluster_flushcnt);
-               XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+               if (!xfs_inode_clean(ip))
+                       error = xfs_iflush(ip, bp);
+               else
+                       xfs_ifunlock(ip);
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+               if (error)
+                       break;
+               clcount++;
         }
  
-out_free:
-       rcu_read_unlock();
-       kmem_free(cilist);
-out_put:
-       xfs_perag_put(pag);
         if (error) {
                 bp->b_flags |= XBF_ASYNC;
                 xfs_buf_ioend_fail(bp);
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return error;
         }
-       return error;
+
+       if (!clcount)
+               return -EAGAIN;
+
+       XFS_STATS_INC(mp, xs_icluster_flushcnt);
+       XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
+       return 0;
+
  }
  
  /* Release an inode. */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index c482e7306fe00cf0e139b944eb813663b25b6f13..e9a8bb184d1f94738d52d23770504931a8d1354f 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -426,7 +426,7 @@ int         xfs_log_force_inode(struct xfs_inode *ip);
  void           xfs_iunpin_wait(xfs_inode_t *);
  #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
  
-int            xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
+int            xfs_iflush_cluster(struct xfs_buf *);
  void           xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
                                 struct xfs_inode *ip1, uint ip1_mode);
  
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index e8eda2ac25fb41831e0ab82243a7b8c829fdf7e4..4e7fce8d4f7cb05b67b96f4a9e8fe58caca5988c 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -507,12 +507,18 @@ xfs_inode_item_push(
          * reference for IO until we queue the buffer for delwri submission.
          */
         xfs_buf_hold(bp);
-       error = xfs_iflush_cluster(ip, bp);
+       error = xfs_iflush_cluster(bp);
         if (!error) {
                 if (!xfs_buf_delwri_queue(bp, buffer_list))
                         rval = XFS_ITEM_FLUSHING;
                 xfs_buf_relse(bp);
         } else {
+               /*
+                * Release the buffer if we were unable to flush anything. On
+                * any other error, the buffer has already been released.
+                */
+               if (error == -EAGAIN)
+                       xfs_buf_relse(bp);
                 rval = XFS_ITEM_LOCKED;
         }
author	Dave Chinner <dchinner@redhat.com>
	Mon, 29 Jun 2020 21:49:20 +0000 (14:49 -0700)
committer	Darrick J. Wong <darrick.wong@oracle.com>
	Tue, 7 Jul 2020 14:15:09 +0000 (07:15 -0700)
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| history