ext4: Start with shared i_rwsem in case of DIO instead of exclusive

author Ritesh Harjani <riteshh@linux.ibm.com>

Thu, 12 Dec 2019 05:55:56 +0000 (11:25 +0530)

committer Theodore Ts'o <tytso@mit.edu>

Mon, 23 Dec 2019 04:57:27 +0000 (23:57 -0500)
author Ritesh Harjani <riteshh@linux.ibm.com>
Thu, 12 Dec 2019 05:55:56 +0000 (11:25 +0530)
committer Theodore Ts'o <tytso@mit.edu>
Mon, 23 Dec 2019 04:57:27 +0000 (23:57 -0500)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 977ac58dc718d51ae4233b4ecd9b5ad278f67ae9..1da49dffa3df17db2b3bc604d3e711e81857febb 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -166,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
   * threads are at work on the same unwritten block, they must be synchronized
   * or one thread will zero the other's data, causing corruption.
   */
-static int
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+static bool
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
  {
         struct super_block *sb = inode->i_sb;
-       int blockmask = sb->s_blocksize - 1;
-
-       if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
-               return 0;
+       unsigned long blockmask = sb->s_blocksize - 1;
  
         if ((pos | iov_iter_alignment(from)) & blockmask)
-               return 1;
+               return true;
  
-       return 0;
+       return false;
+}
+
+static bool
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+{
+       if (offset + len > i_size_read(inode) ||
+           offset + len > EXT4_I(inode)->i_disksize)
+               return true;
+       return false;
  }
  
  /* Is IO overwriting allocated and initialized blocks? */
@@ -204,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
         return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
  }
  
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+                                        struct iov_iter *from)
  {
         struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
@@ -228,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
                 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
         }
  
+       return iov_iter_count(from);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+       ssize_t ret, count;
+
+       count = ext4_generic_write_checks(iocb, from);
+       if (count <= 0)
+               return count;
+
         ret = file_modified(iocb->ki_filp);
         if (ret)
                 return ret;
-
-       return iov_iter_count(from);
+       return count;
  }
  
  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
@@ -364,62 +381,139 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
         .end_io = ext4_dio_write_end_io,
  };
  
+/*
+ * The intention here is to start with shared lock acquired then see if any
+ * condition requires an exclusive inode lock. If yes, then we restart the
+ * whole operation by releasing the shared lock and acquiring exclusive lock.
+ *
+ * - For unaligned_io we never take shared lock as it may cause data corruption
+ *   when two unaligned IO tries to modify the same block e.g. while zeroing.
+ *
+ * - For extending writes case we don't take the shared lock, since it requires
+ *   updating inode i_disksize and/or orphan handling with exclusive lock.
+ *
+ * - shared locking will only be true mostly with overwrites in dioread_nolock
+ *   mode. Otherwise we will switch to exclusive i_rwsem lock.
+ */
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+                                    bool *ilock_shared, bool *extend)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       loff_t offset;
+       size_t count;
+       ssize_t ret;
+
+restart:
+       ret = ext4_generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto out;
+
+       offset = iocb->ki_pos;
+       count = ret;
+       if (ext4_extending_io(inode, offset, count))
+               *extend = true;
+       /*
+        * Determine whether the IO operation will overwrite allocated
+        * and initialized blocks. If so, check to see whether it is
+        * possible to take the dioread_nolock path.
+        *
+        * We need exclusive i_rwsem for changing security info
+        * in file_modified().
+        */
+       if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
+            !ext4_should_dioread_nolock(inode) ||
+            !ext4_overwrite_io(inode, offset, count))) {
+               inode_unlock_shared(inode);
+               *ilock_shared = false;
+               inode_lock(inode);
+               goto restart;
+       }
+
+       ret = file_modified(file);
+       if (ret < 0)
+               goto out;
+
+       return count;
+out:
+       if (*ilock_shared)
+               inode_unlock_shared(inode);
+       else
+               inode_unlock(inode);
+       return ret;
+}
+
  static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
         ssize_t ret;
-       size_t count;
-       loff_t offset;
         handle_t *handle;
         struct inode *inode = file_inode(iocb->ki_filp);
-       bool extend = false, overwrite = false, unaligned_aio = false;
+       loff_t offset = iocb->ki_pos;
+       size_t count = iov_iter_count(from);
+       bool extend = false, unaligned_io = false;
+       bool ilock_shared = true;
+
+       /*
+        * We initially start with shared inode lock unless it is
+        * unaligned IO which needs exclusive lock anyways.
+        */
+       if (ext4_unaligned_io(inode, from, offset)) {
+               unaligned_io = true;
+               ilock_shared = false;
+       }
+       /*
+        * Quick check here without any i_rwsem lock to see if it is extending
+        * IO. A more reliable check is done in ext4_dio_write_checks() with
+        * proper locking in place.
+        */
+       if (offset + count > i_size_read(inode))
+               ilock_shared = false;
  
         if (iocb->ki_flags & IOCB_NOWAIT) {
-               if (!inode_trylock(inode))
-                       return -EAGAIN;
+               if (ilock_shared) {
+                       if (!inode_trylock_shared(inode))
+                               return -EAGAIN;
+               } else {
+                       if (!inode_trylock(inode))
+                               return -EAGAIN;
+               }
         } else {
-               inode_lock(inode);
+               if (ilock_shared)
+                       inode_lock_shared(inode);
+               else
+                       inode_lock(inode);
         }
  
+       /* Fallback to buffered I/O if the inode does not support direct I/O. */
         if (!ext4_dio_supported(inode)) {
-               inode_unlock(inode);
-               /*
-                * Fallback to buffered I/O if the inode does not support
-                * direct I/O.
-                */
+               if (ilock_shared)
+                       inode_unlock_shared(inode);
+               else
+                       inode_unlock(inode);
                 return ext4_buffered_write_iter(iocb, from);
         }
  
-       ret = ext4_write_checks(iocb, from);
-       if (ret <= 0) {
-               inode_unlock(inode);
+       ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+       if (ret <= 0)
                 return ret;
-       }
  
-       /*
-        * Unaligned asynchronous direct I/O must be serialized among each
-        * other as the zeroing of partial blocks of two competing unaligned
-        * asynchronous direct I/O writes can result in data corruption.
-        */
         offset = iocb->ki_pos;
-       count = iov_iter_count(from);
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-           !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
-               unaligned_aio = true;
-               inode_dio_wait(inode);
-       }
+       count = ret;
  
         /*
-        * Determine whether the I/O will overwrite allocated and initialized
-        * blocks. If so, check to see whether it is possible to take the
-        * dioread_nolock path.
+        * Unaligned direct IO must be serialized among each other as zeroing
+        * of partial blocks of two competing unaligned IOs can result in data
+        * corruption.
+        *
+        * So we make sure we don't allow any unaligned IO in flight.
+        * For IOs where we need not wait (like unaligned non-AIO DIO),
+        * below inode_dio_wait() may anyway become a no-op, since we start
+        * with exclusive lock.
          */
-       if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
-           ext4_should_dioread_nolock(inode)) {
-               overwrite = true;
-               downgrade_write(&inode->i_rwsem);
-       }
+       if (unaligned_io)
+               inode_dio_wait(inode);
  
-       if (offset + count > EXT4_I(inode)->i_disksize) {
+       if (extend) {
                 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                 if (IS_ERR(handle)) {
                         ret = PTR_ERR(handle);
@@ -432,18 +526,17 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
                         goto out;
                 }
  
-               extend = true;
                 ext4_journal_stop(handle);
         }
  
         ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
-                          is_sync_kiocb(iocb) || unaligned_aio || extend);
+                          is_sync_kiocb(iocb) || unaligned_io || extend);
  
         if (extend)
                 ret = ext4_handle_inode_extension(inode, offset, ret, count);
  
  out:
-       if (overwrite)
+       if (ilock_shared)
                 inode_unlock_shared(inode);
         else
                 inode_unlock(inode);
author	Ritesh Harjani <riteshh@linux.ibm.com>
	Thu, 12 Dec 2019 05:55:56 +0000 (11:25 +0530)
committer	Theodore Ts'o <tytso@mit.edu>
	Mon, 23 Dec 2019 04:57:27 +0000 (23:57 -0500)