Merge tag 'linux-kselftest-kunit-5.15-rc1' of git://git.kernel.org/pub/scm/linux...

[linux-2.6-microblaze.git] / fs / xfs / xfs_inode.c
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index a835ceb..a4f6f03 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared(
  
  /*
   * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
   * various combinations of the locks to be obtained.
   *
   * The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared(
   *
   * Basic locking order:
   *
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
   *
   * mmap_lock locking order:
   *
   * i_rwsem -> page lock -> mmap_lock
- * mmap_lock -> i_mmap_lock -> page_lock
+ * mmap_lock -> invalidate_lock -> page_lock
   *
   * The difference in mmap_lock locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_lock.
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
   *
   * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
   * free manner (e.g. truncate, hole punch and other extent manipulation
   * functions).
   */
@@ -188,10 +188,13 @@ xfs_ilock(
                                  XFS_IOLOCK_DEP(lock_flags));
         }
  
-       if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
-       else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+       if (lock_flags & XFS_MMAPLOCK_EXCL) {
+               down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+                                 XFS_MMAPLOCK_DEP(lock_flags));
+       } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+               down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+                                XFS_MMAPLOCK_DEP(lock_flags));
+       }
  
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
         }
  
         if (lock_flags & XFS_MMAPLOCK_EXCL) {
-               if (!mrtryupdate(&ip->i_mmaplock))
+               if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
                         goto out_undo_iolock;
         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
-               if (!mrtryaccess(&ip->i_mmaplock))
+               if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
                         goto out_undo_iolock;
         }
  
@@ -258,9 +261,9 @@ xfs_ilock_nowait(
  
  out_undo_mmaplock:
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrunlock_excl(&ip->i_mmaplock);
+               up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mrunlock_shared(&ip->i_mmaplock);
+               up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
  out_undo_iolock:
         if (lock_flags & XFS_IOLOCK_EXCL)
                 up_write(&VFS_I(ip)->i_rwsem);
@@ -307,9 +310,9 @@ xfs_iunlock(
                 up_read(&VFS_I(ip)->i_rwsem);
  
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrunlock_excl(&ip->i_mmaplock);
+               up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         else if (lock_flags & XFS_MMAPLOCK_SHARED)
-               mrunlock_shared(&ip->i_mmaplock);
+               up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
  
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrunlock_excl(&ip->i_lock);
@@ -335,7 +338,7 @@ xfs_ilock_demote(
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrdemote(&ip->i_lock);
         if (lock_flags & XFS_MMAPLOCK_EXCL)
-               mrdemote(&ip->i_mmaplock);
+               downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
         if (lock_flags & XFS_IOLOCK_EXCL)
                 downgrade_write(&VFS_I(ip)->i_rwsem);
  
@@ -343,9 +346,29 @@ xfs_ilock_demote(
  }
  
  #if defined(DEBUG) || defined(XFS_WARN)
-int
+static inline bool
+__xfs_rwsem_islocked(
+       struct rw_semaphore     *rwsem,
+       bool                    shared)
+{
+       if (!debug_locks)
+               return rwsem_is_locked(rwsem);
+
+       if (!shared)
+               return lockdep_is_held_type(rwsem, 0);
+
+       /*
+        * We are checking that the lock is held at least in shared
+        * mode but don't care that it might be held exclusively
+        * (i.e. shared | excl). Hence we check if the lock is held
+        * in any mode rather than an explicit shared mode.
+        */
+       return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
  xfs_isilocked(
-       xfs_inode_t             *ip,
+       struct xfs_inode        *ip,
         uint                    lock_flags)
  {
         if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
@@ -355,20 +378,17 @@ xfs_isilocked(
         }
  
         if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
-               if (!(lock_flags & XFS_MMAPLOCK_SHARED))
-                       return !!ip->i_mmaplock.mr_writer;
-               return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+               return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+                               (lock_flags & XFS_IOLOCK_SHARED));
         }
  
-       if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-               if (!(lock_flags & XFS_IOLOCK_SHARED))
-                       return !debug_locks ||
-                               lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
-               return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+       if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+               return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+                               (lock_flags & XFS_IOLOCK_SHARED));
         }
  
         ASSERT(0);
-       return 0;
+       return false;
  }
  #endif
  
@@ -532,12 +552,10 @@ again:
  }
  
  /*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders.  The iolock must be double-locked separately since
- * we use i_rwsem for that.  We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
   */
  void
  xfs_lock_two_inodes(
@@ -555,15 +573,8 @@ xfs_lock_two_inodes(
         ASSERT(hweight32(ip1_mode) == 1);
         ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
         ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
-       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
-              !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+       ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+       ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
         ASSERT(ip0->i_ino != ip1->i_ino);
  
         if (ip0->i_ino > ip1->i_ino) {
@@ -663,7 +674,7 @@ xfs_lookup(
  
         trace_xfs_lookup(dp, name);
  
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+       if (xfs_is_shutdown(dp->i_mount))
                 return -EIO;
  
         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -705,7 +716,7 @@ xfs_inode_inherit_flags(
                         di_flags |= XFS_DIFLAG_PROJINHERIT;
         } else if (S_ISREG(mode)) {
                 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-                   xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
+                   xfs_has_realtime(ip->i_mount))
                         di_flags |= XFS_DIFLAG_REALTIME;
                 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
                         di_flags |= XFS_DIFLAG_EXTSIZE;
@@ -826,8 +837,7 @@ xfs_init_new_inode(
         inode->i_rdev = rdev;
         ip->i_projid = prid;
  
-       if (dir && !(dir->i_mode & S_ISGID) &&
-           (mp->m_flags & XFS_MOUNT_GRPID)) {
+       if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
                 inode_fsuid_set(inode, mnt_userns);
                 inode->i_gid = dir->i_gid;
                 inode->i_mode = mode;
@@ -857,7 +867,7 @@ xfs_init_new_inode(
         ip->i_extsize = 0;
         ip->i_diflags = 0;
  
-       if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
+       if (xfs_has_v3inodes(mp)) {
                 inode_set_iversion(inode, 1);
                 ip->i_cowextsize = 0;
                 ip->i_crtime = tv;
@@ -897,7 +907,7 @@ xfs_init_new_inode(
          * this saves us from needing to run a separate transaction to set the
          * fork offset in the immediate future.
          */
-       if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) {
+       if (init_xattrs && xfs_has_attr(mp)) {
                 ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
                 ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
         }
@@ -976,7 +986,7 @@ xfs_create(
  
         trace_xfs_create(dp, name);
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (xfs_is_shutdown(mp))
                 return -EIO;
  
         prid = xfs_get_initial_prid(dp);
@@ -1068,7 +1078,7 @@ xfs_create(
          * create transaction goes to disk before returning to
          * the user.
          */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+       if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
                 xfs_trans_set_sync(tp);
  
         /*
@@ -1130,7 +1140,7 @@ xfs_create_tmpfile(
         uint                    resblks;
         xfs_ino_t               ino;
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (xfs_is_shutdown(mp))
                 return -EIO;
  
         prid = xfs_get_initial_prid(dp);
@@ -1160,7 +1170,7 @@ xfs_create_tmpfile(
         if (error)
                 goto out_trans_cancel;
  
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
+       if (xfs_has_wsync(mp))
                 xfs_trans_set_sync(tp);
  
         /*
@@ -1220,7 +1230,7 @@ xfs_link(
  
         ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (xfs_is_shutdown(mp))
                 return -EIO;
  
         error = xfs_qm_dqattach(sip);
@@ -1294,7 +1304,7 @@ xfs_link(
          * link transaction goes to disk before returning to
          * the user.
          */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+       if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
                 xfs_trans_set_sync(tp);
  
         return xfs_trans_commit(tp);
@@ -1435,10 +1445,10 @@ xfs_release(
                 return 0;
  
         /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
+       if (xfs_is_readonly(mp))
                 return 0;
  
-       if (!XFS_FORCED_SHUTDOWN(mp)) {
+       if (!xfs_is_shutdown(mp)) {
                 int truncated;
  
                 /*
@@ -1521,7 +1531,7 @@ xfs_inactive_truncate(
  
         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
         if (error) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               ASSERT(xfs_is_shutdown(mp));
                 return error;
         }
         xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1592,7 +1602,7 @@ xfs_inactive_ifree(
                         "Failed to remove inode(s) from unlinked list. "
                         "Please free space, unmount and run xfs_repair.");
                 } else {
-                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                       ASSERT(xfs_is_shutdown(mp));
                 }
                 return error;
         }
@@ -1628,7 +1638,7 @@ xfs_inactive_ifree(
                  * might do that, we need to make sure.  Otherwise the
                  * inode might be lost for a long time or forever.
                  */
-               if (!XFS_FORCED_SHUTDOWN(mp)) {
+               if (!xfs_is_shutdown(mp)) {
                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
                                 __func__, error);
                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
@@ -1654,6 +1664,59 @@ xfs_inactive_ifree(
         return 0;
  }
  
+/*
+ * Returns true if we need to update the on-disk metadata before we can free
+ * the memory used by this inode.  Updates include freeing post-eof
+ * preallocations; freeing COW staging extents; and marking the inode free in
+ * the inobt if it is on the unlinked list.
+ */
+bool
+xfs_inode_needs_inactive(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+
+       /*
+        * If the inode is already free, then there can be nothing
+        * to clean up here.
+        */
+       if (VFS_I(ip)->i_mode == 0)
+               return false;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (xfs_is_readonly(mp))
+               return false;
+
+       /* If the log isn't running, push inodes straight to reclaim. */
+       if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
+               return false;
+
+       /* Metadata inodes require explicit resource cleanup. */
+       if (xfs_is_metadata_inode(ip))
+               return false;
+
+       /* Want to clean out the cow blocks if there are any. */
+       if (cow_ifp && cow_ifp->if_bytes > 0)
+               return true;
+
+       /* Unlinked files must be freed. */
+       if (VFS_I(ip)->i_nlink == 0)
+               return true;
+
+       /*
+        * This file isn't being freed, so check if there are post-eof blocks
+        * to free.  @force is true because we are evicting an inode from the
+        * cache.  Post-eof blocks must be freed, lest we end up with broken
+        * free space accounting.
+        *
+        * Note: don't bother with iolock here since lockdep complains about
+        * acquiring it in reclaim context. We have the only reference to the
+        * inode at this point anyways.
+        */
+       return xfs_can_free_eofblocks(ip, true);
+}
+
  /*
   * xfs_inactive
   *
@@ -1683,7 +1746,7 @@ xfs_inactive(
         ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
  
         /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
+       if (xfs_is_readonly(mp))
                 goto out;
  
         /* Metadata inodes require explicit resource cleanup. */
@@ -1958,7 +2021,7 @@ xfs_iunlink_destroy(
         rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
                         xfs_iunlink_free_item, &freed_anything);
  
-       ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+       ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
  }
  
  /*
@@ -2703,7 +2766,7 @@ xfs_remove(
  
         trace_xfs_remove(dp, name);
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (xfs_is_shutdown(mp))
                 return -EIO;
  
         error = xfs_qm_dqattach(dp);
@@ -2763,6 +2826,19 @@ xfs_remove(
                 error = xfs_droplink(tp, ip);
                 if (error)
                         goto out_trans_cancel;
+
+               /*
+                * Point the unlinked child directory's ".." entry to the root
+                * directory to eliminate back-references to inodes that may
+                * get freed before the child directory is closed.  If the fs
+                * gets shrunk, this can lead to dirent inode validation errors.
+                */
+               if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+                       error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+                                       tp->t_mountp->m_sb.sb_rootino, 0);
+                       if (error)
+                               return error;
+               }
         } else {
                 /*
                  * When removing a non-directory we need to log the parent
@@ -2789,7 +2865,7 @@ xfs_remove(
          * remove transaction goes to disk before returning to
          * the user.
          */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+       if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
                 xfs_trans_set_sync(tp);
  
         error = xfs_trans_commit(tp);
@@ -2866,7 +2942,7 @@ xfs_finish_rename(
          * If this is a synchronous mount, make sure that the rename transaction
          * goes to disk before returning to the user.
          */
-       if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+       if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
                 xfs_trans_set_sync(tp);
  
         return xfs_trans_commit(tp);
@@ -3449,7 +3525,7 @@ xfs_iflush(
          * happen but we need to still do it to ensure backwards compatibility
          * with old kernels that predate logging all inode changes.
          */
-       if (!xfs_sb_version_has_v3inode(&mp->m_sb))
+       if (!xfs_has_v3inodes(mp))
                 ip->i_flushiter++;
  
         /*
@@ -3471,7 +3547,7 @@ xfs_iflush(
         xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
  
         /* Wrap, we never let the log put out DI_MAX_FLUSH */
-       if (!xfs_sb_version_has_v3inode(&mp->m_sb)) {
+       if (!xfs_has_v3inodes(mp)) {
                 if (ip->i_flushiter == DI_MAX_FLUSH)
                         ip->i_flushiter = 0;
         }
@@ -3590,7 +3666,7 @@ xfs_iflush_cluster(
                  * AIL, leaving a dirty/unpinned inode attached to the buffer
                  * that otherwise looks like it should be flushed.
                  */
-               if (XFS_FORCED_SHUTDOWN(mp)) {
+               if (xfs_is_shutdown(mp)) {
                         xfs_iunpin_wait(ip);
                         xfs_iflush_abort(ip);
                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -3728,11 +3804,8 @@ xfs_ilock2_io_mmap(
         ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
         if (ret)
                 return ret;
-       if (ip1 == ip2)
-               xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
-       else
-               xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
-                                   ip2, XFS_MMAPLOCK_EXCL);
+       filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+                                   VFS_I(ip2)->i_mapping);
         return 0;
  }
  
@@ -3742,12 +3815,9 @@ xfs_iunlock2_io_mmap(
         struct xfs_inode        *ip1,
         struct xfs_inode        *ip2)
  {
-       bool                    same_inode = (ip1 == ip2);
-
-       xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
-       if (!same_inode)
-               xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+       filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+                                     VFS_I(ip2)->i_mapping);
         inode_unlock(VFS_I(ip2));
-       if (!same_inode)
+       if (ip1 != ip2)
                 inode_unlock(VFS_I(ip1));
  }