xfs: attach inodes to the cluster buffer when dirtied
[linux-2.6-microblaze.git] / fs / xfs / xfs_inode.c
index 4c91fb2..c4586ac 100644 (file)
@@ -1740,10 +1740,31 @@ xfs_inactive_ifree(
                return error;
        }
 
+       /*
+        * We do not hold the inode locked across the entire rolling transaction
+        * here. We only need to hold it for the first transaction that
+        * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+        * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+        * here breaks the relationship between cluster buffer invalidation and
+        * stale inode invalidation on cluster buffer item journal commit
+        * completion, and can result in leaving dirty stale inodes hanging
+        * around in memory.
+        *
+        * We have no need for serialising this inode operation against other
+        * operations - we freed the inode and hence reallocation is required
+        * and that will serialise on reallocating the space the deferops need
+        * to free. Hence we can unlock the inode on the first commit of
+        * the transaction rather than roll it right through the deferops. This
+        * avoids relogging the XFS_ISTALE inode.
+        *
+        * We check that xfs_ifree() hasn't grown an internal transaction roll
+        * by asserting that the inode is still locked when it returns.
+        */
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, 0);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
        error = xfs_ifree(tp, ip);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        if (error) {
                /*
                 * If we fail to free the inode, shut down.  The cancel
@@ -1756,7 +1777,6 @@ xfs_inactive_ifree(
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
                return error;
        }
 
@@ -1774,7 +1794,6 @@ xfs_inactive_ifree(
                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
                        __func__, error);
 
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return 0;
 }
 
@@ -2498,17 +2517,19 @@ out:
 }
 
 /*
- * Look up the inode number specified and mark it stale if it is found. If it is
- * dirty, return the inode so it can be attached to the cluster buffer so it can
- * be processed appropriately when the cluster free transaction completes.
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
  */
-static struct xfs_inode *
-xfs_ifree_get_one_inode(
-       struct xfs_perag        *pag,
+static void
+xfs_ifree_mark_inode_stale(
+       struct xfs_buf          *bp,
        struct xfs_inode        *free_ip,
        xfs_ino_t               inum)
 {
-       struct xfs_mount        *mp = pag->pag_mount;
+       struct xfs_mount        *mp = bp->b_mount;
+       struct xfs_perag        *pag = bp->b_pag;
+       struct xfs_inode_log_item *iip;
        struct xfs_inode        *ip;
 
 retry:
@@ -2516,8 +2537,10 @@ retry:
        ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
 
        /* Inode not in memory, nothing to do */
-       if (!ip)
-               goto out_rcu_unlock;
+       if (!ip) {
+               rcu_read_unlock();
+               return;
+       }
 
        /*
         * because this is an RCU protected lookup, we could find a recently
@@ -2528,9 +2551,9 @@ retry:
        spin_lock(&ip->i_flags_lock);
        if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
                spin_unlock(&ip->i_flags_lock);
-               goto out_rcu_unlock;
+               rcu_read_unlock();
+               return;
        }
-       spin_unlock(&ip->i_flags_lock);
 
        /*
         * Don't try to lock/unlock the current inode, but we _cannot_ skip the
@@ -2540,43 +2563,50 @@ retry:
         */
        if (ip != free_ip) {
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                       spin_unlock(&ip->i_flags_lock);
                        rcu_read_unlock();
                        delay(1);
                        goto retry;
                }
-
-               /*
-                * Check the inode number again in case we're racing with
-                * freeing in xfs_reclaim_inode().  See the comments in that
-                * function for more information as to why the initial check is
-                * not sufficient.
-                */
-               if (ip->i_ino != inum) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       goto out_rcu_unlock;
-               }
        }
+       ip->i_flags |= XFS_ISTALE;
+       spin_unlock(&ip->i_flags_lock);
        rcu_read_unlock();
 
-       xfs_iflock(ip);
-       xfs_iflags_set(ip, XFS_ISTALE);
+       /*
+        * If we can't get the flush lock, the inode is already attached.  All
+        * we needed to do here is mark the inode stale so buffer IO completion
+        * will remove it from the AIL.
+        */
+       iip = ip->i_itemp;
+       if (!xfs_iflock_nowait(ip)) {
+               ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+               ASSERT(iip->ili_last_fields);
+               goto out_iunlock;
+       }
 
        /*
-        * We don't need to attach clean inodes or those only with unlogged
-        * changes (which we throw away, anyway).
+        * Inodes not attached to the buffer can be released immediately.
+        * Everything else has to go through xfs_iflush_abort() on journal
+        * commit as the flock synchronises removal of the inode from the
+        * cluster buffer against inode reclaim.
         */
-       if (!ip->i_itemp || xfs_inode_clean(ip)) {
-               ASSERT(ip != free_ip);
+       if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
                xfs_ifunlock(ip);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               goto out_no_inode;
+               goto out_iunlock;
        }
-       return ip;
 
-out_rcu_unlock:
-       rcu_read_unlock();
-out_no_inode:
-       return NULL;
+       /* we have a dirty inode in memory that has not yet been flushed. */
+       spin_lock(&iip->ili_lock);
+       iip->ili_last_fields = iip->ili_fields;
+       iip->ili_fields = 0;
+       iip->ili_fsync_fields = 0;
+       spin_unlock(&iip->ili_lock);
+       ASSERT(iip->ili_last_fields);
+
+out_iunlock:
+       if (ip != free_ip)
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
 }
 
 /*
@@ -2586,26 +2616,20 @@ out_no_inode:
  */
 STATIC int
 xfs_ifree_cluster(
-       xfs_inode_t             *free_ip,
-       xfs_trans_t             *tp,
+       struct xfs_inode        *free_ip,
+       struct xfs_trans        *tp,
        struct xfs_icluster     *xic)
 {
-       xfs_mount_t             *mp = free_ip->i_mount;
+       struct xfs_mount        *mp = free_ip->i_mount;
+       struct xfs_ino_geometry *igeo = M_IGEO(mp);
+       struct xfs_buf          *bp;
+       xfs_daddr_t             blkno;
+       xfs_ino_t               inum = xic->first_ino;
        int                     nbufs;
        int                     i, j;
        int                     ioffset;
-       xfs_daddr_t             blkno;
-       xfs_buf_t               *bp;
-       xfs_inode_t             *ip;
-       struct xfs_inode_log_item *iip;
-       struct xfs_log_item     *lip;
-       struct xfs_perag        *pag;
-       struct xfs_ino_geometry *igeo = M_IGEO(mp);
-       xfs_ino_t               inum;
        int                     error;
 
-       inum = xic->first_ino;
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
        nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
 
        for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2649,60 +2673,16 @@ xfs_ifree_cluster(
                bp->b_ops = &xfs_inode_buf_ops;
 
                /*
-                * Walk the inodes already attached to the buffer and mark them
-                * stale. These will all have the flush locks held, so an
-                * in-memory inode walk can't lock them. By marking them all
-                * stale first, we will not attempt to lock them in the loop
-                * below as the XFS_ISTALE flag will be set.
+                * Now we need to set all the cached clean inodes as XFS_ISTALE,
+                * too. This requires lookups, and will skip inodes that we've
+                * already marked XFS_ISTALE.
                 */
-               list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
-                       if (lip->li_type == XFS_LI_INODE) {
-                               iip = (struct xfs_inode_log_item *)lip;
-                               ASSERT(iip->ili_logged == 1);
-                               lip->li_cb = xfs_istale_done;
-                               xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                       &iip->ili_flush_lsn,
-                                                       &iip->ili_item.li_lsn);
-                               xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                       }
-               }
-
-
-               /*
-                * For each inode in memory attempt to add it to the inode
-                * buffer and set it up for being staled on buffer IO
-                * completion.  This is safe as we've locked out tail pushing
-                * and flushing by locking the buffer.
-                *
-                * We have already marked every inode that was part of a
-                * transaction stale above, which means there is no point in
-                * even trying to lock them.
-                */
-               for (i = 0; i < igeo->inodes_per_cluster; i++) {
-                       ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
-                       if (!ip)
-                               continue;
-
-                       iip = ip->i_itemp;
-                       iip->ili_last_fields = iip->ili_fields;
-                       iip->ili_fields = 0;
-                       iip->ili_fsync_fields = 0;
-                       iip->ili_logged = 1;
-                       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                                               &iip->ili_item.li_lsn);
-
-                       xfs_buf_attach_iodone(bp, xfs_istale_done,
-                                                 &iip->ili_item);
-
-                       if (ip != free_ip)
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
+               for (i = 0; i < igeo->inodes_per_cluster; i++)
+                       xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
 
                xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
-
-       xfs_perag_put(pag);
        return 0;
 }
 
@@ -2723,6 +2703,7 @@ xfs_ifree(
 {
        int                     error;
        struct xfs_icluster     xic = { 0 };
+       struct xfs_inode_log_item *iip = ip->i_itemp;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2760,7 +2741,9 @@ xfs_ifree(
        ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
 
        /* Don't attempt to replay owner changes for a deleted inode */
-       ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+       spin_lock(&iip->ili_lock);
+       iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+       spin_unlock(&iip->ili_lock);
 
        /*
         * Bump the generation count so no one will be confused
@@ -3816,39 +3799,24 @@ xfs_iflush_int(
         * know that the information those bits represent is permanently on
         * disk.  As long as the flush completes before the inode is logged
         * again, then both ili_fields and ili_last_fields will be cleared.
-        *
-        * We can play with the ili_fields bits here, because the inode lock
-        * must be held exclusively in order to set bits there and the flush
-        * lock protects the ili_last_fields bits.  Set ili_logged so the flush
-        * done routine can tell whether or not to look in the AIL.  Also, store
-        * the current LSN of the inode so that we can tell whether the item has
-        * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
-        * need the AIL lock, because it is a 64 bit value that cannot be read
-        * atomically.
         */
        error = 0;
 flush_out:
+       spin_lock(&iip->ili_lock);
        iip->ili_last_fields = iip->ili_fields;
        iip->ili_fields = 0;
        iip->ili_fsync_fields = 0;
-       iip->ili_logged = 1;
-
-       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                               &iip->ili_item.li_lsn);
+       spin_unlock(&iip->ili_lock);
 
        /*
-        * Attach the inode item callback to the buffer whether the flush
-        * succeeded or not. If not, the caller will shut down and fail I/O
-        * completion on the buffer to remove the inode from the AIL and release
-        * the flush lock.
+        * Store the current LSN of the inode so that we can tell whether the
+        * item has moved in the AIL from xfs_iflush_done().
         */
-       xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+                               &iip->ili_item.li_lsn);
 
        /* generate the checksum. */
        xfs_dinode_calc_crc(mp, dip);
-
-       ASSERT(!list_empty(&bp->b_li_list));
-       ASSERT(bp->b_iodone != NULL);
        return error;
 }
 
@@ -3879,3 +3847,96 @@ xfs_log_force_inode(
                return 0;
        return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
 }
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding.  The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+       struct inode            *src,
+       struct inode            *dest)
+{
+       int                     error;
+
+       if (src > dest)
+               swap(src, dest);
+
+retry:
+       /* Wait to break both inodes' layouts before we start locking. */
+       error = break_layout(src, true);
+       if (error)
+               return error;
+       if (src != dest) {
+               error = break_layout(dest, true);
+               if (error)
+                       return error;
+       }
+
+       /* Lock one inode and make sure nobody got in and leased it. */
+       inode_lock(src);
+       error = break_layout(src, false);
+       if (error) {
+               inode_unlock(src);
+               if (error == -EWOULDBLOCK)
+                       goto retry;
+               return error;
+       }
+
+       if (src == dest)
+               return 0;
+
+       /* Lock the other inode and make sure nobody got in and leased it. */
+       inode_lock_nested(dest, I_MUTEX_NONDIR2);
+       error = break_layout(dest, false);
+       if (error) {
+               inode_unlock(src);
+               inode_unlock(dest);
+               if (error == -EWOULDBLOCK)
+                       goto retry;
+               return error;
+       }
+
+       return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+       struct xfs_inode        *ip1,
+       struct xfs_inode        *ip2)
+{
+       int                     ret;
+
+       ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+       if (ret)
+               return ret;
+       if (ip1 == ip2)
+               xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+       else
+               xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
+                                   ip2, XFS_MMAPLOCK_EXCL);
+       return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+       struct xfs_inode        *ip1,
+       struct xfs_inode        *ip2)
+{
+       bool                    same_inode = (ip1 == ip2);
+
+       xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+       if (!same_inode)
+               xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+       inode_unlock(VFS_I(ip2));
+       if (!same_inode)
+               inode_unlock(VFS_I(ip1));
+}