fs/xfs/xfs_inode.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include <linux/iversion.h>
   7
   8 #include "xfs.h"
   9 #include "xfs_fs.h"
  10 #include "xfs_shared.h"
  11 #include "xfs_format.h"
  12 #include "xfs_log_format.h"
  13 #include "xfs_trans_resv.h"
  14 #include "xfs_mount.h"
  15 #include "xfs_defer.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_dir2.h"
  18 #include "xfs_attr.h"
  19 #include "xfs_trans_space.h"
  20 #include "xfs_trans.h"
  21 #include "xfs_buf_item.h"
  22 #include "xfs_inode_item.h"
  23 #include "xfs_iunlink_item.h"
  24 #include "xfs_ialloc.h"
  25 #include "xfs_bmap.h"
  26 #include "xfs_bmap_util.h"
  27 #include "xfs_errortag.h"
  28 #include "xfs_error.h"
  29 #include "xfs_quota.h"
  30 #include "xfs_filestream.h"
  31 #include "xfs_trace.h"
  32 #include "xfs_icache.h"
  33 #include "xfs_symlink.h"
  34 #include "xfs_trans_priv.h"
  35 #include "xfs_log.h"
  36 #include "xfs_bmap_btree.h"
  37 #include "xfs_reflink.h"
  38 #include "xfs_ag.h"
  39 #include "xfs_log_priv.h"
  40 #include "xfs_health.h"
  41
  42 struct kmem_cache *xfs_inode_cache;
  43
  44 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
  45 STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
  46         struct xfs_inode *);
  47
  48 /*
  49  * helper function to extract extent size hint from inode
  50  */
  51 xfs_extlen_t
  52 xfs_get_extsz_hint(
  53         struct xfs_inode        *ip)
  54 {
  55         /*
  56          * No point in aligning allocations if we need to COW to actually
  57          * write to them.
  58          */
  59         if (xfs_is_always_cow_inode(ip))
  60                 return 0;
  61         if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
  62                 return ip->i_extsize;
  63         if (XFS_IS_REALTIME_INODE(ip))
  64                 return ip->i_mount->m_sb.sb_rextsize;
  65         return 0;
  66 }
  67
  68 /*
  69  * Helper function to extract CoW extent size hint from inode.
  70  * Between the extent size hint and the CoW extent size hint, we
  71  * return the greater of the two.  If the value is zero (automatic),
  72  * use the default size.
  73  */
  74 xfs_extlen_t
  75 xfs_get_cowextsz_hint(
  76         struct xfs_inode        *ip)
  77 {
  78         xfs_extlen_t            a, b;
  79
  80         a = 0;
  81         if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
  82                 a = ip->i_cowextsize;
  83         b = xfs_get_extsz_hint(ip);
  84
  85         a = max(a, b);
  86         if (a == 0)
  87                 return XFS_DEFAULT_COWEXTSZ_HINT;
  88         return a;
  89 }
  90
  91 /*
  92  * These two are wrapper routines around the xfs_ilock() routine used to
  93  * centralize some grungy code.  They are used in places that wish to lock the
  94  * inode solely for reading the extents.  The reason these places can't just
  95  * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
  96  * bringing in of the extents from disk for a file in b-tree format.  If the
  97  * inode is in b-tree format, then we need to lock the inode exclusively until
  98  * the extents are read in.  Locking it exclusively all the time would limit
  99  * our parallelism unnecessarily, though.  What we do instead is check to see
 100  * if the extents have been read in yet, and only lock the inode exclusively
 101  * if they have not.
 102  *
 103  * The functions return a value which should be given to the corresponding
 104  * xfs_iunlock() call.
 105  */
 106 uint
 107 xfs_ilock_data_map_shared(
 108         struct xfs_inode        *ip)
 109 {
 110         uint                    lock_mode = XFS_ILOCK_SHARED;
 111
 112         if (xfs_need_iread_extents(&ip->i_df))
 113                 lock_mode = XFS_ILOCK_EXCL;
 114         xfs_ilock(ip, lock_mode);
 115         return lock_mode;
 116 }
 117
 118 uint
 119 xfs_ilock_attr_map_shared(
 120         struct xfs_inode        *ip)
 121 {
 122         uint                    lock_mode = XFS_ILOCK_SHARED;
 123
 124         if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
 125                 lock_mode = XFS_ILOCK_EXCL;
 126         xfs_ilock(ip, lock_mode);
 127         return lock_mode;
 128 }
 129
 130 /*
 131  * You can't set both SHARED and EXCL for the same lock,
 132  * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
 133  * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
 134  * to set in lock_flags.
 135  */
 136 static inline void
 137 xfs_lock_flags_assert(
 138         uint            lock_flags)
 139 {
 140         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 141                 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 142         ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 143                 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 144         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 145                 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 146         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 147         ASSERT(lock_flags != 0);
 148 }
 149
 150 /*
 151  * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
 152  * multi-reader locks: invalidate_lock and the i_lock.  This routine allows
 153  * various combinations of the locks to be obtained.
 154  *
 155  * The 3 locks should always be ordered so that the IO lock is obtained first,
 156  * the mmap lock second and the ilock last in order to prevent deadlock.
 157  *
 158  * Basic locking order:
 159  *
 160  * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
 161  *
 162  * mmap_lock locking order:
 163  *
 164  * i_rwsem -> page lock -> mmap_lock
 165  * mmap_lock -> invalidate_lock -> page_lock
 166  *
 167  * The difference in mmap_lock locking order mean that we cannot hold the
 168  * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
 169  * can fault in pages during copy in/out (for buffered IO) or require the
 170  * mmap_lock in get_user_pages() to map the user pages into the kernel address
 171  * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
 172  * fault because page faults already hold the mmap_lock.
 173  *
 174  * Hence to serialise fully against both syscall and mmap based IO, we need to
 175  * take both the i_rwsem and the invalidate_lock. These locks should *only* be
 176  * both taken in places where we need to invalidate the page cache in a race
 177  * free manner (e.g. truncate, hole punch and other extent manipulation
 178  * functions).
 179  */
 180 void
 181 xfs_ilock(
 182         xfs_inode_t             *ip,
 183         uint                    lock_flags)
 184 {
 185         trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 186
 187         xfs_lock_flags_assert(lock_flags);
 188
 189         if (lock_flags & XFS_IOLOCK_EXCL) {
 190                 down_write_nested(&VFS_I(ip)->i_rwsem,
 191                                   XFS_IOLOCK_DEP(lock_flags));
 192         } else if (lock_flags & XFS_IOLOCK_SHARED) {
 193                 down_read_nested(&VFS_I(ip)->i_rwsem,
 194                                  XFS_IOLOCK_DEP(lock_flags));
 195         }
 196
 197         if (lock_flags & XFS_MMAPLOCK_EXCL) {
 198                 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
 199                                   XFS_MMAPLOCK_DEP(lock_flags));
 200         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 201                 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
 202                                  XFS_MMAPLOCK_DEP(lock_flags));
 203         }
 204
 205         if (lock_flags & XFS_ILOCK_EXCL)
 206                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 207         else if (lock_flags & XFS_ILOCK_SHARED)
 208                 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 209 }
 210
 211 /*
 212  * This is just like xfs_ilock(), except that the caller
 213  * is guaranteed not to sleep.  It returns 1 if it gets
 214  * the requested locks and 0 otherwise.  If the IO lock is
 215  * obtained but the inode lock cannot be, then the IO lock
 216  * is dropped before returning.
 217  *
 218  * ip -- the inode being locked
 219  * lock_flags -- this parameter indicates the inode's locks to be
 220  *       to be locked.  See the comment for xfs_ilock() for a list
 221  *       of valid values.
 222  */
 223 int
 224 xfs_ilock_nowait(
 225         xfs_inode_t             *ip,
 226         uint                    lock_flags)
 227 {
 228         trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 229
 230         xfs_lock_flags_assert(lock_flags);
 231
 232         if (lock_flags & XFS_IOLOCK_EXCL) {
 233                 if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 234                         goto out;
 235         } else if (lock_flags & XFS_IOLOCK_SHARED) {
 236                 if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 237                         goto out;
 238         }
 239
 240         if (lock_flags & XFS_MMAPLOCK_EXCL) {
 241                 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 242                         goto out_undo_iolock;
 243         } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 244                 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
 245                         goto out_undo_iolock;
 246         }
 247
 248         if (lock_flags & XFS_ILOCK_EXCL) {
 249                 if (!mrtryupdate(&ip->i_lock))
 250                         goto out_undo_mmaplock;
 251         } else if (lock_flags & XFS_ILOCK_SHARED) {
 252                 if (!mrtryaccess(&ip->i_lock))
 253                         goto out_undo_mmaplock;
 254         }
 255         return 1;
 256
 257 out_undo_mmaplock:
 258         if (lock_flags & XFS_MMAPLOCK_EXCL)
 259                 up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 260         else if (lock_flags & XFS_MMAPLOCK_SHARED)
 261                 up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 262 out_undo_iolock:
 263         if (lock_flags & XFS_IOLOCK_EXCL)
 264                 up_write(&VFS_I(ip)->i_rwsem);
 265         else if (lock_flags & XFS_IOLOCK_SHARED)
 266                 up_read(&VFS_I(ip)->i_rwsem);
 267 out:
 268         return 0;
 269 }
 270
 271 /*
 272  * xfs_iunlock() is used to drop the inode locks acquired with
 273  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 274  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 275  * that we know which locks to drop.
 276  *
 277  * ip -- the inode being unlocked
 278  * lock_flags -- this parameter indicates the inode's locks to be
 279  *       to be unlocked.  See the comment for xfs_ilock() for a list
 280  *       of valid values for this parameter.
 281  *
 282  */
 283 void
 284 xfs_iunlock(
 285         xfs_inode_t             *ip,
 286         uint                    lock_flags)
 287 {
 288         xfs_lock_flags_assert(lock_flags);
 289
 290         if (lock_flags & XFS_IOLOCK_EXCL)
 291                 up_write(&VFS_I(ip)->i_rwsem);
 292         else if (lock_flags & XFS_IOLOCK_SHARED)
 293                 up_read(&VFS_I(ip)->i_rwsem);
 294
 295         if (lock_flags & XFS_MMAPLOCK_EXCL)
 296                 up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 297         else if (lock_flags & XFS_MMAPLOCK_SHARED)
 298                 up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 299
 300         if (lock_flags & XFS_ILOCK_EXCL)
 301                 mrunlock_excl(&ip->i_lock);
 302         else if (lock_flags & XFS_ILOCK_SHARED)
 303                 mrunlock_shared(&ip->i_lock);
 304
 305         trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 306 }
 307
 308 /*
 309  * give up write locks.  the i/o lock cannot be held nested
 310  * if it is being demoted.
 311  */
 312 void
 313 xfs_ilock_demote(
 314         xfs_inode_t             *ip,
 315         uint                    lock_flags)
 316 {
 317         ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 318         ASSERT((lock_flags &
 319                 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 320
 321         if (lock_flags & XFS_ILOCK_EXCL)
 322                 mrdemote(&ip->i_lock);
 323         if (lock_flags & XFS_MMAPLOCK_EXCL)
 324                 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 325         if (lock_flags & XFS_IOLOCK_EXCL)
 326                 downgrade_write(&VFS_I(ip)->i_rwsem);
 327
 328         trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 329 }
 330
 331 void
 332 xfs_assert_ilocked(
 333         struct xfs_inode        *ip,
 334         uint                    lock_flags)
 335 {
 336         if (lock_flags & XFS_ILOCK_SHARED)
 337                 rwsem_assert_held(&ip->i_lock.mr_lock);
 338         else if (lock_flags & XFS_ILOCK_EXCL)
 339                 ASSERT(ip->i_lock.mr_writer);
 340
 341         if (lock_flags & XFS_MMAPLOCK_SHARED)
 342                 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
 343         else if (lock_flags & XFS_MMAPLOCK_EXCL)
 344                 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 345
 346         if (lock_flags & XFS_IOLOCK_SHARED)
 347                 rwsem_assert_held(&VFS_I(ip)->i_rwsem);
 348         else if (lock_flags & XFS_IOLOCK_EXCL)
 349                 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
 350 }
 351
 352 /*
 353  * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
 354  * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
 355  * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
 356  * errors and warnings.
 357  */
 358 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
 359 static bool
 360 xfs_lockdep_subclass_ok(
 361         int subclass)
 362 {
 363         return subclass < MAX_LOCKDEP_SUBCLASSES;
 364 }
 365 #else
 366 #define xfs_lockdep_subclass_ok(subclass)       (true)
 367 #endif
 368
 369 /*
 370  * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 371  * value. This can be called for any type of inode lock combination, including
 372  * parent locking. Care must be taken to ensure we don't overrun the subclass
 373  * storage fields in the class mask we build.
 374  */
 375 static inline uint
 376 xfs_lock_inumorder(
 377         uint    lock_mode,
 378         uint    subclass)
 379 {
 380         uint    class = 0;
 381
 382         ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
 383                               XFS_ILOCK_RTSUM)));
 384         ASSERT(xfs_lockdep_subclass_ok(subclass));
 385
 386         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 387                 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
 388                 class += subclass << XFS_IOLOCK_SHIFT;
 389         }
 390
 391         if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 392                 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
 393                 class += subclass << XFS_MMAPLOCK_SHIFT;
 394         }
 395
 396         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
 397                 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
 398                 class += subclass << XFS_ILOCK_SHIFT;
 399         }
 400
 401         return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 402 }
 403
 404 /*
 405  * The following routine will lock n inodes in exclusive mode.  We assume the
 406  * caller calls us with the inodes in i_ino order.
 407  *
 408  * We need to detect deadlock where an inode that we lock is in the AIL and we
 409  * start waiting for another inode that is locked by a thread in a long running
 410  * transaction (such as truncate). This can result in deadlock since the long
 411  * running trans might need to wait for the inode we just locked in order to
 412  * push the tail and free space in the log.
 413  *
 414  * xfs_lock_inodes() can only be used to lock one type of lock at a time -
 415  * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 416  * lock more than one at a time, lockdep will report false positives saying we
 417  * have violated locking orders.
 418  */
 419 static void
 420 xfs_lock_inodes(
 421         struct xfs_inode        **ips,
 422         int                     inodes,
 423         uint                    lock_mode)
 424 {
 425         int                     attempts = 0;
 426         uint                    i;
 427         int                     j;
 428         bool                    try_lock;
 429         struct xfs_log_item     *lp;
 430
 431         /*
 432          * Currently supports between 2 and 5 inodes with exclusive locking.  We
 433          * support an arbitrary depth of locking here, but absolute limits on
 434          * inodes depend on the type of locking and the limits placed by
 435          * lockdep annotations in xfs_lock_inumorder.  These are all checked by
 436          * the asserts.
 437          */
 438         ASSERT(ips && inodes >= 2 && inodes <= 5);
 439         ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
 440                             XFS_ILOCK_EXCL));
 441         ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 442                               XFS_ILOCK_SHARED)));
 443         ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 444                 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 445         ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
 446                 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
 447
 448         if (lock_mode & XFS_IOLOCK_EXCL) {
 449                 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
 450         } else if (lock_mode & XFS_MMAPLOCK_EXCL)
 451                 ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 452
 453 again:
 454         try_lock = false;
 455         i = 0;
 456         for (; i < inodes; i++) {
 457                 ASSERT(ips[i]);
 458
 459                 if (i && (ips[i] == ips[i - 1]))        /* Already locked */
 460                         continue;
 461
 462                 /*
 463                  * If try_lock is not set yet, make sure all locked inodes are
 464                  * not in the AIL.  If any are, set try_lock to be used later.
 465                  */
 466                 if (!try_lock) {
 467                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
 468                                 lp = &ips[j]->i_itemp->ili_item;
 469                                 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
 470                                         try_lock = true;
 471                         }
 472                 }
 473
 474                 /*
 475                  * If any of the previous locks we have locked is in the AIL,
 476                  * we must TRY to get the second and subsequent locks. If
 477                  * we can't get any, we must release all we have
 478                  * and try again.
 479                  */
 480                 if (!try_lock) {
 481                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 482                         continue;
 483                 }
 484
 485                 /* try_lock means we have an inode locked that is in the AIL. */
 486                 ASSERT(i != 0);
 487                 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
 488                         continue;
 489
 490                 /*
 491                  * Unlock all previous guys and try again.  xfs_iunlock will try
 492                  * to push the tail if the inode is in the AIL.
 493                  */
 494                 attempts++;
 495                 for (j = i - 1; j >= 0; j--) {
 496                         /*
 497                          * Check to see if we've already unlocked this one.  Not
 498                          * the first one going back, and the inode ptr is the
 499                          * same.
 500                          */
 501                         if (j != (i - 1) && ips[j] == ips[j + 1])
 502                                 continue;
 503
 504                         xfs_iunlock(ips[j], lock_mode);
 505                 }
 506
 507                 if ((attempts % 5) == 0) {
 508                         delay(1); /* Don't just spin the CPU */
 509                 }
 510                 goto again;
 511         }
 512 }
 513
 514 /*
 515  * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
 516  * mmaplock must be double-locked separately since we use i_rwsem and
 517  * invalidate_lock for that. We now support taking one lock EXCL and the
 518  * other SHARED.
 519  */
 520 void
 521 xfs_lock_two_inodes(
 522         struct xfs_inode        *ip0,
 523         uint                    ip0_mode,
 524         struct xfs_inode        *ip1,
 525         uint                    ip1_mode)
 526 {
 527         int                     attempts = 0;
 528         struct xfs_log_item     *lp;
 529
 530         ASSERT(hweight32(ip0_mode) == 1);
 531         ASSERT(hweight32(ip1_mode) == 1);
 532         ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 533         ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
 534         ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 535         ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 536         ASSERT(ip0->i_ino != ip1->i_ino);
 537
 538         if (ip0->i_ino > ip1->i_ino) {
 539                 swap(ip0, ip1);
 540                 swap(ip0_mode, ip1_mode);
 541         }
 542
 543  again:
 544         xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0));
 545
 546         /*
 547          * If the first lock we have locked is in the AIL, we must TRY to get
 548          * the second lock. If we can't get it, we must release the first one
 549          * and try again.
 550          */
 551         lp = &ip0->i_itemp->ili_item;
 552         if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
 553                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
 554                         xfs_iunlock(ip0, ip0_mode);
 555                         if ((++attempts % 5) == 0)
 556                                 delay(1); /* Don't just spin the CPU */
 557                         goto again;
 558                 }
 559         } else {
 560                 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
 561         }
 562 }
 563
 564 uint
 565 xfs_ip2xflags(
 566         struct xfs_inode        *ip)
 567 {
 568         uint                    flags = 0;
 569
 570         if (ip->i_diflags & XFS_DIFLAG_ANY) {
 571                 if (ip->i_diflags & XFS_DIFLAG_REALTIME)
 572                         flags |= FS_XFLAG_REALTIME;
 573                 if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
 574                         flags |= FS_XFLAG_PREALLOC;
 575                 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
 576                         flags |= FS_XFLAG_IMMUTABLE;
 577                 if (ip->i_diflags & XFS_DIFLAG_APPEND)
 578                         flags |= FS_XFLAG_APPEND;
 579                 if (ip->i_diflags & XFS_DIFLAG_SYNC)
 580                         flags |= FS_XFLAG_SYNC;
 581                 if (ip->i_diflags & XFS_DIFLAG_NOATIME)
 582                         flags |= FS_XFLAG_NOATIME;
 583                 if (ip->i_diflags & XFS_DIFLAG_NODUMP)
 584                         flags |= FS_XFLAG_NODUMP;
 585                 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
 586                         flags |= FS_XFLAG_RTINHERIT;
 587                 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
 588                         flags |= FS_XFLAG_PROJINHERIT;
 589                 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
 590                         flags |= FS_XFLAG_NOSYMLINKS;
 591                 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 592                         flags |= FS_XFLAG_EXTSIZE;
 593                 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
 594                         flags |= FS_XFLAG_EXTSZINHERIT;
 595                 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
 596                         flags |= FS_XFLAG_NODEFRAG;
 597                 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
 598                         flags |= FS_XFLAG_FILESTREAM;
 599         }
 600
 601         if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
 602                 if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
 603                         flags |= FS_XFLAG_DAX;
 604                 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 605                         flags |= FS_XFLAG_COWEXTSIZE;
 606         }
 607
 608         if (xfs_inode_has_attr_fork(ip))
 609                 flags |= FS_XFLAG_HASATTR;
 610         return flags;
 611 }
 612
 613 /*
 614  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 615  * is allowed, otherwise it has to be an exact match. If a CI match is found,
 616  * ci_name->name will point to a the actual name (caller must free) or
 617  * will be set to NULL if an exact match is found.
 618  */
 619 int
 620 xfs_lookup(
 621         struct xfs_inode        *dp,
 622         const struct xfs_name   *name,
 623         struct xfs_inode        **ipp,
 624         struct xfs_name         *ci_name)
 625 {
 626         xfs_ino_t               inum;
 627         int                     error;
 628
 629         trace_xfs_lookup(dp, name);
 630
 631         if (xfs_is_shutdown(dp->i_mount))
 632                 return -EIO;
 633         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
 634                 return -EIO;
 635
 636         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 637         if (error)
 638                 goto out_unlock;
 639
 640         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 641         if (error)
 642                 goto out_free_name;
 643
 644         return 0;
 645
 646 out_free_name:
 647         if (ci_name)
 648                 kfree(ci_name->name);
 649 out_unlock:
 650         *ipp = NULL;
 651         return error;
 652 }
 653
 654 /* Propagate di_flags from a parent inode to a child inode. */
 655 static void
 656 xfs_inode_inherit_flags(
 657         struct xfs_inode        *ip,
 658         const struct xfs_inode  *pip)
 659 {
 660         unsigned int            di_flags = 0;
 661         xfs_failaddr_t          failaddr;
 662         umode_t                 mode = VFS_I(ip)->i_mode;
 663
 664         if (S_ISDIR(mode)) {
 665                 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
 666                         di_flags |= XFS_DIFLAG_RTINHERIT;
 667                 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
 668                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 669                         ip->i_extsize = pip->i_extsize;
 670                 }
 671                 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
 672                         di_flags |= XFS_DIFLAG_PROJINHERIT;
 673         } else if (S_ISREG(mode)) {
 674                 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
 675                     xfs_has_realtime(ip->i_mount))
 676                         di_flags |= XFS_DIFLAG_REALTIME;
 677                 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
 678                         di_flags |= XFS_DIFLAG_EXTSIZE;
 679                         ip->i_extsize = pip->i_extsize;
 680                 }
 681         }
 682         if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
 683             xfs_inherit_noatime)
 684                 di_flags |= XFS_DIFLAG_NOATIME;
 685         if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
 686             xfs_inherit_nodump)
 687                 di_flags |= XFS_DIFLAG_NODUMP;
 688         if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
 689             xfs_inherit_sync)
 690                 di_flags |= XFS_DIFLAG_SYNC;
 691         if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
 692             xfs_inherit_nosymlinks)
 693                 di_flags |= XFS_DIFLAG_NOSYMLINKS;
 694         if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
 695             xfs_inherit_nodefrag)
 696                 di_flags |= XFS_DIFLAG_NODEFRAG;
 697         if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
 698                 di_flags |= XFS_DIFLAG_FILESTREAM;
 699
 700         ip->i_diflags |= di_flags;
 701
 702         /*
 703          * Inode verifiers on older kernels only check that the extent size
 704          * hint is an integer multiple of the rt extent size on realtime files.
 705          * They did not check the hint alignment on a directory with both
 706          * rtinherit and extszinherit flags set.  If the misaligned hint is
 707          * propagated from a directory into a new realtime file, new file
 708          * allocations will fail due to math errors in the rt allocator and/or
 709          * trip the verifiers.  Validate the hint settings in the new file so
 710          * that we don't let broken hints propagate.
 711          */
 712         failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
 713                         VFS_I(ip)->i_mode, ip->i_diflags);
 714         if (failaddr) {
 715                 ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
 716                                    XFS_DIFLAG_EXTSZINHERIT);
 717                 ip->i_extsize = 0;
 718         }
 719 }
 720
 721 /* Propagate di_flags2 from a parent inode to a child inode. */
 722 static void
 723 xfs_inode_inherit_flags2(
 724         struct xfs_inode        *ip,
 725         const struct xfs_inode  *pip)
 726 {
 727         xfs_failaddr_t          failaddr;
 728
 729         if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
 730                 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
 731                 ip->i_cowextsize = pip->i_cowextsize;
 732         }
 733         if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
 734                 ip->i_diflags2 |= XFS_DIFLAG2_DAX;
 735
 736         /* Don't let invalid cowextsize hints propagate. */
 737         failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
 738                         VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
 739         if (failaddr) {
 740                 ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
 741                 ip->i_cowextsize = 0;
 742         }
 743 }
 744
 745 /*
 746  * Initialise a newly allocated inode and return the in-core inode to the
 747  * caller locked exclusively.
 748  */
 749 int
 750 xfs_init_new_inode(
 751         struct mnt_idmap        *idmap,
 752         struct xfs_trans        *tp,
 753         struct xfs_inode        *pip,
 754         xfs_ino_t               ino,
 755         umode_t                 mode,
 756         xfs_nlink_t             nlink,
 757         dev_t                   rdev,
 758         prid_t                  prid,
 759         bool                    init_xattrs,
 760         struct xfs_inode        **ipp)
 761 {
 762         struct inode            *dir = pip ? VFS_I(pip) : NULL;
 763         struct xfs_mount        *mp = tp->t_mountp;
 764         struct xfs_inode        *ip;
 765         unsigned int            flags;
 766         int                     error;
 767         struct timespec64       tv;
 768         struct inode            *inode;
 769
 770         /*
 771          * Protect against obviously corrupt allocation btree records. Later
 772          * xfs_iget checks will catch re-allocation of other active in-memory
 773          * and on-disk inodes. If we don't catch reallocating the parent inode
 774          * here we will deadlock in xfs_iget() so we have to do these checks
 775          * first.
 776          */
 777         if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
 778                 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
 779                 return -EFSCORRUPTED;
 780         }
 781
 782         /*
 783          * Get the in-core inode with the lock held exclusively to prevent
 784          * others from looking at until we're done.
 785          */
 786         error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
 787         if (error)
 788                 return error;
 789
 790         ASSERT(ip != NULL);
 791         inode = VFS_I(ip);
 792         set_nlink(inode, nlink);
 793         inode->i_rdev = rdev;
 794         ip->i_projid = prid;
 795
 796         if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
 797                 inode_fsuid_set(inode, idmap);
 798                 inode->i_gid = dir->i_gid;
 799                 inode->i_mode = mode;
 800         } else {
 801                 inode_init_owner(idmap, inode, dir, mode);
 802         }
 803
 804         /*
 805          * If the group ID of the new file does not match the effective group
 806          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 807          * (and only if the irix_sgid_inherit compatibility variable is set).
 808          */
 809         if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
 810             !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
 811                 inode->i_mode &= ~S_ISGID;
 812
 813         ip->i_disk_size = 0;
 814         ip->i_df.if_nextents = 0;
 815         ASSERT(ip->i_nblocks == 0);
 816
 817         tv = inode_set_ctime_current(inode);
 818         inode_set_mtime_to_ts(inode, tv);
 819         inode_set_atime_to_ts(inode, tv);
 820
 821         ip->i_extsize = 0;
 822         ip->i_diflags = 0;
 823
 824         if (xfs_has_v3inodes(mp)) {
 825                 inode_set_iversion(inode, 1);
 826                 ip->i_cowextsize = 0;
 827                 ip->i_crtime = tv;
 828         }
 829
 830         flags = XFS_ILOG_CORE;
 831         switch (mode & S_IFMT) {
 832         case S_IFIFO:
 833         case S_IFCHR:
 834         case S_IFBLK:
 835         case S_IFSOCK:
 836                 ip->i_df.if_format = XFS_DINODE_FMT_DEV;
 837                 flags |= XFS_ILOG_DEV;
 838                 break;
 839         case S_IFREG:
 840         case S_IFDIR:
 841                 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
 842                         xfs_inode_inherit_flags(ip, pip);
 843                 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
 844                         xfs_inode_inherit_flags2(ip, pip);
 845                 fallthrough;
 846         case S_IFLNK:
 847                 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
 848                 ip->i_df.if_bytes = 0;
 849                 ip->i_df.if_data = NULL;
 850                 break;
 851         default:
 852                 ASSERT(0);
 853         }
 854
 855         /*
 856          * If we need to create attributes immediately after allocating the
 857          * inode, initialise an empty attribute fork right now. We use the
 858          * default fork offset for attributes here as we don't know exactly what
 859          * size or how many attributes we might be adding. We can do this
 860          * safely here because we know the data fork is completely empty and
 861          * this saves us from needing to run a separate transaction to set the
 862          * fork offset in the immediate future.
 863          */
 864         if (init_xattrs && xfs_has_attr(mp)) {
 865                 ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
 866                 xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
 867         }
 868
 869         /*
 870          * Log the new values stuffed into the inode.
 871          */
 872         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 873         xfs_trans_log_inode(tp, ip, flags);
 874
 875         /* now that we have an i_mode we can setup the inode structure */
 876         xfs_setup_inode(ip);
 877
 878         *ipp = ip;
 879         return 0;
 880 }
 881
 882 /*
 883  * Decrement the link count on an inode & log the change.  If this causes the
 884  * link count to go to zero, move the inode to AGI unlinked list so that it can
 885  * be freed when the last active reference goes away via xfs_inactive().
 886  */
 887 static int                      /* error */
 888 xfs_droplink(
 889         xfs_trans_t *tp,
 890         xfs_inode_t *ip)
 891 {
 892         if (VFS_I(ip)->i_nlink == 0) {
 893                 xfs_alert(ip->i_mount,
 894                           "%s: Attempt to drop inode (%llu) with nlink zero.",
 895                           __func__, ip->i_ino);
 896                 return -EFSCORRUPTED;
 897         }
 898
 899         xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 900
 901         drop_nlink(VFS_I(ip));
 902         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 903
 904         if (VFS_I(ip)->i_nlink)
 905                 return 0;
 906
 907         return xfs_iunlink(tp, ip);
 908 }
 909
 910 /*
 911  * Increment the link count on an inode & log the change.
 912  */
 913 static void
 914 xfs_bumplink(
 915         xfs_trans_t *tp,
 916         xfs_inode_t *ip)
 917 {
 918         xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 919
 920         inc_nlink(VFS_I(ip));
 921         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 922 }
 923
 924 int
 925 xfs_create(
 926         struct mnt_idmap        *idmap,
 927         xfs_inode_t             *dp,
 928         struct xfs_name         *name,
 929         umode_t                 mode,
 930         dev_t                   rdev,
 931         bool                    init_xattrs,
 932         xfs_inode_t             **ipp)
 933 {
 934         int                     is_dir = S_ISDIR(mode);
 935         struct xfs_mount        *mp = dp->i_mount;
 936         struct xfs_inode        *ip = NULL;
 937         struct xfs_trans        *tp = NULL;
 938         int                     error;
 939         bool                    unlock_dp_on_error = false;
 940         prid_t                  prid;
 941         struct xfs_dquot        *udqp = NULL;
 942         struct xfs_dquot        *gdqp = NULL;
 943         struct xfs_dquot        *pdqp = NULL;
 944         struct xfs_trans_res    *tres;
 945         uint                    resblks;
 946         xfs_ino_t               ino;
 947
 948         trace_xfs_create(dp, name);
 949
 950         if (xfs_is_shutdown(mp))
 951                 return -EIO;
 952         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
 953                 return -EIO;
 954
 955         prid = xfs_get_initial_prid(dp);
 956
 957         /*
 958          * Make sure that we have allocated dquot(s) on disk.
 959          */
 960         error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
 961                         mapped_fsgid(idmap, &init_user_ns), prid,
 962                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 963                         &udqp, &gdqp, &pdqp);
 964         if (error)
 965                 return error;
 966
 967         if (is_dir) {
 968                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
 969                 tres = &M_RES(mp)->tr_mkdir;
 970         } else {
 971                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
 972                 tres = &M_RES(mp)->tr_create;
 973         }
 974
 975         /*
 976          * Initially assume that the file does not exist and
 977          * reserve the resources for that case.  If that is not
 978          * the case we'll drop the one we have and get a more
 979          * appropriate transaction later.
 980          */
 981         error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
 982                         &tp);
 983         if (error == -ENOSPC) {
 984                 /* flush outstanding delalloc blocks and retry */
 985                 xfs_flush_inodes(mp);
 986                 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
 987                                 resblks, &tp);
 988         }
 989         if (error)
 990                 goto out_release_dquots;
 991
 992         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 993         unlock_dp_on_error = true;
 994
 995         /*
 996          * A newly created regular or special file just has one directory
 997          * entry pointing to them, but a directory also the "." entry
 998          * pointing to itself.
 999          */
1000         error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1001         if (!error)
1002                 error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1003                                 is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
1004         if (error)
1005                 goto out_trans_cancel;
1006
1007         /*
1008          * Now we join the directory inode to the transaction.  We do not do it
1009          * earlier because xfs_dialloc might commit the previous transaction
1010          * (and release all the locks).  An error from here on will result in
1011          * the transaction cancel unlocking dp so don't do it explicitly in the
1012          * error path.
1013          */
1014         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1015         unlock_dp_on_error = false;
1016
1017         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1018                                         resblks - XFS_IALLOC_SPACE_RES(mp));
1019         if (error) {
1020                 ASSERT(error != -ENOSPC);
1021                 goto out_trans_cancel;
1022         }
1023         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1024         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1025
1026         if (is_dir) {
1027                 error = xfs_dir_init(tp, ip, dp);
1028                 if (error)
1029                         goto out_trans_cancel;
1030
1031                 xfs_bumplink(tp, dp);
1032         }
1033
1034         /*
1035          * If this is a synchronous mount, make sure that the
1036          * create transaction goes to disk before returning to
1037          * the user.
1038          */
1039         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1040                 xfs_trans_set_sync(tp);
1041
1042         /*
1043          * Attach the dquot(s) to the inodes and modify them incore.
1044          * These ids of the inode couldn't have changed since the new
1045          * inode has been locked ever since it was created.
1046          */
1047         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1048
1049         error = xfs_trans_commit(tp);
1050         if (error)
1051                 goto out_release_inode;
1052
1053         xfs_qm_dqrele(udqp);
1054         xfs_qm_dqrele(gdqp);
1055         xfs_qm_dqrele(pdqp);
1056
1057         *ipp = ip;
1058         return 0;
1059
1060  out_trans_cancel:
1061         xfs_trans_cancel(tp);
1062  out_release_inode:
1063         /*
1064          * Wait until after the current transaction is aborted to finish the
1065          * setup of the inode and release the inode.  This prevents recursive
1066          * transactions and deadlocks from xfs_inactive.
1067          */
1068         if (ip) {
1069                 xfs_finish_inode_setup(ip);
1070                 xfs_irele(ip);
1071         }
1072  out_release_dquots:
1073         xfs_qm_dqrele(udqp);
1074         xfs_qm_dqrele(gdqp);
1075         xfs_qm_dqrele(pdqp);
1076
1077         if (unlock_dp_on_error)
1078                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1079         return error;
1080 }
1081
1082 int
1083 xfs_create_tmpfile(
1084         struct mnt_idmap        *idmap,
1085         struct xfs_inode        *dp,
1086         umode_t                 mode,
1087         struct xfs_inode        **ipp)
1088 {
1089         struct xfs_mount        *mp = dp->i_mount;
1090         struct xfs_inode        *ip = NULL;
1091         struct xfs_trans        *tp = NULL;
1092         int                     error;
1093         prid_t                  prid;
1094         struct xfs_dquot        *udqp = NULL;
1095         struct xfs_dquot        *gdqp = NULL;
1096         struct xfs_dquot        *pdqp = NULL;
1097         struct xfs_trans_res    *tres;
1098         uint                    resblks;
1099         xfs_ino_t               ino;
1100
1101         if (xfs_is_shutdown(mp))
1102                 return -EIO;
1103
1104         prid = xfs_get_initial_prid(dp);
1105
1106         /*
1107          * Make sure that we have allocated dquot(s) on disk.
1108          */
1109         error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1110                         mapped_fsgid(idmap, &init_user_ns), prid,
1111                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1112                         &udqp, &gdqp, &pdqp);
1113         if (error)
1114                 return error;
1115
1116         resblks = XFS_IALLOC_SPACE_RES(mp);
1117         tres = &M_RES(mp)->tr_create_tmpfile;
1118
1119         error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
1120                         &tp);
1121         if (error)
1122                 goto out_release_dquots;
1123
1124         error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1125         if (!error)
1126                 error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1127                                 0, 0, prid, false, &ip);
1128         if (error)
1129                 goto out_trans_cancel;
1130
1131         if (xfs_has_wsync(mp))
1132                 xfs_trans_set_sync(tp);
1133
1134         /*
1135          * Attach the dquot(s) to the inodes and modify them incore.
1136          * These ids of the inode couldn't have changed since the new
1137          * inode has been locked ever since it was created.
1138          */
1139         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1140
1141         error = xfs_iunlink(tp, ip);
1142         if (error)
1143                 goto out_trans_cancel;
1144
1145         error = xfs_trans_commit(tp);
1146         if (error)
1147                 goto out_release_inode;
1148
1149         xfs_qm_dqrele(udqp);
1150         xfs_qm_dqrele(gdqp);
1151         xfs_qm_dqrele(pdqp);
1152
1153         *ipp = ip;
1154         return 0;
1155
1156  out_trans_cancel:
1157         xfs_trans_cancel(tp);
1158  out_release_inode:
1159         /*
1160          * Wait until after the current transaction is aborted to finish the
1161          * setup of the inode and release the inode.  This prevents recursive
1162          * transactions and deadlocks from xfs_inactive.
1163          */
1164         if (ip) {
1165                 xfs_finish_inode_setup(ip);
1166                 xfs_irele(ip);
1167         }
1168  out_release_dquots:
1169         xfs_qm_dqrele(udqp);
1170         xfs_qm_dqrele(gdqp);
1171         xfs_qm_dqrele(pdqp);
1172
1173         return error;
1174 }
1175
1176 int
1177 xfs_link(
1178         xfs_inode_t             *tdp,
1179         xfs_inode_t             *sip,
1180         struct xfs_name         *target_name)
1181 {
1182         xfs_mount_t             *mp = tdp->i_mount;
1183         xfs_trans_t             *tp;
1184         int                     error, nospace_error = 0;
1185         int                     resblks;
1186
1187         trace_xfs_link(tdp, target_name);
1188
1189         ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1190
1191         if (xfs_is_shutdown(mp))
1192                 return -EIO;
1193         if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
1194                 return -EIO;
1195
1196         error = xfs_qm_dqattach(sip);
1197         if (error)
1198                 goto std_return;
1199
1200         error = xfs_qm_dqattach(tdp);
1201         if (error)
1202                 goto std_return;
1203
1204         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1205         error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
1206                         &tp, &nospace_error);
1207         if (error)
1208                 goto std_return;
1209
1210         /*
1211          * If we are using project inheritance, we only allow hard link
1212          * creation in our tree when the project IDs are the same; else
1213          * the tree quota mechanism could be circumvented.
1214          */
1215         if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1216                      tdp->i_projid != sip->i_projid)) {
1217                 error = -EXDEV;
1218                 goto error_return;
1219         }
1220
1221         if (!resblks) {
1222                 error = xfs_dir_canenter(tp, tdp, target_name);
1223                 if (error)
1224                         goto error_return;
1225         }
1226
1227         /*
1228          * Handle initial link state of O_TMPFILE inode
1229          */
1230         if (VFS_I(sip)->i_nlink == 0) {
1231                 struct xfs_perag        *pag;
1232
1233                 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1234                 error = xfs_iunlink_remove(tp, pag, sip);
1235                 xfs_perag_put(pag);
1236                 if (error)
1237                         goto error_return;
1238         }
1239
1240         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1241                                    resblks);
1242         if (error)
1243                 goto error_return;
1244         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1245         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1246
1247         xfs_bumplink(tp, sip);
1248
1249         /*
1250          * If this is a synchronous mount, make sure that the
1251          * link transaction goes to disk before returning to
1252          * the user.
1253          */
1254         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
1255                 xfs_trans_set_sync(tp);
1256
1257         return xfs_trans_commit(tp);
1258
1259  error_return:
1260         xfs_trans_cancel(tp);
1261  std_return:
1262         if (error == -ENOSPC && nospace_error)
1263                 error = nospace_error;
1264         return error;
1265 }
1266
1267 /* Clear the reflink flag and the cowblocks tag if possible. */
1268 static void
1269 xfs_itruncate_clear_reflink_flags(
1270         struct xfs_inode        *ip)
1271 {
1272         struct xfs_ifork        *dfork;
1273         struct xfs_ifork        *cfork;
1274
1275         if (!xfs_is_reflink_inode(ip))
1276                 return;
1277         dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1278         cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
1279         if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
1280                 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1281         if (cfork->if_bytes == 0)
1282                 xfs_inode_clear_cowblocks_tag(ip);
1283 }
1284
1285 /*
1286  * Free up the underlying blocks past new_size.  The new size must be smaller
1287  * than the current size.  This routine can be used both for the attribute and
1288  * data fork, and does not modify the inode size, which is left to the caller.
1289  *
1290  * The transaction passed to this routine must have made a permanent log
1291  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1292  * given transaction and start new ones, so make sure everything involved in
1293  * the transaction is tidy before calling here.  Some transaction will be
1294  * returned to the caller to be committed.  The incoming transaction must
1295  * already include the inode, and both inode locks must be held exclusively.
1296  * The inode must also be "held" within the transaction.  On return the inode
1297  * will be "held" within the returned transaction.  This routine does NOT
1298  * require any disk space to be reserved for it within the transaction.
1299  *
1300  * If we get an error, we must return with the inode locked and linked into the
1301  * current transaction. This keeps things simple for the higher level code,
1302  * because it always knows that the inode is locked and held in the transaction
1303  * that returns to it whether errors occur or not.  We don't mark the inode
1304  * dirty on error so that transactions can be easily aborted if possible.
1305  */
1306 int
1307 xfs_itruncate_extents_flags(
1308         struct xfs_trans        **tpp,
1309         struct xfs_inode        *ip,
1310         int                     whichfork,
1311         xfs_fsize_t             new_size,
1312         int                     flags)
1313 {
1314         struct xfs_mount        *mp = ip->i_mount;
1315         struct xfs_trans        *tp = *tpp;
1316         xfs_fileoff_t           first_unmap_block;
1317         int                     error = 0;
1318
1319         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
1320         if (atomic_read(&VFS_I(ip)->i_count))
1321                 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
1322         ASSERT(new_size <= XFS_ISIZE(ip));
1323         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1324         ASSERT(ip->i_itemp != NULL);
1325         ASSERT(ip->i_itemp->ili_lock_flags == 0);
1326         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1327
1328         trace_xfs_itruncate_extents_start(ip, new_size);
1329
1330         flags |= xfs_bmapi_aflag(whichfork);
1331
1332         /*
1333          * Since it is possible for space to become allocated beyond
1334          * the end of the file (in a crash where the space is allocated
1335          * but the inode size is not yet updated), simply remove any
1336          * blocks which show up between the new EOF and the maximum
1337          * possible file size.
1338          *
1339          * We have to free all the blocks to the bmbt maximum offset, even if
1340          * the page cache can't scale that far.
1341          */
1342         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1343         if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1344                 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1345                 return 0;
1346         }
1347
1348         error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
1349                         XFS_MAX_FILEOFF);
1350         if (error)
1351                 goto out;
1352
1353         if (whichfork == XFS_DATA_FORK) {
1354                 /* Remove all pending CoW reservations. */
1355                 error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1356                                 first_unmap_block, XFS_MAX_FILEOFF, true);
1357                 if (error)
1358                         goto out;
1359
1360                 xfs_itruncate_clear_reflink_flags(ip);
1361         }
1362
1363         /*
1364          * Always re-log the inode so that our permanent transaction can keep
1365          * on rolling it forward in the log.
1366          */
1367         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1368
1369         trace_xfs_itruncate_extents_end(ip, new_size);
1370
1371 out:
1372         *tpp = tp;
1373         return error;
1374 }
1375
1376 int
1377 xfs_release(
1378         xfs_inode_t     *ip)
1379 {
1380         xfs_mount_t     *mp = ip->i_mount;
1381         int             error = 0;
1382
1383         if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
1384                 return 0;
1385
1386         /* If this is a read-only mount, don't do this (would generate I/O) */
1387         if (xfs_is_readonly(mp))
1388                 return 0;
1389
1390         if (!xfs_is_shutdown(mp)) {
1391                 int truncated;
1392
1393                 /*
1394                  * If we previously truncated this file and removed old data
1395                  * in the process, we want to initiate "early" writeout on
1396                  * the last close.  This is an attempt to combat the notorious
1397                  * NULL files problem which is particularly noticeable from a
1398                  * truncate down, buffered (re-)write (delalloc), followed by
1399                  * a crash.  What we are effectively doing here is
1400                  * significantly reducing the time window where we'd otherwise
1401                  * be exposed to that problem.
1402                  */
1403                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1404                 if (truncated) {
1405                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1406                         if (ip->i_delayed_blks > 0) {
1407                                 error = filemap_flush(VFS_I(ip)->i_mapping);
1408                                 if (error)
1409                                         return error;
1410                         }
1411                 }
1412         }
1413
1414         if (VFS_I(ip)->i_nlink == 0)
1415                 return 0;
1416
1417         /*
1418          * If we can't get the iolock just skip truncating the blocks past EOF
1419          * because we could deadlock with the mmap_lock otherwise. We'll get
1420          * another chance to drop them once the last reference to the inode is
1421          * dropped, so we'll never leak blocks permanently.
1422          */
1423         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1424                 return 0;
1425
1426         if (xfs_can_free_eofblocks(ip, false)) {
1427                 /*
1428                  * Check if the inode is being opened, written and closed
1429                  * frequently and we have delayed allocation blocks outstanding
1430                  * (e.g. streaming writes from the NFS server), truncating the
1431                  * blocks past EOF will cause fragmentation to occur.
1432                  *
1433                  * In this case don't do the truncation, but we have to be
1434                  * careful how we detect this case. Blocks beyond EOF show up as
1435                  * i_delayed_blks even when the inode is clean, so we need to
1436                  * truncate them away first before checking for a dirty release.
1437                  * Hence on the first dirty close we will still remove the
1438                  * speculative allocation, but after that we will leave it in
1439                  * place.
1440                  */
1441                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1442                         goto out_unlock;
1443
1444                 error = xfs_free_eofblocks(ip);
1445                 if (error)
1446                         goto out_unlock;
1447
1448                 /* delalloc blocks after truncation means it really is dirty */
1449                 if (ip->i_delayed_blks)
1450                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1451         }
1452
1453 out_unlock:
1454         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1455         return error;
1456 }
1457
1458 /*
1459  * xfs_inactive_truncate
1460  *
1461  * Called to perform a truncate when an inode becomes unlinked.
1462  */
1463 STATIC int
1464 xfs_inactive_truncate(
1465         struct xfs_inode *ip)
1466 {
1467         struct xfs_mount        *mp = ip->i_mount;
1468         struct xfs_trans        *tp;
1469         int                     error;
1470
1471         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
1472         if (error) {
1473                 ASSERT(xfs_is_shutdown(mp));
1474                 return error;
1475         }
1476         xfs_ilock(ip, XFS_ILOCK_EXCL);
1477         xfs_trans_ijoin(tp, ip, 0);
1478
1479         /*
1480          * Log the inode size first to prevent stale data exposure in the event
1481          * of a system crash before the truncate completes. See the related
1482          * comment in xfs_vn_setattr_size() for details.
1483          */
1484         ip->i_disk_size = 0;
1485         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1486
1487         error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1488         if (error)
1489                 goto error_trans_cancel;
1490
1491         ASSERT(ip->i_df.if_nextents == 0);
1492
1493         error = xfs_trans_commit(tp);
1494         if (error)
1495                 goto error_unlock;
1496
1497         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1498         return 0;
1499
1500 error_trans_cancel:
1501         xfs_trans_cancel(tp);
1502 error_unlock:
1503         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1504         return error;
1505 }
1506
1507 /*
1508  * xfs_inactive_ifree()
1509  *
1510  * Perform the inode free when an inode is unlinked.
1511  */
1512 STATIC int
1513 xfs_inactive_ifree(
1514         struct xfs_inode *ip)
1515 {
1516         struct xfs_mount        *mp = ip->i_mount;
1517         struct xfs_trans        *tp;
1518         int                     error;
1519
1520         /*
1521          * We try to use a per-AG reservation for any block needed by the finobt
1522          * tree, but as the finobt feature predates the per-AG reservation
1523          * support a degraded file system might not have enough space for the
1524          * reservation at mount time.  In that case try to dip into the reserved
1525          * pool and pray.
1526          *
1527          * Send a warning if the reservation does happen to fail, as the inode
1528          * now remains allocated and sits on the unlinked list until the fs is
1529          * repaired.
1530          */
1531         if (unlikely(mp->m_finobt_nores)) {
1532                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1533                                 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1534                                 &tp);
1535         } else {
1536                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1537         }
1538         if (error) {
1539                 if (error == -ENOSPC) {
1540                         xfs_warn_ratelimited(mp,
1541                         "Failed to remove inode(s) from unlinked list. "
1542                         "Please free space, unmount and run xfs_repair.");
1543                 } else {
1544                         ASSERT(xfs_is_shutdown(mp));
1545                 }
1546                 return error;
1547         }
1548
1549         /*
1550          * We do not hold the inode locked across the entire rolling transaction
1551          * here. We only need to hold it for the first transaction that
1552          * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1553          * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1554          * here breaks the relationship between cluster buffer invalidation and
1555          * stale inode invalidation on cluster buffer item journal commit
1556          * completion, and can result in leaving dirty stale inodes hanging
1557          * around in memory.
1558          *
1559          * We have no need for serialising this inode operation against other
1560          * operations - we freed the inode and hence reallocation is required
1561          * and that will serialise on reallocating the space the deferops need
1562          * to free. Hence we can unlock the inode on the first commit of
1563          * the transaction rather than roll it right through the deferops. This
1564          * avoids relogging the XFS_ISTALE inode.
1565          *
1566          * We check that xfs_ifree() hasn't grown an internal transaction roll
1567          * by asserting that the inode is still locked when it returns.
1568          */
1569         xfs_ilock(ip, XFS_ILOCK_EXCL);
1570         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1571
1572         error = xfs_ifree(tp, ip);
1573         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
1574         if (error) {
1575                 /*
1576                  * If we fail to free the inode, shut down.  The cancel
1577                  * might do that, we need to make sure.  Otherwise the
1578                  * inode might be lost for a long time or forever.
1579                  */
1580                 if (!xfs_is_shutdown(mp)) {
1581                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
1582                                 __func__, error);
1583                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1584                 }
1585                 xfs_trans_cancel(tp);
1586                 return error;
1587         }
1588
1589         /*
1590          * Credit the quota account(s). The inode is gone.
1591          */
1592         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1593
1594         return xfs_trans_commit(tp);
1595 }
1596
1597 /*
1598  * Returns true if we need to update the on-disk metadata before we can free
1599  * the memory used by this inode.  Updates include freeing post-eof
1600  * preallocations; freeing COW staging extents; and marking the inode free in
1601  * the inobt if it is on the unlinked list.
1602  */
1603 bool
1604 xfs_inode_needs_inactive(
1605         struct xfs_inode        *ip)
1606 {
1607         struct xfs_mount        *mp = ip->i_mount;
1608         struct xfs_ifork        *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1609
1610         /*
1611          * If the inode is already free, then there can be nothing
1612          * to clean up here.
1613          */
1614         if (VFS_I(ip)->i_mode == 0)
1615                 return false;
1616
1617         /*
1618          * If this is a read-only mount, don't do this (would generate I/O)
1619          * unless we're in log recovery and cleaning the iunlinked list.
1620          */
1621         if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
1622                 return false;
1623
1624         /* If the log isn't running, push inodes straight to reclaim. */
1625         if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
1626                 return false;
1627
1628         /* Metadata inodes require explicit resource cleanup. */
1629         if (xfs_is_metadata_inode(ip))
1630                 return false;
1631
1632         /* Want to clean out the cow blocks if there are any. */
1633         if (cow_ifp && cow_ifp->if_bytes > 0)
1634                 return true;
1635
1636         /* Unlinked files must be freed. */
1637         if (VFS_I(ip)->i_nlink == 0)
1638                 return true;
1639
1640         /*
1641          * This file isn't being freed, so check if there are post-eof blocks
1642          * to free.  @force is true because we are evicting an inode from the
1643          * cache.  Post-eof blocks must be freed, lest we end up with broken
1644          * free space accounting.
1645          *
1646          * Note: don't bother with iolock here since lockdep complains about
1647          * acquiring it in reclaim context. We have the only reference to the
1648          * inode at this point anyways.
1649          */
1650         return xfs_can_free_eofblocks(ip, true);
1651 }
1652
1653 /*
1654  * xfs_inactive
1655  *
1656  * This is called when the vnode reference count for the vnode
1657  * goes to zero.  If the file has been unlinked, then it must
1658  * now be truncated.  Also, we clear all of the read-ahead state
1659  * kept for the inode here since the file is now closed.
1660  */
1661 int
1662 xfs_inactive(
1663         xfs_inode_t     *ip)
1664 {
1665         struct xfs_mount        *mp;
1666         int                     error = 0;
1667         int                     truncate = 0;
1668
1669         /*
1670          * If the inode is already free, then there can be nothing
1671          * to clean up here.
1672          */
1673         if (VFS_I(ip)->i_mode == 0) {
1674                 ASSERT(ip->i_df.if_broot_bytes == 0);
1675                 goto out;
1676         }
1677
1678         mp = ip->i_mount;
1679         ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1680
1681         /*
1682          * If this is a read-only mount, don't do this (would generate I/O)
1683          * unless we're in log recovery and cleaning the iunlinked list.
1684          */
1685         if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
1686                 goto out;
1687
1688         /* Metadata inodes require explicit resource cleanup. */
1689         if (xfs_is_metadata_inode(ip))
1690                 goto out;
1691
1692         /* Try to clean out the cow blocks if there are any. */
1693         if (xfs_inode_has_cow_data(ip))
1694                 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
1695
1696         if (VFS_I(ip)->i_nlink != 0) {
1697                 /*
1698                  * force is true because we are evicting an inode from the
1699                  * cache. Post-eof blocks must be freed, lest we end up with
1700                  * broken free space accounting.
1701                  *
1702                  * Note: don't bother with iolock here since lockdep complains
1703                  * about acquiring it in reclaim context. We have the only
1704                  * reference to the inode at this point anyways.
1705                  */
1706                 if (xfs_can_free_eofblocks(ip, true))
1707                         error = xfs_free_eofblocks(ip);
1708
1709                 goto out;
1710         }
1711
1712         if (S_ISREG(VFS_I(ip)->i_mode) &&
1713             (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
1714              ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
1715                 truncate = 1;
1716
1717         if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
1718                 /*
1719                  * If this inode is being inactivated during a quotacheck and
1720                  * has not yet been scanned by quotacheck, we /must/ remove
1721                  * the dquots from the inode before inactivation changes the
1722                  * block and inode counts.  Most probably this is a result of
1723                  * reloading the incore iunlinked list to purge unrecovered
1724                  * unlinked inodes.
1725                  */
1726                 xfs_qm_dqdetach(ip);
1727         } else {
1728                 error = xfs_qm_dqattach(ip);
1729                 if (error)
1730                         goto out;
1731         }
1732
1733         if (S_ISLNK(VFS_I(ip)->i_mode))
1734                 error = xfs_inactive_symlink(ip);
1735         else if (truncate)
1736                 error = xfs_inactive_truncate(ip);
1737         if (error)
1738                 goto out;
1739
1740         /*
1741          * If there are attributes associated with the file then blow them away
1742          * now.  The code calls a routine that recursively deconstructs the
1743          * attribute fork. If also blows away the in-core attribute fork.
1744          */
1745         if (xfs_inode_has_attr_fork(ip)) {
1746                 error = xfs_attr_inactive(ip);
1747                 if (error)
1748                         goto out;
1749         }
1750
1751         ASSERT(ip->i_forkoff == 0);
1752
1753         /*
1754          * Free the inode.
1755          */
1756         error = xfs_inactive_ifree(ip);
1757
1758 out:
1759         /*
1760          * We're done making metadata updates for this inode, so we can release
1761          * the attached dquots.
1762          */
1763         xfs_qm_dqdetach(ip);
1764         return error;
1765 }
1766
1767 /*
1768  * In-Core Unlinked List Lookups
1769  * =============================
1770  *
1771  * Every inode is supposed to be reachable from some other piece of metadata
1772  * with the exception of the root directory.  Inodes with a connection to a
1773  * file descriptor but not linked from anywhere in the on-disk directory tree
1774  * are collectively known as unlinked inodes, though the filesystem itself
1775  * maintains links to these inodes so that on-disk metadata are consistent.
1776  *
1777  * XFS implements a per-AG on-disk hash table of unlinked inodes.  The AGI
1778  * header contains a number of buckets that point to an inode, and each inode
1779  * record has a pointer to the next inode in the hash chain.  This
1780  * singly-linked list causes scaling problems in the iunlink remove function
1781  * because we must walk that list to find the inode that points to the inode
1782  * being removed from the unlinked hash bucket list.
1783  *
1784  * Hence we keep an in-memory double linked list to link each inode on an
1785  * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
1786  * based lists would require having 64 list heads in the perag, one for each
1787  * list. This is expensive in terms of memory (think millions of AGs) and cache
1788  * misses on lookups. Instead, use the fact that inodes on the unlinked list
1789  * must be referenced at the VFS level to keep them on the list and hence we
1790  * have an existence guarantee for inodes on the unlinked list.
1791  *
1792  * Given we have an existence guarantee, we can use lockless inode cache lookups
1793  * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
1794  * for the double linked unlinked list, and we don't need any extra locking to
1795  * keep the list safe as all manipulations are done under the AGI buffer lock.
1796  * Keeping the list up to date does not require memory allocation, just finding
1797  * the XFS inode and updating the next/prev unlinked list aginos.
1798  */
1799
1800 /*
1801  * Find an inode on the unlinked list. This does not take references to the
1802  * inode as we have existence guarantees by holding the AGI buffer lock and that
1803  * only unlinked, referenced inodes can be on the unlinked inode list.  If we
1804  * don't find the inode in cache, then let the caller handle the situation.
1805  */
1806 static struct xfs_inode *
1807 xfs_iunlink_lookup(
1808         struct xfs_perag        *pag,
1809         xfs_agino_t             agino)
1810 {
1811         struct xfs_inode        *ip;
1812
1813         rcu_read_lock();
1814         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1815         if (!ip) {
1816                 /* Caller can handle inode not being in memory. */
1817                 rcu_read_unlock();
1818                 return NULL;
1819         }
1820
1821         /*
1822          * Inode in RCU freeing limbo should not happen.  Warn about this and
1823          * let the caller handle the failure.
1824          */
1825         if (WARN_ON_ONCE(!ip->i_ino)) {
1826                 rcu_read_unlock();
1827                 return NULL;
1828         }
1829         ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
1830         rcu_read_unlock();
1831         return ip;
1832 }
1833
1834 /*
1835  * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
1836  * is not in cache.
1837  */
1838 static int
1839 xfs_iunlink_update_backref(
1840         struct xfs_perag        *pag,
1841         xfs_agino_t             prev_agino,
1842         xfs_agino_t             next_agino)
1843 {
1844         struct xfs_inode        *ip;
1845
1846         /* No update necessary if we are at the end of the list. */
1847         if (next_agino == NULLAGINO)
1848                 return 0;
1849
1850         ip = xfs_iunlink_lookup(pag, next_agino);
1851         if (!ip)
1852                 return -ENOLINK;
1853
1854         ip->i_prev_unlinked = prev_agino;
1855         return 0;
1856 }
1857
1858 /*
1859  * Point the AGI unlinked bucket at an inode and log the results.  The caller
1860  * is responsible for validating the old value.
1861  */
1862 STATIC int
1863 xfs_iunlink_update_bucket(
1864         struct xfs_trans        *tp,
1865         struct xfs_perag        *pag,
1866         struct xfs_buf          *agibp,
1867         unsigned int            bucket_index,
1868         xfs_agino_t             new_agino)
1869 {
1870         struct xfs_agi          *agi = agibp->b_addr;
1871         xfs_agino_t             old_value;
1872         int                     offset;
1873
1874         ASSERT(xfs_verify_agino_or_null(pag, new_agino));
1875
1876         old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1877         trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
1878                         old_value, new_agino);
1879
1880         /*
1881          * We should never find the head of the list already set to the value
1882          * passed in because either we're adding or removing ourselves from the
1883          * head of the list.
1884          */
1885         if (old_value == new_agino) {
1886                 xfs_buf_mark_corrupt(agibp);
1887                 return -EFSCORRUPTED;
1888         }
1889
1890         agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
1891         offset = offsetof(struct xfs_agi, agi_unlinked) +
1892                         (sizeof(xfs_agino_t) * bucket_index);
1893         xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
1894         return 0;
1895 }
1896
1897 /*
1898  * Load the inode @next_agino into the cache and set its prev_unlinked pointer
1899  * to @prev_agino.  Caller must hold the AGI to synchronize with other changes
1900  * to the unlinked list.
1901  */
1902 STATIC int
1903 xfs_iunlink_reload_next(
1904         struct xfs_trans        *tp,
1905         struct xfs_buf          *agibp,
1906         xfs_agino_t             prev_agino,
1907         xfs_agino_t             next_agino)
1908 {
1909         struct xfs_perag        *pag = agibp->b_pag;
1910         struct xfs_mount        *mp = pag->pag_mount;
1911         struct xfs_inode        *next_ip = NULL;
1912         xfs_ino_t               ino;
1913         int                     error;
1914
1915         ASSERT(next_agino != NULLAGINO);
1916
1917 #ifdef DEBUG
1918         rcu_read_lock();
1919         next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
1920         ASSERT(next_ip == NULL);
1921         rcu_read_unlock();
1922 #endif
1923
1924         xfs_info_ratelimited(mp,
1925  "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating recovery.",
1926                         next_agino, pag->pag_agno);
1927
1928         /*
1929          * Use an untrusted lookup just to be cautious in case the AGI has been
1930          * corrupted and now points at a free inode.  That shouldn't happen,
1931          * but we'd rather shut down now since we're already running in a weird
1932          * situation.
1933          */
1934         ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
1935         error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
1936         if (error)
1937                 return error;
1938
1939         /* If this is not an unlinked inode, something is very wrong. */
1940         if (VFS_I(next_ip)->i_nlink != 0) {
1941                 error = -EFSCORRUPTED;
1942                 goto rele;
1943         }
1944
1945         next_ip->i_prev_unlinked = prev_agino;
1946         trace_xfs_iunlink_reload_next(next_ip);
1947 rele:
1948         ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
1949         if (xfs_is_quotacheck_running(mp) && next_ip)
1950                 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
1951         xfs_irele(next_ip);
1952         return error;
1953 }
1954
1955 static int
1956 xfs_iunlink_insert_inode(
1957         struct xfs_trans        *tp,
1958         struct xfs_perag        *pag,
1959         struct xfs_buf          *agibp,
1960         struct xfs_inode        *ip)
1961 {
1962         struct xfs_mount        *mp = tp->t_mountp;
1963         struct xfs_agi          *agi = agibp->b_addr;
1964         xfs_agino_t             next_agino;
1965         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1966         short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1967         int                     error;
1968
1969         /*
1970          * Get the index into the agi hash table for the list this inode will
1971          * go on.  Make sure the pointer isn't garbage and that this inode
1972          * isn't already on the list.
1973          */
1974         next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1975         if (next_agino == agino ||
1976             !xfs_verify_agino_or_null(pag, next_agino)) {
1977                 xfs_buf_mark_corrupt(agibp);
1978                 return -EFSCORRUPTED;
1979         }
1980
1981         /*
1982          * Update the prev pointer in the next inode to point back to this
1983          * inode.
1984          */
1985         error = xfs_iunlink_update_backref(pag, agino, next_agino);
1986         if (error == -ENOLINK)
1987                 error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
1988         if (error)
1989                 return error;
1990
1991         if (next_agino != NULLAGINO) {
1992                 /*
1993                  * There is already another inode in the bucket, so point this
1994                  * inode to the current head of the list.
1995                  */
1996                 error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
1997                 if (error)
1998                         return error;
1999                 ip->i_next_unlinked = next_agino;
2000         }
2001
2002         /* Point the head of the list to point to this inode. */
2003         ip->i_prev_unlinked = NULLAGINO;
2004         return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
2005 }
2006
2007 /*
2008  * This is called when the inode's link count has gone to 0 or we are creating
2009  * a tmpfile via O_TMPFILE.  The inode @ip must have nlink == 0.
2010  *
2011  * We place the on-disk inode on a list in the AGI.  It will be pulled from this
2012  * list when the inode is freed.
2013  */
2014 STATIC int
2015 xfs_iunlink(
2016         struct xfs_trans        *tp,
2017         struct xfs_inode        *ip)
2018 {
2019         struct xfs_mount        *mp = tp->t_mountp;
2020         struct xfs_perag        *pag;
2021         struct xfs_buf          *agibp;
2022         int                     error;
2023
2024         ASSERT(VFS_I(ip)->i_nlink == 0);
2025         ASSERT(VFS_I(ip)->i_mode != 0);
2026         trace_xfs_iunlink(ip);
2027
2028         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2029
2030         /* Get the agi buffer first.  It ensures lock ordering on the list. */
2031         error = xfs_read_agi(pag, tp, &agibp);
2032         if (error)
2033                 goto out;
2034
2035         error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
2036 out:
2037         xfs_perag_put(pag);
2038         return error;
2039 }
2040
2041 static int
2042 xfs_iunlink_remove_inode(
2043         struct xfs_trans        *tp,
2044         struct xfs_perag        *pag,
2045         struct xfs_buf          *agibp,
2046         struct xfs_inode        *ip)
2047 {
2048         struct xfs_mount        *mp = tp->t_mountp;
2049         struct xfs_agi          *agi = agibp->b_addr;
2050         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2051         xfs_agino_t             head_agino;
2052         short                   bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2053         int                     error;
2054
2055         trace_xfs_iunlink_remove(ip);
2056
2057         /*
2058          * Get the index into the agi hash table for the list this inode will
2059          * go on.  Make sure the head pointer isn't garbage.
2060          */
2061         head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2062         if (!xfs_verify_agino(pag, head_agino)) {
2063                 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2064                                 agi, sizeof(*agi));
2065                 return -EFSCORRUPTED;
2066         }
2067
2068         /*
2069          * Set our inode's next_unlinked pointer to NULL and then return
2070          * the old pointer value so that we can update whatever was previous
2071          * to us in the list to point to whatever was next in the list.
2072          */
2073         error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
2074         if (error)
2075                 return error;
2076
2077         /*
2078          * Update the prev pointer in the next inode to point back to previous
2079          * inode in the chain.
2080          */
2081         error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
2082                         ip->i_next_unlinked);
2083         if (error == -ENOLINK)
2084                 error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
2085                                 ip->i_next_unlinked);
2086         if (error)
2087                 return error;
2088
2089         if (head_agino != agino) {
2090                 struct xfs_inode        *prev_ip;
2091
2092                 prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
2093                 if (!prev_ip)
2094                         return -EFSCORRUPTED;
2095
2096                 error = xfs_iunlink_log_inode(tp, prev_ip, pag,
2097                                 ip->i_next_unlinked);
2098                 prev_ip->i_next_unlinked = ip->i_next_unlinked;
2099         } else {
2100                 /* Point the head of the list to the next unlinked inode. */
2101                 error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2102                                 ip->i_next_unlinked);
2103         }
2104
2105         ip->i_next_unlinked = NULLAGINO;
2106         ip->i_prev_unlinked = 0;
2107         return error;
2108 }
2109
2110 /*
2111  * Pull the on-disk inode from the AGI unlinked list.
2112  */
2113 STATIC int
2114 xfs_iunlink_remove(
2115         struct xfs_trans        *tp,
2116         struct xfs_perag        *pag,
2117         struct xfs_inode        *ip)
2118 {
2119         struct xfs_buf          *agibp;
2120         int                     error;
2121
2122         trace_xfs_iunlink_remove(ip);
2123
2124         /* Get the agi buffer first.  It ensures lock ordering on the list. */
2125         error = xfs_read_agi(pag, tp, &agibp);
2126         if (error)
2127                 return error;
2128
2129         return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
2130 }
2131
2132 /*
2133  * Look up the inode number specified and if it is not already marked XFS_ISTALE
2134  * mark it stale. We should only find clean inodes in this lookup that aren't
2135  * already stale.
2136  */
2137 static void
2138 xfs_ifree_mark_inode_stale(
2139         struct xfs_perag        *pag,
2140         struct xfs_inode        *free_ip,
2141         xfs_ino_t               inum)
2142 {
2143         struct xfs_mount        *mp = pag->pag_mount;
2144         struct xfs_inode_log_item *iip;
2145         struct xfs_inode        *ip;
2146
2147 retry:
2148         rcu_read_lock();
2149         ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2150
2151         /* Inode not in memory, nothing to do */
2152         if (!ip) {
2153                 rcu_read_unlock();
2154                 return;
2155         }
2156
2157         /*
2158          * because this is an RCU protected lookup, we could find a recently
2159          * freed or even reallocated inode during the lookup. We need to check
2160          * under the i_flags_lock for a valid inode here. Skip it if it is not
2161          * valid, the wrong inode or stale.
2162          */
2163         spin_lock(&ip->i_flags_lock);
2164         if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2165                 goto out_iflags_unlock;
2166
2167         /*
2168          * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2169          * other inodes that we did not find in the list attached to the buffer
2170          * and are not already marked stale. If we can't lock it, back off and
2171          * retry.
2172          */
2173         if (ip != free_ip) {
2174                 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2175                         spin_unlock(&ip->i_flags_lock);
2176                         rcu_read_unlock();
2177                         delay(1);
2178                         goto retry;
2179                 }
2180         }
2181         ip->i_flags |= XFS_ISTALE;
2182
2183         /*
2184          * If the inode is flushing, it is already attached to the buffer.  All
2185          * we needed to do here is mark the inode stale so buffer IO completion
2186          * will remove it from the AIL.
2187          */
2188         iip = ip->i_itemp;
2189         if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2190                 ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2191                 ASSERT(iip->ili_last_fields);
2192                 goto out_iunlock;
2193         }
2194
2195         /*
2196          * Inodes not attached to the buffer can be released immediately.
2197          * Everything else has to go through xfs_iflush_abort() on journal
2198          * commit as the flock synchronises removal of the inode from the
2199          * cluster buffer against inode reclaim.
2200          */
2201         if (!iip || list_empty(&iip->ili_item.li_bio_list))
2202                 goto out_iunlock;
2203
2204         __xfs_iflags_set(ip, XFS_IFLUSHING);
2205         spin_unlock(&ip->i_flags_lock);
2206         rcu_read_unlock();
2207
2208         /* we have a dirty inode in memory that has not yet been flushed. */
2209         spin_lock(&iip->ili_lock);
2210         iip->ili_last_fields = iip->ili_fields;
2211         iip->ili_fields = 0;
2212         iip->ili_fsync_fields = 0;
2213         spin_unlock(&iip->ili_lock);
2214         ASSERT(iip->ili_last_fields);
2215
2216         if (ip != free_ip)
2217                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2218         return;
2219
2220 out_iunlock:
2221         if (ip != free_ip)
2222                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2223 out_iflags_unlock:
2224         spin_unlock(&ip->i_flags_lock);
2225         rcu_read_unlock();
2226 }
2227
2228 /*
2229  * A big issue when freeing the inode cluster is that we _cannot_ skip any
2230  * inodes that are in memory - they all must be marked stale and attached to
2231  * the cluster buffer.
2232  */
2233 static int
2234 xfs_ifree_cluster(
2235         struct xfs_trans        *tp,
2236         struct xfs_perag        *pag,
2237         struct xfs_inode        *free_ip,
2238         struct xfs_icluster     *xic)
2239 {
2240         struct xfs_mount        *mp = free_ip->i_mount;
2241         struct xfs_ino_geometry *igeo = M_IGEO(mp);
2242         struct xfs_buf          *bp;
2243         xfs_daddr_t             blkno;
2244         xfs_ino_t               inum = xic->first_ino;
2245         int                     nbufs;
2246         int                     i, j;
2247         int                     ioffset;
2248         int                     error;
2249
2250         nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2251
2252         for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2253                 /*
2254                  * The allocation bitmap tells us which inodes of the chunk were
2255                  * physically allocated. Skip the cluster if an inode falls into
2256                  * a sparse region.
2257                  */
2258                 ioffset = inum - xic->first_ino;
2259                 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2260                         ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2261                         continue;
2262                 }
2263
2264                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2265                                          XFS_INO_TO_AGBNO(mp, inum));
2266
2267                 /*
2268                  * We obtain and lock the backing buffer first in the process
2269                  * here to ensure dirty inodes attached to the buffer remain in
2270                  * the flushing state while we mark them stale.
2271                  *
2272                  * If we scan the in-memory inodes first, then buffer IO can
2273                  * complete before we get a lock on it, and hence we may fail
2274                  * to mark all the active inodes on the buffer stale.
2275                  */
2276                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2277                                 mp->m_bsize * igeo->blocks_per_cluster,
2278                                 XBF_UNMAPPED, &bp);
2279                 if (error)
2280                         return error;
2281
2282                 /*
2283                  * This buffer may not have been correctly initialised as we
2284                  * didn't read it from disk. That's not important because we are
2285                  * only using to mark the buffer as stale in the log, and to
2286                  * attach stale cached inodes on it. That means it will never be
2287                  * dispatched for IO. If it is, we want to know about it, and we
2288                  * want it to fail. We can acheive this by adding a write
2289                  * verifier to the buffer.
2290                  */
2291                 bp->b_ops = &xfs_inode_buf_ops;
2292
2293                 /*
2294                  * Now we need to set all the cached clean inodes as XFS_ISTALE,
2295                  * too. This requires lookups, and will skip inodes that we've
2296                  * already marked XFS_ISTALE.
2297                  */
2298                 for (i = 0; i < igeo->inodes_per_cluster; i++)
2299                         xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
2300
2301                 xfs_trans_stale_inode_buf(tp, bp);
2302                 xfs_trans_binval(tp, bp);
2303         }
2304         return 0;
2305 }
2306
2307 /*
2308  * This is called to return an inode to the inode free list.  The inode should
2309  * already be truncated to 0 length and have no pages associated with it.  This
2310  * routine also assumes that the inode is already a part of the transaction.
2311  *
2312  * The on-disk copy of the inode will have been added to the list of unlinked
2313  * inodes in the AGI. We need to remove the inode from that list atomically with
2314  * respect to freeing it here.
2315  */
2316 int
2317 xfs_ifree(
2318         struct xfs_trans        *tp,
2319         struct xfs_inode        *ip)
2320 {
2321         struct xfs_mount        *mp = ip->i_mount;
2322         struct xfs_perag        *pag;
2323         struct xfs_icluster     xic = { 0 };
2324         struct xfs_inode_log_item *iip = ip->i_itemp;
2325         int                     error;
2326
2327         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
2328         ASSERT(VFS_I(ip)->i_nlink == 0);
2329         ASSERT(ip->i_df.if_nextents == 0);
2330         ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2331         ASSERT(ip->i_nblocks == 0);
2332
2333         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2334
2335         /*
2336          * Free the inode first so that we guarantee that the AGI lock is going
2337          * to be taken before we remove the inode from the unlinked list. This
2338          * makes the AGI lock -> unlinked list modification order the same as
2339          * used in O_TMPFILE creation.
2340          */
2341         error = xfs_difree(tp, pag, ip->i_ino, &xic);
2342         if (error)
2343                 goto out;
2344
2345         error = xfs_iunlink_remove(tp, pag, ip);
2346         if (error)
2347                 goto out;
2348
2349         /*
2350          * Free any local-format data sitting around before we reset the
2351          * data fork to extents format.  Note that the attr fork data has
2352          * already been freed by xfs_attr_inactive.
2353          */
2354         if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2355                 kfree(ip->i_df.if_data);
2356                 ip->i_df.if_data = NULL;
2357                 ip->i_df.if_bytes = 0;
2358         }
2359
2360         VFS_I(ip)->i_mode = 0;          /* mark incore inode as free */
2361         ip->i_diflags = 0;
2362         ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2363         ip->i_forkoff = 0;              /* mark the attr fork not in use */
2364         ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2365         if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2366                 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2367
2368         /* Don't attempt to replay owner changes for a deleted inode */
2369         spin_lock(&iip->ili_lock);
2370         iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2371         spin_unlock(&iip->ili_lock);
2372
2373         /*
2374          * Bump the generation count so no one will be confused
2375          * by reincarnations of this inode.
2376          */
2377         VFS_I(ip)->i_generation++;
2378         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2379
2380         if (xic.deleted)
2381                 error = xfs_ifree_cluster(tp, pag, ip, &xic);
2382 out:
2383         xfs_perag_put(pag);
2384         return error;
2385 }
2386
2387 /*
2388  * This is called to unpin an inode.  The caller must have the inode locked
2389  * in at least shared mode so that the buffer cannot be subsequently pinned
2390  * once someone is waiting for it to be unpinned.
2391  */
2392 static void
2393 xfs_iunpin(
2394         struct xfs_inode        *ip)
2395 {
2396         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
2397
2398         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2399
2400         /* Give the log a push to start the unpinning I/O */
2401         xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2402
2403 }
2404
2405 static void
2406 __xfs_iunpin_wait(
2407         struct xfs_inode        *ip)
2408 {
2409         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2410         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2411
2412         xfs_iunpin(ip);
2413
2414         do {
2415                 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2416                 if (xfs_ipincount(ip))
2417                         io_schedule();
2418         } while (xfs_ipincount(ip));
2419         finish_wait(wq, &wait.wq_entry);
2420 }
2421
2422 void
2423 xfs_iunpin_wait(
2424         struct xfs_inode        *ip)
2425 {
2426         if (xfs_ipincount(ip))
2427                 __xfs_iunpin_wait(ip);
2428 }
2429
2430 /*
2431  * Removing an inode from the namespace involves removing the directory entry
2432  * and dropping the link count on the inode. Removing the directory entry can
2433  * result in locking an AGF (directory blocks were freed) and removing a link
2434  * count can result in placing the inode on an unlinked list which results in
2435  * locking an AGI.
2436  *
2437  * The big problem here is that we have an ordering constraint on AGF and AGI
2438  * locking - inode allocation locks the AGI, then can allocate a new extent for
2439  * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2440  * removes the inode from the unlinked list, requiring that we lock the AGI
2441  * first, and then freeing the inode can result in an inode chunk being freed
2442  * and hence freeing disk space requiring that we lock an AGF.
2443  *
2444  * Hence the ordering that is imposed by other parts of the code is AGI before
2445  * AGF. This means we cannot remove the directory entry before we drop the inode
2446  * reference count and put it on the unlinked list as this results in a lock
2447  * order of AGF then AGI, and this can deadlock against inode allocation and
2448  * freeing. Therefore we must drop the link counts before we remove the
2449  * directory entry.
2450  *
2451  * This is still safe from a transactional point of view - it is not until we
2452  * get to xfs_defer_finish() that we have the possibility of multiple
2453  * transactions in this operation. Hence as long as we remove the directory
2454  * entry and drop the link count in the first transaction of the remove
2455  * operation, there are no transactional constraints on the ordering here.
2456  */
2457 int
2458 xfs_remove(
2459         xfs_inode_t             *dp,
2460         struct xfs_name         *name,
2461         xfs_inode_t             *ip)
2462 {
2463         xfs_mount_t             *mp = dp->i_mount;
2464         xfs_trans_t             *tp = NULL;
2465         int                     is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2466         int                     dontcare;
2467         int                     error = 0;
2468         uint                    resblks;
2469
2470         trace_xfs_remove(dp, name);
2471
2472         if (xfs_is_shutdown(mp))
2473                 return -EIO;
2474         if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
2475                 return -EIO;
2476
2477         error = xfs_qm_dqattach(dp);
2478         if (error)
2479                 goto std_return;
2480
2481         error = xfs_qm_dqattach(ip);
2482         if (error)
2483                 goto std_return;
2484
2485         /*
2486          * We try to get the real space reservation first, allowing for
2487          * directory btree deletion(s) implying possible bmap insert(s).  If we
2488          * can't get the space reservation then we use 0 instead, and avoid the
2489          * bmap btree insert(s) in the directory code by, if the bmap insert
2490          * tries to happen, instead trimming the LAST block from the directory.
2491          *
2492          * Ignore EDQUOT and ENOSPC being returned via nospace_error because
2493          * the directory code can handle a reservationless update and we don't
2494          * want to prevent a user from trying to free space by deleting things.
2495          */
2496         resblks = XFS_REMOVE_SPACE_RES(mp);
2497         error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
2498                         &tp, &dontcare);
2499         if (error) {
2500                 ASSERT(error != -ENOSPC);
2501                 goto std_return;
2502         }
2503
2504         /*
2505          * If we're removing a directory perform some additional validation.
2506          */
2507         if (is_dir) {
2508                 ASSERT(VFS_I(ip)->i_nlink >= 2);
2509                 if (VFS_I(ip)->i_nlink != 2) {
2510                         error = -ENOTEMPTY;
2511                         goto out_trans_cancel;
2512                 }
2513                 if (!xfs_dir_isempty(ip)) {
2514                         error = -ENOTEMPTY;
2515                         goto out_trans_cancel;
2516                 }
2517
2518                 /* Drop the link from ip's "..".  */
2519                 error = xfs_droplink(tp, dp);
2520                 if (error)
2521                         goto out_trans_cancel;
2522
2523                 /* Drop the "." link from ip to self.  */
2524                 error = xfs_droplink(tp, ip);
2525                 if (error)
2526                         goto out_trans_cancel;
2527
2528                 /*
2529                  * Point the unlinked child directory's ".." entry to the root
2530                  * directory to eliminate back-references to inodes that may
2531                  * get freed before the child directory is closed.  If the fs
2532                  * gets shrunk, this can lead to dirent inode validation errors.
2533                  */
2534                 if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2535                         error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2536                                         tp->t_mountp->m_sb.sb_rootino, 0);
2537                         if (error)
2538                                 goto out_trans_cancel;
2539                 }
2540         } else {
2541                 /*
2542                  * When removing a non-directory we need to log the parent
2543                  * inode here.  For a directory this is done implicitly
2544                  * by the xfs_droplink call for the ".." entry.
2545                  */
2546                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2547         }
2548         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2549
2550         /* Drop the link from dp to ip. */
2551         error = xfs_droplink(tp, ip);
2552         if (error)
2553                 goto out_trans_cancel;
2554
2555         error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2556         if (error) {
2557                 ASSERT(error != -ENOENT);
2558                 goto out_trans_cancel;
2559         }
2560
2561         /*
2562          * If this is a synchronous mount, make sure that the
2563          * remove transaction goes to disk before returning to
2564          * the user.
2565          */
2566         if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
2567                 xfs_trans_set_sync(tp);
2568
2569         error = xfs_trans_commit(tp);
2570         if (error)
2571                 goto std_return;
2572
2573         if (is_dir && xfs_inode_is_filestream(ip))
2574                 xfs_filestream_deassociate(ip);
2575
2576         return 0;
2577
2578  out_trans_cancel:
2579         xfs_trans_cancel(tp);
2580  std_return:
2581         return error;
2582 }
2583
2584 /*
2585  * Enter all inodes for a rename transaction into a sorted array.
2586  */
2587 #define __XFS_SORT_INODES       5
2588 STATIC void
2589 xfs_sort_for_rename(
2590         struct xfs_inode        *dp1,   /* in: old (source) directory inode */
2591         struct xfs_inode        *dp2,   /* in: new (target) directory inode */
2592         struct xfs_inode        *ip1,   /* in: inode of old entry */
2593         struct xfs_inode        *ip2,   /* in: inode of new entry */
2594         struct xfs_inode        *wip,   /* in: whiteout inode */
2595         struct xfs_inode        **i_tab,/* out: sorted array of inodes */
2596         int                     *num_inodes)  /* in/out: inodes in array */
2597 {
2598         int                     i, j;
2599
2600         ASSERT(*num_inodes == __XFS_SORT_INODES);
2601         memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2602
2603         /*
2604          * i_tab contains a list of pointers to inodes.  We initialize
2605          * the table here & we'll sort it.  We will then use it to
2606          * order the acquisition of the inode locks.
2607          *
2608          * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2609          */
2610         i = 0;
2611         i_tab[i++] = dp1;
2612         i_tab[i++] = dp2;
2613         i_tab[i++] = ip1;
2614         if (ip2)
2615                 i_tab[i++] = ip2;
2616         if (wip)
2617                 i_tab[i++] = wip;
2618         *num_inodes = i;
2619
2620         /*
2621          * Sort the elements via bubble sort.  (Remember, there are at
2622          * most 5 elements to sort, so this is adequate.)
2623          */
2624         for (i = 0; i < *num_inodes; i++) {
2625                 for (j = 1; j < *num_inodes; j++) {
2626                         if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2627                                 struct xfs_inode *temp = i_tab[j];
2628                                 i_tab[j] = i_tab[j-1];
2629                                 i_tab[j-1] = temp;
2630                         }
2631                 }
2632         }
2633 }
2634
2635 static int
2636 xfs_finish_rename(
2637         struct xfs_trans        *tp)
2638 {
2639         /*
2640          * If this is a synchronous mount, make sure that the rename transaction
2641          * goes to disk before returning to the user.
2642          */
2643         if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
2644                 xfs_trans_set_sync(tp);
2645
2646         return xfs_trans_commit(tp);
2647 }
2648
2649 /*
2650  * xfs_cross_rename()
2651  *
2652  * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2653  */
2654 STATIC int
2655 xfs_cross_rename(
2656         struct xfs_trans        *tp,
2657         struct xfs_inode        *dp1,
2658         struct xfs_name         *name1,
2659         struct xfs_inode        *ip1,
2660         struct xfs_inode        *dp2,
2661         struct xfs_name         *name2,
2662         struct xfs_inode        *ip2,
2663         int                     spaceres)
2664 {
2665         int             error = 0;
2666         int             ip1_flags = 0;
2667         int             ip2_flags = 0;
2668         int             dp2_flags = 0;
2669
2670         /* Swap inode number for dirent in first parent */
2671         error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2672         if (error)
2673                 goto out_trans_abort;
2674
2675         /* Swap inode number for dirent in second parent */
2676         error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2677         if (error)
2678                 goto out_trans_abort;
2679
2680         /*
2681          * If we're renaming one or more directories across different parents,
2682          * update the respective ".." entries (and link counts) to match the new
2683          * parents.
2684          */
2685         if (dp1 != dp2) {
2686                 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2687
2688                 if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2689                         error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2690                                                 dp1->i_ino, spaceres);
2691                         if (error)
2692                                 goto out_trans_abort;
2693
2694                         /* transfer ip2 ".." reference to dp1 */
2695                         if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2696                                 error = xfs_droplink(tp, dp2);
2697                                 if (error)
2698                                         goto out_trans_abort;
2699                                 xfs_bumplink(tp, dp1);
2700                         }
2701
2702                         /*
2703                          * Although ip1 isn't changed here, userspace needs
2704                          * to be warned about the change, so that applications
2705                          * relying on it (like backup ones), will properly
2706                          * notify the change
2707                          */
2708                         ip1_flags |= XFS_ICHGTIME_CHG;
2709                         ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2710                 }
2711
2712                 if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2713                         error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2714                                                 dp2->i_ino, spaceres);
2715                         if (error)
2716                                 goto out_trans_abort;
2717
2718                         /* transfer ip1 ".." reference to dp2 */
2719                         if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2720                                 error = xfs_droplink(tp, dp1);
2721                                 if (error)
2722                                         goto out_trans_abort;
2723                                 xfs_bumplink(tp, dp2);
2724                         }
2725
2726                         /*
2727                          * Although ip2 isn't changed here, userspace needs
2728                          * to be warned about the change, so that applications
2729                          * relying on it (like backup ones), will properly
2730                          * notify the change
2731                          */
2732                         ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2733                         ip2_flags |= XFS_ICHGTIME_CHG;
2734                 }
2735         }
2736
2737         if (ip1_flags) {
2738                 xfs_trans_ichgtime(tp, ip1, ip1_flags);
2739                 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2740         }
2741         if (ip2_flags) {
2742                 xfs_trans_ichgtime(tp, ip2, ip2_flags);
2743                 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2744         }
2745         if (dp2_flags) {
2746                 xfs_trans_ichgtime(tp, dp2, dp2_flags);
2747                 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2748         }
2749         xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2750         xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2751         return xfs_finish_rename(tp);
2752
2753 out_trans_abort:
2754         xfs_trans_cancel(tp);
2755         return error;
2756 }
2757
2758 /*
2759  * xfs_rename_alloc_whiteout()
2760  *
2761  * Return a referenced, unlinked, unlocked inode that can be used as a
2762  * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2763  * crash between allocating the inode and linking it into the rename transaction
2764  * recovery will free the inode and we won't leak it.
2765  */
2766 static int
2767 xfs_rename_alloc_whiteout(
2768         struct mnt_idmap        *idmap,
2769         struct xfs_name         *src_name,
2770         struct xfs_inode        *dp,
2771         struct xfs_inode        **wip)
2772 {
2773         struct xfs_inode        *tmpfile;
2774         struct qstr             name;
2775         int                     error;
2776
2777         error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
2778                                    &tmpfile);
2779         if (error)
2780                 return error;
2781
2782         name.name = src_name->name;
2783         name.len = src_name->len;
2784         error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
2785         if (error) {
2786                 xfs_finish_inode_setup(tmpfile);
2787                 xfs_irele(tmpfile);
2788                 return error;
2789         }
2790
2791         /*
2792          * Prepare the tmpfile inode as if it were created through the VFS.
2793          * Complete the inode setup and flag it as linkable.  nlink is already
2794          * zero, so we can skip the drop_nlink.
2795          */
2796         xfs_setup_iops(tmpfile);
2797         xfs_finish_inode_setup(tmpfile);
2798         VFS_I(tmpfile)->i_state |= I_LINKABLE;
2799
2800         *wip = tmpfile;
2801         return 0;
2802 }
2803
2804 /*
2805  * xfs_rename
2806  */
2807 int
2808 xfs_rename(
2809         struct mnt_idmap        *idmap,
2810         struct xfs_inode        *src_dp,
2811         struct xfs_name         *src_name,
2812         struct xfs_inode        *src_ip,
2813         struct xfs_inode        *target_dp,
2814         struct xfs_name         *target_name,
2815         struct xfs_inode        *target_ip,
2816         unsigned int            flags)
2817 {
2818         struct xfs_mount        *mp = src_dp->i_mount;
2819         struct xfs_trans        *tp;
2820         struct xfs_inode        *wip = NULL;            /* whiteout inode */
2821         struct xfs_inode        *inodes[__XFS_SORT_INODES];
2822         int                     i;
2823         int                     num_inodes = __XFS_SORT_INODES;
2824         bool                    new_parent = (src_dp != target_dp);
2825         bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2826         int                     spaceres;
2827         bool                    retried = false;
2828         int                     error, nospace_error = 0;
2829
2830         trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2831
2832         if ((flags & RENAME_EXCHANGE) && !target_ip)
2833                 return -EINVAL;
2834
2835         /*
2836          * If we are doing a whiteout operation, allocate the whiteout inode
2837          * we will be placing at the target and ensure the type is set
2838          * appropriately.
2839          */
2840         if (flags & RENAME_WHITEOUT) {
2841                 error = xfs_rename_alloc_whiteout(idmap, src_name,
2842                                                   target_dp, &wip);
2843                 if (error)
2844                         return error;
2845
2846                 /* setup target dirent info as whiteout */
2847                 src_name->type = XFS_DIR3_FT_CHRDEV;
2848         }
2849
2850         xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2851                                 inodes, &num_inodes);
2852
2853 retry:
2854         nospace_error = 0;
2855         spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2856         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
2857         if (error == -ENOSPC) {
2858                 nospace_error = error;
2859                 spaceres = 0;
2860                 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
2861                                 &tp);
2862         }
2863         if (error)
2864                 goto out_release_wip;
2865
2866         /*
2867          * Attach the dquots to the inodes
2868          */
2869         error = xfs_qm_vop_rename_dqattach(inodes);
2870         if (error)
2871                 goto out_trans_cancel;
2872
2873         /*
2874          * Lock all the participating inodes. Depending upon whether
2875          * the target_name exists in the target directory, and
2876          * whether the target directory is the same as the source
2877          * directory, we can lock from 2 to 5 inodes.
2878          */
2879         xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2880
2881         /*
2882          * Join all the inodes to the transaction. From this point on,
2883          * we can rely on either trans_commit or trans_cancel to unlock
2884          * them.
2885          */
2886         xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2887         if (new_parent)
2888                 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2889         xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2890         if (target_ip)
2891                 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2892         if (wip)
2893                 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2894
2895         /*
2896          * If we are using project inheritance, we only allow renames
2897          * into our tree when the project IDs are the same; else the
2898          * tree quota mechanism would be circumvented.
2899          */
2900         if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
2901                      target_dp->i_projid != src_ip->i_projid)) {
2902                 error = -EXDEV;
2903                 goto out_trans_cancel;
2904         }
2905
2906         /* RENAME_EXCHANGE is unique from here on. */
2907         if (flags & RENAME_EXCHANGE)
2908                 return xfs_cross_rename(tp, src_dp, src_name, src_ip,
2909                                         target_dp, target_name, target_ip,
2910                                         spaceres);
2911
2912         /*
2913          * Try to reserve quota to handle an expansion of the target directory.
2914          * We'll allow the rename to continue in reservationless mode if we hit
2915          * a space usage constraint.  If we trigger reservationless mode, save
2916          * the errno if there isn't any free space in the target directory.
2917          */
2918         if (spaceres != 0) {
2919                 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
2920                                 0, false);
2921                 if (error == -EDQUOT || error == -ENOSPC) {
2922                         if (!retried) {
2923                                 xfs_trans_cancel(tp);
2924                                 xfs_blockgc_free_quota(target_dp, 0);
2925                                 retried = true;
2926                                 goto retry;
2927                         }
2928
2929                         nospace_error = error;
2930                         spaceres = 0;
2931                         error = 0;
2932                 }
2933                 if (error)
2934                         goto out_trans_cancel;
2935         }
2936
2937         /*
2938          * Check for expected errors before we dirty the transaction
2939          * so we can return an error without a transaction abort.
2940          */
2941         if (target_ip == NULL) {
2942                 /*
2943                  * If there's no space reservation, check the entry will
2944                  * fit before actually inserting it.
2945                  */
2946                 if (!spaceres) {
2947                         error = xfs_dir_canenter(tp, target_dp, target_name);
2948                         if (error)
2949                                 goto out_trans_cancel;
2950                 }
2951         } else {
2952                 /*
2953                  * If target exists and it's a directory, check that whether
2954                  * it can be destroyed.
2955                  */
2956                 if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
2957                     (!xfs_dir_isempty(target_ip) ||
2958                      (VFS_I(target_ip)->i_nlink > 2))) {
2959                         error = -EEXIST;
2960                         goto out_trans_cancel;
2961                 }
2962         }
2963
2964         /*
2965          * Lock the AGI buffers we need to handle bumping the nlink of the
2966          * whiteout inode off the unlinked list and to handle dropping the
2967          * nlink of the target inode.  Per locking order rules, do this in
2968          * increasing AG order and before directory block allocation tries to
2969          * grab AGFs because we grab AGIs before AGFs.
2970          *
2971          * The (vfs) caller must ensure that if src is a directory then
2972          * target_ip is either null or an empty directory.
2973          */
2974         for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
2975                 if (inodes[i] == wip ||
2976                     (inodes[i] == target_ip &&
2977                      (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
2978                         struct xfs_perag        *pag;
2979                         struct xfs_buf          *bp;
2980
2981                         pag = xfs_perag_get(mp,
2982                                         XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
2983                         error = xfs_read_agi(pag, tp, &bp);
2984                         xfs_perag_put(pag);
2985                         if (error)
2986                                 goto out_trans_cancel;
2987                 }
2988         }
2989
2990         /*
2991          * Directory entry creation below may acquire the AGF. Remove
2992          * the whiteout from the unlinked list first to preserve correct
2993          * AGI/AGF locking order. This dirties the transaction so failures
2994          * after this point will abort and log recovery will clean up the
2995          * mess.
2996          *
2997          * For whiteouts, we need to bump the link count on the whiteout
2998          * inode. After this point, we have a real link, clear the tmpfile
2999          * state flag from the inode so it doesn't accidentally get misused
3000          * in future.
3001          */
3002         if (wip) {
3003                 struct xfs_perag        *pag;
3004
3005                 ASSERT(VFS_I(wip)->i_nlink == 0);
3006
3007                 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
3008                 error = xfs_iunlink_remove(tp, pag, wip);
3009                 xfs_perag_put(pag);
3010                 if (error)
3011                         goto out_trans_cancel;
3012
3013                 xfs_bumplink(tp, wip);
3014                 VFS_I(wip)->i_state &= ~I_LINKABLE;
3015         }
3016
3017         /*
3018          * Set up the target.
3019          */
3020         if (target_ip == NULL) {
3021                 /*
3022                  * If target does not exist and the rename crosses
3023                  * directories, adjust the target directory link count
3024                  * to account for the ".." reference from the new entry.
3025                  */
3026                 error = xfs_dir_createname(tp, target_dp, target_name,
3027                                            src_ip->i_ino, spaceres);
3028                 if (error)
3029                         goto out_trans_cancel;
3030
3031                 xfs_trans_ichgtime(tp, target_dp,
3032                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3033
3034                 if (new_parent && src_is_directory) {
3035                         xfs_bumplink(tp, target_dp);
3036                 }
3037         } else { /* target_ip != NULL */
3038                 /*
3039                  * Link the source inode under the target name.
3040                  * If the source inode is a directory and we are moving
3041                  * it across directories, its ".." entry will be
3042                  * inconsistent until we replace that down below.
3043                  *
3044                  * In case there is already an entry with the same
3045                  * name at the destination directory, remove it first.
3046                  */
3047                 error = xfs_dir_replace(tp, target_dp, target_name,
3048                                         src_ip->i_ino, spaceres);
3049                 if (error)
3050                         goto out_trans_cancel;
3051
3052                 xfs_trans_ichgtime(tp, target_dp,
3053                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3054
3055                 /*
3056                  * Decrement the link count on the target since the target
3057                  * dir no longer points to it.
3058                  */
3059                 error = xfs_droplink(tp, target_ip);
3060                 if (error)
3061                         goto out_trans_cancel;
3062
3063                 if (src_is_directory) {
3064                         /*
3065                          * Drop the link from the old "." entry.
3066                          */
3067                         error = xfs_droplink(tp, target_ip);
3068                         if (error)
3069                                 goto out_trans_cancel;
3070                 }
3071         } /* target_ip != NULL */
3072
3073         /*
3074          * Remove the source.
3075          */
3076         if (new_parent && src_is_directory) {
3077                 /*
3078                  * Rewrite the ".." entry to point to the new
3079                  * directory.
3080                  */
3081                 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3082                                         target_dp->i_ino, spaceres);
3083                 ASSERT(error != -EEXIST);
3084                 if (error)
3085                         goto out_trans_cancel;
3086         }
3087
3088         /*
3089          * We always want to hit the ctime on the source inode.
3090          *
3091          * This isn't strictly required by the standards since the source
3092          * inode isn't really being changed, but old unix file systems did
3093          * it and some incremental backup programs won't work without it.
3094          */
3095         xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3096         xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3097
3098         /*
3099          * Adjust the link count on src_dp.  This is necessary when
3100          * renaming a directory, either within one parent when
3101          * the target existed, or across two parent directories.
3102          */
3103         if (src_is_directory && (new_parent || target_ip != NULL)) {
3104
3105                 /*
3106                  * Decrement link count on src_directory since the
3107                  * entry that's moved no longer points to it.
3108                  */
3109                 error = xfs_droplink(tp, src_dp);
3110                 if (error)
3111                         goto out_trans_cancel;
3112         }
3113
3114         /*
3115          * For whiteouts, we only need to update the source dirent with the
3116          * inode number of the whiteout inode rather than removing it
3117          * altogether.
3118          */
3119         if (wip)
3120                 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3121                                         spaceres);
3122         else
3123                 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3124                                            spaceres);
3125
3126         if (error)
3127                 goto out_trans_cancel;
3128
3129         xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3130         xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3131         if (new_parent)
3132                 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3133
3134         error = xfs_finish_rename(tp);
3135         if (wip)
3136                 xfs_irele(wip);
3137         return error;
3138
3139 out_trans_cancel:
3140         xfs_trans_cancel(tp);
3141 out_release_wip:
3142         if (wip)
3143                 xfs_irele(wip);
3144         if (error == -ENOSPC && nospace_error)
3145                 error = nospace_error;
3146         return error;
3147 }
3148
3149 static int
3150 xfs_iflush(
3151         struct xfs_inode        *ip,
3152         struct xfs_buf          *bp)
3153 {
3154         struct xfs_inode_log_item *iip = ip->i_itemp;
3155         struct xfs_dinode       *dip;
3156         struct xfs_mount        *mp = ip->i_mount;
3157         int                     error;
3158
3159         xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
3160         ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3161         ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3162                ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3163         ASSERT(iip->ili_item.li_buf == bp);
3164
3165         dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3166
3167         /*
3168          * We don't flush the inode if any of the following checks fail, but we
3169          * do still update the log item and attach to the backing buffer as if
3170          * the flush happened. This is a formality to facilitate predictable
3171          * error handling as the caller will shutdown and fail the buffer.
3172          */
3173         error = -EFSCORRUPTED;
3174         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3175                                mp, XFS_ERRTAG_IFLUSH_1)) {
3176                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3177                         "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
3178                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3179                 goto flush_out;
3180         }
3181         if (S_ISREG(VFS_I(ip)->i_mode)) {
3182                 if (XFS_TEST_ERROR(
3183                     ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3184                     ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3185                     mp, XFS_ERRTAG_IFLUSH_3)) {
3186                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3187                                 "%s: Bad regular inode %llu, ptr "PTR_FMT,
3188                                 __func__, ip->i_ino, ip);
3189                         goto flush_out;
3190                 }
3191         } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3192                 if (XFS_TEST_ERROR(
3193                     ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3194                     ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3195                     ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3196                     mp, XFS_ERRTAG_IFLUSH_4)) {
3197                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3198                                 "%s: Bad directory inode %llu, ptr "PTR_FMT,
3199                                 __func__, ip->i_ino, ip);
3200                         goto flush_out;
3201                 }
3202         }
3203         if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
3204                                 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3205                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3206                         "%s: detected corrupt incore inode %llu, "
3207                         "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
3208                         __func__, ip->i_ino,
3209                         ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
3210                         ip->i_nblocks, ip);
3211                 goto flush_out;
3212         }
3213         if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3214                                 mp, XFS_ERRTAG_IFLUSH_6)) {
3215                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3216                         "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
3217                         __func__, ip->i_ino, ip->i_forkoff, ip);
3218                 goto flush_out;
3219         }
3220
3221         /*
3222          * Inode item log recovery for v2 inodes are dependent on the flushiter
3223          * count for correct sequencing.  We bump the flush iteration count so
3224          * we can detect flushes which postdate a log record during recovery.
3225          * This is redundant as we now log every change and hence this can't
3226          * happen but we need to still do it to ensure backwards compatibility
3227          * with old kernels that predate logging all inode changes.
3228          */
3229         if (!xfs_has_v3inodes(mp))
3230                 ip->i_flushiter++;
3231
3232         /*
3233          * If there are inline format data / attr forks attached to this inode,
3234          * make sure they are not corrupt.
3235          */
3236         if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3237             xfs_ifork_verify_local_data(ip))
3238                 goto flush_out;
3239         if (xfs_inode_has_attr_fork(ip) &&
3240             ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
3241             xfs_ifork_verify_local_attr(ip))
3242                 goto flush_out;
3243
3244         /*
3245          * Copy the dirty parts of the inode into the on-disk inode.  We always
3246          * copy out the core of the inode, because if the inode is dirty at all
3247          * the core must be.
3248          */
3249         xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3250
3251         /* Wrap, we never let the log put out DI_MAX_FLUSH */
3252         if (!xfs_has_v3inodes(mp)) {
3253                 if (ip->i_flushiter == DI_MAX_FLUSH)
3254                         ip->i_flushiter = 0;
3255         }
3256
3257         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3258         if (xfs_inode_has_attr_fork(ip))
3259                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3260
3261         /*
3262          * We've recorded everything logged in the inode, so we'd like to clear
3263          * the ili_fields bits so we don't log and flush things unnecessarily.
3264          * However, we can't stop logging all this information until the data
3265          * we've copied into the disk buffer is written to disk.  If we did we
3266          * might overwrite the copy of the inode in the log with all the data
3267          * after re-logging only part of it, and in the face of a crash we
3268          * wouldn't have all the data we need to recover.
3269          *
3270          * What we do is move the bits to the ili_last_fields field.  When
3271          * logging the inode, these bits are moved back to the ili_fields field.
3272          * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3273          * we know that the information those bits represent is permanently on
3274          * disk.  As long as the flush completes before the inode is logged
3275          * again, then both ili_fields and ili_last_fields will be cleared.
3276          */
3277         error = 0;
3278 flush_out:
3279         spin_lock(&iip->ili_lock);
3280         iip->ili_last_fields = iip->ili_fields;
3281         iip->ili_fields = 0;
3282         iip->ili_fsync_fields = 0;
3283         spin_unlock(&iip->ili_lock);
3284
3285         /*
3286          * Store the current LSN of the inode so that we can tell whether the
3287          * item has moved in the AIL from xfs_buf_inode_iodone().
3288          */
3289         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3290                                 &iip->ili_item.li_lsn);
3291
3292         /* generate the checksum. */
3293         xfs_dinode_calc_crc(mp, dip);
3294         return error;
3295 }
3296
3297 /*
3298  * Non-blocking flush of dirty inode metadata into the backing buffer.
3299  *
3300  * The caller must have a reference to the inode and hold the cluster buffer
3301  * locked. The function will walk across all the inodes on the cluster buffer it
3302  * can find and lock without blocking, and flush them to the cluster buffer.
3303  *
3304  * On successful flushing of at least one inode, the caller must write out the
3305  * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3306  * the caller needs to release the buffer. On failure, the filesystem will be
3307  * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3308  * will be returned.
3309  */
3310 int
3311 xfs_iflush_cluster(
3312         struct xfs_buf          *bp)
3313 {
3314         struct xfs_mount        *mp = bp->b_mount;
3315         struct xfs_log_item     *lip, *n;
3316         struct xfs_inode        *ip;
3317         struct xfs_inode_log_item *iip;
3318         int                     clcount = 0;
3319         int                     error = 0;
3320
3321         /*
3322          * We must use the safe variant here as on shutdown xfs_iflush_abort()
3323          * will remove itself from the list.
3324          */
3325         list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3326                 iip = (struct xfs_inode_log_item *)lip;
3327                 ip = iip->ili_inode;
3328
3329                 /*
3330                  * Quick and dirty check to avoid locks if possible.
3331                  */
3332                 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3333                         continue;
3334                 if (xfs_ipincount(ip))
3335                         continue;
3336
3337                 /*
3338                  * The inode is still attached to the buffer, which means it is
3339                  * dirty but reclaim might try to grab it. Check carefully for
3340                  * that, and grab the ilock while still holding the i_flags_lock
3341                  * to guarantee reclaim will not be able to reclaim this inode
3342                  * once we drop the i_flags_lock.
3343                  */
3344                 spin_lock(&ip->i_flags_lock);
3345                 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3346                 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3347                         spin_unlock(&ip->i_flags_lock);
3348                         continue;
3349                 }
3350
3351                 /*
3352                  * ILOCK will pin the inode against reclaim and prevent
3353                  * concurrent transactions modifying the inode while we are
3354                  * flushing the inode. If we get the lock, set the flushing
3355                  * state before we drop the i_flags_lock.
3356                  */
3357                 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3358                         spin_unlock(&ip->i_flags_lock);
3359                         continue;
3360                 }
3361                 __xfs_iflags_set(ip, XFS_IFLUSHING);
3362                 spin_unlock(&ip->i_flags_lock);
3363
3364                 /*
3365                  * Abort flushing this inode if we are shut down because the
3366                  * inode may not currently be in the AIL. This can occur when
3367                  * log I/O failure unpins the inode without inserting into the
3368                  * AIL, leaving a dirty/unpinned inode attached to the buffer
3369                  * that otherwise looks like it should be flushed.
3370                  */
3371                 if (xlog_is_shutdown(mp->m_log)) {
3372                         xfs_iunpin_wait(ip);
3373                         xfs_iflush_abort(ip);
3374                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3375                         error = -EIO;
3376                         continue;
3377                 }
3378
3379                 /* don't block waiting on a log force to unpin dirty inodes */
3380                 if (xfs_ipincount(ip)) {
3381                         xfs_iflags_clear(ip, XFS_IFLUSHING);
3382                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3383                         continue;
3384                 }
3385
3386                 if (!xfs_inode_clean(ip))
3387                         error = xfs_iflush(ip, bp);
3388                 else
3389                         xfs_iflags_clear(ip, XFS_IFLUSHING);
3390                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3391                 if (error)
3392                         break;
3393                 clcount++;
3394         }
3395
3396         if (error) {
3397                 /*
3398                  * Shutdown first so we kill the log before we release this
3399                  * buffer. If it is an INODE_ALLOC buffer and pins the tail
3400                  * of the log, failing it before the _log_ is shut down can
3401                  * result in the log tail being moved forward in the journal
3402                  * on disk because log writes can still be taking place. Hence
3403                  * unpinning the tail will allow the ICREATE intent to be
3404                  * removed from the log an recovery will fail with uninitialised
3405                  * inode cluster buffers.
3406                  */
3407                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3408                 bp->b_flags |= XBF_ASYNC;
3409                 xfs_buf_ioend_fail(bp);
3410                 return error;
3411         }
3412
3413         if (!clcount)
3414                 return -EAGAIN;
3415
3416         XFS_STATS_INC(mp, xs_icluster_flushcnt);
3417         XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3418         return 0;
3419
3420 }
3421
3422 /* Release an inode. */
3423 void
3424 xfs_irele(
3425         struct xfs_inode        *ip)
3426 {
3427         trace_xfs_irele(ip, _RET_IP_);
3428         iput(VFS_I(ip));
3429 }
3430
3431 /*
3432  * Ensure all commited transactions touching the inode are written to the log.
3433  */
3434 int
3435 xfs_log_force_inode(
3436         struct xfs_inode        *ip)
3437 {
3438         xfs_csn_t               seq = 0;
3439
3440         xfs_ilock(ip, XFS_ILOCK_SHARED);
3441         if (xfs_ipincount(ip))
3442                 seq = ip->i_itemp->ili_commit_seq;
3443         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3444
3445         if (!seq)
3446                 return 0;
3447         return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3448 }
3449
3450 /*
3451  * Grab the exclusive iolock for a data copy from src to dest, making sure to
3452  * abide vfs locking order (lowest pointer value goes first) and breaking the
3453  * layout leases before proceeding.  The loop is needed because we cannot call
3454  * the blocking break_layout() with the iolocks held, and therefore have to
3455  * back out both locks.
3456  */
3457 static int
3458 xfs_iolock_two_inodes_and_break_layout(
3459         struct inode            *src,
3460         struct inode            *dest)
3461 {
3462         int                     error;
3463
3464         if (src > dest)
3465                 swap(src, dest);
3466
3467 retry:
3468         /* Wait to break both inodes' layouts before we start locking. */
3469         error = break_layout(src, true);
3470         if (error)
3471                 return error;
3472         if (src != dest) {
3473                 error = break_layout(dest, true);
3474                 if (error)
3475                         return error;
3476         }
3477
3478         /* Lock one inode and make sure nobody got in and leased it. */
3479         inode_lock(src);
3480         error = break_layout(src, false);
3481         if (error) {
3482                 inode_unlock(src);
3483                 if (error == -EWOULDBLOCK)
3484                         goto retry;
3485                 return error;
3486         }
3487
3488         if (src == dest)
3489                 return 0;
3490
3491         /* Lock the other inode and make sure nobody got in and leased it. */
3492         inode_lock_nested(dest, I_MUTEX_NONDIR2);
3493         error = break_layout(dest, false);
3494         if (error) {
3495                 inode_unlock(src);
3496                 inode_unlock(dest);
3497                 if (error == -EWOULDBLOCK)
3498                         goto retry;
3499                 return error;
3500         }
3501
3502         return 0;
3503 }
3504
3505 static int
3506 xfs_mmaplock_two_inodes_and_break_dax_layout(
3507         struct xfs_inode        *ip1,
3508         struct xfs_inode        *ip2)
3509 {
3510         int                     error;
3511         bool                    retry;
3512         struct page             *page;
3513
3514         if (ip1->i_ino > ip2->i_ino)
3515                 swap(ip1, ip2);
3516
3517 again:
3518         retry = false;
3519         /* Lock the first inode */
3520         xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3521         error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
3522         if (error || retry) {
3523                 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3524                 if (error == 0 && retry)
3525                         goto again;
3526                 return error;
3527         }
3528
3529         if (ip1 == ip2)
3530                 return 0;
3531
3532         /* Nested lock the second inode */
3533         xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
3534         /*
3535          * We cannot use xfs_break_dax_layouts() directly here because it may
3536          * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
3537          * for this nested lock case.
3538          */
3539         page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
3540         if (page && page_ref_count(page) != 1) {
3541                 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3542                 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3543                 goto again;
3544         }
3545
3546         return 0;
3547 }
3548
3549 /*
3550  * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3551  * mmap activity.
3552  */
3553 int
3554 xfs_ilock2_io_mmap(
3555         struct xfs_inode        *ip1,
3556         struct xfs_inode        *ip2)
3557 {
3558         int                     ret;
3559
3560         ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3561         if (ret)
3562                 return ret;
3563
3564         if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3565                 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
3566                 if (ret) {
3567                         inode_unlock(VFS_I(ip2));
3568                         if (ip1 != ip2)
3569                                 inode_unlock(VFS_I(ip1));
3570                         return ret;
3571                 }
3572         } else
3573                 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
3574                                             VFS_I(ip2)->i_mapping);
3575
3576         return 0;
3577 }
3578
3579 /* Unlock both inodes to allow IO and mmap activity. */
3580 void
3581 xfs_iunlock2_io_mmap(
3582         struct xfs_inode        *ip1,
3583         struct xfs_inode        *ip2)
3584 {
3585         if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3586                 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3587                 if (ip1 != ip2)
3588                         xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3589         } else
3590                 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
3591                                               VFS_I(ip2)->i_mapping);
3592
3593         inode_unlock(VFS_I(ip2));
3594         if (ip1 != ip2)
3595                 inode_unlock(VFS_I(ip1));
3596 }
3597
3598 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
3599 void
3600 xfs_iunlock2_remapping(
3601         struct xfs_inode        *ip1,
3602         struct xfs_inode        *ip2)
3603 {
3604         xfs_iflags_clear(ip1, XFS_IREMAPPING);
3605
3606         if (ip1 != ip2)
3607                 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
3608         xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3609
3610         if (ip1 != ip2)
3611                 inode_unlock_shared(VFS_I(ip1));
3612         inode_unlock(VFS_I(ip2));
3613 }
3614
3615 /*
3616  * Reload the incore inode list for this inode.  Caller should ensure that
3617  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
3618  * preventing other threads from executing.
3619  */
3620 int
3621 xfs_inode_reload_unlinked_bucket(
3622         struct xfs_trans        *tp,
3623         struct xfs_inode        *ip)
3624 {
3625         struct xfs_mount        *mp = tp->t_mountp;
3626         struct xfs_buf          *agibp;
3627         struct xfs_agi          *agi;
3628         struct xfs_perag        *pag;
3629         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
3630         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
3631         xfs_agino_t             prev_agino, next_agino;
3632         unsigned int            bucket;
3633         bool                    foundit = false;
3634         int                     error;
3635
3636         /* Grab the first inode in the list */
3637         pag = xfs_perag_get(mp, agno);
3638         error = xfs_ialloc_read_agi(pag, tp, &agibp);
3639         xfs_perag_put(pag);
3640         if (error)
3641                 return error;
3642
3643         /*
3644          * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
3645          * incore unlinked list pointers for this inode.  Check once more to
3646          * see if we raced with anyone else to reload the unlinked list.
3647          */
3648         if (!xfs_inode_unlinked_incomplete(ip)) {
3649                 foundit = true;
3650                 goto out_agibp;
3651         }
3652
3653         bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
3654         agi = agibp->b_addr;
3655
3656         trace_xfs_inode_reload_unlinked_bucket(ip);
3657
3658         xfs_info_ratelimited(mp,
3659  "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating list recovery.",
3660                         agino, agno);
3661
3662         prev_agino = NULLAGINO;
3663         next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3664         while (next_agino != NULLAGINO) {
3665                 struct xfs_inode        *next_ip = NULL;
3666
3667                 /* Found this caller's inode, set its backlink. */
3668                 if (next_agino == agino) {
3669                         next_ip = ip;
3670                         next_ip->i_prev_unlinked = prev_agino;
3671                         foundit = true;
3672                         goto next_inode;
3673                 }
3674
3675                 /* Try in-memory lookup first. */
3676                 next_ip = xfs_iunlink_lookup(pag, next_agino);
3677                 if (next_ip)
3678                         goto next_inode;
3679
3680                 /* Inode not in memory, try reloading it. */
3681                 error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
3682                                 next_agino);
3683                 if (error)
3684                         break;
3685
3686                 /* Grab the reloaded inode. */
3687                 next_ip = xfs_iunlink_lookup(pag, next_agino);
3688                 if (!next_ip) {
3689                         /* No incore inode at all?  We reloaded it... */
3690                         ASSERT(next_ip != NULL);
3691                         error = -EFSCORRUPTED;
3692                         break;
3693                 }
3694
3695 next_inode:
3696                 prev_agino = next_agino;
3697                 next_agino = next_ip->i_next_unlinked;
3698         }
3699
3700 out_agibp:
3701         xfs_trans_brelse(tp, agibp);
3702         /* Should have found this inode somewhere in the iunlinked bucket. */
3703         if (!error && !foundit)
3704                 error = -EFSCORRUPTED;
3705         return error;
3706 }
3707
3708 /* Decide if this inode is missing its unlinked list and reload it. */
3709 int
3710 xfs_inode_reload_unlinked(
3711         struct xfs_inode        *ip)
3712 {
3713         struct xfs_trans        *tp;
3714         int                     error;
3715
3716         error = xfs_trans_alloc_empty(ip->i_mount, &tp);
3717         if (error)
3718                 return error;
3719
3720         xfs_ilock(ip, XFS_ILOCK_SHARED);
3721         if (xfs_inode_unlinked_incomplete(ip))
3722                 error = xfs_inode_reload_unlinked_bucket(tp, ip);
3723         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3724         xfs_trans_cancel(tp);
3725
3726         return error;
3727 }
3728
3729 /* Has this inode fork been zapped by repair? */
3730 bool
3731 xfs_ifork_zapped(
3732         const struct xfs_inode  *ip,
3733         int                     whichfork)
3734 {
3735         unsigned int            datamask = 0;
3736
3737         switch (whichfork) {
3738         case XFS_DATA_FORK:
3739                 switch (ip->i_vnode.i_mode & S_IFMT) {
3740                 case S_IFDIR:
3741                         datamask = XFS_SICK_INO_DIR_ZAPPED;
3742                         break;
3743                 case S_IFLNK:
3744                         datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
3745                         break;
3746                 }
3747                 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
3748         case XFS_ATTR_FORK:
3749                 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
3750         default:
3751                 return false;
3752         }
3753 }