fs/xfs/xfs_icache.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_sb.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_trans.h"
  16 #include "xfs_trans_priv.h"
  17 #include "xfs_inode_item.h"
  18 #include "xfs_quota.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_icache.h"
  21 #include "xfs_bmap_util.h"
  22 #include "xfs_dquot_item.h"
  23 #include "xfs_dquot.h"
  24 #include "xfs_reflink.h"
  25 #include "xfs_ialloc.h"
  26
  27 #include <linux/iversion.h>
  28
  29 /*
  30  * Allocate and initialise an xfs_inode.
  31  */
  32 struct xfs_inode *
  33 xfs_inode_alloc(
  34         struct xfs_mount        *mp,
  35         xfs_ino_t               ino)
  36 {
  37         struct xfs_inode        *ip;
  38
  39         /*
  40          * if this didn't occur in transactions, we could use
  41          * KM_MAYFAIL and return NULL here on ENOMEM. Set the
  42          * code up to do this anyway.
  43          */
  44         ip = kmem_zone_alloc(xfs_inode_zone, 0);
  45         if (!ip)
  46                 return NULL;
  47         if (inode_init_always(mp->m_super, VFS_I(ip))) {
  48                 kmem_cache_free(xfs_inode_zone, ip);
  49                 return NULL;
  50         }
  51
  52         /* VFS doesn't initialise i_mode! */
  53         VFS_I(ip)->i_mode = 0;
  54
  55         XFS_STATS_INC(mp, vn_active);
  56         ASSERT(atomic_read(&ip->i_pincount) == 0);
  57         ASSERT(!xfs_isiflocked(ip));
  58         ASSERT(ip->i_ino == 0);
  59
  60         /* initialise the xfs inode */
  61         ip->i_ino = ino;
  62         ip->i_mount = mp;
  63         memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  64         ip->i_afp = NULL;
  65         ip->i_cowfp = NULL;
  66         memset(&ip->i_df, 0, sizeof(ip->i_df));
  67         ip->i_flags = 0;
  68         ip->i_delayed_blks = 0;
  69         memset(&ip->i_d, 0, sizeof(ip->i_d));
  70         ip->i_sick = 0;
  71         ip->i_checked = 0;
  72         INIT_WORK(&ip->i_ioend_work, xfs_end_io);
  73         INIT_LIST_HEAD(&ip->i_ioend_list);
  74         spin_lock_init(&ip->i_ioend_lock);
  75
  76         return ip;
  77 }
  78
  79 STATIC void
  80 xfs_inode_free_callback(
  81         struct rcu_head         *head)
  82 {
  83         struct inode            *inode = container_of(head, struct inode, i_rcu);
  84         struct xfs_inode        *ip = XFS_I(inode);
  85
  86         switch (VFS_I(ip)->i_mode & S_IFMT) {
  87         case S_IFREG:
  88         case S_IFDIR:
  89         case S_IFLNK:
  90                 xfs_idestroy_fork(&ip->i_df);
  91                 break;
  92         }
  93
  94         if (ip->i_afp) {
  95                 xfs_idestroy_fork(ip->i_afp);
  96                 kmem_cache_free(xfs_ifork_zone, ip->i_afp);
  97         }
  98         if (ip->i_cowfp) {
  99                 xfs_idestroy_fork(ip->i_cowfp);
 100                 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
 101         }
 102         if (ip->i_itemp) {
 103                 ASSERT(!test_bit(XFS_LI_IN_AIL,
 104                                  &ip->i_itemp->ili_item.li_flags));
 105                 xfs_inode_item_destroy(ip);
 106                 ip->i_itemp = NULL;
 107         }
 108
 109         kmem_cache_free(xfs_inode_zone, ip);
 110 }
 111
 112 static void
 113 __xfs_inode_free(
 114         struct xfs_inode        *ip)
 115 {
 116         /* asserts to verify all state is correct here */
 117         ASSERT(atomic_read(&ip->i_pincount) == 0);
 118         ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
 119         XFS_STATS_DEC(ip->i_mount, vn_active);
 120
 121         call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 122 }
 123
 124 void
 125 xfs_inode_free(
 126         struct xfs_inode        *ip)
 127 {
 128         ASSERT(!xfs_isiflocked(ip));
 129
 130         /*
 131          * Because we use RCU freeing we need to ensure the inode always
 132          * appears to be reclaimed with an invalid inode number when in the
 133          * free state. The ip->i_flags_lock provides the barrier against lookup
 134          * races.
 135          */
 136         spin_lock(&ip->i_flags_lock);
 137         ip->i_flags = XFS_IRECLAIM;
 138         ip->i_ino = 0;
 139         spin_unlock(&ip->i_flags_lock);
 140
 141         __xfs_inode_free(ip);
 142 }
 143
 144 /*
 145  * Queue background inode reclaim work if there are reclaimable inodes and there
 146  * isn't reclaim work already scheduled or in progress.
 147  */
 148 static void
 149 xfs_reclaim_work_queue(
 150         struct xfs_mount        *mp)
 151 {
 152
 153         rcu_read_lock();
 154         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
 155                 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 156                         msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 157         }
 158         rcu_read_unlock();
 159 }
 160
 161 static void
 162 xfs_perag_set_reclaim_tag(
 163         struct xfs_perag        *pag)
 164 {
 165         struct xfs_mount        *mp = pag->pag_mount;
 166
 167         lockdep_assert_held(&pag->pag_ici_lock);
 168         if (pag->pag_ici_reclaimable++)
 169                 return;
 170
 171         /* propagate the reclaim tag up into the perag radix tree */
 172         spin_lock(&mp->m_perag_lock);
 173         radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
 174                            XFS_ICI_RECLAIM_TAG);
 175         spin_unlock(&mp->m_perag_lock);
 176
 177         /* schedule periodic background inode reclaim */
 178         xfs_reclaim_work_queue(mp);
 179
 180         trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
 181 }
 182
 183 static void
 184 xfs_perag_clear_reclaim_tag(
 185         struct xfs_perag        *pag)
 186 {
 187         struct xfs_mount        *mp = pag->pag_mount;
 188
 189         lockdep_assert_held(&pag->pag_ici_lock);
 190         if (--pag->pag_ici_reclaimable)
 191                 return;
 192
 193         /* clear the reclaim tag from the perag radix tree */
 194         spin_lock(&mp->m_perag_lock);
 195         radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
 196                              XFS_ICI_RECLAIM_TAG);
 197         spin_unlock(&mp->m_perag_lock);
 198         trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
 199 }
 200
 201
 202 /*
 203  * We set the inode flag atomically with the radix tree tag.
 204  * Once we get tag lookups on the radix tree, this inode flag
 205  * can go away.
 206  */
 207 void
 208 xfs_inode_set_reclaim_tag(
 209         struct xfs_inode        *ip)
 210 {
 211         struct xfs_mount        *mp = ip->i_mount;
 212         struct xfs_perag        *pag;
 213
 214         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 215         spin_lock(&pag->pag_ici_lock);
 216         spin_lock(&ip->i_flags_lock);
 217
 218         radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
 219                            XFS_ICI_RECLAIM_TAG);
 220         xfs_perag_set_reclaim_tag(pag);
 221         __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 222
 223         spin_unlock(&ip->i_flags_lock);
 224         spin_unlock(&pag->pag_ici_lock);
 225         xfs_perag_put(pag);
 226 }
 227
 228 STATIC void
 229 xfs_inode_clear_reclaim_tag(
 230         struct xfs_perag        *pag,
 231         xfs_ino_t               ino)
 232 {
 233         radix_tree_tag_clear(&pag->pag_ici_root,
 234                              XFS_INO_TO_AGINO(pag->pag_mount, ino),
 235                              XFS_ICI_RECLAIM_TAG);
 236         xfs_perag_clear_reclaim_tag(pag);
 237 }
 238
 239 static void
 240 xfs_inew_wait(
 241         struct xfs_inode        *ip)
 242 {
 243         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
 244         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
 245
 246         do {
 247                 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 248                 if (!xfs_iflags_test(ip, XFS_INEW))
 249                         break;
 250                 schedule();
 251         } while (true);
 252         finish_wait(wq, &wait.wq_entry);
 253 }
 254
 255 /*
 256  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
 257  * part of the structure. This is made more complex by the fact we store
 258  * information about the on-disk values in the VFS inode and so we can't just
 259  * overwrite the values unconditionally. Hence we save the parameters we
 260  * need to retain across reinitialisation, and rewrite them into the VFS inode
 261  * after reinitialisation even if it fails.
 262  */
 263 static int
 264 xfs_reinit_inode(
 265         struct xfs_mount        *mp,
 266         struct inode            *inode)
 267 {
 268         int             error;
 269         uint32_t        nlink = inode->i_nlink;
 270         uint32_t        generation = inode->i_generation;
 271         uint64_t        version = inode_peek_iversion(inode);
 272         umode_t         mode = inode->i_mode;
 273         dev_t           dev = inode->i_rdev;
 274         kuid_t          uid = inode->i_uid;
 275         kgid_t          gid = inode->i_gid;
 276
 277         error = inode_init_always(mp->m_super, inode);
 278
 279         set_nlink(inode, nlink);
 280         inode->i_generation = generation;
 281         inode_set_iversion_queried(inode, version);
 282         inode->i_mode = mode;
 283         inode->i_rdev = dev;
 284         inode->i_uid = uid;
 285         inode->i_gid = gid;
 286         return error;
 287 }
 288
 289 /*
 290  * If we are allocating a new inode, then check what was returned is
 291  * actually a free, empty inode. If we are not allocating an inode,
 292  * then check we didn't find a free inode.
 293  *
 294  * Returns:
 295  *      0               if the inode free state matches the lookup context
 296  *      -ENOENT         if the inode is free and we are not allocating
 297  *      -EFSCORRUPTED   if there is any state mismatch at all
 298  */
 299 static int
 300 xfs_iget_check_free_state(
 301         struct xfs_inode        *ip,
 302         int                     flags)
 303 {
 304         if (flags & XFS_IGET_CREATE) {
 305                 /* should be a free inode */
 306                 if (VFS_I(ip)->i_mode != 0) {
 307                         xfs_warn(ip->i_mount,
 308 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
 309                                 ip->i_ino, VFS_I(ip)->i_mode);
 310                         return -EFSCORRUPTED;
 311                 }
 312
 313                 if (ip->i_d.di_nblocks != 0) {
 314                         xfs_warn(ip->i_mount,
 315 "Corruption detected! Free inode 0x%llx has blocks allocated!",
 316                                 ip->i_ino);
 317                         return -EFSCORRUPTED;
 318                 }
 319                 return 0;
 320         }
 321
 322         /* should be an allocated inode */
 323         if (VFS_I(ip)->i_mode == 0)
 324                 return -ENOENT;
 325
 326         return 0;
 327 }
 328
 329 /*
 330  * Check the validity of the inode we just found it the cache
 331  */
 332 static int
 333 xfs_iget_cache_hit(
 334         struct xfs_perag        *pag,
 335         struct xfs_inode        *ip,
 336         xfs_ino_t               ino,
 337         int                     flags,
 338         int                     lock_flags) __releases(RCU)
 339 {
 340         struct inode            *inode = VFS_I(ip);
 341         struct xfs_mount        *mp = ip->i_mount;
 342         int                     error;
 343
 344         /*
 345          * check for re-use of an inode within an RCU grace period due to the
 346          * radix tree nodes not being updated yet. We monitor for this by
 347          * setting the inode number to zero before freeing the inode structure.
 348          * If the inode has been reallocated and set up, then the inode number
 349          * will not match, so check for that, too.
 350          */
 351         spin_lock(&ip->i_flags_lock);
 352         if (ip->i_ino != ino) {
 353                 trace_xfs_iget_skip(ip);
 354                 XFS_STATS_INC(mp, xs_ig_frecycle);
 355                 error = -EAGAIN;
 356                 goto out_error;
 357         }
 358
 359
 360         /*
 361          * If we are racing with another cache hit that is currently
 362          * instantiating this inode or currently recycling it out of
 363          * reclaimabe state, wait for the initialisation to complete
 364          * before continuing.
 365          *
 366          * XXX(hch): eventually we should do something equivalent to
 367          *           wait_on_inode to wait for these flags to be cleared
 368          *           instead of polling for it.
 369          */
 370         if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 371                 trace_xfs_iget_skip(ip);
 372                 XFS_STATS_INC(mp, xs_ig_frecycle);
 373                 error = -EAGAIN;
 374                 goto out_error;
 375         }
 376
 377         /*
 378          * Check the inode free state is valid. This also detects lookup
 379          * racing with unlinks.
 380          */
 381         error = xfs_iget_check_free_state(ip, flags);
 382         if (error)
 383                 goto out_error;
 384
 385         /*
 386          * If IRECLAIMABLE is set, we've torn down the VFS inode already.
 387          * Need to carefully get it back into useable state.
 388          */
 389         if (ip->i_flags & XFS_IRECLAIMABLE) {
 390                 trace_xfs_iget_reclaim(ip);
 391
 392                 if (flags & XFS_IGET_INCORE) {
 393                         error = -EAGAIN;
 394                         goto out_error;
 395                 }
 396
 397                 /*
 398                  * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
 399                  * from stomping over us while we recycle the inode.  We can't
 400                  * clear the radix tree reclaimable tag yet as it requires
 401                  * pag_ici_lock to be held exclusive.
 402                  */
 403                 ip->i_flags |= XFS_IRECLAIM;
 404
 405                 spin_unlock(&ip->i_flags_lock);
 406                 rcu_read_unlock();
 407
 408                 ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 409                 error = xfs_reinit_inode(mp, inode);
 410                 if (error) {
 411                         bool wake;
 412                         /*
 413                          * Re-initializing the inode failed, and we are in deep
 414                          * trouble.  Try to re-add it to the reclaim list.
 415                          */
 416                         rcu_read_lock();
 417                         spin_lock(&ip->i_flags_lock);
 418                         wake = !!__xfs_iflags_test(ip, XFS_INEW);
 419                         ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
 420                         if (wake)
 421                                 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 422                         ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 423                         trace_xfs_iget_reclaim_fail(ip);
 424                         goto out_error;
 425                 }
 426
 427                 spin_lock(&pag->pag_ici_lock);
 428                 spin_lock(&ip->i_flags_lock);
 429
 430                 /*
 431                  * Clear the per-lifetime state in the inode as we are now
 432                  * effectively a new inode and need to return to the initial
 433                  * state before reuse occurs.
 434                  */
 435                 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
 436                 ip->i_flags |= XFS_INEW;
 437                 xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 438                 inode->i_state = I_NEW;
 439                 ip->i_sick = 0;
 440                 ip->i_checked = 0;
 441
 442                 spin_unlock(&ip->i_flags_lock);
 443                 spin_unlock(&pag->pag_ici_lock);
 444         } else {
 445                 /* If the VFS inode is being torn down, pause and try again. */
 446                 if (!igrab(inode)) {
 447                         trace_xfs_iget_skip(ip);
 448                         error = -EAGAIN;
 449                         goto out_error;
 450                 }
 451
 452                 /* We've got a live one. */
 453                 spin_unlock(&ip->i_flags_lock);
 454                 rcu_read_unlock();
 455                 trace_xfs_iget_hit(ip);
 456         }
 457
 458         if (lock_flags != 0)
 459                 xfs_ilock(ip, lock_flags);
 460
 461         if (!(flags & XFS_IGET_INCORE))
 462                 xfs_iflags_clear(ip, XFS_ISTALE);
 463         XFS_STATS_INC(mp, xs_ig_found);
 464
 465         return 0;
 466
 467 out_error:
 468         spin_unlock(&ip->i_flags_lock);
 469         rcu_read_unlock();
 470         return error;
 471 }
 472
 473
 474 static int
 475 xfs_iget_cache_miss(
 476         struct xfs_mount        *mp,
 477         struct xfs_perag        *pag,
 478         xfs_trans_t             *tp,
 479         xfs_ino_t               ino,
 480         struct xfs_inode        **ipp,
 481         int                     flags,
 482         int                     lock_flags)
 483 {
 484         struct xfs_inode        *ip;
 485         int                     error;
 486         xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 487         int                     iflags;
 488
 489         ip = xfs_inode_alloc(mp, ino);
 490         if (!ip)
 491                 return -ENOMEM;
 492
 493         error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
 494         if (error)
 495                 goto out_destroy;
 496
 497         /*
 498          * For version 5 superblocks, if we are initialising a new inode and we
 499          * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
 500          * simply build the new inode core with a random generation number.
 501          *
 502          * For version 4 (and older) superblocks, log recovery is dependent on
 503          * the di_flushiter field being initialised from the current on-disk
 504          * value and hence we must also read the inode off disk even when
 505          * initializing new inodes.
 506          */
 507         if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
 508             (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 509                 VFS_I(ip)->i_generation = prandom_u32();
 510         } else {
 511                 struct xfs_dinode       *dip;
 512                 struct xfs_buf          *bp;
 513
 514                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
 515                 if (error)
 516                         goto out_destroy;
 517
 518                 error = xfs_inode_from_disk(ip, dip);
 519                 if (!error)
 520                         xfs_buf_set_ref(bp, XFS_INO_REF);
 521                 xfs_trans_brelse(tp, bp);
 522
 523                 if (error)
 524                         goto out_destroy;
 525         }
 526
 527         trace_xfs_iget_miss(ip);
 528
 529         /*
 530          * Check the inode free state is valid. This also detects lookup
 531          * racing with unlinks.
 532          */
 533         error = xfs_iget_check_free_state(ip, flags);
 534         if (error)
 535                 goto out_destroy;
 536
 537         /*
 538          * Preload the radix tree so we can insert safely under the
 539          * write spinlock. Note that we cannot sleep inside the preload
 540          * region. Since we can be called from transaction context, don't
 541          * recurse into the file system.
 542          */
 543         if (radix_tree_preload(GFP_NOFS)) {
 544                 error = -EAGAIN;
 545                 goto out_destroy;
 546         }
 547
 548         /*
 549          * Because the inode hasn't been added to the radix-tree yet it can't
 550          * be found by another thread, so we can do the non-sleeping lock here.
 551          */
 552         if (lock_flags) {
 553                 if (!xfs_ilock_nowait(ip, lock_flags))
 554                         BUG();
 555         }
 556
 557         /*
 558          * These values must be set before inserting the inode into the radix
 559          * tree as the moment it is inserted a concurrent lookup (allowed by the
 560          * RCU locking mechanism) can find it and that lookup must see that this
 561          * is an inode currently under construction (i.e. that XFS_INEW is set).
 562          * The ip->i_flags_lock that protects the XFS_INEW flag forms the
 563          * memory barrier that ensures this detection works correctly at lookup
 564          * time.
 565          */
 566         iflags = XFS_INEW;
 567         if (flags & XFS_IGET_DONTCACHE)
 568                 d_mark_dontcache(VFS_I(ip));
 569         ip->i_udquot = NULL;
 570         ip->i_gdquot = NULL;
 571         ip->i_pdquot = NULL;
 572         xfs_iflags_set(ip, iflags);
 573
 574         /* insert the new inode */
 575         spin_lock(&pag->pag_ici_lock);
 576         error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
 577         if (unlikely(error)) {
 578                 WARN_ON(error != -EEXIST);
 579                 XFS_STATS_INC(mp, xs_ig_dup);
 580                 error = -EAGAIN;
 581                 goto out_preload_end;
 582         }
 583         spin_unlock(&pag->pag_ici_lock);
 584         radix_tree_preload_end();
 585
 586         *ipp = ip;
 587         return 0;
 588
 589 out_preload_end:
 590         spin_unlock(&pag->pag_ici_lock);
 591         radix_tree_preload_end();
 592         if (lock_flags)
 593                 xfs_iunlock(ip, lock_flags);
 594 out_destroy:
 595         __destroy_inode(VFS_I(ip));
 596         xfs_inode_free(ip);
 597         return error;
 598 }
 599
 600 /*
 601  * Look up an inode by number in the given file system.  The inode is looked up
 602  * in the cache held in each AG.  If the inode is found in the cache, initialise
 603  * the vfs inode if necessary.
 604  *
 605  * If it is not in core, read it in from the file system's device, add it to the
 606  * cache and initialise the vfs inode.
 607  *
 608  * The inode is locked according to the value of the lock_flags parameter.
 609  * Inode lookup is only done during metadata operations and not as part of the
 610  * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
 611  */
 612 int
 613 xfs_iget(
 614         struct xfs_mount        *mp,
 615         struct xfs_trans        *tp,
 616         xfs_ino_t               ino,
 617         uint                    flags,
 618         uint                    lock_flags,
 619         struct xfs_inode        **ipp)
 620 {
 621         struct xfs_inode        *ip;
 622         struct xfs_perag        *pag;
 623         xfs_agino_t             agino;
 624         int                     error;
 625
 626         ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
 627
 628         /* reject inode numbers outside existing AGs */
 629         if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
 630                 return -EINVAL;
 631
 632         XFS_STATS_INC(mp, xs_ig_attempts);
 633
 634         /* get the perag structure and ensure that it's inode capable */
 635         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 636         agino = XFS_INO_TO_AGINO(mp, ino);
 637
 638 again:
 639         error = 0;
 640         rcu_read_lock();
 641         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 642
 643         if (ip) {
 644                 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
 645                 if (error)
 646                         goto out_error_or_again;
 647         } else {
 648                 rcu_read_unlock();
 649                 if (flags & XFS_IGET_INCORE) {
 650                         error = -ENODATA;
 651                         goto out_error_or_again;
 652                 }
 653                 XFS_STATS_INC(mp, xs_ig_missed);
 654
 655                 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 656                                                         flags, lock_flags);
 657                 if (error)
 658                         goto out_error_or_again;
 659         }
 660         xfs_perag_put(pag);
 661
 662         *ipp = ip;
 663
 664         /*
 665          * If we have a real type for an on-disk inode, we can setup the inode
 666          * now.  If it's a new inode being created, xfs_ialloc will handle it.
 667          */
 668         if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
 669                 xfs_setup_existing_inode(ip);
 670         return 0;
 671
 672 out_error_or_again:
 673         if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
 674                 delay(1);
 675                 goto again;
 676         }
 677         xfs_perag_put(pag);
 678         return error;
 679 }
 680
 681 /*
 682  * "Is this a cached inode that's also allocated?"
 683  *
 684  * Look up an inode by number in the given file system.  If the inode is
 685  * in cache and isn't in purgatory, return 1 if the inode is allocated
 686  * and 0 if it is not.  For all other cases (not in cache, being torn
 687  * down, etc.), return a negative error code.
 688  *
 689  * The caller has to prevent inode allocation and freeing activity,
 690  * presumably by locking the AGI buffer.   This is to ensure that an
 691  * inode cannot transition from allocated to freed until the caller is
 692  * ready to allow that.  If the inode is in an intermediate state (new,
 693  * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
 694  * inode is not in the cache, -ENOENT will be returned.  The caller must
 695  * deal with these scenarios appropriately.
 696  *
 697  * This is a specialized use case for the online scrubber; if you're
 698  * reading this, you probably want xfs_iget.
 699  */
 700 int
 701 xfs_icache_inode_is_allocated(
 702         struct xfs_mount        *mp,
 703         struct xfs_trans        *tp,
 704         xfs_ino_t               ino,
 705         bool                    *inuse)
 706 {
 707         struct xfs_inode        *ip;
 708         int                     error;
 709
 710         error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
 711         if (error)
 712                 return error;
 713
 714         *inuse = !!(VFS_I(ip)->i_mode);
 715         xfs_irele(ip);
 716         return 0;
 717 }
 718
 719 /*
 720  * The inode lookup is done in batches to keep the amount of lock traffic and
 721  * radix tree lookups to a minimum. The batch size is a trade off between
 722  * lookup reduction and stack usage. This is in the reclaim path, so we can't
 723  * be too greedy.
 724  */
 725 #define XFS_LOOKUP_BATCH        32
 726
 727 /*
 728  * Decide if the given @ip is eligible to be a part of the inode walk, and
 729  * grab it if so.  Returns true if it's ready to go or false if we should just
 730  * ignore it.
 731  */
 732 STATIC bool
 733 xfs_inode_walk_ag_grab(
 734         struct xfs_inode        *ip,
 735         int                     flags)
 736 {
 737         struct inode            *inode = VFS_I(ip);
 738         bool                    newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
 739
 740         ASSERT(rcu_read_lock_held());
 741
 742         /* Check for stale RCU freed inode */
 743         spin_lock(&ip->i_flags_lock);
 744         if (!ip->i_ino)
 745                 goto out_unlock_noent;
 746
 747         /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
 748         if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
 749             __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
 750                 goto out_unlock_noent;
 751         spin_unlock(&ip->i_flags_lock);
 752
 753         /* nothing to sync during shutdown */
 754         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 755                 return false;
 756
 757         /* If we can't grab the inode, it must on it's way to reclaim. */
 758         if (!igrab(inode))
 759                 return false;
 760
 761         /* inode is valid */
 762         return true;
 763
 764 out_unlock_noent:
 765         spin_unlock(&ip->i_flags_lock);
 766         return false;
 767 }
 768
 769 /*
 770  * For a given per-AG structure @pag, grab, @execute, and rele all incore
 771  * inodes with the given radix tree @tag.
 772  */
 773 STATIC int
 774 xfs_inode_walk_ag(
 775         struct xfs_perag        *pag,
 776         int                     iter_flags,
 777         int                     (*execute)(struct xfs_inode *ip, void *args),
 778         void                    *args,
 779         int                     tag)
 780 {
 781         struct xfs_mount        *mp = pag->pag_mount;
 782         uint32_t                first_index;
 783         int                     last_error = 0;
 784         int                     skipped;
 785         bool                    done;
 786         int                     nr_found;
 787
 788 restart:
 789         done = false;
 790         skipped = 0;
 791         first_index = 0;
 792         nr_found = 0;
 793         do {
 794                 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
 795                 int             error = 0;
 796                 int             i;
 797
 798                 rcu_read_lock();
 799
 800                 if (tag == XFS_ICI_NO_TAG)
 801                         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 802                                         (void **)batch, first_index,
 803                                         XFS_LOOKUP_BATCH);
 804                 else
 805                         nr_found = radix_tree_gang_lookup_tag(
 806                                         &pag->pag_ici_root,
 807                                         (void **) batch, first_index,
 808                                         XFS_LOOKUP_BATCH, tag);
 809
 810                 if (!nr_found) {
 811                         rcu_read_unlock();
 812                         break;
 813                 }
 814
 815                 /*
 816                  * Grab the inodes before we drop the lock. if we found
 817                  * nothing, nr == 0 and the loop will be skipped.
 818                  */
 819                 for (i = 0; i < nr_found; i++) {
 820                         struct xfs_inode *ip = batch[i];
 821
 822                         if (done || !xfs_inode_walk_ag_grab(ip, iter_flags))
 823                                 batch[i] = NULL;
 824
 825                         /*
 826                          * Update the index for the next lookup. Catch
 827                          * overflows into the next AG range which can occur if
 828                          * we have inodes in the last block of the AG and we
 829                          * are currently pointing to the last inode.
 830                          *
 831                          * Because we may see inodes that are from the wrong AG
 832                          * due to RCU freeing and reallocation, only update the
 833                          * index if it lies in this AG. It was a race that lead
 834                          * us to see this inode, so another lookup from the
 835                          * same index will not find it again.
 836                          */
 837                         if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
 838                                 continue;
 839                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
 840                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
 841                                 done = true;
 842                 }
 843
 844                 /* unlock now we've grabbed the inodes. */
 845                 rcu_read_unlock();
 846
 847                 for (i = 0; i < nr_found; i++) {
 848                         if (!batch[i])
 849                                 continue;
 850                         if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) &&
 851                             xfs_iflags_test(batch[i], XFS_INEW))
 852                                 xfs_inew_wait(batch[i]);
 853                         error = execute(batch[i], args);
 854                         xfs_irele(batch[i]);
 855                         if (error == -EAGAIN) {
 856                                 skipped++;
 857                                 continue;
 858                         }
 859                         if (error && last_error != -EFSCORRUPTED)
 860                                 last_error = error;
 861                 }
 862
 863                 /* bail out if the filesystem is corrupted.  */
 864                 if (error == -EFSCORRUPTED)
 865                         break;
 866
 867                 cond_resched();
 868
 869         } while (nr_found && !done);
 870
 871         if (skipped) {
 872                 delay(1);
 873                 goto restart;
 874         }
 875         return last_error;
 876 }
 877
 878 /* Fetch the next (possibly tagged) per-AG structure. */
 879 static inline struct xfs_perag *
 880 xfs_inode_walk_get_perag(
 881         struct xfs_mount        *mp,
 882         xfs_agnumber_t          agno,
 883         int                     tag)
 884 {
 885         if (tag == XFS_ICI_NO_TAG)
 886                 return xfs_perag_get(mp, agno);
 887         return xfs_perag_get_tag(mp, agno, tag);
 888 }
 889
 890 /*
 891  * Call the @execute function on all incore inodes matching the radix tree
 892  * @tag.
 893  */
 894 int
 895 xfs_inode_walk(
 896         struct xfs_mount        *mp,
 897         int                     iter_flags,
 898         int                     (*execute)(struct xfs_inode *ip, void *args),
 899         void                    *args,
 900         int                     tag)
 901 {
 902         struct xfs_perag        *pag;
 903         int                     error = 0;
 904         int                     last_error = 0;
 905         xfs_agnumber_t          ag;
 906
 907         ag = 0;
 908         while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) {
 909                 ag = pag->pag_agno + 1;
 910                 error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag);
 911                 xfs_perag_put(pag);
 912                 if (error) {
 913                         last_error = error;
 914                         if (error == -EFSCORRUPTED)
 915                                 break;
 916                 }
 917         }
 918         return last_error;
 919 }
 920
 921 /*
 922  * Background scanning to trim post-EOF preallocated space. This is queued
 923  * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
 924  */
 925 void
 926 xfs_queue_eofblocks(
 927         struct xfs_mount *mp)
 928 {
 929         rcu_read_lock();
 930         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
 931                 queue_delayed_work(mp->m_eofblocks_workqueue,
 932                                    &mp->m_eofblocks_work,
 933                                    msecs_to_jiffies(xfs_eofb_secs * 1000));
 934         rcu_read_unlock();
 935 }
 936
 937 void
 938 xfs_eofblocks_worker(
 939         struct work_struct *work)
 940 {
 941         struct xfs_mount *mp = container_of(to_delayed_work(work),
 942                                 struct xfs_mount, m_eofblocks_work);
 943
 944         if (!sb_start_write_trylock(mp->m_super))
 945                 return;
 946         xfs_icache_free_eofblocks(mp, NULL);
 947         sb_end_write(mp->m_super);
 948
 949         xfs_queue_eofblocks(mp);
 950 }
 951
 952 /*
 953  * Background scanning to trim preallocated CoW space. This is queued
 954  * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
 955  * (We'll just piggyback on the post-EOF prealloc space workqueue.)
 956  */
 957 void
 958 xfs_queue_cowblocks(
 959         struct xfs_mount *mp)
 960 {
 961         rcu_read_lock();
 962         if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
 963                 queue_delayed_work(mp->m_eofblocks_workqueue,
 964                                    &mp->m_cowblocks_work,
 965                                    msecs_to_jiffies(xfs_cowb_secs * 1000));
 966         rcu_read_unlock();
 967 }
 968
 969 void
 970 xfs_cowblocks_worker(
 971         struct work_struct *work)
 972 {
 973         struct xfs_mount *mp = container_of(to_delayed_work(work),
 974                                 struct xfs_mount, m_cowblocks_work);
 975
 976         if (!sb_start_write_trylock(mp->m_super))
 977                 return;
 978         xfs_icache_free_cowblocks(mp, NULL);
 979         sb_end_write(mp->m_super);
 980
 981         xfs_queue_cowblocks(mp);
 982 }
 983
 984 /*
 985  * Grab the inode for reclaim exclusively.
 986  *
 987  * We have found this inode via a lookup under RCU, so the inode may have
 988  * already been freed, or it may be in the process of being recycled by
 989  * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
 990  * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
 991  * will not be set. Hence we need to check for both these flag conditions to
 992  * avoid inodes that are no longer reclaim candidates.
 993  *
 994  * Note: checking for other state flags here, under the i_flags_lock or not, is
 995  * racy and should be avoided. Those races should be resolved only after we have
 996  * ensured that we are able to reclaim this inode and the world can see that we
 997  * are going to reclaim it.
 998  *
 999  * Return true if we grabbed it, false otherwise.
1000  */
1001 static bool
1002 xfs_reclaim_inode_grab(
1003         struct xfs_inode        *ip)
1004 {
1005         ASSERT(rcu_read_lock_held());
1006
1007         spin_lock(&ip->i_flags_lock);
1008         if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
1009             __xfs_iflags_test(ip, XFS_IRECLAIM)) {
1010                 /* not a reclaim candidate. */
1011                 spin_unlock(&ip->i_flags_lock);
1012                 return false;
1013         }
1014         __xfs_iflags_set(ip, XFS_IRECLAIM);
1015         spin_unlock(&ip->i_flags_lock);
1016         return true;
1017 }
1018
1019 /*
1020  * Inode reclaim is non-blocking, so the default action if progress cannot be
1021  * made is to "requeue" the inode for reclaim by unlocking it and clearing the
1022  * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
1023  * blocking anymore and hence we can wait for the inode to be able to reclaim
1024  * it.
1025  *
1026  * We do no IO here - if callers require inodes to be cleaned they must push the
1027  * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
1028  * done in the background in a non-blocking manner, and enables memory reclaim
1029  * to make progress without blocking.
1030  */
1031 static void
1032 xfs_reclaim_inode(
1033         struct xfs_inode        *ip,
1034         struct xfs_perag        *pag)
1035 {
1036         xfs_ino_t               ino = ip->i_ino; /* for radix_tree_delete */
1037
1038         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
1039                 goto out;
1040         if (!xfs_iflock_nowait(ip))
1041                 goto out_iunlock;
1042
1043         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1044                 xfs_iunpin_wait(ip);
1045                 /* xfs_iflush_abort() drops the flush lock */
1046                 xfs_iflush_abort(ip);
1047                 goto reclaim;
1048         }
1049         if (xfs_ipincount(ip))
1050                 goto out_ifunlock;
1051         if (!xfs_inode_clean(ip))
1052                 goto out_ifunlock;
1053
1054         xfs_ifunlock(ip);
1055 reclaim:
1056         ASSERT(!xfs_isiflocked(ip));
1057
1058         /*
1059          * Because we use RCU freeing we need to ensure the inode always appears
1060          * to be reclaimed with an invalid inode number when in the free state.
1061          * We do this as early as possible under the ILOCK so that
1062          * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
1063          * detect races with us here. By doing this, we guarantee that once
1064          * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
1065          * it will see either a valid inode that will serialise correctly, or it
1066          * will see an invalid inode that it can skip.
1067          */
1068         spin_lock(&ip->i_flags_lock);
1069         ip->i_flags = XFS_IRECLAIM;
1070         ip->i_ino = 0;
1071         spin_unlock(&ip->i_flags_lock);
1072
1073         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1074
1075         XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
1076         /*
1077          * Remove the inode from the per-AG radix tree.
1078          *
1079          * Because radix_tree_delete won't complain even if the item was never
1080          * added to the tree assert that it's been there before to catch
1081          * problems with the inode life time early on.
1082          */
1083         spin_lock(&pag->pag_ici_lock);
1084         if (!radix_tree_delete(&pag->pag_ici_root,
1085                                 XFS_INO_TO_AGINO(ip->i_mount, ino)))
1086                 ASSERT(0);
1087         xfs_perag_clear_reclaim_tag(pag);
1088         spin_unlock(&pag->pag_ici_lock);
1089
1090         /*
1091          * Here we do an (almost) spurious inode lock in order to coordinate
1092          * with inode cache radix tree lookups.  This is because the lookup
1093          * can reference the inodes in the cache without taking references.
1094          *
1095          * We make that OK here by ensuring that we wait until the inode is
1096          * unlocked after the lookup before we go ahead and free it.
1097          */
1098         xfs_ilock(ip, XFS_ILOCK_EXCL);
1099         xfs_qm_dqdetach(ip);
1100         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1101         ASSERT(xfs_inode_clean(ip));
1102
1103         __xfs_inode_free(ip);
1104         return;
1105
1106 out_ifunlock:
1107         xfs_ifunlock(ip);
1108 out_iunlock:
1109         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1110 out:
1111         xfs_iflags_clear(ip, XFS_IRECLAIM);
1112 }
1113
1114 /*
1115  * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
1116  * corrupted, we still want to try to reclaim all the inodes. If we don't,
1117  * then a shut down during filesystem unmount reclaim walk leak all the
1118  * unreclaimed inodes.
1119  *
1120  * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
1121  * so that callers that want to block until all dirty inodes are written back
1122  * and reclaimed can sanely loop.
1123  */
1124 static void
1125 xfs_reclaim_inodes_ag(
1126         struct xfs_mount        *mp,
1127         int                     *nr_to_scan)
1128 {
1129         struct xfs_perag        *pag;
1130         xfs_agnumber_t          ag = 0;
1131
1132         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1133                 unsigned long   first_index = 0;
1134                 int             done = 0;
1135                 int             nr_found = 0;
1136
1137                 ag = pag->pag_agno + 1;
1138
1139                 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1140                 do {
1141                         struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1142                         int     i;
1143
1144                         rcu_read_lock();
1145                         nr_found = radix_tree_gang_lookup_tag(
1146                                         &pag->pag_ici_root,
1147                                         (void **)batch, first_index,
1148                                         XFS_LOOKUP_BATCH,
1149                                         XFS_ICI_RECLAIM_TAG);
1150                         if (!nr_found) {
1151                                 done = 1;
1152                                 rcu_read_unlock();
1153                                 break;
1154                         }
1155
1156                         /*
1157                          * Grab the inodes before we drop the lock. if we found
1158                          * nothing, nr == 0 and the loop will be skipped.
1159                          */
1160                         for (i = 0; i < nr_found; i++) {
1161                                 struct xfs_inode *ip = batch[i];
1162
1163                                 if (done || !xfs_reclaim_inode_grab(ip))
1164                                         batch[i] = NULL;
1165
1166                                 /*
1167                                  * Update the index for the next lookup. Catch
1168                                  * overflows into the next AG range which can
1169                                  * occur if we have inodes in the last block of
1170                                  * the AG and we are currently pointing to the
1171                                  * last inode.
1172                                  *
1173                                  * Because we may see inodes that are from the
1174                                  * wrong AG due to RCU freeing and
1175                                  * reallocation, only update the index if it
1176                                  * lies in this AG. It was a race that lead us
1177                                  * to see this inode, so another lookup from
1178                                  * the same index will not find it again.
1179                                  */
1180                                 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
1181                                                                 pag->pag_agno)
1182                                         continue;
1183                                 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1184                                 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1185                                         done = 1;
1186                         }
1187
1188                         /* unlock now we've grabbed the inodes. */
1189                         rcu_read_unlock();
1190
1191                         for (i = 0; i < nr_found; i++) {
1192                                 if (batch[i])
1193                                         xfs_reclaim_inode(batch[i], pag);
1194                         }
1195
1196                         *nr_to_scan -= XFS_LOOKUP_BATCH;
1197                         cond_resched();
1198                 } while (nr_found && !done && *nr_to_scan > 0);
1199
1200                 if (done)
1201                         first_index = 0;
1202                 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1203                 xfs_perag_put(pag);
1204         }
1205 }
1206
1207 void
1208 xfs_reclaim_inodes(
1209         struct xfs_mount        *mp)
1210 {
1211         int             nr_to_scan = INT_MAX;
1212
1213         while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
1214                 xfs_ail_push_all_sync(mp->m_ail);
1215                 xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1216         };
1217 }
1218
1219 /*
1220  * The shrinker infrastructure determines how many inodes we should scan for
1221  * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1222  * push the AIL here. We also want to proactively free up memory if we can to
1223  * minimise the amount of work memory reclaim has to do so we kick the
1224  * background reclaim if it isn't already scheduled.
1225  */
1226 long
1227 xfs_reclaim_inodes_nr(
1228         struct xfs_mount        *mp,
1229         int                     nr_to_scan)
1230 {
1231         /* kick background reclaimer and push the AIL */
1232         xfs_reclaim_work_queue(mp);
1233         xfs_ail_push_all(mp->m_ail);
1234
1235         xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1236         return 0;
1237 }
1238
1239 /*
1240  * Return the number of reclaimable inodes in the filesystem for
1241  * the shrinker to determine how much to reclaim.
1242  */
1243 int
1244 xfs_reclaim_inodes_count(
1245         struct xfs_mount        *mp)
1246 {
1247         struct xfs_perag        *pag;
1248         xfs_agnumber_t          ag = 0;
1249         int                     reclaimable = 0;
1250
1251         while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1252                 ag = pag->pag_agno + 1;
1253                 reclaimable += pag->pag_ici_reclaimable;
1254                 xfs_perag_put(pag);
1255         }
1256         return reclaimable;
1257 }
1258
1259 STATIC bool
1260 xfs_inode_match_id(
1261         struct xfs_inode        *ip,
1262         struct xfs_eofblocks    *eofb)
1263 {
1264         if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1265             !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1266                 return false;
1267
1268         if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1269             !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1270                 return false;
1271
1272         if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1273             ip->i_d.di_projid != eofb->eof_prid)
1274                 return false;
1275
1276         return true;
1277 }
1278
1279 /*
1280  * A union-based inode filtering algorithm. Process the inode if any of the
1281  * criteria match. This is for global/internal scans only.
1282  */
1283 STATIC bool
1284 xfs_inode_match_id_union(
1285         struct xfs_inode        *ip,
1286         struct xfs_eofblocks    *eofb)
1287 {
1288         if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1289             uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1290                 return true;
1291
1292         if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1293             gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1294                 return true;
1295
1296         if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1297             ip->i_d.di_projid == eofb->eof_prid)
1298                 return true;
1299
1300         return false;
1301 }
1302
1303 /*
1304  * Is this inode @ip eligible for eof/cow block reclamation, given some
1305  * filtering parameters @eofb?  The inode is eligible if @eofb is null or
1306  * if the predicate functions match.
1307  */
1308 static bool
1309 xfs_inode_matches_eofb(
1310         struct xfs_inode        *ip,
1311         struct xfs_eofblocks    *eofb)
1312 {
1313         bool                    match;
1314
1315         if (!eofb)
1316                 return true;
1317
1318         if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1319                 match = xfs_inode_match_id_union(ip, eofb);
1320         else
1321                 match = xfs_inode_match_id(ip, eofb);
1322         if (!match)
1323                 return false;
1324
1325         /* skip the inode if the file size is too small */
1326         if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
1327             XFS_ISIZE(ip) < eofb->eof_min_file_size)
1328                 return false;
1329
1330         return true;
1331 }
1332
1333 /*
1334  * This is a fast pass over the inode cache to try to get reclaim moving on as
1335  * many inodes as possible in a short period of time. It kicks itself every few
1336  * seconds, as well as being kicked by the inode cache shrinker when memory
1337  * goes low.
1338  */
1339 void
1340 xfs_reclaim_worker(
1341         struct work_struct *work)
1342 {
1343         struct xfs_mount *mp = container_of(to_delayed_work(work),
1344                                         struct xfs_mount, m_reclaim_work);
1345         int             nr_to_scan = INT_MAX;
1346
1347         xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1348         xfs_reclaim_work_queue(mp);
1349 }
1350
1351 STATIC int
1352 xfs_inode_free_eofblocks(
1353         struct xfs_inode        *ip,
1354         void                    *args)
1355 {
1356         struct xfs_eofblocks    *eofb = args;
1357         bool                    wait;
1358         int                     ret;
1359
1360         wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
1361
1362         if (!xfs_can_free_eofblocks(ip, false)) {
1363                 /* inode could be preallocated or append-only */
1364                 trace_xfs_inode_free_eofblocks_invalid(ip);
1365                 xfs_inode_clear_eofblocks_tag(ip);
1366                 return 0;
1367         }
1368
1369         /*
1370          * If the mapping is dirty the operation can block and wait for some
1371          * time. Unless we are waiting, skip it.
1372          */
1373         if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1374                 return 0;
1375
1376         if (!xfs_inode_matches_eofb(ip, eofb))
1377                 return 0;
1378
1379         /*
1380          * If the caller is waiting, return -EAGAIN to keep the background
1381          * scanner moving and revisit the inode in a subsequent pass.
1382          */
1383         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1384                 if (wait)
1385                         return -EAGAIN;
1386                 return 0;
1387         }
1388
1389         ret = xfs_free_eofblocks(ip);
1390         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1391
1392         return ret;
1393 }
1394
1395 int
1396 xfs_icache_free_eofblocks(
1397         struct xfs_mount        *mp,
1398         struct xfs_eofblocks    *eofb)
1399 {
1400         return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb,
1401                         XFS_ICI_EOFBLOCKS_TAG);
1402 }
1403
1404 /*
1405  * Run eofblocks scans on the quotas applicable to the inode. For inodes with
1406  * multiple quotas, we don't know exactly which quota caused an allocation
1407  * failure. We make a best effort by including each quota under low free space
1408  * conditions (less than 1% free space) in the scan.
1409  */
1410 static int
1411 __xfs_inode_free_quota_eofblocks(
1412         struct xfs_inode        *ip,
1413         int                     (*execute)(struct xfs_mount *mp,
1414                                            struct xfs_eofblocks *eofb))
1415 {
1416         int scan = 0;
1417         struct xfs_eofblocks eofb = {0};
1418         struct xfs_dquot *dq;
1419
1420         /*
1421          * Run a sync scan to increase effectiveness and use the union filter to
1422          * cover all applicable quotas in a single scan.
1423          */
1424         eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1425
1426         if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
1427                 dq = xfs_inode_dquot(ip, XFS_DQ_USER);
1428                 if (dq && xfs_dquot_lowsp(dq)) {
1429                         eofb.eof_uid = VFS_I(ip)->i_uid;
1430                         eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1431                         scan = 1;
1432                 }
1433         }
1434
1435         if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
1436                 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
1437                 if (dq && xfs_dquot_lowsp(dq)) {
1438                         eofb.eof_gid = VFS_I(ip)->i_gid;
1439                         eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1440                         scan = 1;
1441                 }
1442         }
1443
1444         if (scan)
1445                 execute(ip->i_mount, &eofb);
1446
1447         return scan;
1448 }
1449
1450 int
1451 xfs_inode_free_quota_eofblocks(
1452         struct xfs_inode *ip)
1453 {
1454         return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
1455 }
1456
1457 static inline unsigned long
1458 xfs_iflag_for_tag(
1459         int             tag)
1460 {
1461         switch (tag) {
1462         case XFS_ICI_EOFBLOCKS_TAG:
1463                 return XFS_IEOFBLOCKS;
1464         case XFS_ICI_COWBLOCKS_TAG:
1465                 return XFS_ICOWBLOCKS;
1466         default:
1467                 ASSERT(0);
1468                 return 0;
1469         }
1470 }
1471
1472 static void
1473 __xfs_inode_set_blocks_tag(
1474         xfs_inode_t     *ip,
1475         void            (*execute)(struct xfs_mount *mp),
1476         void            (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
1477                                   int error, unsigned long caller_ip),
1478         int             tag)
1479 {
1480         struct xfs_mount *mp = ip->i_mount;
1481         struct xfs_perag *pag;
1482         int tagged;
1483
1484         /*
1485          * Don't bother locking the AG and looking up in the radix trees
1486          * if we already know that we have the tag set.
1487          */
1488         if (ip->i_flags & xfs_iflag_for_tag(tag))
1489                 return;
1490         spin_lock(&ip->i_flags_lock);
1491         ip->i_flags |= xfs_iflag_for_tag(tag);
1492         spin_unlock(&ip->i_flags_lock);
1493
1494         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1495         spin_lock(&pag->pag_ici_lock);
1496
1497         tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
1498         radix_tree_tag_set(&pag->pag_ici_root,
1499                            XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
1500         if (!tagged) {
1501                 /* propagate the eofblocks tag up into the perag radix tree */
1502                 spin_lock(&ip->i_mount->m_perag_lock);
1503                 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1504                                    XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1505                                    tag);
1506                 spin_unlock(&ip->i_mount->m_perag_lock);
1507
1508                 /* kick off background trimming */
1509                 execute(ip->i_mount);
1510
1511                 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
1512         }
1513
1514         spin_unlock(&pag->pag_ici_lock);
1515         xfs_perag_put(pag);
1516 }
1517
1518 void
1519 xfs_inode_set_eofblocks_tag(
1520         xfs_inode_t     *ip)
1521 {
1522         trace_xfs_inode_set_eofblocks_tag(ip);
1523         return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
1524                         trace_xfs_perag_set_eofblocks,
1525                         XFS_ICI_EOFBLOCKS_TAG);
1526 }
1527
1528 static void
1529 __xfs_inode_clear_blocks_tag(
1530         xfs_inode_t     *ip,
1531         void            (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
1532                                     int error, unsigned long caller_ip),
1533         int             tag)
1534 {
1535         struct xfs_mount *mp = ip->i_mount;
1536         struct xfs_perag *pag;
1537
1538         spin_lock(&ip->i_flags_lock);
1539         ip->i_flags &= ~xfs_iflag_for_tag(tag);
1540         spin_unlock(&ip->i_flags_lock);
1541
1542         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1543         spin_lock(&pag->pag_ici_lock);
1544
1545         radix_tree_tag_clear(&pag->pag_ici_root,
1546                              XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
1547         if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
1548                 /* clear the eofblocks tag from the perag radix tree */
1549                 spin_lock(&ip->i_mount->m_perag_lock);
1550                 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1551                                      XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1552                                      tag);
1553                 spin_unlock(&ip->i_mount->m_perag_lock);
1554                 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
1555         }
1556
1557         spin_unlock(&pag->pag_ici_lock);
1558         xfs_perag_put(pag);
1559 }
1560
1561 void
1562 xfs_inode_clear_eofblocks_tag(
1563         xfs_inode_t     *ip)
1564 {
1565         trace_xfs_inode_clear_eofblocks_tag(ip);
1566         return __xfs_inode_clear_blocks_tag(ip,
1567                         trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
1568 }
1569
1570 /*
1571  * Set ourselves up to free CoW blocks from this file.  If it's already clean
1572  * then we can bail out quickly, but otherwise we must back off if the file
1573  * is undergoing some kind of write.
1574  */
1575 static bool
1576 xfs_prep_free_cowblocks(
1577         struct xfs_inode        *ip)
1578 {
1579         /*
1580          * Just clear the tag if we have an empty cow fork or none at all. It's
1581          * possible the inode was fully unshared since it was originally tagged.
1582          */
1583         if (!xfs_inode_has_cow_data(ip)) {
1584                 trace_xfs_inode_free_cowblocks_invalid(ip);
1585                 xfs_inode_clear_cowblocks_tag(ip);
1586                 return false;
1587         }
1588
1589         /*
1590          * If the mapping is dirty or under writeback we cannot touch the
1591          * CoW fork.  Leave it alone if we're in the midst of a directio.
1592          */
1593         if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1594             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1595             mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1596             atomic_read(&VFS_I(ip)->i_dio_count))
1597                 return false;
1598
1599         return true;
1600 }
1601
1602 /*
1603  * Automatic CoW Reservation Freeing
1604  *
1605  * These functions automatically garbage collect leftover CoW reservations
1606  * that were made on behalf of a cowextsize hint when we start to run out
1607  * of quota or when the reservations sit around for too long.  If the file
1608  * has dirty pages or is undergoing writeback, its CoW reservations will
1609  * be retained.
1610  *
1611  * The actual garbage collection piggybacks off the same code that runs
1612  * the speculative EOF preallocation garbage collector.
1613  */
1614 STATIC int
1615 xfs_inode_free_cowblocks(
1616         struct xfs_inode        *ip,
1617         void                    *args)
1618 {
1619         struct xfs_eofblocks    *eofb = args;
1620         int                     ret = 0;
1621
1622         if (!xfs_prep_free_cowblocks(ip))
1623                 return 0;
1624
1625         if (!xfs_inode_matches_eofb(ip, eofb))
1626                 return 0;
1627
1628         /* Free the CoW blocks */
1629         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1630         xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1631
1632         /*
1633          * Check again, nobody else should be able to dirty blocks or change
1634          * the reflink iflag now that we have the first two locks held.
1635          */
1636         if (xfs_prep_free_cowblocks(ip))
1637                 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1638
1639         xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1640         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1641
1642         return ret;
1643 }
1644
1645 int
1646 xfs_icache_free_cowblocks(
1647         struct xfs_mount        *mp,
1648         struct xfs_eofblocks    *eofb)
1649 {
1650         return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb,
1651                         XFS_ICI_COWBLOCKS_TAG);
1652 }
1653
1654 int
1655 xfs_inode_free_quota_cowblocks(
1656         struct xfs_inode *ip)
1657 {
1658         return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
1659 }
1660
1661 void
1662 xfs_inode_set_cowblocks_tag(
1663         xfs_inode_t     *ip)
1664 {
1665         trace_xfs_inode_set_cowblocks_tag(ip);
1666         return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
1667                         trace_xfs_perag_set_cowblocks,
1668                         XFS_ICI_COWBLOCKS_TAG);
1669 }
1670
1671 void
1672 xfs_inode_clear_cowblocks_tag(
1673         xfs_inode_t     *ip)
1674 {
1675         trace_xfs_inode_clear_cowblocks_tag(ip);
1676         return __xfs_inode_clear_blocks_tag(ip,
1677                         trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
1678 }
1679
1680 /* Disable post-EOF and CoW block auto-reclamation. */
1681 void
1682 xfs_stop_block_reaping(
1683         struct xfs_mount        *mp)
1684 {
1685         cancel_delayed_work_sync(&mp->m_eofblocks_work);
1686         cancel_delayed_work_sync(&mp->m_cowblocks_work);
1687 }
1688
1689 /* Enable post-EOF and CoW block auto-reclamation. */
1690 void
1691 xfs_start_block_reaping(
1692         struct xfs_mount        *mp)
1693 {
1694         xfs_queue_eofblocks(mp);
1695         xfs_queue_cowblocks(mp);
1696 }