Linux 6.9-rc1

[linux-2.6-microblaze.git] / fs / fs-writeback.c
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 0522136..e4f17c5 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
  {
         assert_spin_locked(&wb->list_lock);
         assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
  
         list_move(&inode->i_io_list, head);
  
@@ -134,10 +135,35 @@ static bool inode_io_list_move_locked(struct inode *inode,
  
  static void wb_wakeup(struct bdi_writeback *wb)
  {
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
         if (test_bit(WB_registered, &wb->state))
                 mod_delayed_work(bdi_wq, &wb->dwork, 0);
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
+}
+
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+       unsigned long timeout;
+
+       timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+       spin_lock_irq(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+       spin_unlock_irq(&wb->work_lock);
  }
  
  static void finish_writeback_work(struct bdi_writeback *wb,
@@ -164,7 +190,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
         if (work->done)
                 atomic_inc(&work->done->cnt);
  
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
  
         if (test_bit(WB_registered, &wb->state)) {
                 list_add_tail(&work->list, &wb->work_list);
@@ -172,7 +198,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
         } else
                 finish_writeback_work(wb, work);
  
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
  }
  
  /**
@@ -236,7 +262,7 @@ void wb_wait_for_completion(struct wb_completion *done)
  static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
  static struct workqueue_struct *isw_wq;
  
-void __inode_attach_wb(struct inode *inode, struct page *page)
+void __inode_attach_wb(struct inode *inode, struct folio *folio)
  {
         struct backing_dev_info *bdi = inode_to_bdi(inode);
         struct bdi_writeback *wb = NULL;
@@ -244,8 +270,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
         if (inode_cgwb_enabled(inode)) {
                 struct cgroup_subsys_state *memcg_css;
  
-               if (page) {
-                       memcg_css = mem_cgroup_css_from_page(page);
+               if (folio) {
+                       memcg_css = mem_cgroup_css_from_folio(folio);
                         wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                 } else {
                         /* must pin memcg_css, see wb_get_create() */
@@ -280,6 +306,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
  {
         assert_spin_locked(&wb->list_lock);
         assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
  
         inode->i_state &= ~I_SYNC_QUEUED;
         if (wb != &wb->bdi->wb)
@@ -611,6 +638,24 @@ out_free:
         kfree(isw);
  }
  
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+                                  struct list_head *list, int *nr)
+{
+       struct inode *inode;
+
+       list_for_each_entry(inode, list, i_io_list) {
+               if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+                       continue;
+
+               isw->inodes[*nr] = inode;
+               (*nr)++;
+
+               if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+                       return true;
+       }
+       return false;
+}
+
  /**
   * cleanup_offline_cgwb - detach associated inodes
   * @wb: target wb
@@ -623,7 +668,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
  {
         struct cgroup_subsys_state *memcg_css;
         struct inode_switch_wbs_context *isw;
-       struct inode *inode;
         int nr;
         bool restart = false;
  
@@ -645,17 +689,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
  
         nr = 0;
         spin_lock(&wb->list_lock);
-       list_for_each_entry(inode, &wb->b_attached, i_io_list) {
-               if (!inode_prepare_wbs_switch(inode, isw->new_wb))
-                       continue;
-
-               isw->inodes[nr++] = inode;
-
-               if (nr >= WB_MAX_INODES_PER_ISW - 1) {
-                       restart = true;
-                       break;
-               }
-       }
+       /*
+        * In addition to the inodes that have completed writeback, also switch
+        * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+        * inodes won't be written back for a long time when lazytime is
+        * enabled, and thus pinning the dying cgwbs. It won't break the
+        * bandwidth restrictions, as writeback of inode metadata is not
+        * accounted for.
+        */
+       restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+       if (!restart)
+               restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
         spin_unlock(&wb->list_lock);
  
         /* no attached inodes? bail out */
@@ -827,7 +871,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
                  * is okay.  The main goal is avoiding keeping an inode on
                  * the wrong wb for an extended period of time.
                  */
-               if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+               if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                         inode_switch_wbs(inode, max_id);
         }
  
@@ -857,6 +901,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
  void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                               size_t bytes)
  {
+       struct folio *folio;
         struct cgroup_subsys_state *css;
         int id;
  
@@ -869,7 +914,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
         if (!wbc->wb || wbc->no_cgroup_owner)
                 return;
  
-       css = mem_cgroup_css_from_page(page);
+       folio = page_folio(page);
+       css = mem_cgroup_css_from_folio(folio);
         /* dead cgroups shouldn't contribute to inode ownership arbitration */
         if (!(css->flags & CSS_ONLINE))
                 return;
@@ -974,6 +1020,16 @@ restart:
                         continue;
                 }
  
+               /*
+                * If wb_tryget fails, the wb has been shutdown, skip it.
+                *
+                * Pin @wb so that it stays on @bdi->wb_list.  This allows
+                * continuing iteration from @wb after dropping and
+                * regrabbing rcu read lock.
+                */
+               if (!wb_tryget(wb))
+                       continue;
+
                 /* alloc failed, execute synchronously using on-stack fallback */
                 work = &fallback_work;
                 *work = *base_work;
@@ -982,13 +1038,6 @@ restart:
                 work->done = &fallback_work_done;
  
                 wb_queue_work(wb, work);
-
-               /*
-                * Pin @wb so that it stays on @bdi->wb_list.  This allows
-                * continuing iteration from @wb after dropping and
-                * regrabbing rcu read lock.
-                */
-               wb_get(wb);
                 last_wb = wb;
  
                 rcu_read_unlock();
@@ -1129,6 +1178,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
  {
         assert_spin_locked(&wb->list_lock);
         assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
  
         inode->i_state &= ~I_SYNC_QUEUED;
         list_del_init(&inode->i_io_list);
@@ -1294,6 +1344,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
  {
         assert_spin_locked(&inode->i_lock);
  
+       inode->i_state &= ~I_SYNC_QUEUED;
+       /*
+        * When the inode is being freed just don't bother with dirty list
+        * tracking. Flush worker will ignore this inode anyway and it will
+        * trigger assertions in inode_io_list_move_locked().
+        */
+       if (inode->i_state & I_FREEING) {
+               list_del_init(&inode->i_io_list);
+               wb_io_lists_depopulated(wb);
+               return;
+       }
         if (!list_empty(&wb->b_dirty)) {
                 struct inode *tail;
  
@@ -1302,7 +1363,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
                         inode->dirtied_when = jiffies;
         }
         inode_io_list_move_locked(inode, wb, &wb->b_dirty);
-       inode->i_state &= ~I_SYNC_QUEUED;
  }
  
  static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1345,8 +1405,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
         return ret;
  }
  
-#define EXPIRE_DIRTY_ATIME 0x0001
-
  /*
   * Move expired (dirtied before dirtied_before) dirty inodes from
   * @delaying_queue to @dispatch_queue.
@@ -1519,10 +1577,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  
         if (wbc->pages_skipped) {
                 /*
-                * writeback is not making progress due to locked
-                * buffers. Skip this inode for now.
+                * Writeback is not making progress due to locked buffers.
+                * Skip this inode for now. Although having skipped pages
+                * is odd for clean inodes, it can happen for some
+                * filesystems so handle that gracefully.
                  */
-               redirty_tail_locked(inode, wb);
+               if (inode->i_state & I_DIRTY_ALL)
+                       redirty_tail_locked(inode, wb);
+               else
+                       inode_cgwb_move_to_attached(inode, wb);
                 return;
         }
  
@@ -1637,11 +1700,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  
         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                 inode->i_state |= I_DIRTY_PAGES;
-       else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+       else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
                 if (!(inode->i_state & I_DIRTY_PAGES)) {
-                       inode->i_state &= ~I_PINNING_FSCACHE_WB;
-                       wbc->unpinned_fscache_wb = true;
-                       dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+                       inode->i_state &= ~I_PINNING_NETFS_WB;
+                       wbc->unpinned_netfs_wb = true;
+                       dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                 }
         }
  
@@ -1653,7 +1716,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                 if (ret == 0)
                         ret = err;
         }
-       wbc->unpinned_fscache_wb = false;
+       wbc->unpinned_netfs_wb = false;
         trace_writeback_single_inode(inode, wbc, nr_to_write);
         return ret;
  }
@@ -1712,15 +1775,28 @@ static int writeback_single_inode(struct inode *inode,
         wb = inode_to_wb_and_lock_list(inode);
         spin_lock(&inode->i_lock);
         /*
-        * If the inode is now fully clean, then it can be safely removed from
-        * its writeback list (if any).  Otherwise the flusher threads are
-        * responsible for the writeback lists.
+        * If the inode is freeing, its i_io_list shoudn't be updated
+        * as it can be finally deleted at this moment.
          */
-       if (!(inode->i_state & I_DIRTY_ALL))
-               inode_cgwb_move_to_attached(inode, wb);
-       else if (!(inode->i_state & I_SYNC_QUEUED) &&
-                (inode->i_state & I_DIRTY))
-               redirty_tail_locked(inode, wb);
+       if (!(inode->i_state & I_FREEING)) {
+               /*
+                * If the inode is now fully clean, then it can be safely
+                * removed from its writeback list (if any). Otherwise the
+                * flusher threads are responsible for the writeback lists.
+                */
+               if (!(inode->i_state & I_DIRTY_ALL))
+                       inode_cgwb_move_to_attached(inode, wb);
+               else if (!(inode->i_state & I_SYNC_QUEUED)) {
+                       if ((inode->i_state & I_DIRTY))
+                               redirty_tail_locked(inode, wb);
+                       else if (inode->i_state & I_DIRTY_TIME) {
+                               inode->dirtied_when = jiffies;
+                               inode_io_list_move_locked(inode,
+                                                         wb,
+                                                         &wb->b_dirty_time);
+                       }
+               }
+       }
  
         spin_unlock(&wb->list_lock);
         inode_sync_complete(inode);
@@ -1924,9 +2000,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
                 struct inode *inode = wb_inode(wb->b_io.prev);
                 struct super_block *sb = inode->i_sb;
  
-               if (!trylock_super(sb)) {
+               if (!super_trylock_shared(sb)) {
                         /*
-                        * trylock_super() may fail consistently due to
+                        * super_trylock_shared() may fail consistently due to
                          * s_umount being grabbed by someone else. Don't use
                          * requeue_io() to avoid busy retrying the inode/sb.
                          */
@@ -1995,7 +2071,6 @@ static long wb_writeback(struct bdi_writeback *wb,
         struct blk_plug plug;
  
         blk_start_plug(&plug);
-       spin_lock(&wb->list_lock);
         for (;;) {
                 /*
                  * Stop writeback when nr_pages has been consumed
@@ -2020,6 +2095,9 @@ static long wb_writeback(struct bdi_writeback *wb,
                 if (work->for_background && !wb_over_bg_thresh(wb))
                         break;
  
+
+               spin_lock(&wb->list_lock);
+
                 /*
                  * Kupdate and background works are special and we want to
                  * include all inodes that need writing. Livelock avoidance is
@@ -2049,13 +2127,19 @@ static long wb_writeback(struct bdi_writeback *wb,
                  * mean the overall work is done. So we keep looping as long
                  * as made some progress on cleaning pages or inodes.
                  */
-               if (progress)
+               if (progress) {
+                       spin_unlock(&wb->list_lock);
                         continue;
+               }
+
                 /*
                  * No more inodes for IO, bail
                  */
-               if (list_empty(&wb->b_more_io))
+               if (list_empty(&wb->b_more_io)) {
+                       spin_unlock(&wb->list_lock);
                         break;
+               }
+
                 /*
                  * Nothing written. Wait for some inode to
                  * become available for writeback. Otherwise
@@ -2067,9 +2151,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 spin_unlock(&wb->list_lock);
                 /* This function drops i_lock... */
                 inode_sleep_on_writeback(inode);
-               spin_lock(&wb->list_lock);
         }
-       spin_unlock(&wb->list_lock);
         blk_finish_plug(&plug);
  
         return nr_pages - work->nr_pages;
@@ -2082,13 +2164,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
  {
         struct wb_writeback_work *work = NULL;
  
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
         if (!list_empty(&wb->work_list)) {
                 work = list_entry(wb->work_list.next,
                                   struct wb_writeback_work, list);
                 list_del_init(&work->list);
         }
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
         return work;
  }
  
@@ -2369,6 +2451,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
         trace_writeback_mark_inode_dirty(inode, flags);
  
         if (flags & I_DIRTY_INODE) {
+               /*
+                * Inode timestamp update will piggback on this dirtying.
+                * We tell ->dirty_inode callback that timestamps need to
+                * be updated by setting I_DIRTY_TIME in flags.
+                */
+               if (inode->i_state & I_DIRTY_TIME) {
+                       spin_lock(&inode->i_lock);
+                       if (inode->i_state & I_DIRTY_TIME) {
+                               inode->i_state &= ~I_DIRTY_TIME;
+                               flags |= I_DIRTY_TIME;
+                       }
+                       spin_unlock(&inode->i_lock);
+               }
+
                 /*
                  * Notify the filesystem about the inode being dirtied, so that
                  * (if needed) it can update on-disk fields and journal the
@@ -2378,7 +2474,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                  */
                 trace_writeback_dirty_inode_start(inode, flags);
                 if (sb->s_op->dirty_inode)
-                       sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
+                       sb->s_op->dirty_inode(inode,
+                               flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                 trace_writeback_dirty_inode(inode, flags);
  
                 /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
@@ -2399,21 +2496,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
          */
         smp_mb();
  
-       if (((inode->i_state & flags) == flags) ||
-           (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+       if ((inode->i_state & flags) == flags)
                 return;
  
         spin_lock(&inode->i_lock);
-       if (dirtytime && (inode->i_state & I_DIRTY_INODE))
-               goto out_unlock_inode;
         if ((inode->i_state & flags) != flags) {
                 const int was_dirty = inode->i_state & I_DIRTY;
  
                 inode_attach_wb(inode, NULL);
  
-               /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
-               if (flags & I_DIRTY_INODE)
-                       inode->i_state &= ~I_DIRTY_TIME;
                 inode->i_state |= flags;
  
                 /*
@@ -2486,7 +2577,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
  out_unlock:
         if (wb)
                 spin_unlock(&wb->list_lock);
-out_unlock_inode:
         spin_unlock(&inode->i_lock);
  }
  EXPORT_SYMBOL(__mark_inode_dirty);