Linux 6.9-rc1
[linux-2.6-microblaze.git] / fs / fs-writeback.c
index 0522136..e4f17c5 100644 (file)
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
 {
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
 
        list_move(&inode->i_io_list, head);
 
@@ -134,10 +135,35 @@ static bool inode_io_list_move_locked(struct inode *inode,
 
 static void wb_wakeup(struct bdi_writeback *wb)
 {
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
+}
+
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+       unsigned long timeout;
+
+       timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+       spin_lock_irq(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+       spin_unlock_irq(&wb->work_lock);
 }
 
 static void finish_writeback_work(struct bdi_writeback *wb,
@@ -164,7 +190,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
        if (work->done)
                atomic_inc(&work->done->cnt);
 
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
 
        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
@@ -172,7 +198,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
        } else
                finish_writeback_work(wb, work);
 
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
 }
 
 /**
@@ -236,7 +262,7 @@ void wb_wait_for_completion(struct wb_completion *done)
 static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
 static struct workqueue_struct *isw_wq;
 
-void __inode_attach_wb(struct inode *inode, struct page *page)
+void __inode_attach_wb(struct inode *inode, struct folio *folio)
 {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;
@@ -244,8 +270,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;
 
-               if (page) {
-                       memcg_css = mem_cgroup_css_from_page(page);
+               if (folio) {
+                       memcg_css = mem_cgroup_css_from_folio(folio);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                } else {
                        /* must pin memcg_css, see wb_get_create() */
@@ -280,6 +306,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
 {
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
 
        inode->i_state &= ~I_SYNC_QUEUED;
        if (wb != &wb->bdi->wb)
@@ -611,6 +638,24 @@ out_free:
        kfree(isw);
 }
 
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+                                  struct list_head *list, int *nr)
+{
+       struct inode *inode;
+
+       list_for_each_entry(inode, list, i_io_list) {
+               if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+                       continue;
+
+               isw->inodes[*nr] = inode;
+               (*nr)++;
+
+               if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+                       return true;
+       }
+       return false;
+}
+
 /**
  * cleanup_offline_cgwb - detach associated inodes
  * @wb: target wb
@@ -623,7 +668,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 {
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
-       struct inode *inode;
        int nr;
        bool restart = false;
 
@@ -645,17 +689,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 
        nr = 0;
        spin_lock(&wb->list_lock);
-       list_for_each_entry(inode, &wb->b_attached, i_io_list) {
-               if (!inode_prepare_wbs_switch(inode, isw->new_wb))
-                       continue;
-
-               isw->inodes[nr++] = inode;
-
-               if (nr >= WB_MAX_INODES_PER_ISW - 1) {
-                       restart = true;
-                       break;
-               }
-       }
+       /*
+        * In addition to the inodes that have completed writeback, also switch
+        * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+        * inodes won't be written back for a long time when lazytime is
+        * enabled, and thus pinning the dying cgwbs. It won't break the
+        * bandwidth restrictions, as writeback of inode metadata is not
+        * accounted for.
+        */
+       restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+       if (!restart)
+               restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
        spin_unlock(&wb->list_lock);
 
        /* no attached inodes? bail out */
@@ -827,7 +871,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
-               if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+               if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }
 
@@ -857,6 +901,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
 void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes)
 {
+       struct folio *folio;
        struct cgroup_subsys_state *css;
        int id;
 
@@ -869,7 +914,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;
 
-       css = mem_cgroup_css_from_page(page);
+       folio = page_folio(page);
+       css = mem_cgroup_css_from_folio(folio);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!(css->flags & CSS_ONLINE))
                return;
@@ -974,6 +1020,16 @@ restart:
                        continue;
                }
 
+               /*
+                * If wb_tryget fails, the wb has been shutdown, skip it.
+                *
+                * Pin @wb so that it stays on @bdi->wb_list.  This allows
+                * continuing iteration from @wb after dropping and
+                * regrabbing rcu read lock.
+                */
+               if (!wb_tryget(wb))
+                       continue;
+
                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
@@ -982,13 +1038,6 @@ restart:
                work->done = &fallback_work_done;
 
                wb_queue_work(wb, work);
-
-               /*
-                * Pin @wb so that it stays on @bdi->wb_list.  This allows
-                * continuing iteration from @wb after dropping and
-                * regrabbing rcu read lock.
-                */
-               wb_get(wb);
                last_wb = wb;
 
                rcu_read_unlock();
@@ -1129,6 +1178,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
 {
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
+       WARN_ON_ONCE(inode->i_state & I_FREEING);
 
        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
@@ -1294,6 +1344,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
 {
        assert_spin_locked(&inode->i_lock);
 
+       inode->i_state &= ~I_SYNC_QUEUED;
+       /*
+        * When the inode is being freed just don't bother with dirty list
+        * tracking. Flush worker will ignore this inode anyway and it will
+        * trigger assertions in inode_io_list_move_locked().
+        */
+       if (inode->i_state & I_FREEING) {
+               list_del_init(&inode->i_io_list);
+               wb_io_lists_depopulated(wb);
+               return;
+       }
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
 
@@ -1302,7 +1363,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
-       inode->i_state &= ~I_SYNC_QUEUED;
 }
 
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1345,8 +1405,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
        return ret;
 }
 
-#define EXPIRE_DIRTY_ATIME 0x0001
-
 /*
  * Move expired (dirtied before dirtied_before) dirty inodes from
  * @delaying_queue to @dispatch_queue.
@@ -1519,10 +1577,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 
        if (wbc->pages_skipped) {
                /*
-                * writeback is not making progress due to locked
-                * buffers. Skip this inode for now.
+                * Writeback is not making progress due to locked buffers.
+                * Skip this inode for now. Although having skipped pages
+                * is odd for clean inodes, it can happen for some
+                * filesystems so handle that gracefully.
                 */
-               redirty_tail_locked(inode, wb);
+               if (inode->i_state & I_DIRTY_ALL)
+                       redirty_tail_locked(inode, wb);
+               else
+                       inode_cgwb_move_to_attached(inode, wb);
                return;
        }
 
@@ -1637,11 +1700,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;
-       else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+       else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
                if (!(inode->i_state & I_DIRTY_PAGES)) {
-                       inode->i_state &= ~I_PINNING_FSCACHE_WB;
-                       wbc->unpinned_fscache_wb = true;
-                       dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+                       inode->i_state &= ~I_PINNING_NETFS_WB;
+                       wbc->unpinned_netfs_wb = true;
+                       dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                }
        }
 
@@ -1653,7 +1716,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                if (ret == 0)
                        ret = err;
        }
-       wbc->unpinned_fscache_wb = false;
+       wbc->unpinned_netfs_wb = false;
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
 }
@@ -1712,15 +1775,28 @@ static int writeback_single_inode(struct inode *inode,
        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
-        * If the inode is now fully clean, then it can be safely removed from
-        * its writeback list (if any).  Otherwise the flusher threads are
-        * responsible for the writeback lists.
+        * If the inode is freeing, its i_io_list shoudn't be updated
+        * as it can be finally deleted at this moment.
         */
-       if (!(inode->i_state & I_DIRTY_ALL))
-               inode_cgwb_move_to_attached(inode, wb);
-       else if (!(inode->i_state & I_SYNC_QUEUED) &&
-                (inode->i_state & I_DIRTY))
-               redirty_tail_locked(inode, wb);
+       if (!(inode->i_state & I_FREEING)) {
+               /*
+                * If the inode is now fully clean, then it can be safely
+                * removed from its writeback list (if any). Otherwise the
+                * flusher threads are responsible for the writeback lists.
+                */
+               if (!(inode->i_state & I_DIRTY_ALL))
+                       inode_cgwb_move_to_attached(inode, wb);
+               else if (!(inode->i_state & I_SYNC_QUEUED)) {
+                       if ((inode->i_state & I_DIRTY))
+                               redirty_tail_locked(inode, wb);
+                       else if (inode->i_state & I_DIRTY_TIME) {
+                               inode->dirtied_when = jiffies;
+                               inode_io_list_move_locked(inode,
+                                                         wb,
+                                                         &wb->b_dirty_time);
+                       }
+               }
+       }
 
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
@@ -1924,9 +2000,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;
 
-               if (!trylock_super(sb)) {
+               if (!super_trylock_shared(sb)) {
                        /*
-                        * trylock_super() may fail consistently due to
+                        * super_trylock_shared() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
@@ -1995,7 +2071,6 @@ static long wb_writeback(struct bdi_writeback *wb,
        struct blk_plug plug;
 
        blk_start_plug(&plug);
-       spin_lock(&wb->list_lock);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
@@ -2020,6 +2095,9 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;
 
+
+               spin_lock(&wb->list_lock);
+
                /*
                 * Kupdate and background works are special and we want to
                 * include all inodes that need writing. Livelock avoidance is
@@ -2049,13 +2127,19 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
-               if (progress)
+               if (progress) {
+                       spin_unlock(&wb->list_lock);
                        continue;
+               }
+
                /*
                 * No more inodes for IO, bail
                 */
-               if (list_empty(&wb->b_more_io))
+               if (list_empty(&wb->b_more_io)) {
+                       spin_unlock(&wb->list_lock);
                        break;
+               }
+
                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
@@ -2067,9 +2151,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
-               spin_lock(&wb->list_lock);
        }
-       spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);
 
        return nr_pages - work->nr_pages;
@@ -2082,13 +2164,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 {
        struct wb_writeback_work *work = NULL;
 
-       spin_lock_bh(&wb->work_lock);
+       spin_lock_irq(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
-       spin_unlock_bh(&wb->work_lock);
+       spin_unlock_irq(&wb->work_lock);
        return work;
 }
 
@@ -2369,6 +2451,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        trace_writeback_mark_inode_dirty(inode, flags);
 
        if (flags & I_DIRTY_INODE) {
+               /*
+                * Inode timestamp update will piggback on this dirtying.
+                * We tell ->dirty_inode callback that timestamps need to
+                * be updated by setting I_DIRTY_TIME in flags.
+                */
+               if (inode->i_state & I_DIRTY_TIME) {
+                       spin_lock(&inode->i_lock);
+                       if (inode->i_state & I_DIRTY_TIME) {
+                               inode->i_state &= ~I_DIRTY_TIME;
+                               flags |= I_DIRTY_TIME;
+                       }
+                       spin_unlock(&inode->i_lock);
+               }
+
                /*
                 * Notify the filesystem about the inode being dirtied, so that
                 * (if needed) it can update on-disk fields and journal the
@@ -2378,7 +2474,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 */
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
-                       sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
+                       sb->s_op->dirty_inode(inode,
+                               flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                trace_writeback_dirty_inode(inode, flags);
 
                /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
@@ -2399,21 +2496,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
         */
        smp_mb();
 
-       if (((inode->i_state & flags) == flags) ||
-           (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+       if ((inode->i_state & flags) == flags)
                return;
 
        spin_lock(&inode->i_lock);
-       if (dirtytime && (inode->i_state & I_DIRTY_INODE))
-               goto out_unlock_inode;
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
 
                inode_attach_wb(inode, NULL);
 
-               /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
-               if (flags & I_DIRTY_INODE)
-                       inode->i_state &= ~I_DIRTY_TIME;
                inode->i_state |= flags;
 
                /*
@@ -2486,7 +2577,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 out_unlock:
        if (wb)
                spin_unlock(&wb->list_lock);
-out_unlock_inode:
        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);