writeback: fix bandwidth estimate for spiky workload

author Jan Kara <jack@suse.cz>

Thu, 2 Sep 2021 21:53:09 +0000 (14:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 3 Sep 2021 16:58:10 +0000 (09:58 -0700)
author Jan Kara <jack@suse.cz>
Thu, 2 Sep 2021 21:53:09 +0000 (14:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Sep 2021 16:58:10 +0000 (09:58 -0700)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h

index 06fb8e1..3320700 100644 (file)
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -143,6 +143,7 @@ struct bdi_writeback {
         spinlock_t work_lock;           /* protects work_list & dwork scheduling */
         struct list_head work_list;
         struct delayed_work dwork;      /* work item used for writeback */
+       struct delayed_work bw_dwork;   /* work item used for bandwidth estimate */
  
         unsigned long dirty_sleep;      /* last wait */
  
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index 2480322..cbaef09 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
  unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
  
+void wb_update_bandwidth(struct bdi_writeback *wb);
  void balance_dirty_pages_ratelimited(struct address_space *mapping);
  bool wb_over_bg_thresh(struct bdi_writeback *wb);
  
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index b4c707d..6122c78 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
         spin_unlock_bh(&wb->work_lock);
  }
  
+static void wb_update_bandwidth_workfn(struct work_struct *work)
+{
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, bw_dwork);
+
+       wb_update_bandwidth(wb);
+}
+
  /*
   * Initial write bandwidth: 100 MB/s
   */
@@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
         spin_lock_init(&wb->work_lock);
         INIT_LIST_HEAD(&wb->work_list);
         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+       INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
         wb->dirty_sleep = jiffies;
  
         err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
         mod_delayed_work(bdi_wq, &wb->dwork, 0);
         flush_delayed_work(&wb->dwork);
         WARN_ON(!list_empty(&wb->work_list));
+       flush_delayed_work(&wb->bw_dwork);
  }
  
  static void wb_exit(struct bdi_writeback *wb)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index e4a381b..156f588 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1336,18 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
  {
         struct bdi_writeback *wb = gdtc->wb;
         unsigned long now = jiffies;
-       unsigned long elapsed = now - wb->bw_time_stamp;
+       unsigned long elapsed;
         unsigned long dirtied;
         unsigned long written;
  
-       lockdep_assert_held(&wb->list_lock);
+       spin_lock(&wb->list_lock);
  
         /*
-        * rate-limit, only update once every 200ms.
+        * Lockless checks for elapsed time are racy and delayed update after
+        * IO completion doesn't do it at all (to make sure written pages are
+        * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
+        * division errors.
          */
-       if (elapsed < BANDWIDTH_INTERVAL)
-               return;
-
+       elapsed = max(now - wb->bw_time_stamp, 1UL);
         dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
         written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
  
@@ -1369,15 +1370,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
         wb->dirtied_stamp = dirtied;
         wb->written_stamp = written;
         wb->bw_time_stamp = now;
+       spin_unlock(&wb->list_lock);
  }
  
-static void wb_update_bandwidth(struct bdi_writeback *wb)
+void wb_update_bandwidth(struct bdi_writeback *wb)
  {
         struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
  
-       spin_lock(&wb->list_lock);
         __wb_update_bandwidth(&gdtc, NULL, false);
-       spin_unlock(&wb->list_lock);
  }
  
  /* Interval after which we consider wb idle and don't estimate bandwidth */
@@ -1722,11 +1722,8 @@ free_running:
                         wb->dirty_exceeded = 1;
  
                 if (time_is_before_jiffies(wb->bw_time_stamp +
-                                          BANDWIDTH_INTERVAL)) {
-                       spin_lock(&wb->list_lock);
+                                          BANDWIDTH_INTERVAL))
                         __wb_update_bandwidth(gdtc, mdtc, true);
-                       spin_unlock(&wb->list_lock);
-               }
  
                 /* throttle according to the chosen dtc */
                 dirty_ratelimit = wb->dirty_ratelimit;
@@ -2374,7 +2371,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
                 cond_resched();
                 congestion_wait(BLK_RW_ASYNC, HZ/50);
         }
-       wb_update_bandwidth(wb);
+       /*
+        * Usually few pages are written by now from those we've just submitted
+        * but if there's constant writeback being submitted, this makes sure
+        * writeback bandwidth is updated once in a while.
+        */
+       if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL))
+               wb_update_bandwidth(wb);
         return ret;
  }
  
@@ -2754,6 +2757,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb)
  static void wb_inode_writeback_end(struct bdi_writeback *wb)
  {
         atomic_dec(&wb->writeback_inodes);
+       /*
+        * Make sure estimate of writeback throughput gets updated after
+        * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+        * (which is the interval other bandwidth updates use for batching) so
+        * that if multiple inodes end writeback at a similar time, they get
+        * batched into one bandwidth update.
+        */
+       queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
  }
  
  int test_clear_page_writeback(struct page *page)
author	Jan Kara <jack@suse.cz>
	Thu, 2 Sep 2021 21:53:09 +0000 (14:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 3 Sep 2021 16:58:10 +0000 (09:58 -0700)
include/linux/backing-dev-defs.h		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history
mm/backing-dev.c		patch \| blob \| history
mm/page-writeback.c		patch \| blob \| history