Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / mm / page-writeback.c
index c12f67c..4812a17 100644 (file)
@@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
 static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
 {
-       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;
@@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
 static void wb_position_ratio(struct dirty_throttle_control *dtc)
 {
        struct bdi_writeback *wb = dtc->wb;
-       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
@@ -1115,7 +1115,7 @@ out:
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
-       wb->avg_write_bandwidth = avg;
+       WRITE_ONCE(wb->avg_write_bandwidth, avg);
 }
 
 static void update_dirty_limit(struct dirty_throttle_control *dtc)
@@ -1147,8 +1147,8 @@ update:
        dom->dirty_limit = limit;
 }
 
-static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
-                                   unsigned long now)
+static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
+                                     unsigned long now)
 {
        struct wb_domain *dom = dtc_dom(dtc);
 
@@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
        else
                dirty_ratelimit -= step;
 
-       wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+       WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
@@ -1332,35 +1332,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
 
 static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
-                                 unsigned long start_time,
                                  bool update_ratelimit)
 {
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
-       unsigned long elapsed = now - wb->bw_time_stamp;
+       unsigned long elapsed;
        unsigned long dirtied;
        unsigned long written;
 
-       lockdep_assert_held(&wb->list_lock);
+       spin_lock(&wb->list_lock);
 
        /*
-        * rate-limit, only update once every 200ms.
+        * Lockless checks for elapsed time are racy and delayed update after
+        * IO completion doesn't do it at all (to make sure written pages are
+        * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
+        * division errors.
         */
-       if (elapsed < BANDWIDTH_INTERVAL)
-               return;
-
+       elapsed = max(now - wb->bw_time_stamp, 1UL);
        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
 
-       /*
-        * Skip quiet periods when disk bandwidth is under-utilized.
-        * (at least 1s idle time between two flusher runs)
-        */
-       if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
-               goto snapshot;
-
        if (update_ratelimit) {
-               domain_update_bandwidth(gdtc, now);
+               domain_update_dirty_limit(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
 
                /*
@@ -1368,23 +1361,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
-                       domain_update_bandwidth(mdtc, now);
+                       domain_update_dirty_limit(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);
 
-snapshot:
        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
-       wb->bw_time_stamp = now;
+       WRITE_ONCE(wb->bw_time_stamp, now);
+       spin_unlock(&wb->list_lock);
 }
 
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb)
 {
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
 
-       __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+       __wb_update_bandwidth(&gdtc, NULL, false);
+}
+
+/* Interval after which we consider wb idle and don't estimate bandwidth */
+#define WB_BANDWIDTH_IDLE_JIF (HZ)
+
+static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
+{
+       unsigned long now = jiffies;
+       unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
+
+       if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
+           !atomic_read(&wb->writeback_inodes)) {
+               spin_lock(&wb->list_lock);
+               wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
+               wb->written_stamp = wb_stat(wb, WB_WRITTEN);
+               WRITE_ONCE(wb->bw_time_stamp, now);
+               spin_unlock(&wb->list_lock);
+       }
 }
 
 /*
@@ -1407,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
 static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
 {
-       unsigned long bw = wb->avg_write_bandwidth;
+       unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long t;
 
        /*
@@ -1429,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
 {
-       long hi = ilog2(wb->avg_write_bandwidth);
-       long lo = ilog2(wb->dirty_ratelimit);
+       long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
+       long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
        long t;         /* target pause */
        long pause;     /* estimated next pause */
        int pages;      /* target nr_dirtied_pause */
@@ -1710,15 +1721,12 @@ free_running:
                if (dirty_exceeded && !wb->dirty_exceeded)
                        wb->dirty_exceeded = 1;
 
-               if (time_is_before_jiffies(wb->bw_time_stamp +
-                                          BANDWIDTH_INTERVAL)) {
-                       spin_lock(&wb->list_lock);
-                       __wb_update_bandwidth(gdtc, mdtc, start_time, true);
-                       spin_unlock(&wb->list_lock);
-               }
+               if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+                                          BANDWIDTH_INTERVAL))
+                       __wb_update_bandwidth(gdtc, mdtc, true);
 
                /* throttle according to the chosen dtc */
-               dirty_ratelimit = wb->dirty_ratelimit;
+               dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
@@ -2345,9 +2353,12 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        int ret;
+       struct bdi_writeback *wb;
 
        if (wbc->nr_to_write <= 0)
                return 0;
+       wb = inode_to_wb_wbc(mapping->host, wbc);
+       wb_bandwidth_estimate_start(wb);
        while (1) {
                if (mapping->a_ops->writepages)
                        ret = mapping->a_ops->writepages(mapping, wbc);
@@ -2358,6 +2369,14 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
                cond_resched();
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        }
+       /*
+        * Usually few pages are written by now from those we've just submitted
+        * but if there's constant writeback being submitted, this makes sure
+        * writeback bandwidth is updated once in a while.
+        */
+       if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+                                  BANDWIDTH_INTERVAL))
+               wb_update_bandwidth(wb);
        return ret;
 }
 
@@ -2729,6 +2748,24 @@ int clear_page_dirty_for_io(struct page *page)
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
 
+static void wb_inode_writeback_start(struct bdi_writeback *wb)
+{
+       atomic_inc(&wb->writeback_inodes);
+}
+
+static void wb_inode_writeback_end(struct bdi_writeback *wb)
+{
+       atomic_dec(&wb->writeback_inodes);
+       /*
+        * Make sure estimate of writeback throughput gets updated after
+        * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+        * (which is the interval other bandwidth updates use for batching) so
+        * that if multiple inodes end writeback at a similar time, they get
+        * batched into one bandwidth update.
+        */
+       queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+}
+
 int test_clear_page_writeback(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -2750,6 +2787,9 @@ int test_clear_page_writeback(struct page *page)
 
                                dec_wb_stat(wb, WB_WRITEBACK);
                                __wb_writeout_inc(wb);
+                               if (!mapping_tagged(mapping,
+                                                   PAGECACHE_TAG_WRITEBACK))
+                                       wb_inode_writeback_end(wb);
                        }
                }
 
@@ -2792,8 +2832,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                                                   PAGECACHE_TAG_WRITEBACK);
 
                        xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
-                       if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
-                               inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+                       if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+                               struct bdi_writeback *wb = inode_to_wb(inode);
+
+                               inc_wb_stat(wb, WB_WRITEBACK);
+                               if (!on_wblist)
+                                       wb_inode_writeback_start(wb);
+                       }
 
                        /*
                         * We can come through here when swapping anonymous