mm: remove unneeded local variable in free_area_init_core
[linux-2.6-microblaze.git] / mm / backing-dev.c
1 // SPDX-License-Identifier: GPL-2.0-only
2
3 #include <linux/wait.h>
4 #include <linux/rbtree.h>
5 #include <linux/backing-dev.h>
6 #include <linux/kthread.h>
7 #include <linux/freezer.h>
8 #include <linux/fs.h>
9 #include <linux/pagemap.h>
10 #include <linux/mm.h>
11 #include <linux/sched.h>
12 #include <linux/module.h>
13 #include <linux/writeback.h>
14 #include <linux/device.h>
15 #include <trace/events/writeback.h>
16
17 struct backing_dev_info noop_backing_dev_info;
18 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
19
20 static struct class *bdi_class;
21 static const char *bdi_unknown_name = "(unknown)";
22
23 /*
24  * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
25  * reader side locking.
26  */
27 DEFINE_SPINLOCK(bdi_lock);
28 static u64 bdi_id_cursor;
29 static struct rb_root bdi_tree = RB_ROOT;
30 LIST_HEAD(bdi_list);
31
32 /* bdi_wq serves all asynchronous writeback tasks */
33 struct workqueue_struct *bdi_wq;
34
35 #define K(x) ((x) << (PAGE_SHIFT - 10))
36
37 #ifdef CONFIG_DEBUG_FS
38 #include <linux/debugfs.h>
39 #include <linux/seq_file.h>
40
41 static struct dentry *bdi_debug_root;
42
43 static void bdi_debug_init(void)
44 {
45         bdi_debug_root = debugfs_create_dir("bdi", NULL);
46 }
47
48 static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 {
50         struct backing_dev_info *bdi = m->private;
51         struct bdi_writeback *wb = &bdi->wb;
52         unsigned long background_thresh;
53         unsigned long dirty_thresh;
54         unsigned long wb_thresh;
55         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
56         struct inode *inode;
57
58         nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
59         spin_lock(&wb->list_lock);
60         list_for_each_entry(inode, &wb->b_dirty, i_io_list)
61                 nr_dirty++;
62         list_for_each_entry(inode, &wb->b_io, i_io_list)
63                 nr_io++;
64         list_for_each_entry(inode, &wb->b_more_io, i_io_list)
65                 nr_more_io++;
66         list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
67                 if (inode->i_state & I_DIRTY_TIME)
68                         nr_dirty_time++;
69         spin_unlock(&wb->list_lock);
70
71         global_dirty_limits(&background_thresh, &dirty_thresh);
72         wb_thresh = wb_calc_thresh(wb, dirty_thresh);
73
74         seq_printf(m,
75                    "BdiWriteback:       %10lu kB\n"
76                    "BdiReclaimable:     %10lu kB\n"
77                    "BdiDirtyThresh:     %10lu kB\n"
78                    "DirtyThresh:        %10lu kB\n"
79                    "BackgroundThresh:   %10lu kB\n"
80                    "BdiDirtied:         %10lu kB\n"
81                    "BdiWritten:         %10lu kB\n"
82                    "BdiWriteBandwidth:  %10lu kBps\n"
83                    "b_dirty:            %10lu\n"
84                    "b_io:               %10lu\n"
85                    "b_more_io:          %10lu\n"
86                    "b_dirty_time:       %10lu\n"
87                    "bdi_list:           %10u\n"
88                    "state:              %10lx\n",
89                    (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
90                    (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
91                    K(wb_thresh),
92                    K(dirty_thresh),
93                    K(background_thresh),
94                    (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
95                    (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
96                    (unsigned long) K(wb->write_bandwidth),
97                    nr_dirty,
98                    nr_io,
99                    nr_more_io,
100                    nr_dirty_time,
101                    !list_empty(&bdi->bdi_list), bdi->wb.state);
102
103         return 0;
104 }
105 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
106
107 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
108 {
109         bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
110
111         debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
112                             &bdi_debug_stats_fops);
113 }
114
115 static void bdi_debug_unregister(struct backing_dev_info *bdi)
116 {
117         debugfs_remove_recursive(bdi->debug_dir);
118 }
119 #else
120 static inline void bdi_debug_init(void)
121 {
122 }
123 static inline void bdi_debug_register(struct backing_dev_info *bdi,
124                                       const char *name)
125 {
126 }
127 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
128 {
129 }
130 #endif
131
132 static ssize_t read_ahead_kb_store(struct device *dev,
133                                   struct device_attribute *attr,
134                                   const char *buf, size_t count)
135 {
136         struct backing_dev_info *bdi = dev_get_drvdata(dev);
137         unsigned long read_ahead_kb;
138         ssize_t ret;
139
140         ret = kstrtoul(buf, 10, &read_ahead_kb);
141         if (ret < 0)
142                 return ret;
143
144         bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
145
146         return count;
147 }
148
149 #define BDI_SHOW(name, expr)                                            \
150 static ssize_t name##_show(struct device *dev,                          \
151                            struct device_attribute *attr, char *buf)    \
152 {                                                                       \
153         struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
154                                                                         \
155         return sysfs_emit(buf, "%lld\n", (long long)expr);              \
156 }                                                                       \
157 static DEVICE_ATTR_RW(name);
158
159 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
160
161 static ssize_t min_ratio_store(struct device *dev,
162                 struct device_attribute *attr, const char *buf, size_t count)
163 {
164         struct backing_dev_info *bdi = dev_get_drvdata(dev);
165         unsigned int ratio;
166         ssize_t ret;
167
168         ret = kstrtouint(buf, 10, &ratio);
169         if (ret < 0)
170                 return ret;
171
172         ret = bdi_set_min_ratio(bdi, ratio);
173         if (!ret)
174                 ret = count;
175
176         return ret;
177 }
178 BDI_SHOW(min_ratio, bdi->min_ratio)
179
180 static ssize_t max_ratio_store(struct device *dev,
181                 struct device_attribute *attr, const char *buf, size_t count)
182 {
183         struct backing_dev_info *bdi = dev_get_drvdata(dev);
184         unsigned int ratio;
185         ssize_t ret;
186
187         ret = kstrtouint(buf, 10, &ratio);
188         if (ret < 0)
189                 return ret;
190
191         ret = bdi_set_max_ratio(bdi, ratio);
192         if (!ret)
193                 ret = count;
194
195         return ret;
196 }
197 BDI_SHOW(max_ratio, bdi->max_ratio)
198
199 static ssize_t stable_pages_required_show(struct device *dev,
200                                           struct device_attribute *attr,
201                                           char *buf)
202 {
203         dev_warn_once(dev,
204                 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
205         return sysfs_emit(buf, "%d\n", 0);
206 }
207 static DEVICE_ATTR_RO(stable_pages_required);
208
209 static struct attribute *bdi_dev_attrs[] = {
210         &dev_attr_read_ahead_kb.attr,
211         &dev_attr_min_ratio.attr,
212         &dev_attr_max_ratio.attr,
213         &dev_attr_stable_pages_required.attr,
214         NULL,
215 };
216 ATTRIBUTE_GROUPS(bdi_dev);
217
218 static __init int bdi_class_init(void)
219 {
220         bdi_class = class_create(THIS_MODULE, "bdi");
221         if (IS_ERR(bdi_class))
222                 return PTR_ERR(bdi_class);
223
224         bdi_class->dev_groups = bdi_dev_groups;
225         bdi_debug_init();
226
227         return 0;
228 }
229 postcore_initcall(bdi_class_init);
230
231 static int bdi_init(struct backing_dev_info *bdi);
232
233 static int __init default_bdi_init(void)
234 {
235         int err;
236
237         bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
238                                  WQ_SYSFS, 0);
239         if (!bdi_wq)
240                 return -ENOMEM;
241
242         err = bdi_init(&noop_backing_dev_info);
243
244         return err;
245 }
246 subsys_initcall(default_bdi_init);
247
248 /*
249  * This function is used when the first inode for this wb is marked dirty. It
250  * wakes-up the corresponding bdi thread which should then take care of the
251  * periodic background write-out of dirty inodes. Since the write-out would
252  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
253  * set up a timer which wakes the bdi thread up later.
254  *
255  * Note, we wouldn't bother setting up the timer, but this function is on the
256  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
257  * by delaying the wake-up.
258  *
259  * We have to be careful not to postpone flush work if it is scheduled for
260  * earlier. Thus we use queue_delayed_work().
261  */
262 void wb_wakeup_delayed(struct bdi_writeback *wb)
263 {
264         unsigned long timeout;
265
266         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
267         spin_lock_bh(&wb->work_lock);
268         if (test_bit(WB_registered, &wb->state))
269                 queue_delayed_work(bdi_wq, &wb->dwork, timeout);
270         spin_unlock_bh(&wb->work_lock);
271 }
272
273 /*
274  * Initial write bandwidth: 100 MB/s
275  */
276 #define INIT_BW         (100 << (20 - PAGE_SHIFT))
277
278 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
279                    gfp_t gfp)
280 {
281         int i, err;
282
283         memset(wb, 0, sizeof(*wb));
284
285         if (wb != &bdi->wb)
286                 bdi_get(bdi);
287         wb->bdi = bdi;
288         wb->last_old_flush = jiffies;
289         INIT_LIST_HEAD(&wb->b_dirty);
290         INIT_LIST_HEAD(&wb->b_io);
291         INIT_LIST_HEAD(&wb->b_more_io);
292         INIT_LIST_HEAD(&wb->b_dirty_time);
293         spin_lock_init(&wb->list_lock);
294
295         wb->bw_time_stamp = jiffies;
296         wb->balanced_dirty_ratelimit = INIT_BW;
297         wb->dirty_ratelimit = INIT_BW;
298         wb->write_bandwidth = INIT_BW;
299         wb->avg_write_bandwidth = INIT_BW;
300
301         spin_lock_init(&wb->work_lock);
302         INIT_LIST_HEAD(&wb->work_list);
303         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
304         wb->dirty_sleep = jiffies;
305
306         err = fprop_local_init_percpu(&wb->completions, gfp);
307         if (err)
308                 goto out_put_bdi;
309
310         for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
311                 err = percpu_counter_init(&wb->stat[i], 0, gfp);
312                 if (err)
313                         goto out_destroy_stat;
314         }
315
316         return 0;
317
318 out_destroy_stat:
319         while (i--)
320                 percpu_counter_destroy(&wb->stat[i]);
321         fprop_local_destroy_percpu(&wb->completions);
322 out_put_bdi:
323         if (wb != &bdi->wb)
324                 bdi_put(bdi);
325         return err;
326 }
327
328 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
329
330 /*
331  * Remove bdi from the global list and shutdown any threads we have running
332  */
333 static void wb_shutdown(struct bdi_writeback *wb)
334 {
335         /* Make sure nobody queues further work */
336         spin_lock_bh(&wb->work_lock);
337         if (!test_and_clear_bit(WB_registered, &wb->state)) {
338                 spin_unlock_bh(&wb->work_lock);
339                 return;
340         }
341         spin_unlock_bh(&wb->work_lock);
342
343         cgwb_remove_from_bdi_list(wb);
344         /*
345          * Drain work list and shutdown the delayed_work.  !WB_registered
346          * tells wb_workfn() that @wb is dying and its work_list needs to
347          * be drained no matter what.
348          */
349         mod_delayed_work(bdi_wq, &wb->dwork, 0);
350         flush_delayed_work(&wb->dwork);
351         WARN_ON(!list_empty(&wb->work_list));
352 }
353
354 static void wb_exit(struct bdi_writeback *wb)
355 {
356         int i;
357
358         WARN_ON(delayed_work_pending(&wb->dwork));
359
360         for (i = 0; i < NR_WB_STAT_ITEMS; i++)
361                 percpu_counter_destroy(&wb->stat[i]);
362
363         fprop_local_destroy_percpu(&wb->completions);
364         if (wb != &wb->bdi->wb)
365                 bdi_put(wb->bdi);
366 }
367
368 #ifdef CONFIG_CGROUP_WRITEBACK
369
370 #include <linux/memcontrol.h>
371
372 /*
373  * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
374  * bdi->cgwb_tree is also RCU protected.
375  */
376 static DEFINE_SPINLOCK(cgwb_lock);
377 static struct workqueue_struct *cgwb_release_wq;
378
379 static void cgwb_release_workfn(struct work_struct *work)
380 {
381         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
382                                                 release_work);
383         struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
384
385         mutex_lock(&wb->bdi->cgwb_release_mutex);
386         wb_shutdown(wb);
387
388         css_put(wb->memcg_css);
389         css_put(wb->blkcg_css);
390         mutex_unlock(&wb->bdi->cgwb_release_mutex);
391
392         /* triggers blkg destruction if no online users left */
393         blkcg_unpin_online(blkcg);
394
395         fprop_local_destroy_percpu(&wb->memcg_completions);
396         percpu_ref_exit(&wb->refcnt);
397         wb_exit(wb);
398         kfree_rcu(wb, rcu);
399 }
400
401 static void cgwb_release(struct percpu_ref *refcnt)
402 {
403         struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
404                                                 refcnt);
405         queue_work(cgwb_release_wq, &wb->release_work);
406 }
407
408 static void cgwb_kill(struct bdi_writeback *wb)
409 {
410         lockdep_assert_held(&cgwb_lock);
411
412         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
413         list_del(&wb->memcg_node);
414         list_del(&wb->blkcg_node);
415         percpu_ref_kill(&wb->refcnt);
416 }
417
418 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
419 {
420         spin_lock_irq(&cgwb_lock);
421         list_del_rcu(&wb->bdi_node);
422         spin_unlock_irq(&cgwb_lock);
423 }
424
425 static int cgwb_create(struct backing_dev_info *bdi,
426                        struct cgroup_subsys_state *memcg_css, gfp_t gfp)
427 {
428         struct mem_cgroup *memcg;
429         struct cgroup_subsys_state *blkcg_css;
430         struct blkcg *blkcg;
431         struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
432         struct bdi_writeback *wb;
433         unsigned long flags;
434         int ret = 0;
435
436         memcg = mem_cgroup_from_css(memcg_css);
437         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
438         blkcg = css_to_blkcg(blkcg_css);
439         memcg_cgwb_list = &memcg->cgwb_list;
440         blkcg_cgwb_list = &blkcg->cgwb_list;
441
442         /* look up again under lock and discard on blkcg mismatch */
443         spin_lock_irqsave(&cgwb_lock, flags);
444         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
445         if (wb && wb->blkcg_css != blkcg_css) {
446                 cgwb_kill(wb);
447                 wb = NULL;
448         }
449         spin_unlock_irqrestore(&cgwb_lock, flags);
450         if (wb)
451                 goto out_put;
452
453         /* need to create a new one */
454         wb = kmalloc(sizeof(*wb), gfp);
455         if (!wb) {
456                 ret = -ENOMEM;
457                 goto out_put;
458         }
459
460         ret = wb_init(wb, bdi, gfp);
461         if (ret)
462                 goto err_free;
463
464         ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
465         if (ret)
466                 goto err_wb_exit;
467
468         ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
469         if (ret)
470                 goto err_ref_exit;
471
472         wb->memcg_css = memcg_css;
473         wb->blkcg_css = blkcg_css;
474         INIT_WORK(&wb->release_work, cgwb_release_workfn);
475         set_bit(WB_registered, &wb->state);
476
477         /*
478          * The root wb determines the registered state of the whole bdi and
479          * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
480          * whether they're still online.  Don't link @wb if any is dead.
481          * See wb_memcg_offline() and wb_blkcg_offline().
482          */
483         ret = -ENODEV;
484         spin_lock_irqsave(&cgwb_lock, flags);
485         if (test_bit(WB_registered, &bdi->wb.state) &&
486             blkcg_cgwb_list->next && memcg_cgwb_list->next) {
487                 /* we might have raced another instance of this function */
488                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
489                 if (!ret) {
490                         list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
491                         list_add(&wb->memcg_node, memcg_cgwb_list);
492                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
493                         blkcg_pin_online(blkcg);
494                         css_get(memcg_css);
495                         css_get(blkcg_css);
496                 }
497         }
498         spin_unlock_irqrestore(&cgwb_lock, flags);
499         if (ret) {
500                 if (ret == -EEXIST)
501                         ret = 0;
502                 goto err_fprop_exit;
503         }
504         goto out_put;
505
506 err_fprop_exit:
507         fprop_local_destroy_percpu(&wb->memcg_completions);
508 err_ref_exit:
509         percpu_ref_exit(&wb->refcnt);
510 err_wb_exit:
511         wb_exit(wb);
512 err_free:
513         kfree(wb);
514 out_put:
515         css_put(blkcg_css);
516         return ret;
517 }
518
519 /**
520  * wb_get_lookup - get wb for a given memcg
521  * @bdi: target bdi
522  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
523  *
524  * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
525  * refcount incremented.
526  *
527  * This function uses css_get() on @memcg_css and thus expects its refcnt
528  * to be positive on invocation.  IOW, rcu_read_lock() protection on
529  * @memcg_css isn't enough.  try_get it before calling this function.
530  *
531  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
532  * memcg on the default hierarchy, memcg association is guaranteed to be
533  * more specific (equal or descendant to the associated blkcg) and thus can
534  * identify both the memcg and blkcg associations.
535  *
536  * Because the blkcg associated with a memcg may change as blkcg is enabled
537  * and disabled closer to root in the hierarchy, each wb keeps track of
538  * both the memcg and blkcg associated with it and verifies the blkcg on
539  * each lookup.  On mismatch, the existing wb is discarded and a new one is
540  * created.
541  */
542 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
543                                     struct cgroup_subsys_state *memcg_css)
544 {
545         struct bdi_writeback *wb;
546
547         if (!memcg_css->parent)
548                 return &bdi->wb;
549
550         rcu_read_lock();
551         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
552         if (wb) {
553                 struct cgroup_subsys_state *blkcg_css;
554
555                 /* see whether the blkcg association has changed */
556                 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
557                 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
558                         wb = NULL;
559                 css_put(blkcg_css);
560         }
561         rcu_read_unlock();
562
563         return wb;
564 }
565
566 /**
567  * wb_get_create - get wb for a given memcg, create if necessary
568  * @bdi: target bdi
569  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
570  * @gfp: allocation mask to use
571  *
572  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
573  * create one.  See wb_get_lookup() for more details.
574  */
575 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
576                                     struct cgroup_subsys_state *memcg_css,
577                                     gfp_t gfp)
578 {
579         struct bdi_writeback *wb;
580
581         might_sleep_if(gfpflags_allow_blocking(gfp));
582
583         if (!memcg_css->parent)
584                 return &bdi->wb;
585
586         do {
587                 wb = wb_get_lookup(bdi, memcg_css);
588         } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
589
590         return wb;
591 }
592
593 static int cgwb_bdi_init(struct backing_dev_info *bdi)
594 {
595         int ret;
596
597         INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
598         mutex_init(&bdi->cgwb_release_mutex);
599         init_rwsem(&bdi->wb_switch_rwsem);
600
601         ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
602         if (!ret) {
603                 bdi->wb.memcg_css = &root_mem_cgroup->css;
604                 bdi->wb.blkcg_css = blkcg_root_css;
605         }
606         return ret;
607 }
608
609 static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
610 {
611         struct radix_tree_iter iter;
612         void **slot;
613         struct bdi_writeback *wb;
614
615         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
616
617         spin_lock_irq(&cgwb_lock);
618         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
619                 cgwb_kill(*slot);
620         spin_unlock_irq(&cgwb_lock);
621
622         mutex_lock(&bdi->cgwb_release_mutex);
623         spin_lock_irq(&cgwb_lock);
624         while (!list_empty(&bdi->wb_list)) {
625                 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
626                                       bdi_node);
627                 spin_unlock_irq(&cgwb_lock);
628                 wb_shutdown(wb);
629                 spin_lock_irq(&cgwb_lock);
630         }
631         spin_unlock_irq(&cgwb_lock);
632         mutex_unlock(&bdi->cgwb_release_mutex);
633 }
634
635 /**
636  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
637  * @memcg: memcg being offlined
638  *
639  * Also prevents creation of any new wb's associated with @memcg.
640  */
641 void wb_memcg_offline(struct mem_cgroup *memcg)
642 {
643         struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
644         struct bdi_writeback *wb, *next;
645
646         spin_lock_irq(&cgwb_lock);
647         list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
648                 cgwb_kill(wb);
649         memcg_cgwb_list->next = NULL;   /* prevent new wb's */
650         spin_unlock_irq(&cgwb_lock);
651 }
652
653 /**
654  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
655  * @blkcg: blkcg being offlined
656  *
657  * Also prevents creation of any new wb's associated with @blkcg.
658  */
659 void wb_blkcg_offline(struct blkcg *blkcg)
660 {
661         struct bdi_writeback *wb, *next;
662
663         spin_lock_irq(&cgwb_lock);
664         list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
665                 cgwb_kill(wb);
666         blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
667         spin_unlock_irq(&cgwb_lock);
668 }
669
670 static void cgwb_bdi_register(struct backing_dev_info *bdi)
671 {
672         spin_lock_irq(&cgwb_lock);
673         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
674         spin_unlock_irq(&cgwb_lock);
675 }
676
677 static int __init cgwb_init(void)
678 {
679         /*
680          * There can be many concurrent release work items overwhelming
681          * system_wq.  Put them in a separate wq and limit concurrency.
682          * There's no point in executing many of these in parallel.
683          */
684         cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
685         if (!cgwb_release_wq)
686                 return -ENOMEM;
687
688         return 0;
689 }
690 subsys_initcall(cgwb_init);
691
692 #else   /* CONFIG_CGROUP_WRITEBACK */
693
694 static int cgwb_bdi_init(struct backing_dev_info *bdi)
695 {
696         return wb_init(&bdi->wb, bdi, GFP_KERNEL);
697 }
698
699 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
700
701 static void cgwb_bdi_register(struct backing_dev_info *bdi)
702 {
703         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
704 }
705
706 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
707 {
708         list_del_rcu(&wb->bdi_node);
709 }
710
711 #endif  /* CONFIG_CGROUP_WRITEBACK */
712
713 static int bdi_init(struct backing_dev_info *bdi)
714 {
715         int ret;
716
717         bdi->dev = NULL;
718
719         kref_init(&bdi->refcnt);
720         bdi->min_ratio = 0;
721         bdi->max_ratio = 100;
722         bdi->max_prop_frac = FPROP_FRAC_BASE;
723         INIT_LIST_HEAD(&bdi->bdi_list);
724         INIT_LIST_HEAD(&bdi->wb_list);
725         init_waitqueue_head(&bdi->wb_waitq);
726
727         ret = cgwb_bdi_init(bdi);
728
729         return ret;
730 }
731
732 struct backing_dev_info *bdi_alloc(int node_id)
733 {
734         struct backing_dev_info *bdi;
735
736         bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
737         if (!bdi)
738                 return NULL;
739
740         if (bdi_init(bdi)) {
741                 kfree(bdi);
742                 return NULL;
743         }
744         bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
745         bdi->ra_pages = VM_READAHEAD_PAGES;
746         bdi->io_pages = VM_READAHEAD_PAGES;
747         return bdi;
748 }
749 EXPORT_SYMBOL(bdi_alloc);
750
751 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
752 {
753         struct rb_node **p = &bdi_tree.rb_node;
754         struct rb_node *parent = NULL;
755         struct backing_dev_info *bdi;
756
757         lockdep_assert_held(&bdi_lock);
758
759         while (*p) {
760                 parent = *p;
761                 bdi = rb_entry(parent, struct backing_dev_info, rb_node);
762
763                 if (bdi->id > id)
764                         p = &(*p)->rb_left;
765                 else if (bdi->id < id)
766                         p = &(*p)->rb_right;
767                 else
768                         break;
769         }
770
771         if (parentp)
772                 *parentp = parent;
773         return p;
774 }
775
776 /**
777  * bdi_get_by_id - lookup and get bdi from its id
778  * @id: bdi id to lookup
779  *
780  * Find bdi matching @id and get it.  Returns NULL if the matching bdi
781  * doesn't exist or is already unregistered.
782  */
783 struct backing_dev_info *bdi_get_by_id(u64 id)
784 {
785         struct backing_dev_info *bdi = NULL;
786         struct rb_node **p;
787
788         spin_lock_bh(&bdi_lock);
789         p = bdi_lookup_rb_node(id, NULL);
790         if (*p) {
791                 bdi = rb_entry(*p, struct backing_dev_info, rb_node);
792                 bdi_get(bdi);
793         }
794         spin_unlock_bh(&bdi_lock);
795
796         return bdi;
797 }
798
799 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
800 {
801         struct device *dev;
802         struct rb_node *parent, **p;
803
804         if (bdi->dev)   /* The driver needs to use separate queues per device */
805                 return 0;
806
807         vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
808         dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
809         if (IS_ERR(dev))
810                 return PTR_ERR(dev);
811
812         cgwb_bdi_register(bdi);
813         bdi->dev = dev;
814
815         bdi_debug_register(bdi, dev_name(dev));
816         set_bit(WB_registered, &bdi->wb.state);
817
818         spin_lock_bh(&bdi_lock);
819
820         bdi->id = ++bdi_id_cursor;
821
822         p = bdi_lookup_rb_node(bdi->id, &parent);
823         rb_link_node(&bdi->rb_node, parent, p);
824         rb_insert_color(&bdi->rb_node, &bdi_tree);
825
826         list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
827
828         spin_unlock_bh(&bdi_lock);
829
830         trace_writeback_bdi_register(bdi);
831         return 0;
832 }
833
834 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
835 {
836         va_list args;
837         int ret;
838
839         va_start(args, fmt);
840         ret = bdi_register_va(bdi, fmt, args);
841         va_end(args);
842         return ret;
843 }
844 EXPORT_SYMBOL(bdi_register);
845
846 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
847 {
848         WARN_ON_ONCE(bdi->owner);
849         bdi->owner = owner;
850         get_device(owner);
851 }
852
853 /*
854  * Remove bdi from bdi_list, and ensure that it is no longer visible
855  */
856 static void bdi_remove_from_list(struct backing_dev_info *bdi)
857 {
858         spin_lock_bh(&bdi_lock);
859         rb_erase(&bdi->rb_node, &bdi_tree);
860         list_del_rcu(&bdi->bdi_list);
861         spin_unlock_bh(&bdi_lock);
862
863         synchronize_rcu_expedited();
864 }
865
866 void bdi_unregister(struct backing_dev_info *bdi)
867 {
868         /* make sure nobody finds us on the bdi_list anymore */
869         bdi_remove_from_list(bdi);
870         wb_shutdown(&bdi->wb);
871         cgwb_bdi_unregister(bdi);
872
873         if (bdi->dev) {
874                 bdi_debug_unregister(bdi);
875                 device_unregister(bdi->dev);
876                 bdi->dev = NULL;
877         }
878
879         if (bdi->owner) {
880                 put_device(bdi->owner);
881                 bdi->owner = NULL;
882         }
883 }
884
885 static void release_bdi(struct kref *ref)
886 {
887         struct backing_dev_info *bdi =
888                         container_of(ref, struct backing_dev_info, refcnt);
889
890         if (test_bit(WB_registered, &bdi->wb.state))
891                 bdi_unregister(bdi);
892         WARN_ON_ONCE(bdi->dev);
893         wb_exit(&bdi->wb);
894         kfree(bdi);
895 }
896
897 void bdi_put(struct backing_dev_info *bdi)
898 {
899         kref_put(&bdi->refcnt, release_bdi);
900 }
901 EXPORT_SYMBOL(bdi_put);
902
903 const char *bdi_dev_name(struct backing_dev_info *bdi)
904 {
905         if (!bdi || !bdi->dev)
906                 return bdi_unknown_name;
907         return bdi->dev_name;
908 }
909 EXPORT_SYMBOL_GPL(bdi_dev_name);
910
911 static wait_queue_head_t congestion_wqh[2] = {
912                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
913                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
914         };
915 static atomic_t nr_wb_congested[2];
916
917 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
918 {
919         wait_queue_head_t *wqh = &congestion_wqh[sync];
920         enum wb_congested_state bit;
921
922         bit = sync ? WB_sync_congested : WB_async_congested;
923         if (test_and_clear_bit(bit, &bdi->wb.congested))
924                 atomic_dec(&nr_wb_congested[sync]);
925         smp_mb__after_atomic();
926         if (waitqueue_active(wqh))
927                 wake_up(wqh);
928 }
929 EXPORT_SYMBOL(clear_bdi_congested);
930
931 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
932 {
933         enum wb_congested_state bit;
934
935         bit = sync ? WB_sync_congested : WB_async_congested;
936         if (!test_and_set_bit(bit, &bdi->wb.congested))
937                 atomic_inc(&nr_wb_congested[sync]);
938 }
939 EXPORT_SYMBOL(set_bdi_congested);
940
941 /**
942  * congestion_wait - wait for a backing_dev to become uncongested
943  * @sync: SYNC or ASYNC IO
944  * @timeout: timeout in jiffies
945  *
946  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
947  * write congestion.  If no backing_devs are congested then just wait for the
948  * next write to be completed.
949  */
950 long congestion_wait(int sync, long timeout)
951 {
952         long ret;
953         unsigned long start = jiffies;
954         DEFINE_WAIT(wait);
955         wait_queue_head_t *wqh = &congestion_wqh[sync];
956
957         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
958         ret = io_schedule_timeout(timeout);
959         finish_wait(wqh, &wait);
960
961         trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
962                                         jiffies_to_usecs(jiffies - start));
963
964         return ret;
965 }
966 EXPORT_SYMBOL(congestion_wait);
967
968 /**
969  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
970  * @sync: SYNC or ASYNC IO
971  * @timeout: timeout in jiffies
972  *
973  * In the event of a congested backing_dev (any backing_dev) this waits
974  * for up to @timeout jiffies for either a BDI to exit congestion of the
975  * given @sync queue or a write to complete.
976  *
977  * The return value is 0 if the sleep is for the full timeout. Otherwise,
978  * it is the number of jiffies that were still remaining when the function
979  * returned. return_value == timeout implies the function did not sleep.
980  */
981 long wait_iff_congested(int sync, long timeout)
982 {
983         long ret;
984         unsigned long start = jiffies;
985         DEFINE_WAIT(wait);
986         wait_queue_head_t *wqh = &congestion_wqh[sync];
987
988         /*
989          * If there is no congestion, yield if necessary instead
990          * of sleeping on the congestion queue
991          */
992         if (atomic_read(&nr_wb_congested[sync]) == 0) {
993                 cond_resched();
994
995                 /* In case we scheduled, work out time remaining */
996                 ret = timeout - (jiffies - start);
997                 if (ret < 0)
998                         ret = 0;
999
1000                 goto out;
1001         }
1002
1003         /* Sleep until uncongested or a write happens */
1004         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1005         ret = io_schedule_timeout(timeout);
1006         finish_wait(wqh, &wait);
1007
1008 out:
1009         trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
1010                                         jiffies_to_usecs(jiffies - start));
1011
1012         return ret;
1013 }
1014 EXPORT_SYMBOL(wait_iff_congested);