Merge branch 'opp/defer-probe' into HEAD
[linux-2.6-microblaze.git] / block / blk-cgroup.c
index 930212c..619a79b 100644 (file)
@@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu)
        css_put(&blkg->blkcg->css);
        if (blkg->parent)
                blkg_put(blkg->parent);
-
-       wb_congested_put(blkg->wb_congested);
-
        blkg_free(blkg);
 }
 
@@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct blkcg_gq *new_blkg)
 {
        struct blkcg_gq *blkg;
-       struct bdi_writeback_congested *wb_congested;
        int i, ret;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                goto err_free_blkg;
        }
 
-       wb_congested = wb_congested_get_create(q->backing_dev_info,
-                                              blkcg->css.id,
-                                              GFP_NOWAIT | __GFP_NOWARN);
-       if (!wb_congested) {
-               ret = -ENOMEM;
-               goto err_put_css;
-       }
-
        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
-                       goto err_put_congested;
+                       goto err_put_css;
                }
        }
        blkg = new_blkg;
-       blkg->wb_congested = wb_congested;
 
        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
-                       goto err_put_congested;
+                       goto err_put_css;
                }
                blkg_get(blkg->parent);
        }
@@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        blkg_put(blkg);
        return ERR_PTR(ret);
 
-err_put_congested:
-       wb_congested_put(wb_congested);
 err_put_css:
        css_put(&blkcg->css);
 err_free_blkg:
@@ -316,30 +301,35 @@ err_free_blkg:
 }
 
 /**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
  *
  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
- * should be called under RCU read lock and @q->queue_lock.
+ * should be called under RCU read lock and takes @q->queue_lock.
  *
  * Returns the blkg or the closest blkg if blkg_create() fails as it walks
  * down from root.
  */
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-                                     struct request_queue *q)
+static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+               struct request_queue *q)
 {
        struct blkcg_gq *blkg;
+       unsigned long flags;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
-       lockdep_assert_held(&q->queue_lock);
 
-       blkg = __blkg_lookup(blkcg, q, true);
+       blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;
 
+       spin_lock_irqsave(&q->queue_lock, flags);
+       blkg = __blkg_lookup(blkcg, q, true);
+       if (blkg)
+               goto found;
+
        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
@@ -362,34 +352,16 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
                }
 
                blkg = blkg_create(pos, q, NULL);
-               if (IS_ERR(blkg))
-                       return ret_blkg;
+               if (IS_ERR(blkg)) {
+                       blkg = ret_blkg;
+                       break;
+               }
                if (pos == blkcg)
-                       return blkg;
-       }
-}
-
-/**
- * blkg_lookup_create - find or create a blkg
- * @blkcg: target block cgroup
- * @q: target request_queue
- *
- * This looks up or creates the blkg representing the unique pair
- * of the blkcg and the request_queue.
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-                                   struct request_queue *q)
-{
-       struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
-
-       if (unlikely(!blkg)) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&q->queue_lock, flags);
-               blkg = __blkg_lookup_create(blkcg, q);
-               spin_unlock_irqrestore(&q->queue_lock, flags);
+                       break;
        }
 
+found:
+       spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
 }
 
@@ -739,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+       int i;
+
+       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+               dst->bytes[i] = src->bytes[i];
+               dst->ios[i] = src->ios[i];
+       }
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+       int i;
+
+       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+               dst->bytes[i] += src->bytes[i];
+               dst->ios[i] += src->ios[i];
+       }
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+       int i;
+
+       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+               dst->bytes[i] -= src->bytes[i];
+               dst->ios[i] -= src->ios[i];
+       }
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+       struct blkcg *blkcg = css_to_blkcg(css);
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               struct blkcg_gq *parent = blkg->parent;
+               struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+               struct blkg_iostat cur, delta;
+               unsigned int seq;
+
+               /* fetch the current per-cpu values */
+               do {
+                       seq = u64_stats_fetch_begin(&bisc->sync);
+                       blkg_iostat_set(&cur, &bisc->cur);
+               } while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+               /* propagate percpu delta to global */
+               u64_stats_update_begin(&blkg->iostat.sync);
+               blkg_iostat_set(&delta, &cur);
+               blkg_iostat_sub(&delta, &bisc->last);
+               blkg_iostat_add(&blkg->iostat.cur, &delta);
+               blkg_iostat_add(&bisc->last, &delta);
+               u64_stats_update_end(&blkg->iostat.sync);
+
+               /* propagate global delta to parent */
+               if (parent) {
+                       u64_stats_update_begin(&parent->iostat.sync);
+                       blkg_iostat_set(&delta, &blkg->iostat.cur);
+                       blkg_iostat_sub(&delta, &blkg->iostat.last);
+                       blkg_iostat_add(&parent->iostat.cur, &delta);
+                       blkg_iostat_add(&blkg->iostat.last, &delta);
+                       u64_stats_update_end(&parent->iostat.sync);
+               }
+       }
+
+       rcu_read_unlock();
+}
+
+/*
+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
+ * incurring overhead when no cgroups are defined. For that reason,
+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
+ * iostat in the root cgroup's blkcg_gq.
+ *
+ * However, we would like to re-use the printing code between the root and
+ * non-root cgroups to the extent possible. For that reason, we simulate
+ * flushing the root cgroup's stats by explicitly filling in the iostat
+ * with disk level statistics.
+ */
+static void blkcg_fill_root_iostats(void)
+{
+       struct class_dev_iter iter;
+       struct device *dev;
+
+       class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+       while ((dev = class_dev_iter_next(&iter))) {
+               struct gendisk *disk = dev_to_disk(dev);
+               struct hd_struct *part = disk_get_part(disk, 0);
+               struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
+               struct blkg_iostat tmp;
+               int cpu;
+
+               memset(&tmp, 0, sizeof(tmp));
+               for_each_possible_cpu(cpu) {
+                       struct disk_stats *cpu_dkstats;
+
+                       cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
+                       tmp.ios[BLKG_IOSTAT_READ] +=
+                               cpu_dkstats->ios[STAT_READ];
+                       tmp.ios[BLKG_IOSTAT_WRITE] +=
+                               cpu_dkstats->ios[STAT_WRITE];
+                       tmp.ios[BLKG_IOSTAT_DISCARD] +=
+                               cpu_dkstats->ios[STAT_DISCARD];
+                       // convert sectors to bytes
+                       tmp.bytes[BLKG_IOSTAT_READ] +=
+                               cpu_dkstats->sectors[STAT_READ] << 9;
+                       tmp.bytes[BLKG_IOSTAT_WRITE] +=
+                               cpu_dkstats->sectors[STAT_WRITE] << 9;
+                       tmp.bytes[BLKG_IOSTAT_DISCARD] +=
+                               cpu_dkstats->sectors[STAT_DISCARD] << 9;
+
+                       u64_stats_update_begin(&blkg->iostat.sync);
+                       blkg_iostat_set(&blkg->iostat.cur, &tmp);
+                       u64_stats_update_end(&blkg->iostat.sync);
+               }
+       }
+}
+
 static int blkcg_print_stat(struct seq_file *sf, void *v)
 {
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;
 
-       cgroup_rstat_flush(blkcg->css.cgroup);
+       if (!seq_css(sf)->parent)
+               blkcg_fill_root_iostats();
+       else
+               cgroup_rstat_flush(blkcg->css.cgroup);
+
        rcu_read_lock();
 
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -833,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 static struct cftype blkcg_files[] = {
        {
                .name = "stat",
-               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = blkcg_print_stat,
        },
        { }     /* terminate */
@@ -1025,7 +1121,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
  * blkcg_init_queue - initialize blkcg part of request queue
  * @q: request_queue to initialize
  *
- * Called from __blk_alloc_queue(). Responsible for initializing blkcg
+ * Called from blk_alloc_queue(). Responsible for initializing blkcg
  * part of new request_queue @q.
  *
  * RETURNS:
@@ -1114,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset)
        return ret;
 }
 
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
-       int i;
-
-       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
-               dst->bytes[i] = src->bytes[i];
-               dst->ios[i] = src->ios[i];
-       }
-}
-
-static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
-       int i;
-
-       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
-               dst->bytes[i] += src->bytes[i];
-               dst->ios[i] += src->ios[i];
-       }
-}
-
-static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
-       int i;
-
-       for (i = 0; i < BLKG_IOSTAT_NR; i++) {
-               dst->bytes[i] -= src->bytes[i];
-               dst->ios[i] -= src->ios[i];
-       }
-}
-
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
-{
-       struct blkcg *blkcg = css_to_blkcg(css);
-       struct blkcg_gq *blkg;
-
-       rcu_read_lock();
-
-       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
-               struct blkcg_gq *parent = blkg->parent;
-               struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
-               struct blkg_iostat cur, delta;
-               unsigned seq;
-
-               /* fetch the current per-cpu values */
-               do {
-                       seq = u64_stats_fetch_begin(&bisc->sync);
-                       blkg_iostat_set(&cur, &bisc->cur);
-               } while (u64_stats_fetch_retry(&bisc->sync, seq));
-
-               /* propagate percpu delta to global */
-               u64_stats_update_begin(&blkg->iostat.sync);
-               blkg_iostat_set(&delta, &cur);
-               blkg_iostat_sub(&delta, &bisc->last);
-               blkg_iostat_add(&blkg->iostat.cur, &delta);
-               blkg_iostat_add(&bisc->last, &delta);
-               u64_stats_update_end(&blkg->iostat.sync);
-
-               /* propagate global delta to parent */
-               if (parent) {
-                       u64_stats_update_begin(&parent->iostat.sync);
-                       blkg_iostat_set(&delta, &blkg->iostat.cur);
-                       blkg_iostat_sub(&delta, &blkg->iostat.last);
-                       blkg_iostat_add(&parent->iostat.cur, &delta);
-                       blkg_iostat_add(&blkg->iostat.last, &delta);
-                       u64_stats_update_end(&parent->iostat.sync);
-               }
-       }
-
-       rcu_read_unlock();
-}
-
 static void blkcg_bind(struct cgroup_subsys_state *root_css)
 {
        int i;
@@ -1530,6 +1555,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
 {
        u64 old = atomic64_read(&blkg->delay_start);
 
+       /* negative use_delay means no scaling, see blkcg_set_delay() */
+       if (atomic_read(&blkg->use_delay) < 0)
+               return;
+
        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
@@ -1717,10 +1746,145 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
  */
 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 {
+       if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+               return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
 }
 
+/**
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
+ * @bio: target bio
+ * @css: target css
+ *
+ * As the failure mode here is to walk up the blkg tree, this ensure that the
+ * blkg->parent pointers are always valid.  This returns the blkg that it ended
+ * up taking a reference on or %NULL if no reference was taken.
+ */
+static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
+               struct cgroup_subsys_state *css)
+{
+       struct blkcg_gq *blkg, *ret_blkg = NULL;
+
+       rcu_read_lock();
+       blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+       while (blkg) {
+               if (blkg_tryget(blkg)) {
+                       ret_blkg = blkg;
+                       break;
+               }
+               blkg = blkg->parent;
+       }
+       rcu_read_unlock();
+
+       return ret_blkg;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio.  An association failure is handled by walking up
+ * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
+ * and q->root_blkg.  This situation only happens when a cgroup is dying and
+ * then the remaining bios will spill to the closest alive blkg.
+ *
+ * A reference will be taken on the blkg and will be released when @bio is
+ * freed.
+ */
+void bio_associate_blkg_from_css(struct bio *bio,
+                                struct cgroup_subsys_state *css)
+{
+       if (bio->bi_blkg)
+               blkg_put(bio->bi_blkg);
+
+       if (css && css->parent) {
+               bio->bi_blkg = blkg_tryget_closest(bio, css);
+       } else {
+               blkg_get(bio->bi_disk->queue->root_blkg);
+               bio->bi_blkg = bio->bi_disk->queue->root_blkg;
+       }
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+       struct cgroup_subsys_state *css;
+
+       rcu_read_lock();
+
+       if (bio->bi_blkg)
+               css = &bio_blkcg(bio)->css;
+       else
+               css = blkcg_css();
+
+       bio_associate_blkg_from_css(bio, css);
+
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
+
+/**
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
+ * @dst: destination bio
+ * @src: source bio
+ */
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+{
+       if (src->bi_blkg) {
+               if (dst->bi_blkg)
+                       blkg_put(dst->bi_blkg);
+               blkg_get(src->bi_blkg);
+               dst->bi_blkg = src->bi_blkg;
+       }
+}
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
+
+static int blk_cgroup_io_type(struct bio *bio)
+{
+       if (op_is_discard(bio->bi_opf))
+               return BLKG_IOSTAT_DISCARD;
+       if (op_is_write(bio->bi_opf))
+               return BLKG_IOSTAT_WRITE;
+       return BLKG_IOSTAT_READ;
+}
+
+void blk_cgroup_bio_start(struct bio *bio)
+{
+       int rwd = blk_cgroup_io_type(bio), cpu;
+       struct blkg_iostat_set *bis;
+
+       cpu = get_cpu();
+       bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
+       u64_stats_update_begin(&bis->sync);
+
+       /*
+        * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+        * bio and we would have already accounted for the size of the bio.
+        */
+       if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+               bio_set_flag(bio, BIO_CGROUP_ACCT);
+               bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+       }
+       bis->cur.ios[rwd]++;
+
+       u64_stats_update_end(&bis->sync);
+       if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+               cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+       put_cpu();
+}
+
 static int __init blkcg_init(void)
 {
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",