Revert "block/mq-deadline: Prioritize high-priority requests"
authorJens Axboe <axboe@kernel.dk>
Thu, 26 Aug 2021 18:59:44 +0000 (12:59 -0600)
committerJens Axboe <axboe@kernel.dk>
Thu, 26 Aug 2021 18:59:44 +0000 (12:59 -0600)
This reverts commit fb926032b3209300f9dc454a36b8299582ae545c.

Zhen reports that this commit slows down mq-deadline on a 128 thread
box, going from 258K IOPS to 170-180K. My testing shows that Optane
gen2 IOPS goes from 2.3M IOPS to 1.2M IOPS on a 64 thread box.

Looking in detail at the code, the main culprit here is needing to sum
percpu counters in the dispatch hot path, leading to very high CPU
utilization there. To make matters worse, the code currently needs to
sum 2 percpu counters, and it does so in the most naive way of iterating
possible CPUs _twice_.

Since we're close to release, revert this commit and we can re-do it
with regular per-priority counters instead for the 5.15 kernel.

Link: https://lore.kernel.org/linux-block/20210826144039.2143-1-thunder.leizhen@huawei.com/
Reported-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/mq-deadline.c

index 18dc8ef..3692067 100644 (file)
  */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-/*
- * Time after which to dispatch lower priority requests even if higher
- * priority requests are pending.
- */
-static const int aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */
@@ -103,7 +98,6 @@ struct deadline_data {
        int writes_starved;
        int front_merges;
        u32 async_depth;
-       int aging_expire;
 
        spinlock_t lock;
        spinlock_t zone_lock;
@@ -369,11 +363,10 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 
 /*
  * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc and with a start time <= @latest.
+ * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-                                            struct dd_per_prio *per_prio,
-                                            u64 latest_start_ns)
+                                            struct dd_per_prio *per_prio)
 {
        struct request *rq, *next_rq;
        enum dd_data_dir data_dir;
@@ -385,8 +378,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
        if (!list_empty(&per_prio->dispatch)) {
                rq = list_first_entry(&per_prio->dispatch, struct request,
                                      queuelist);
-               if (rq->start_time_ns > latest_start_ns)
-                       return NULL;
                list_del_init(&rq->queuelist);
                goto done;
        }
@@ -464,8 +455,6 @@ dispatch_find_request:
        dd->batching = 0;
 
 dispatch_request:
-       if (rq->start_time_ns > latest_start_ns)
-               return NULL;
        /*
         * rq is the selected appropriate request.
         */
@@ -494,32 +483,15 @@ done:
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-       const u64 now_ns = ktime_get_ns();
-       struct request *rq = NULL;
+       struct request *rq;
        enum dd_prio prio;
 
        spin_lock(&dd->lock);
-       /*
-        * Start with dispatching requests whose deadline expired more than
-        * aging_expire jiffies ago.
-        */
-       for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
-               rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns -
-                                          jiffies_to_nsecs(dd->aging_expire));
-               if (rq)
-                       goto unlock;
-       }
-       /*
-        * Next, dispatch requests in priority order. Ignore lower priority
-        * requests if any higher priority requests are pending.
-        */
        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-               rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns);
-               if (rq || dd_queued(dd, prio))
+               rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+               if (rq)
                        break;
        }
-
-unlock:
        spin_unlock(&dd->lock);
 
        return rq;
@@ -620,7 +592,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
        dd->front_merges = 1;
        dd->last_dir = DD_WRITE;
        dd->fifo_batch = fifo_batch;
-       dd->aging_expire = aging_expire;
        spin_lock_init(&dd->lock);
        spin_lock_init(&dd->zone_lock);
 
@@ -842,7 +813,6 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)         \
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
-SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -872,7 +842,6 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
        STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
-STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -891,7 +860,6 @@ static struct elv_fs_entry deadline_attrs[] = {
        DD_ATTR(front_merges),
        DD_ATTR(async_depth),
        DD_ATTR(fifo_batch),
-       DD_ATTR(aging_expire),
        __ATTR_NULL
 };