tools headers UAPI: Sync drm/i915_drm.h with the kernel sources
[linux-2.6-microblaze.git] / block / bfq-iosched.c
index 9e81d10..b398dde 100644 (file)
@@ -158,7 +158,6 @@ BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
-BFQ_BFQQ_FNS(has_waker);
 #undef BFQ_BFQQ_FNS                                            \
 
 /* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1024,9 +1023,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
        else
                bfq_clear_bfqq_IO_bound(bfqq);
 
+       bfqq->last_serv_time_ns = bic->saved_last_serv_time_ns;
+       bfqq->inject_limit = bic->saved_inject_limit;
+       bfqq->decrease_time_jif = bic->saved_decrease_time_jif;
+
        bfqq->entity.new_weight = bic->saved_weight;
        bfqq->ttime = bic->saved_ttime;
+       bfqq->io_start_time = bic->saved_io_start_time;
+       bfqq->tot_idle_time = bic->saved_tot_idle_time;
        bfqq->wr_coeff = bic->saved_wr_coeff;
+       bfqq->service_from_wr = bic->saved_service_from_wr;
        bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
        bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
        bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
@@ -1647,6 +1653,8 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
        return bfqq_weight > in_serv_weight;
 }
 
+static bool bfq_better_to_idle(struct bfq_queue *bfqq);
+
 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
                                             struct bfq_queue *bfqq,
                                             int old_wr_coeff,
@@ -1671,15 +1679,19 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
         * - it is sync,
         * - it does not belong to a large burst,
         * - it has been idle for enough time or is soft real-time,
-        * - is linked to a bfq_io_cq (it is not shared in any sense).
+        * - is linked to a bfq_io_cq (it is not shared in any sense),
+        * - has a default weight (otherwise we assume the user wanted
+        *   to control its weight explicitly)
         */
        in_burst = bfq_bfqq_in_large_burst(bfqq);
        soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
                !BFQQ_TOTALLY_SEEKY(bfqq) &&
                !in_burst &&
                time_is_before_jiffies(bfqq->soft_rt_next_start) &&
-               bfqq->dispatched == 0;
-       *interactive = !in_burst && idle_for_long_time;
+               bfqq->dispatched == 0 &&
+               bfqq->entity.new_weight == 40;
+       *interactive = !in_burst && idle_for_long_time &&
+               bfqq->entity.new_weight == 40;
        wr_or_deserves_wr = bfqd->low_latency &&
                (bfqq->wr_coeff > 1 ||
                 (bfq_bfqq_sync(bfqq) &&
@@ -1717,17 +1729,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 
        bfq_clear_bfqq_just_created(bfqq);
 
-
-       if (!bfq_bfqq_IO_bound(bfqq)) {
-               if (arrived_in_time) {
-                       bfqq->requests_within_timer++;
-                       if (bfqq->requests_within_timer >=
-                           bfqd->bfq_requests_within_timer)
-                               bfq_mark_bfqq_IO_bound(bfqq);
-               } else
-                       bfqq->requests_within_timer = 0;
-       }
-
        if (bfqd->low_latency) {
                if (unlikely(time_is_after_jiffies(bfqq->split_time)))
                        /* wraparound */
@@ -1755,10 +1756,10 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
        bfq_add_bfqq_busy(bfqd, bfqq);
 
        /*
-        * Expire in-service queue only if preemption may be needed
-        * for guarantees. In particular, we care only about two
-        * cases. The first is that bfqq has to recover a service
-        * hole, as explained in the comments on
+        * Expire in-service queue if preemption may be needed for
+        * guarantees or throughput. As for guarantees, we care
+        * explicitly about two cases. The first is that bfqq has to
+        * recover a service hole, as explained in the comments on
         * bfq_bfqq_update_budg_for_activation(), i.e., that
         * bfqq_wants_to_preempt is true. However, if bfqq does not
         * carry time-critical I/O, then bfqq's bandwidth is less
@@ -1785,11 +1786,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
         * timestamps of the in-service queue would need to be
         * updated, and this operation is quite costly (see the
         * comments on bfq_bfqq_update_budg_for_activation()).
+        *
+        * As for throughput, we ask bfq_better_to_idle() whether we
+        * still need to plug I/O dispatching. If bfq_better_to_idle()
+        * says no, then plugging is not needed any longer, either to
+        * boost throughput or to perserve service guarantees. Then
+        * the best option is to stop plugging I/O, as not doing so
+        * would certainly lower throughput. We may end up in this
+        * case if: (1) upon a dispatch attempt, we detected that it
+        * was better to plug I/O dispatch, and to wait for a new
+        * request to arrive for the currently in-service queue, but
+        * (2) this switch of bfqq to busy changes the scenario.
         */
        if (bfqd->in_service_queue &&
            ((bfqq_wants_to_preempt &&
              bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
-            bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
+            bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) ||
+            !bfq_better_to_idle(bfqd->in_service_queue)) &&
            next_queue_may_preempt(bfqd))
                bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
                                false, BFQQE_PREEMPTED);
@@ -1861,6 +1874,138 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd,
        bfqq->decrease_time_jif = jiffies;
 }
 
+static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
+{
+       u64 tot_io_time = now_ns - bfqq->io_start_time;
+
+       if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0)
+               bfqq->tot_idle_time +=
+                       now_ns - bfqq->ttime.last_end_request;
+
+       if (unlikely(bfq_bfqq_just_created(bfqq)))
+               return;
+
+       /*
+        * Must be busy for at least about 80% of the time to be
+        * considered I/O bound.
+        */
+       if (bfqq->tot_idle_time * 5 > tot_io_time)
+               bfq_clear_bfqq_IO_bound(bfqq);
+       else
+               bfq_mark_bfqq_IO_bound(bfqq);
+
+       /*
+        * Keep an observation window of at most 200 ms in the past
+        * from now.
+        */
+       if (tot_io_time > 200 * NSEC_PER_MSEC) {
+               bfqq->io_start_time = now_ns - (tot_io_time>>1);
+               bfqq->tot_idle_time >>= 1;
+       }
+}
+
+/*
+ * Detect whether bfqq's I/O seems synchronized with that of some
+ * other queue, i.e., whether bfqq, after remaining empty, happens to
+ * receive new I/O only right after some I/O request of the other
+ * queue has been completed. We call waker queue the other queue, and
+ * we assume, for simplicity, that bfqq may have at most one waker
+ * queue.
+ *
+ * A remarkable throughput boost can be reached by unconditionally
+ * injecting the I/O of the waker queue, every time a new
+ * bfq_dispatch_request happens to be invoked while I/O is being
+ * plugged for bfqq.  In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth and latency for
+ * bfqq. Note that these same results may be achieved with the general
+ * injection mechanism, but less effectively. For details on this
+ * aspect, see the comments on the choice of the queue for injection
+ * in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a queue Q is deemed
+ * as a waker queue for bfqq if, for three consecutive times, bfqq
+ * happens to become non empty right after a request of Q has been
+ * completed. In particular, on the first time, Q is tentatively set
+ * as a candidate waker queue, while on the third consecutive time
+ * that Q is detected, the field waker_bfqq is set to Q, to confirm
+ * that Q is a waker queue for bfqq. These detection steps are
+ * performed only if bfqq has a long think time, so as to make it more
+ * likely that bfqq's I/O is actually being blocked by a
+ * synchronization. This last filter, plus the above three-times
+ * requirement, make false positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner throughput can be
+ * boosted by injecting I/O from the waker queue. Fortunately,
+ * detection is likely to be actually fast, for the following
+ * reasons. While blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at least equal to 1
+ * (see the comments in bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served during the very
+ * first I/O-plugging time interval for bfqq. This triggers the first
+ * step of the detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be confirmed no later than
+ * during the next I/O-plugging interval for bfqq.
+ *
+ * ISSUE
+ *
+ * On queue merging all waker information is lost.
+ */
+static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                           u64 now_ns)
+{
+       if (!bfqd->last_completed_rq_bfqq ||
+           bfqd->last_completed_rq_bfqq == bfqq ||
+           bfq_bfqq_has_short_ttime(bfqq) ||
+           now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
+           bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
+               return;
+
+       if (bfqd->last_completed_rq_bfqq !=
+           bfqq->tentative_waker_bfqq) {
+               /*
+                * First synchronization detected with a
+                * candidate waker queue, or with a different
+                * candidate waker queue from the current one.
+                */
+               bfqq->tentative_waker_bfqq =
+                       bfqd->last_completed_rq_bfqq;
+               bfqq->num_waker_detections = 1;
+       } else /* Same tentative waker queue detected again */
+               bfqq->num_waker_detections++;
+
+       if (bfqq->num_waker_detections == 3) {
+               bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+               bfqq->tentative_waker_bfqq = NULL;
+
+               /*
+                * If the waker queue disappears, then
+                * bfqq->waker_bfqq must be reset. To
+                * this goal, we maintain in each
+                * waker queue a list, woken_list, of
+                * all the queues that reference the
+                * waker queue through their
+                * waker_bfqq pointer. When the waker
+                * queue exits, the waker_bfqq pointer
+                * of all the queues in the woken_list
+                * is reset.
+                *
+                * In addition, if bfqq is already in
+                * the woken_list of a waker queue,
+                * then, before being inserted into
+                * the woken_list of a new waker
+                * queue, bfqq must be removed from
+                * the woken_list of the old waker
+                * queue.
+                */
+               if (!hlist_unhashed(&bfqq->woken_list_node))
+                       hlist_del_init(&bfqq->woken_list_node);
+               hlist_add_head(&bfqq->woken_list_node,
+                              &bfqd->last_completed_rq_bfqq->woken_list);
+       }
+}
+
 static void bfq_add_request(struct request *rq)
 {
        struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1868,117 +2013,14 @@ static void bfq_add_request(struct request *rq)
        struct request *next_rq, *prev;
        unsigned int old_wr_coeff = bfqq->wr_coeff;
        bool interactive = false;
+       u64 now_ns = ktime_get_ns();
 
        bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
        bfqq->queued[rq_is_sync(rq)]++;
        bfqd->queued++;
 
        if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
-               /*
-                * Detect whether bfqq's I/O seems synchronized with
-                * that of some other queue, i.e., whether bfqq, after
-                * remaining empty, happens to receive new I/O only
-                * right after some I/O request of the other queue has
-                * been completed. We call waker queue the other
-                * queue, and we assume, for simplicity, that bfqq may
-                * have at most one waker queue.
-                *
-                * A remarkable throughput boost can be reached by
-                * unconditionally injecting the I/O of the waker
-                * queue, every time a new bfq_dispatch_request
-                * happens to be invoked while I/O is being plugged
-                * for bfqq.  In addition to boosting throughput, this
-                * unblocks bfqq's I/O, thereby improving bandwidth
-                * and latency for bfqq. Note that these same results
-                * may be achieved with the general injection
-                * mechanism, but less effectively. For details on
-                * this aspect, see the comments on the choice of the
-                * queue for injection in bfq_select_queue().
-                *
-                * Turning back to the detection of a waker queue, a
-                * queue Q is deemed as a waker queue for bfqq if, for
-                * two consecutive times, bfqq happens to become non
-                * empty right after a request of Q has been
-                * completed. In particular, on the first time, Q is
-                * tentatively set as a candidate waker queue, while
-                * on the second time, the flag
-                * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
-                * is a waker queue for bfqq. These detection steps
-                * are performed only if bfqq has a long think time,
-                * so as to make it more likely that bfqq's I/O is
-                * actually being blocked by a synchronization. This
-                * last filter, plus the above two-times requirement,
-                * make false positives less likely.
-                *
-                * NOTE
-                *
-                * The sooner a waker queue is detected, the sooner
-                * throughput can be boosted by injecting I/O from the
-                * waker queue. Fortunately, detection is likely to be
-                * actually fast, for the following reasons. While
-                * blocked by synchronization, bfqq has a long think
-                * time. This implies that bfqq's inject limit is at
-                * least equal to 1 (see the comments in
-                * bfq_update_inject_limit()). So, thanks to
-                * injection, the waker queue is likely to be served
-                * during the very first I/O-plugging time interval
-                * for bfqq. This triggers the first step of the
-                * detection mechanism. Thanks again to injection, the
-                * candidate waker queue is then likely to be
-                * confirmed no later than during the next
-                * I/O-plugging interval for bfqq.
-                */
-               if (bfqd->last_completed_rq_bfqq &&
-                   !bfq_bfqq_has_short_ttime(bfqq) &&
-                   ktime_get_ns() - bfqd->last_completion <
-                   200 * NSEC_PER_USEC) {
-                       if (bfqd->last_completed_rq_bfqq != bfqq &&
-                           bfqd->last_completed_rq_bfqq !=
-                           bfqq->waker_bfqq) {
-                               /*
-                                * First synchronization detected with
-                                * a candidate waker queue, or with a
-                                * different candidate waker queue
-                                * from the current one.
-                                */
-                               bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
-
-                               /*
-                                * If the waker queue disappears, then
-                                * bfqq->waker_bfqq must be reset. To
-                                * this goal, we maintain in each
-                                * waker queue a list, woken_list, of
-                                * all the queues that reference the
-                                * waker queue through their
-                                * waker_bfqq pointer. When the waker
-                                * queue exits, the waker_bfqq pointer
-                                * of all the queues in the woken_list
-                                * is reset.
-                                *
-                                * In addition, if bfqq is already in
-                                * the woken_list of a waker queue,
-                                * then, before being inserted into
-                                * the woken_list of a new waker
-                                * queue, bfqq must be removed from
-                                * the woken_list of the old waker
-                                * queue.
-                                */
-                               if (!hlist_unhashed(&bfqq->woken_list_node))
-                                       hlist_del_init(&bfqq->woken_list_node);
-                               hlist_add_head(&bfqq->woken_list_node,
-                                   &bfqd->last_completed_rq_bfqq->woken_list);
-
-                               bfq_clear_bfqq_has_waker(bfqq);
-                       } else if (bfqd->last_completed_rq_bfqq ==
-                                  bfqq->waker_bfqq &&
-                                  !bfq_bfqq_has_waker(bfqq)) {
-                               /*
-                                * synchronization with waker_bfqq
-                                * seen for the second time
-                                */
-                               bfq_mark_bfqq_has_waker(bfqq);
-                       }
-               }
+               bfq_check_waker(bfqd, bfqq, now_ns);
 
                /*
                 * Periodically reset inject limit, to make sure that
@@ -2047,6 +2089,9 @@ static void bfq_add_request(struct request *rq)
                }
        }
 
+       if (bfq_bfqq_sync(bfqq))
+               bfq_update_io_intensity(bfqq, now_ns);
+
        elv_rb_add(&bfqq->sort_list, rq);
 
        /*
@@ -2352,6 +2397,24 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 /* Must be called with bfqq != NULL */
 static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 {
+       /*
+        * If bfqq has been enjoying interactive weight-raising, then
+        * reset soft_rt_next_start. We do it for the following
+        * reason. bfqq may have been conveying the I/O needed to load
+        * a soft real-time application. Such an application actually
+        * exhibits a soft real-time I/O pattern after it finishes
+        * loading, and finally starts doing its job. But, if bfqq has
+        * been receiving a lot of bandwidth so far (likely to happen
+        * on a fast device), then soft_rt_next_start now contains a
+        * high value that. So, without this reset, bfqq would be
+        * prevented from being possibly considered as soft_rt for a
+        * very long time.
+        */
+
+       if (bfqq->wr_cur_max_time !=
+           bfqq->bfqd->bfq_wr_rt_max_time)
+               bfqq->soft_rt_next_start = jiffies;
+
        if (bfq_bfqq_busy(bfqq))
                bfqq->bfqd->wr_busy_queues--;
        bfqq->wr_coeff = 1;
@@ -2686,10 +2749,16 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
        if (!bic)
                return;
 
+       bic->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
+       bic->saved_inject_limit = bfqq->inject_limit;
+       bic->saved_decrease_time_jif = bfqq->decrease_time_jif;
+
        bic->saved_weight = bfqq->entity.orig_weight;
        bic->saved_ttime = bfqq->ttime;
        bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
        bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+       bic->saved_io_start_time = bfqq->io_start_time;
+       bic->saved_tot_idle_time = bfqq->tot_idle_time;
        bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
        bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
        if (unlikely(bfq_bfqq_just_created(bfqq) &&
@@ -2712,6 +2781,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
                bic->saved_wr_coeff = bfqq->wr_coeff;
                bic->saved_wr_start_at_switch_to_srt =
                        bfqq->wr_start_at_switch_to_srt;
+               bic->saved_service_from_wr = bfqq->service_from_wr;
                bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
                bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
        }
@@ -2937,6 +3007,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
        }
 
        bfqd->in_service_queue = bfqq;
+       bfqd->in_serv_last_pos = 0;
 }
 
 /*
@@ -3442,20 +3513,38 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
  * order until all the requests already queued in the device have been
  * served. The last sub-condition commented above somewhat mitigates
  * this problem for weight-raised queues.
+ *
+ * However, as an additional mitigation for this problem, we preserve
+ * plugging for a special symmetric case that may suddenly turn into
+ * asymmetric: the case where only bfqq is busy. In this case, not
+ * expiring bfqq does not cause any harm to any other queues in terms
+ * of service guarantees. In contrast, it avoids the following unlucky
+ * sequence of events: (1) bfqq is expired, (2) a new queue with a
+ * lower weight than bfqq becomes busy (or more queues), (3) the new
+ * queue is served until a new request arrives for bfqq, (4) when bfqq
+ * is finally served, there are so many requests of the new queue in
+ * the drive that the pending requests for bfqq take a lot of time to
+ * be served. In particular, event (2) may case even already
+ * dispatched requests of bfqq to be delayed, inside the drive. So, to
+ * avoid this series of events, the scenario is preventively declared
+ * as asymmetric also if bfqq is the only busy queues
  */
 static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
                                                 struct bfq_queue *bfqq)
 {
+       int tot_busy_queues = bfq_tot_busy_queues(bfqd);
+
        /* No point in idling for bfqq if it won't get requests any longer */
        if (unlikely(!bfqq_process_refs(bfqq)))
                return false;
 
        return (bfqq->wr_coeff > 1 &&
                (bfqd->wr_busy_queues <
-                bfq_tot_busy_queues(bfqd) ||
+                tot_busy_queues ||
                 bfqd->rq_in_driver >=
                 bfqq->dispatched + 4)) ||
-               bfq_asymmetric_scenario(bfqd, bfqq);
+               bfq_asymmetric_scenario(bfqd, bfqq) ||
+               tot_busy_queues == 1;
 }
 
 static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -3939,10 +4028,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
              bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
                bfq_bfqq_charge_time(bfqd, bfqq, delta);
 
-       if (reason == BFQQE_TOO_IDLE &&
-           entity->service <= 2 * entity->budget / 10)
-               bfq_clear_bfqq_IO_bound(bfqq);
-
        if (bfqd->low_latency && bfqq->wr_coeff == 1)
                bfqq->last_wr_start_finish = jiffies;
 
@@ -3952,30 +4037,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
                 * If we get here, and there are no outstanding
                 * requests, then the request pattern is isochronous
                 * (see the comments on the function
-                * bfq_bfqq_softrt_next_start()). Thus we can compute
-                * soft_rt_next_start. And we do it, unless bfqq is in
-                * interactive weight raising. We do not do it in the
-                * latter subcase, for the following reason. bfqq may
-                * be conveying the I/O needed to load a soft
-                * real-time application. Such an application will
-                * actually exhibit a soft real-time I/O pattern after
-                * it finally starts doing its job. But, if
-                * soft_rt_next_start is computed here for an
-                * interactive bfqq, and bfqq had received a lot of
-                * service before remaining with no outstanding
-                * request (likely to happen on a fast device), then
-                * soft_rt_next_start would be assigned such a high
-                * value that, for a very long time, bfqq would be
-                * prevented from being possibly considered as soft
-                * real time.
+                * bfq_bfqq_softrt_next_start()). Therefore we can
+                * compute soft_rt_next_start.
                 *
                 * If, instead, the queue still has outstanding
                 * requests, then we have to wait for the completion
                 * of all the outstanding requests to discover whether
                 * the request pattern is actually isochronous.
                 */
-               if (bfqq->dispatched == 0 &&
-                   bfqq->wr_coeff != bfqd->bfq_wr_coeff)
+               if (bfqq->dispatched == 0)
                        bfqq->soft_rt_next_start =
                                bfq_bfqq_softrt_next_start(bfqd, bfqq);
                else if (bfqq->dispatched > 0) {
@@ -4497,9 +4567,9 @@ check_queue:
                    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
                    bfq_bfqq_budget_left(async_bfqq))
                        bfqq = bfqq->bic->bfqq[0];
-               else if (bfq_bfqq_has_waker(bfqq) &&
+               else if (bfqq->waker_bfqq &&
                           bfq_bfqq_busy(bfqq->waker_bfqq) &&
-                          bfqq->next_rq &&
+                          bfqq->waker_bfqq->next_rq &&
                           bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
                                              bfqq->waker_bfqq) <=
                           bfq_bfqq_budget_left(bfqq->waker_bfqq)
@@ -4559,9 +4629,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
                                                bfqq->wr_cur_max_time)) {
                        if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
                        time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
-                                              bfq_wr_duration(bfqd)))
+                                              bfq_wr_duration(bfqd))) {
+                               /*
+                                * Either in interactive weight
+                                * raising, or in soft_rt weight
+                                * raising with the
+                                * interactive-weight-raising period
+                                * elapsed (so no switch back to
+                                * interactive weight raising).
+                                */
                                bfq_bfqq_end_wr(bfqq);
-                       else {
+                       } else { /*
+                                 * soft_rt finishing while still in
+                                 * interactive period, switch back to
+                                 * interactive weight raising
+                                 */
                                switch_back_to_interactive_wr(bfqq, bfqd);
                                bfqq->entity.prio_changed = 1;
                        }
@@ -4640,9 +4722,6 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
        struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
-       if (!atomic_read(&hctx->elevator_queued))
-               return false;
-
        /*
         * Avoiding lock: a race on bfqd->busy_queues should cause at
         * most a call to dispatch for nothing
@@ -4892,7 +4971,6 @@ void bfq_put_queue(struct bfq_queue *bfqq)
        hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
                                  woken_list_node) {
                item->waker_bfqq = NULL;
-               bfq_clear_bfqq_has_waker(item);
                hlist_del_init(&item->woken_list_node);
        }
 
@@ -5012,6 +5090,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
        }
 
        bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+       bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d",
+                    bfqq->new_ioprio, bfqq->entity.new_weight);
        bfqq->entity.prio_changed = 1;
 }
 
@@ -5049,6 +5129,8 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                          struct bfq_io_cq *bic, pid_t pid, int is_sync)
 {
+       u64 now_ns = ktime_get_ns();
+
        RB_CLEAR_NODE(&bfqq->entity.rb_node);
        INIT_LIST_HEAD(&bfqq->fifo);
        INIT_HLIST_NODE(&bfqq->burst_list_node);
@@ -5076,7 +5158,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                bfq_clear_bfqq_sync(bfqq);
 
        /* set end request to minus infinity from now */
-       bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+       bfqq->ttime.last_end_request = now_ns + 1;
+
+       bfqq->io_start_time = now_ns;
 
        bfq_mark_bfqq_IO_bound(bfqq);
 
@@ -5194,11 +5278,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
                                    struct bfq_queue *bfqq)
 {
        struct bfq_ttime *ttime = &bfqq->ttime;
-       u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+       u64 elapsed;
 
+       /*
+        * We are really interested in how long it takes for the queue to
+        * become busy when there is no outstanding IO for this queue. So
+        * ignore cases when the bfq queue has already IO queued.
+        */
+       if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
+               return;
+       elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
        elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
 
-       ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+       ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
        ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
        ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
                                     ttime->ttime_samples);
@@ -5213,8 +5305,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
        if (bfqq->wr_coeff > 1 &&
            bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
-           BFQQ_TOTALLY_SEEKY(bfqq))
-               bfq_bfqq_end_wr(bfqq);
+           BFQQ_TOTALLY_SEEKY(bfqq)) {
+               if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+                                          bfq_wr_duration(bfqd))) {
+                       /*
+                        * In soft_rt weight raising with the
+                        * interactive-weight-raising period
+                        * elapsed (so no switch back to
+                        * interactive weight raising).
+                        */
+                       bfq_bfqq_end_wr(bfqq);
+               } else { /*
+                         * stopping soft_rt weight raising
+                         * while still in interactive period,
+                         * switch back to interactive weight
+                         * raising
+                         */
+                       switch_back_to_interactive_wr(bfqq, bfqd);
+                       bfqq->entity.prio_changed = 1;
+               }
+       }
 }
 
 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -5238,12 +5348,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
                return;
 
        /* Think time is infinite if no process is linked to
-        * bfqq. Otherwise check average think time to
-        * decide whether to mark as has_short_ttime
+        * bfqq. Otherwise check average think time to decide whether
+        * to mark as has_short_ttime. To this goal, compare average
+        * think time with half the I/O-plugging timeout.
         */
        if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
            (bfq_sample_valid(bfqq->ttime.ttime_samples) &&
-            bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
+            bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1))
                has_short_ttime = false;
 
        state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
@@ -5557,7 +5668,6 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
                rq = list_first_entry(list, struct request, queuelist);
                list_del_init(&rq->queuelist);
                bfq_insert_request(hctx, rq, at_head);
-               atomic_inc(&hctx->elevator_queued);
        }
 }
 
@@ -5925,7 +6035,6 @@ static void bfq_finish_requeue_request(struct request *rq)
 
                bfq_completed_request(bfqq, bfqd);
                bfq_finish_requeue_request_body(bfqq);
-               atomic_dec(&rq->mq_hctx->elevator_queued);
 
                spin_unlock_irqrestore(&bfqd->lock, flags);
        } else {
@@ -6489,8 +6598,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
        bfqd->bfq_slice_idle = bfq_slice_idle;
        bfqd->bfq_timeout = bfq_timeout;
 
-       bfqd->bfq_requests_within_timer = 120;
-
        bfqd->bfq_large_burst_thresh = 8;
        bfqd->bfq_burst_interval = msecs_to_jiffies(180);