block, bfq: reduce threshold for detecting command queueing

[linux-2.6-microblaze.git] / block / bfq-iosched.c
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index cd30776..48b5790 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
  #define BFQ_MIN_TT             (2 * NSEC_PER_MSEC)
  
  /* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_THRESHOLD 3
  #define BFQ_HW_QUEUE_SAMPLES   32
  
  #define BFQQ_SEEK_THR          (sector_t)(8 * 100)
  #define BFQQ_SECT_THR_NONROT   (sector_t)(2 * 32)
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
+       (get_sdist(last_pos, rq) >                      \
+        BFQQ_SEEK_THR &&                               \
+        (!blk_queue_nonrot(bfqd->queue) ||             \
+         blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
  #define BFQQ_CLOSE_THR         (sector_t)(8 * 1024)
  #define BFQQ_SEEKY(bfqq)       (hweight32(bfqq->seek_history) > 19)
  
@@ -623,26 +628,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
                 bfqq->pos_root = NULL;
  }
  
-/*
- * Tell whether there are active queues with different weights or
- * active groups.
- */
-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
-{
-       /*
-        * For queue weights to differ, queue_weights_tree must contain
-        * at least two nodes.
-        */
-       return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
-               (bfqd->queue_weights_tree.rb_node->rb_left ||
-                bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-              ) ||
-               (bfqd->num_groups_with_pending_reqs > 0
-#endif
-              );
-}
-
  /*
   * The following function returns true if every queue must receive the
   * same share of the throughput (this condition is used when deciding
@@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
   *
   * Such a scenario occurs when:
   * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
+ * 2) all active queues belong to the same I/O-priority class,
   * 3) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 4) all active groups at the same level in the groups tree have the same
   *    number of children.
   *
   * Unfortunately, keeping the necessary state for evaluating exactly
   * the last two symmetry sub-conditions above would be quite complex
- * and time consuming.  Therefore this function evaluates, instead,
- * only the following stronger two sub-conditions, for which it is
+ * and time consuming. Therefore this function evaluates, instead,
+ * only the following stronger three sub-conditions, for which it is
   * much easier to maintain the needed state:
   * 1) all active queues have the same weight,
- * 2) there are no active groups.
+ * 2) all active queues belong to the same I/O-priority class,
+ * 3) there are no active groups.
   * In particular, the last condition is always true if hierarchical
   * support or the cgroups interface are not enabled, thus no state
   * needs to be maintained in this case.
   */
  static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
  {
-       return !bfq_varied_queue_weights_or_active_groups(bfqd);
+       /*
+        * For queue weights to differ, queue_weights_tree must contain
+        * at least two nodes.
+        */
+       bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+               (bfqd->queue_weights_tree.rb_node->rb_left ||
+                bfqd->queue_weights_tree.rb_node->rb_right);
+
+       bool multiple_classes_busy =
+               (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
+               (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
+               (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
+
+       /*
+        * For queue weights to differ, queue_weights_tree must contain
+        * at least two nodes.
+        */
+       return !(varied_queue_weights || multiple_classes_busy
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+              || bfqd->num_groups_with_pending_reqs > 0
+#endif
+               );
  }
  
  /*
@@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
         /*
          * In the unlucky event of an allocation failure, we just
          * exit. This will cause the weight of queue to not be
-        * considered in bfq_varied_queue_weights_or_active_groups,
-        * which, in its turn, causes the scenario to be deemed
-        * wrongly symmetric in case bfqq's weight would have been
-        * the only weight making the scenario asymmetric.  On the
-        * bright side, no unbalance will however occur when bfqq
-        * becomes inactive again (the invocation of this function
-        * is triggered by an activation of queue).  In fact,
-        * bfq_weights_tree_remove does nothing if
-        * !bfqq->weight_counter.
+        * considered in bfq_symmetric_scenario, which, in its turn,
+        * causes the scenario to be deemed wrongly symmetric in case
+        * bfqq's weight would have been the only weight making the
+        * scenario asymmetric.  On the bright side, no unbalance will
+        * however occur when bfqq becomes inactive again (the
+        * invocation of this function is triggered by an activation
+        * of queue).  In fact, bfq_weights_tree_remove does nothing
+        * if !bfqq->weight_counter.
          */
         if (unlikely(!bfqq->weight_counter))
                 return;
@@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  
  inc_counter:
         bfqq->weight_counter->num_active++;
+       bfqq->ref++;
  }
  
  /*
@@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
  
  reset_entity_pointer:
         bfqq->weight_counter = NULL;
+       bfq_put_queue(bfqq);
  }
  
  /*
@@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
  {
         struct bfq_entity *entity = bfqq->entity.parent;
  
-       __bfq_weights_tree_remove(bfqd, bfqq,
-                                 &bfqd->queue_weights_tree);
-
         for_each_entity(entity) {
                 struct bfq_sched_data *sd = entity->my_sched_data;
  
@@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
                         bfqd->num_groups_with_pending_reqs--;
                 }
         }
+
+       /*
+        * Next function is invoked last, because it causes bfqq to be
+        * freed if the following holds: bfqq is not in service and
+        * has no dispatched request. DO NOT use bfqq after the next
+        * function invocation.
+        */
+       __bfq_weights_tree_remove(bfqd, bfqq,
+                                 &bfqd->queue_weights_tree);
  }
  
  /*
@@ -907,8 +922,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
                  */
                 return;
  
-       new_budget = max_t(unsigned long, bfqq->max_budget,
-                          bfq_serv_to_charge(next_rq, bfqq));
+       new_budget = max_t(unsigned long,
+                          max_t(unsigned long, bfqq->max_budget,
+                                bfq_serv_to_charge(next_rq, bfqq)),
+                          entity->service);
         if (entity->budget != new_budget) {
                 entity->budget = new_budget;
                 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@@ -1011,7 +1028,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
  
  static int bfqq_process_refs(struct bfq_queue *bfqq)
  {
-       return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+       return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
+               (bfqq->weight_counter != NULL);
  }
  
  /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -1380,7 +1398,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
  {
         struct bfq_entity *entity = &bfqq->entity;
  
-       if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+       /*
+        * In the next compound condition, we check also whether there
+        * is some budget left, because otherwise there is no point in
+        * trying to go on serving bfqq with this same budget: bfqq
+        * would be expired immediately after being selected for
+        * service. This would only cause useless overhead.
+        */
+       if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
+           bfq_bfqq_budget_left(bfqq) > 0) {
                 /*
                  * We do not clear the flag non_blocking_wait_rq here, as
                  * the latter is used in bfq_activate_bfqq to signal
@@ -2217,7 +2243,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                 return NULL;
  
         /* If there is only one backlogged queue, don't search. */
-       if (bfqd->busy_queues == 1)
+       if (bfq_tot_busy_queues(bfqd) == 1)
                 return NULL;
  
         in_service_bfqq = bfqd->in_service_queue;
@@ -2742,7 +2768,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
  
         if ((bfqd->rq_in_driver > 0 ||
                 now_ns - bfqd->last_completion < BFQ_MIN_TT)
-            && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+           && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
                 bfqd->sequential_samples++;
  
         bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@@ -3274,16 +3300,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
                  * requests, then the request pattern is isochronous
                  * (see the comments on the function
                  * bfq_bfqq_softrt_next_start()). Thus we can compute
-                * soft_rt_next_start. If, instead, the queue still
-                * has outstanding requests, then we have to wait for
-                * the completion of all the outstanding requests to
-                * discover whether the request pattern is actually
-                * isochronous.
+                * soft_rt_next_start. And we do it, unless bfqq is in
+                * interactive weight raising. We do not do it in the
+                * latter subcase, for the following reason. bfqq may
+                * be conveying the I/O needed to load a soft
+                * real-time application. Such an application will
+                * actually exhibit a soft real-time I/O pattern after
+                * it finally starts doing its job. But, if
+                * soft_rt_next_start is computed here for an
+                * interactive bfqq, and bfqq had received a lot of
+                * service before remaining with no outstanding
+                * request (likely to happen on a fast device), then
+                * soft_rt_next_start would be assigned such a high
+                * value that, for a very long time, bfqq would be
+                * prevented from being possibly considered as soft
+                * real time.
+                *
+                * If, instead, the queue still has outstanding
+                * requests, then we have to wait for the completion
+                * of all the outstanding requests to discover whether
+                * the request pattern is actually isochronous.
                  */
-               if (bfqq->dispatched == 0)
+               if (bfqq->dispatched == 0 &&
+                   bfqq->wr_coeff != bfqd->bfq_wr_coeff)
                         bfqq->soft_rt_next_start =
                                 bfq_bfqq_softrt_next_start(bfqd, bfqq);
-               else {
+               else if (bfqq->dispatched > 0) {
                         /*
                          * Schedule an update of soft_rt_next_start to when
                          * the task may be discovered to be isochronous.
@@ -3376,53 +3418,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
                 bfq_bfqq_budget_timeout(bfqq);
  }
  
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for the queue. As a consequence, since
- * device idling plays a critical role in both throughput boosting and
- * service guarantees, the return value of this function plays a
- * critical role in both these aspects as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * individually while introducing the variables.
- */
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+                                            struct bfq_queue *bfqq)
  {
-       struct bfq_data *bfqd = bfqq->bfqd;
         bool rot_without_queueing =
                 !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
                 bfqq_sequential_and_IO_bound,
-               idling_boosts_thr, idling_boosts_thr_without_issues,
-               idling_needed_for_service_guarantees,
-               asymmetric_scenario;
-
-       if (bfqd->strict_guarantees)
-               return true;
-
-       /*
-        * Idling is performed only if slice_idle > 0. In addition, we
-        * do not idle if
-        * (a) bfqq is async
-        * (b) bfqq is in the idle io prio class: in this case we do
-        * not idle because we want to minimize the bandwidth that
-        * queues in this class can steal to higher-priority queues
-        */
-       if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
-           bfq_class_idle(bfqq))
-               return false;
+               idling_boosts_thr;
  
         bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
                 bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@@ -3454,8 +3456,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
                  bfqq_sequential_and_IO_bound);
  
         /*
-        * The value of the next variable,
-        * idling_boosts_thr_without_issues, is equal to that of
+        * The return value of this function is equal to that of
          * idling_boosts_thr, unless a special case holds. In this
          * special case, described below, idling may cause problems to
          * weight-raised queues.
@@ -3472,217 +3473,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
          * which enqueue several requests in advance, and further
          * reorder internally-queued requests.
          *
-        * For this reason, we force to false the value of
-        * idling_boosts_thr_without_issues if there are weight-raised
-        * busy queues. In this case, and if bfqq is not weight-raised,
-        * this guarantees that the device is not idled for bfqq (if,
-        * instead, bfqq is weight-raised, then idling will be
-        * guaranteed by another variable, see below). Combined with
-        * the timestamping rules of BFQ (see [1] for details), this
-        * behavior causes bfqq, and hence any sync non-weight-raised
-        * queue, to get a lower number of requests served, and thus
-        * to ask for a lower number of requests from the request
-        * pool, before the busy weight-raised queues get served
-        * again. This often mitigates starvation problems in the
-        * presence of heavy write workloads and NCQ, thereby
-        * guaranteeing a higher application and system responsiveness
-        * in these hostile scenarios.
+        * For this reason, we force to false the return value if
+        * there are weight-raised busy queues. In this case, and if
+        * bfqq is not weight-raised, this guarantees that the device
+        * is not idled for bfqq (if, instead, bfqq is weight-raised,
+        * then idling will be guaranteed by another variable, see
+        * below). Combined with the timestamping rules of BFQ (see
+        * [1] for details), this behavior causes bfqq, and hence any
+        * sync non-weight-raised queue, to get a lower number of
+        * requests served, and thus to ask for a lower number of
+        * requests from the request pool, before the busy
+        * weight-raised queues get served again. This often mitigates
+        * starvation problems in the presence of heavy write
+        * workloads and NCQ, thereby guaranteeing a higher
+        * application and system responsiveness in these hostile
+        * scenarios.
          */
-       idling_boosts_thr_without_issues = idling_boosts_thr &&
+       return idling_boosts_thr &&
                 bfqd->wr_busy_queues == 0;
+}
  
-       /*
-        * There is then a case where idling must be performed not
-        * for throughput concerns, but to preserve service
-        * guarantees.
-        *
-        * To introduce this case, we can note that allowing the drive
-        * to enqueue more than one request at a time, and hence
-        * delegating de facto final scheduling decisions to the
-        * drive's internal scheduler, entails loss of control on the
-        * actual request service order. In particular, the critical
-        * situation is when requests from different processes happen
-        * to be present, at the same time, in the internal queue(s)
-        * of the drive. In such a situation, the drive, by deciding
-        * the service order of the internally-queued requests, does
-        * determine also the actual throughput distribution among
-        * these processes. But the drive typically has no notion or
-        * concern about per-process throughput distribution, and
-        * makes its decisions only on a per-request basis. Therefore,
-        * the service distribution enforced by the drive's internal
-        * scheduler is likely to coincide with the desired
-        * device-throughput distribution only in a completely
-        * symmetric scenario where:
-        * (i)  each of these processes must get the same throughput as
-        *      the others;
-        * (ii) the I/O of each process has the same properties, in
-        *      terms of locality (sequential or random), direction
-        *      (reads or writes), request sizes, greediness
-        *      (from I/O-bound to sporadic), and so on.
-        * In fact, in such a scenario, the drive tends to treat
-        * the requests of each of these processes in about the same
-        * way as the requests of the others, and thus to provide
-        * each of these processes with about the same throughput
-        * (which is exactly the desired throughput distribution). In
-        * contrast, in any asymmetric scenario, device idling is
-        * certainly needed to guarantee that bfqq receives its
-        * assigned fraction of the device throughput (see [1] for
-        * details).
-        * The problem is that idling may significantly reduce
-        * throughput with certain combinations of types of I/O and
-        * devices. An important example is sync random I/O, on flash
-        * storage with command queueing. So, unless bfqq falls in the
-        * above cases where idling also boosts throughput, it would
-        * be important to check conditions (i) and (ii) accurately,
-        * so as to avoid idling when not strictly needed for service
-        * guarantees.
-        *
-        * Unfortunately, it is extremely difficult to thoroughly
-        * check condition (ii). And, in case there are active groups,
-        * it becomes very difficult to check condition (i) too. In
-        * fact, if there are active groups, then, for condition (i)
-        * to become false, it is enough that an active group contains
-        * more active processes or sub-groups than some other active
-        * group. More precisely, for condition (i) to hold because of
-        * such a group, it is not even necessary that the group is
-        * (still) active: it is sufficient that, even if the group
-        * has become inactive, some of its descendant processes still
-        * have some request already dispatched but still waiting for
-        * completion. In fact, requests have still to be guaranteed
-        * their share of the throughput even after being
-        * dispatched. In this respect, it is easy to show that, if a
-        * group frequently becomes inactive while still having
-        * in-flight requests, and if, when this happens, the group is
-        * not considered in the calculation of whether the scenario
-        * is asymmetric, then the group may fail to be guaranteed its
-        * fair share of the throughput (basically because idling may
-        * not be performed for the descendant processes of the group,
-        * but it had to be).  We address this issue with the
-        * following bi-modal behavior, implemented in the function
-        * bfq_symmetric_scenario().
-        *
-        * If there are groups with requests waiting for completion
-        * (as commented above, some of these groups may even be
-        * already inactive), then the scenario is tagged as
-        * asymmetric, conservatively, without checking any of the
-        * conditions (i) and (ii). So the device is idled for bfqq.
-        * This behavior matches also the fact that groups are created
-        * exactly if controlling I/O is a primary concern (to
-        * preserve bandwidth and latency guarantees).
-        *
-        * On the opposite end, if there are no groups with requests
-        * waiting for completion, then only condition (i) is actually
-        * controlled, i.e., provided that condition (i) holds, idling
-        * is not performed, regardless of whether condition (ii)
-        * holds. In other words, only if condition (i) does not hold,
-        * then idling is allowed, and the device tends to be
-        * prevented from queueing many requests, possibly of several
-        * processes. Since there are no groups with requests waiting
-        * for completion, then, to control condition (i) it is enough
-        * to check just whether all the queues with requests waiting
-        * for completion also have the same weight.
-        *
-        * Not checking condition (ii) evidently exposes bfqq to the
-        * risk of getting less throughput than its fair share.
-        * However, for queues with the same weight, a further
-        * mechanism, preemption, mitigates or even eliminates this
-        * problem. And it does so without consequences on overall
-        * throughput. This mechanism and its benefits are explained
-        * in the next three paragraphs.
-        *
-        * Even if a queue, say Q, is expired when it remains idle, Q
-        * can still preempt the new in-service queue if the next
-        * request of Q arrives soon (see the comments on
-        * bfq_bfqq_update_budg_for_activation). If all queues and
-        * groups have the same weight, this form of preemption,
-        * combined with the hole-recovery heuristic described in the
-        * comments on function bfq_bfqq_update_budg_for_activation,
-        * are enough to preserve a correct bandwidth distribution in
-        * the mid term, even without idling. In fact, even if not
-        * idling allows the internal queues of the device to contain
-        * many requests, and thus to reorder requests, we can rather
-        * safely assume that the internal scheduler still preserves a
-        * minimum of mid-term fairness.
-        *
-        * More precisely, this preemption-based, idleless approach
-        * provides fairness in terms of IOPS, and not sectors per
-        * second. This can be seen with a simple example. Suppose
-        * that there are two queues with the same weight, but that
-        * the first queue receives requests of 8 sectors, while the
-        * second queue receives requests of 1024 sectors. In
-        * addition, suppose that each of the two queues contains at
-        * most one request at a time, which implies that each queue
-        * always remains idle after it is served. Finally, after
-        * remaining idle, each queue receives very quickly a new
-        * request. It follows that the two queues are served
-        * alternatively, preempting each other if needed. This
-        * implies that, although both queues have the same weight,
-        * the queue with large requests receives a service that is
-        * 1024/8 times as high as the service received by the other
-        * queue.
-        *
-        * The motivation for using preemption instead of idling (for
-        * queues with the same weight) is that, by not idling,
-        * service guarantees are preserved (completely or at least in
-        * part) without minimally sacrificing throughput. And, if
-        * there is no active group, then the primary expectation for
-        * this device is probably a high throughput.
-        *
-        * We are now left only with explaining the additional
-        * compound condition that is checked below for deciding
-        * whether the scenario is asymmetric. To explain this
-        * compound condition, we need to add that the function
-        * bfq_symmetric_scenario checks the weights of only
-        * non-weight-raised queues, for efficiency reasons (see
-        * comments on bfq_weights_tree_add()). Then the fact that
-        * bfqq is weight-raised is checked explicitly here. More
-        * precisely, the compound condition below takes into account
-        * also the fact that, even if bfqq is being weight-raised,
-        * the scenario is still symmetric if all queues with requests
-        * waiting for completion happen to be
-        * weight-raised. Actually, we should be even more precise
-        * here, and differentiate between interactive weight raising
-        * and soft real-time weight raising.
-        *
-        * As a side note, it is worth considering that the above
-        * device-idling countermeasures may however fail in the
-        * following unlucky scenario: if idling is (correctly)
-        * disabled in a time period during which all symmetry
-        * sub-conditions hold, and hence the device is allowed to
-        * enqueue many requests, but at some later point in time some
-        * sub-condition stops to hold, then it may become impossible
-        * to let requests be served in the desired order until all
-        * the requests already queued in the device have been served.
-        */
-       asymmetric_scenario = (bfqq->wr_coeff > 1 &&
-                              bfqd->wr_busy_queues < bfqd->busy_queues) ||
+/*
+ * There is a case where idling must be performed not for
+ * throughput concerns, but to preserve service guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i)  each of these processes must get the same throughput as
+ *      the others;
+ * (ii) the I/O of each process has the same properties, in
+ *      terms of locality (sequential or random), direction
+ *      (reads or writes), request sizes, greediness
+ *      (from I/O-bound to sporadic), and so on.
+ * In fact, in such a scenario, the drive tends to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ * The problem is that idling may significantly reduce
+ * throughput with certain combinations of types of I/O and
+ * devices. An important example is sync random I/O, on flash
+ * storage with command queueing. So, unless bfqq falls in the
+ * above cases where idling also boosts throughput, it would
+ * be important to check conditions (i) and (ii) accurately,
+ * so as to avoid idling when not strictly needed for service
+ * guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly
+ * check condition (ii). And, in case there are active groups,
+ * it becomes very difficult to check condition (i) too. In
+ * fact, if there are active groups, then, for condition (i)
+ * to become false, it is enough that an active group contains
+ * more active processes or sub-groups than some other active
+ * group. More precisely, for condition (i) to hold because of
+ * such a group, it is not even necessary that the group is
+ * (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still
+ * have some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed
+ * their share of the throughput even after being
+ * dispatched. In this respect, it is easy to show that, if a
+ * group frequently becomes inactive while still having
+ * in-flight requests, and if, when this happens, the group is
+ * not considered in the calculation of whether the scenario
+ * is asymmetric, then the group may fail to be guaranteed its
+ * fair share of the throughput (basically because idling may
+ * not be performed for the descendant processes of the group,
+ * but it had to be).  We address this issue with the
+ * following bi-modal behavior, implemented in the function
+ * bfq_symmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i) and (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests
+ * waiting for completion, then only condition (i) is actually
+ * controlled, i.e., provided that condition (i) holds, idling
+ * is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if condition (i) does not hold,
+ * then idling is allowed, and the device tends to be
+ * prevented from queueing many requests, possibly of several
+ * processes. Since there are no groups with requests waiting
+ * for completion, then, to control condition (i) it is enough
+ * to check just whether all the queues with requests waiting
+ * for completion also have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the additional
+ * compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain this
+ * compound condition, we need to add that the function
+ * bfq_symmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see
+ * comments on bfq_weights_tree_add()). Then the fact that
+ * bfqq is weight-raised is checked explicitly here. More
+ * precisely, the compound condition below takes into account
+ * also the fact that, even if bfqq is being weight-raised,
+ * the scenario is still symmetric if all queues with requests
+ * waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise
+ * here, and differentiate between interactive weight raising
+ * and soft real-time weight raising.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+                                                struct bfq_queue *bfqq)
+{
+       return (bfqq->wr_coeff > 1 &&
+               bfqd->wr_busy_queues <
+               bfq_tot_busy_queues(bfqd)) ||
                 !bfq_symmetric_scenario(bfqd);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * Most of the issues taken into account to get the return value of
+ * this function are not trivial. We discuss these issues in the two
+ * functions providing the main pieces of information needed by this
+ * function.
+ */
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+{
+       struct bfq_data *bfqd = bfqq->bfqd;
+       bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
+
+       if (unlikely(bfqd->strict_guarantees))
+               return true;
  
         /*
-        * Finally, there is a case where maximizing throughput is the
-        * best choice even if it may cause unfairness toward
-        * bfqq. Such a case is when bfqq became active in a burst of
-        * queue activations. Queues that became active during a large
-        * burst benefit only from throughput, as discussed in the
-        * comments on bfq_handle_burst. Thus, if bfqq became active
-        * in a burst and not idling the device maximizes throughput,
-        * then the device must no be idled, because not idling the
-        * device provides bfqq and all other queues in the burst with
-        * maximum benefit. Combining this and the above case, we can
-        * now establish when idling is actually needed to preserve
-        * service guarantees.
+        * Idling is performed only if slice_idle > 0. In addition, we
+        * do not idle if
+        * (a) bfqq is async
+        * (b) bfqq is in the idle io prio class: in this case we do
+        * not idle because we want to minimize the bandwidth that
+        * queues in this class can steal to higher-priority queues
          */
-       idling_needed_for_service_guarantees =
-               asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+       if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
+          bfq_class_idle(bfqq))
+               return false;
+
+       idling_boosts_thr_with_no_issue =
+               idling_boosts_thr_without_issues(bfqd, bfqq);
+
+       idling_needed_for_service_guar =
+               idling_needed_for_service_guarantees(bfqd, bfqq);
  
         /*
-        * We have now all the components we need to compute the
+        * We have now the two components we need to compute the
          * return value of the function, which is true only if idling
          * either boosts the throughput (without issues), or is
          * necessary to preserve service guarantees.
          */
-       return idling_boosts_thr_without_issues ||
-               idling_needed_for_service_guarantees;
+       return idling_boosts_thr_with_no_issue ||
+               idling_needed_for_service_guar;
  }
  
  /*
@@ -3934,7 +3970,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
          * belongs to CLASS_IDLE and other queues are waiting for
          * service.
          */
-       if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+       if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
                 goto return_rq;
  
         bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@@ -3952,7 +3988,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
          * most a call to dispatch for nothing
          */
         return !list_empty_careful(&bfqd->dispatch) ||
-               bfqd->busy_queues > 0;
+               bfq_tot_busy_queues(bfqd) > 0;
  }
  
  static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@@ -4006,9 +4042,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
                 goto start_rq;
         }
  
-       bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+       bfq_log(bfqd, "dispatch requests: %d busy queues",
+               bfq_tot_busy_queues(bfqd));
  
-       if (bfqd->busy_queues == 0)
+       if (bfq_tot_busy_queues(bfqd) == 0)
                 goto exit;
  
         /*
@@ -4488,10 +4525,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                        struct request *rq)
  {
         bfqq->seek_history <<= 1;
-       bfqq->seek_history |=
-               get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
-               (!blk_queue_nonrot(bfqd->queue) ||
-                blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+       bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
  }
  
  static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -4560,28 +4594,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
  
                 /*
-                * There is just this request queued: if the request
-                * is small and the queue is not to be expired, then
-                * just exit.
+                * There is just this request queued: if
+                * - the request is small, and
+                * - we are idling to boost throughput, and
+                * - the queue is not to be expired,
+                * then just exit.
                  *
                  * In this way, if the device is being idled to wait
                  * for a new request from the in-service queue, we
                  * avoid unplugging the device and committing the
-                * device to serve just a small request. On the
-                * contrary, we wait for the block layer to decide
-                * when to unplug the device: hopefully, new requests
-                * will be merged to this one quickly, then the device
-                * will be unplugged and larger requests will be
-                * dispatched.
+                * device to serve just a small request. In contrast
+                * we wait for the block layer to decide when to
+                * unplug the device: hopefully, new requests will be
+                * merged to this one quickly, then the device will be
+                * unplugged and larger requests will be dispatched.
                  */
-               if (small_req && !budget_timeout)
+               if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
+                   !budget_timeout)
                         return;
  
                 /*
-                * A large enough request arrived, or the queue is to
-                * be expired: in both cases disk idling is to be
-                * stopped, so clear wait_request flag and reset
-                * timer.
+                * A large enough request arrived, or idling is being
+                * performed to preserve service guarantees, or
+                * finally the queue is to be expired: in all these
+                * cases disk idling is to be stopped, so clear
+                * wait_request flag and reset timer.
                  */
                 bfq_clear_bfqq_wait_request(bfqq);
                 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@@ -4607,8 +4644,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
         bool waiting, idle_timer_disabled = false;
  
         if (new_bfqq) {
-               if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
-                       new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
                 /*
                  * Release the request's reference to the old bfqq
                  * and make sure one is taken to the shared queue.
@@ -4763,7 +4798,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
          * sum is not exact, as it's not taking into account deactivated
          * requests.
          */
-       if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+       if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
                 return;
  
         if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -4834,11 +4869,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
          * isochronous, and both requisites for this condition to hold
          * are now satisfied, then compute soft_rt_next_start (see the
          * comments on the function bfq_bfqq_softrt_next_start()). We
-        * schedule this delayed check when bfqq expires, if it still
-        * has in-flight requests.
+        * do not compute soft_rt_next_start if bfqq is in interactive
+        * weight raising (see the comments in bfq_bfqq_expire() for
+        * an explanation). We schedule this delayed update when bfqq
+        * expires, if it still has in-flight requests.
          */
         if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
-           RB_EMPTY_ROOT(&bfqq->sort_list))
+           RB_EMPTY_ROOT(&bfqq->sort_list) &&
+           bfqq->wr_coeff != bfqd->bfq_wr_coeff)
                 bfqq->soft_rt_next_start =
                         bfq_bfqq_softrt_next_start(bfqd, bfqq);