kyber: fix out of bounds access when preempted

[linux-2.6-microblaze.git] / block / bfq-iosched.c
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 9558613..59b2499 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1012,7 +1012,7 @@ static void
  bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
                       struct bfq_io_cq *bic, bool bfq_already_existing)
  {
-       unsigned int old_wr_coeff = bfqq->wr_coeff;
+       unsigned int old_wr_coeff = 1;
         bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
  
         if (bic->saved_has_short_ttime)
@@ -1033,7 +1033,13 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
         bfqq->ttime = bic->saved_ttime;
         bfqq->io_start_time = bic->saved_io_start_time;
         bfqq->tot_idle_time = bic->saved_tot_idle_time;
-       bfqq->wr_coeff = bic->saved_wr_coeff;
+       /*
+        * Restore weight coefficient only if low_latency is on
+        */
+       if (bfqd->low_latency) {
+               old_wr_coeff = bfqq->wr_coeff;
+               bfqq->wr_coeff = bic->saved_wr_coeff;
+       }
         bfqq->service_from_wr = bic->saved_service_from_wr;
         bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
         bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
@@ -1069,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
  static int bfqq_process_refs(struct bfq_queue *bfqq)
  {
         return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
-               (bfqq->weight_counter != NULL);
+               (bfqq->weight_counter != NULL) - bfqq->stable_ref;
  }
  
  /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -2257,10 +2263,9 @@ static void bfq_remove_request(struct request_queue *q,
  
  }
  
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
                 unsigned int nr_segs)
  {
-       struct request_queue *q = hctx->queue;
         struct bfq_data *bfqd = q->elevator->elevator_data;
         struct request *free = NULL;
         /*
@@ -2622,6 +2627,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
         return true;
  }
  
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+                                            struct bfq_queue *bfqq);
+
+static void bfq_put_stable_ref(struct bfq_queue *bfqq);
+
  /*
   * Attempt to schedule a merge of bfqq with the currently in-service
   * queue or with a close queue among the scheduled queues.  Return
@@ -2644,10 +2654,49 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
   */
  static struct bfq_queue *
  bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-                    void *io_struct, bool request)
+                    void *io_struct, bool request, struct bfq_io_cq *bic)
  {
         struct bfq_queue *in_service_bfqq, *new_bfqq;
  
+       /*
+        * Check delayed stable merge for rotational or non-queueing
+        * devs. For this branch to be executed, bfqq must not be
+        * currently merged with some other queue (i.e., bfqq->bic
+        * must be non null). If we considered also merged queues,
+        * then we should also check whether bfqq has already been
+        * merged with bic->stable_merge_bfqq. But this would be
+        * costly and complicated.
+        */
+       if (unlikely(!bfqd->nonrot_with_queueing)) {
+               if (bic->stable_merge_bfqq &&
+                   !bfq_bfqq_just_created(bfqq) &&
+                   time_is_after_jiffies(bfqq->split_time +
+                                         msecs_to_jiffies(200))) {
+                       struct bfq_queue *stable_merge_bfqq =
+                               bic->stable_merge_bfqq;
+                       int proc_ref = min(bfqq_process_refs(bfqq),
+                                          bfqq_process_refs(stable_merge_bfqq));
+
+                       /* deschedule stable merge, because done or aborted here */
+                       bfq_put_stable_ref(stable_merge_bfqq);
+
+                       bic->stable_merge_bfqq = NULL;
+
+                       if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
+                           proc_ref > 0) {
+                               /* next function will take at least one ref */
+                               struct bfq_queue *new_bfqq =
+                                       bfq_setup_merge(bfqq, stable_merge_bfqq);
+
+                               bic->stably_merged = true;
+                               if (new_bfqq && new_bfqq->bic)
+                                       new_bfqq->bic->stably_merged = true;
+                               return new_bfqq;
+                       } else
+                               return NULL;
+               }
+       }
+
         /*
          * Do not perform queue merging if the device is non
          * rotational and performs internal queueing. In fact, such a
@@ -2789,6 +2838,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
         }
  }
  
+
+static void
+bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
+{
+       if (cur_bfqq->entity.parent &&
+           cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
+               cur_bfqq->entity.parent->last_bfqq_created = new_bfqq;
+       else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq)
+               cur_bfqq->bfqd->last_bfqq_created = new_bfqq;
+}
+
  void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  {
         /*
@@ -2806,6 +2866,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
             bfqq != bfqd->in_service_queue)
                 bfq_del_bfqq_busy(bfqd, bfqq, false);
  
+       bfq_reassign_last_bfqq(bfqq, NULL);
+
         bfq_put_queue(bfqq);
  }
  
@@ -2822,6 +2884,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
                 bfq_mark_bfqq_IO_bound(new_bfqq);
         bfq_clear_bfqq_IO_bound(bfqq);
  
+       /*
+        * The processes associated with bfqq are cooperators of the
+        * processes associated with new_bfqq. So, if bfqq has a
+        * waker, then assume that all these processes will be happy
+        * to let bfqq's waker freely inject I/O when they have no
+        * I/O.
+        */
+       if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq &&
+           bfqq->waker_bfqq != new_bfqq) {
+               new_bfqq->waker_bfqq = bfqq->waker_bfqq;
+               new_bfqq->tentative_waker_bfqq = NULL;
+
+               /*
+                * If the waker queue disappears, then
+                * new_bfqq->waker_bfqq must be reset. So insert
+                * new_bfqq into the woken_list of the waker. See
+                * bfq_check_waker for details.
+                */
+               hlist_add_head(&new_bfqq->woken_list_node,
+                              &new_bfqq->waker_bfqq->woken_list);
+
+       }
+
         /*
          * If bfqq is weight-raised, then let new_bfqq inherit
          * weight-raising. To reduce false positives, neglect the case
@@ -2879,6 +2964,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
          */
         new_bfqq->pid = -1;
         bfqq->bic = NULL;
+
+       bfq_reassign_last_bfqq(bfqq, new_bfqq);
+
         bfq_release_process_ref(bfqd, bfqq);
  }
  
@@ -2906,7 +2994,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
          * We take advantage of this function to perform an early merge
          * of the queues of possible cooperating processes.
          */
-       new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+       new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic);
         if (new_bfqq) {
                 /*
                  * bic still points to bfqq, then it has not yet been
@@ -4491,9 +4579,15 @@ check_queue:
                         bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
                         bfqq->bic->bfqq[0]->next_rq ?
                         bfqq->bic->bfqq[0] : NULL;
+               struct bfq_queue *blocked_bfqq =
+                       !hlist_empty(&bfqq->woken_list) ?
+                       container_of(bfqq->woken_list.first,
+                                    struct bfq_queue,
+                                    woken_list_node)
+                       : NULL;
  
                 /*
-                * The next three mutually-exclusive ifs decide
+                * The next four mutually-exclusive ifs decide
                  * whether to try injection, and choose the queue to
                  * pick an I/O request from.
                  *
@@ -4526,7 +4620,15 @@ check_queue:
                  * next bfqq's I/O is brought forward dramatically,
                  * for it is not blocked for milliseconds.
                  *
-                * The third if checks whether bfqq is a queue for
+                * The third if checks whether there is a queue woken
+                * by bfqq, and currently with pending I/O. Such a
+                * woken queue does not steal bandwidth from bfqq,
+                * because it remains soon without I/O if bfqq is not
+                * served. So there is virtually no risk of loss of
+                * bandwidth for bfqq if this woken queue has I/O
+                * dispatched while bfqq is waiting for new I/O.
+                *
+                * The fourth if checks whether bfqq is a queue for
                  * which it is better to avoid injection. It is so if
                  * bfqq delivers more throughput when served without
                  * any further I/O from other queues in the middle, or
@@ -4546,11 +4648,11 @@ check_queue:
                  * bfq_update_has_short_ttime(), it is rather likely
                  * that, if I/O is being plugged for bfqq and the
                  * waker queue has pending I/O requests that are
-                * blocking bfqq's I/O, then the third alternative
+                * blocking bfqq's I/O, then the fourth alternative
                  * above lets the waker queue get served before the
                  * I/O-plugging timeout fires. So one may deem the
                  * second alternative superfluous. It is not, because
-                * the third alternative may be way less effective in
+                * the fourth alternative may be way less effective in
                  * case of a synchronization. For two main
                  * reasons. First, throughput may be low because the
                  * inject limit may be too low to guarantee the same
@@ -4559,7 +4661,7 @@ check_queue:
                  * guarantees (the second alternative unconditionally
                  * injects a pending I/O request of the waker queue
                  * for each bfq_dispatch_request()). Second, with the
-                * third alternative, the duration of the plugging,
+                * fourth alternative, the duration of the plugging,
                  * i.e., the time before bfqq finally receives new I/O,
                  * may not be minimized, because the waker queue may
                  * happen to be served only after other queues.
@@ -4577,6 +4679,14 @@ check_queue:
                            bfq_bfqq_budget_left(bfqq->waker_bfqq)
                         )
                         bfqq = bfqq->waker_bfqq;
+               else if (blocked_bfqq &&
+                          bfq_bfqq_busy(blocked_bfqq) &&
+                          blocked_bfqq->next_rq &&
+                          bfq_serv_to_charge(blocked_bfqq->next_rq,
+                                             blocked_bfqq) <=
+                          bfq_bfqq_budget_left(blocked_bfqq)
+                       )
+                       bfqq = blocked_bfqq;
                 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
                          (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
                           !bfq_bfqq_has_short_ttime(bfqq)))
@@ -4983,6 +5093,12 @@ void bfq_put_queue(struct bfq_queue *bfqq)
         bfqg_and_blkg_put(bfqg);
  }
  
+static void bfq_put_stable_ref(struct bfq_queue *bfqq)
+{
+       bfqq->stable_ref--;
+       bfq_put_queue(bfqq);
+}
+
  static void bfq_put_cooperator(struct bfq_queue *bfqq)
  {
         struct bfq_queue *__bfqq, *next;
@@ -5039,6 +5155,24 @@ static void bfq_exit_icq(struct io_cq *icq)
  {
         struct bfq_io_cq *bic = icq_to_bic(icq);
  
+       if (bic->stable_merge_bfqq) {
+               struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd;
+
+               /*
+                * bfqd is NULL if scheduler already exited, and in
+                * that case this is the last time bfqq is accessed.
+                */
+               if (bfqd) {
+                       unsigned long flags;
+
+                       spin_lock_irqsave(&bfqd->lock, flags);
+                       bfq_put_stable_ref(bic->stable_merge_bfqq);
+                       spin_unlock_irqrestore(&bfqd->lock, flags);
+               } else {
+                       bfq_put_stable_ref(bic->stable_merge_bfqq);
+               }
+       }
+
         bfq_exit_icq_bfqq(bic, true);
         bfq_exit_icq_bfqq(bic, false);
  }
@@ -5099,7 +5233,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
  
  static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
                                        struct bio *bio, bool is_sync,
-                                      struct bfq_io_cq *bic);
+                                      struct bfq_io_cq *bic,
+                                      bool respawn);
  
  static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
  {
@@ -5119,7 +5254,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
         bfqq = bic_to_bfqq(bic, false);
         if (bfqq) {
                 bfq_release_process_ref(bfqd, bfqq);
-               bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+               bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
                 bic_set_bfqq(bic, bfqq, false);
         }
  
@@ -5162,6 +5297,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
         /* set end request to minus infinity from now */
         bfqq->ttime.last_end_request = now_ns + 1;
  
+       bfqq->creation_time = jiffies;
+
         bfqq->io_start_time = now_ns;
  
         bfq_mark_bfqq_IO_bound(bfqq);
@@ -5211,9 +5348,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
         }
  }
  
+static struct bfq_queue *
+bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         struct bfq_io_cq *bic,
+                         struct bfq_queue *last_bfqq_created)
+{
+       struct bfq_queue *new_bfqq =
+               bfq_setup_merge(bfqq, last_bfqq_created);
+
+       if (!new_bfqq)
+               return bfqq;
+
+       if (new_bfqq->bic)
+               new_bfqq->bic->stably_merged = true;
+       bic->stably_merged = true;
+
+       /*
+        * Reusing merge functions. This implies that
+        * bfqq->bic must be set too, for
+        * bfq_merge_bfqqs to correctly save bfqq's
+        * state before killing it.
+        */
+       bfqq->bic = bic;
+       bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+
+       return new_bfqq;
+}
+
+/*
+ * Many throughput-sensitive workloads are made of several parallel
+ * I/O flows, with all flows generated by the same application, or
+ * more generically by the same task (e.g., system boot). The most
+ * counterproductive action with these workloads is plugging I/O
+ * dispatch when one of the bfq_queues associated with these flows
+ * remains temporarily empty.
+ *
+ * To avoid this plugging, BFQ has been using a burst-handling
+ * mechanism for years now. This mechanism has proven effective for
+ * throughput, and not detrimental for service guarantees. The
+ * following function pushes this mechanism a little bit further,
+ * basing on the following two facts.
+ *
+ * First, all the I/O flows of a the same application or task
+ * contribute to the execution/completion of that common application
+ * or task. So the performance figures that matter are total
+ * throughput of the flows and task-wide I/O latency.  In particular,
+ * these flows do not need to be protected from each other, in terms
+ * of individual bandwidth or latency.
+ *
+ * Second, the above fact holds regardless of the number of flows.
+ *
+ * Putting these two facts together, this commits merges stably the
+ * bfq_queues associated with these I/O flows, i.e., with the
+ * processes that generate these IO/ flows, regardless of how many the
+ * involved processes are.
+ *
+ * To decide whether a set of bfq_queues is actually associated with
+ * the I/O flows of a common application or task, and to merge these
+ * queues stably, this function operates as follows: given a bfq_queue,
+ * say Q2, currently being created, and the last bfq_queue, say Q1,
+ * created before Q2, Q2 is merged stably with Q1 if
+ * - very little time has elapsed since when Q1 was created
+ * - Q2 has the same ioprio as Q1
+ * - Q2 belongs to the same group as Q1
+ *
+ * Merging bfq_queues also reduces scheduling overhead. A fio test
+ * with ten random readers on /dev/nullb shows a throughput boost of
+ * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of
+ * the total per-request processing time, the above throughput boost
+ * implies that BFQ's overhead is reduced by more than 50%.
+ *
+ * This new mechanism most certainly obsoletes the current
+ * burst-handling heuristics. We keep those heuristics for the moment.
+ */
+static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+                                                     struct bfq_queue *bfqq,
+                                                     struct bfq_io_cq *bic)
+{
+       struct bfq_queue **source_bfqq = bfqq->entity.parent ?
+               &bfqq->entity.parent->last_bfqq_created :
+               &bfqd->last_bfqq_created;
+
+       struct bfq_queue *last_bfqq_created = *source_bfqq;
+
+       /*
+        * If last_bfqq_created has not been set yet, then init it. If
+        * it has been set already, but too long ago, then move it
+        * forward to bfqq. Finally, move also if bfqq belongs to a
+        * different group than last_bfqq_created, or if bfqq has a
+        * different ioprio or ioprio_class. If none of these
+        * conditions holds true, then try an early stable merge or
+        * schedule a delayed stable merge.
+        *
+        * A delayed merge is scheduled (instead of performing an
+        * early merge), in case bfqq might soon prove to be more
+        * throughput-beneficial if not merged. Currently this is
+        * possible only if bfqd is rotational with no queueing. For
+        * such a drive, not merging bfqq is better for throughput if
+        * bfqq happens to contain sequential I/O. So, we wait a
+        * little bit for enough I/O to flow through bfqq. After that,
+        * if such an I/O is sequential, then the merge is
+        * canceled. Otherwise the merge is finally performed.
+        */
+       if (!last_bfqq_created ||
+           time_before(last_bfqq_created->creation_time +
+                       bfqd->bfq_burst_interval,
+                       bfqq->creation_time) ||
+               bfqq->entity.parent != last_bfqq_created->entity.parent ||
+               bfqq->ioprio != last_bfqq_created->ioprio ||
+               bfqq->ioprio_class != last_bfqq_created->ioprio_class)
+               *source_bfqq = bfqq;
+       else if (time_after_eq(last_bfqq_created->creation_time +
+                                bfqd->bfq_burst_interval,
+                                bfqq->creation_time)) {
+               if (likely(bfqd->nonrot_with_queueing))
+                       /*
+                        * With this type of drive, leaving
+                        * bfqq alone may provide no
+                        * throughput benefits compared with
+                        * merging bfqq. So merge bfqq now.
+                        */
+                       bfqq = bfq_do_early_stable_merge(bfqd, bfqq,
+                                                        bic,
+                                                        last_bfqq_created);
+               else { /* schedule tentative stable merge */
+                       /*
+                        * get reference on last_bfqq_created,
+                        * to prevent it from being freed,
+                        * until we decide whether to merge
+                        */
+                       last_bfqq_created->ref++;
+                       /*
+                        * need to keep track of stable refs, to
+                        * compute process refs correctly
+                        */
+                       last_bfqq_created->stable_ref++;
+                       /*
+                        * Record the bfqq to merge to.
+                        */
+                       bic->stable_merge_bfqq = last_bfqq_created;
+               }
+       }
+
+       return bfqq;
+}
+
+
  static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
                                        struct bio *bio, bool is_sync,
-                                      struct bfq_io_cq *bic)
+                                      struct bfq_io_cq *bic,
+                                      bool respawn)
  {
         const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
         const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
@@ -5271,7 +5555,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
  
  out:
         bfqq->ref++; /* get a process reference to this queue */
-       bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+
+       if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn)
+               bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic);
+
         rcu_read_unlock();
         return bfqq;
  }
@@ -5521,7 +5808,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
  {
         struct bfq_queue *bfqq = RQ_BFQQ(rq),
-               *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+               *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true,
+                                                RQ_BIC(rq));
         bool waiting, idle_timer_disabled = false;
  
         if (new_bfqq) {
@@ -5627,7 +5915,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  
         spin_lock_irq(&bfqd->lock);
         bfqq = bfq_init_rq(rq);
-       if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
+
+       /*
+        * Reqs with at_head or passthrough flags set are to be put
+        * directly into dispatch list. Additional case for putting rq
+        * directly into the dispatch queue: the only active
+        * bfq_queues are bfqq and either its waker bfq_queue or one
+        * of its woken bfq_queues. The rationale behind this
+        * additional condition is as follows:
+        * - consider a bfq_queue, say Q1, detected as a waker of
+        *   another bfq_queue, say Q2
+        * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
+        *   some I/O of Q1 needs to be completed for new I/O of Q2
+        *   to arrive.  A notable example of waker is journald
+        * - so, Q1 and Q2 are in any respect the queues of two
+        *   cooperating processes (or of two cooperating sets of
+        *   processes): the goal of Q1's I/O is doing what needs to
+        *   be done so that new Q2's I/O can finally be
+        *   issued. Therefore, if the service of Q1's I/O is delayed,
+        *   then Q2's I/O is delayed too.  Conversely, if Q2's I/O is
+        *   delayed, the goal of Q1's I/O is hindered.
+        * - as a consequence, if some I/O of Q1/Q2 arrives while
+        *   Q2/Q1 is the only queue in service, there is absolutely
+        *   no point in delaying the service of such an I/O. The
+        *   only possible result is a throughput loss
+        * - so, when the above condition holds, the best option is to
+        *   have the new I/O dispatched as soon as possible
+        * - the most effective and efficient way to attain the above
+        *   goal is to put the new I/O directly in the dispatch
+        *   list
+        * - as an additional restriction, Q1 and Q2 must be the only
+        *   busy queues for this commit to put the I/O of Q2/Q1 in
+        *   the dispatch list.  This is necessary, because, if also
+        *   other queues are waiting for service, then putting new
+        *   I/O directly in the dispatch list may evidently cause a
+        *   violation of service guarantees for the other queues
+        */
+       if (!bfqq ||
+           (bfqq != bfqd->in_service_queue &&
+            bfqd->in_service_queue != NULL &&
+            bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
+            (bfqq->waker_bfqq == bfqd->in_service_queue ||
+             bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
                 if (at_head)
                         list_add(&rq->queuelist, &bfqd->dispatch);
                 else
@@ -5767,7 +6096,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
                         1UL<<(BFQ_RATE_SHIFT - 10))
                 bfq_update_rate_reset(bfqd, NULL);
         bfqd->last_completion = now_ns;
-       bfqd->last_completed_rq_bfqq = bfqq;
+       /*
+        * Shared queues are likely to receive I/O at a high
+        * rate. This may deceptively let them be considered as wakers
+        * of other queues. But a false waker will unjustly steal
+        * bandwidth to its supposedly woken queue. So considering
+        * also shared queues in the waking mechanism may cause more
+        * control troubles than throughput benefits. Then do not set
+        * last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
+        */
+       if (!bfq_bfqq_coop(bfqq))
+               bfqd->last_completed_rq_bfqq = bfqq;
  
         /*
          * If we are waiting to discover whether the request pattern
@@ -6124,7 +6463,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
  
         if (bfqq)
                 bfq_put_queue(bfqq);
-       bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+       bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
  
         bic_set_bfqq(bic, bfqq, is_sync);
         if (split && is_sync) {
@@ -6245,8 +6584,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
  
         if (likely(!new_queue)) {
                 /* If the queue was seeky for too long, break it apart. */
-               if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
-                       bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+               if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
+                       !bic->stably_merged) {
+                       struct bfq_queue *old_bfqq = bfqq;
  
                         /* Update bic before losing reference to bfqq */
                         if (bfq_bfqq_in_large_burst(bfqq))
@@ -6255,11 +6595,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
                         bfqq = bfq_split_bfqq(bic, bfqq);
                         split = true;
  
-                       if (!bfqq)
+                       if (!bfqq) {
                                 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
                                                                  true, is_sync,
                                                                  NULL);
-                       else
+                               bfqq->waker_bfqq = old_bfqq->waker_bfqq;
+                               bfqq->tentative_waker_bfqq = NULL;
+
+                               /*
+                                * If the waker queue disappears, then
+                                * new_bfqq->waker_bfqq must be
+                                * reset. So insert new_bfqq into the
+                                * woken_list of the waker. See
+                                * bfq_check_waker for details.
+                                */
+                               if (bfqq->waker_bfqq)
+                                       hlist_add_head(&bfqq->woken_list_node,
+                                                      &bfqq->waker_bfqq->woken_list);
+                       } else
                                 bfqq_already_existing = true;
                 }
         }