X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=block%2Fbfq-iosched.c;h=7c0b7f60811cfe54e76cda5fb2fd4eadd3ceef30;hb=0d52af590552473666da5b6111e7182d6cd23f92;hp=bcb6d21baf1269becc997c50a72bd53fd5e20c4c;hpb=062076e861e3e2bf3cafc9313efa34fad7c827e5;p=linux-2.6-microblaze.git diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..7c0b7f60811c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator. After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; + static struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool; #define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES 32 @@ -444,6 +458,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, return bfqq; } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ + return bfqq->service_from_backlogged > 0 && + time_is_before_jiffies(bfqq->first_IO_time + + bfq_merge_time_limit); +} + void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; @@ -454,6 +475,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } + /* + * bfqq cannot be merged any longer (see comments in + * bfq_setup_cooperator): no point in adding bfqq into the + * position tree. + */ + if (bfq_too_late_for_merging(bfqq)) + return; + if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) @@ -1627,6 +1656,8 @@ static void bfq_remove_request(struct request_queue *q, rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } + } else { + bfq_pos_tree_add_move(bfqd, bfqq); } if (rq->cmd_flags & REQ_META) @@ -1933,6 +1964,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { + if (bfq_too_late_for_merging(new_bfqq)) + return false; + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || (bfqq->ioprio_class != new_bfqq->ioprio_class)) return false; @@ -1956,20 +1990,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } -/* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -1983,11 +2003,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * * WARNING: queue merging may impair fairness among non-weight raised * queues, for at least two reasons: 1) the original weight of a * merged queue may change during the merged state, 2) even being the @@ -2001,12 +2016,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Prevent bfqq from being merged if it has been created too + * long ago. The idea is that true cooperating processes, and + * thus their associated bfq_queues, are supposed to be + * created shortly after each other. This is the case, e.g., + * for KVM/QEMU and dump I/O threads. Basing on this + * assumption, the following filtering greatly reduces the + * probability that two non-cooperating processes, which just + * happen to do close I/O for some short time interval, have + * their queues merged by mistake. + */ + if (bfq_too_late_for_merging(bfqq)) + return NULL; + if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; /* If there is only one backlogged queue, don't search. */ @@ -2015,12 +2042,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, in_service_bfqq = bfqd->in_service_queue; - if (!in_service_bfqq || in_service_bfqq == bfqq - || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + if (in_service_bfqq && in_service_bfqq != bfqq && + likely(in_service_bfqq != &bfqd->oom_bfqq) && + bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && bfqq->entity.parent == in_service_bfqq->entity.parent && bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2032,12 +2056,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ -check_scheduled: new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, bfq_io_struct_pos(io_struct, request)); - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -2062,7 +2084,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && - !bfq_bfqq_in_large_burst(bfqq))) { + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { /* * bfqq being merged right after being created: bfqq * would have deserved interactive weight raising, but @@ -2917,45 +2940,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, * whereas soft_rt_next_start is set to infinity for applications that do * not. * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. + * + * (a) Current time plus: (1) the maximum time for which the arrival + * of a request is waited for when a sync queue becomes idle, + * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + * postpone for a moment the reason for adding a few extra + * jiffies; we get back to it after next item (b). Lower-bounding + * the return value of this function with the current time plus + * bfqd->bfq_slice_idle tends to filter out greedy applications, + * because the latter issue their next request as soon as possible + * after the last one has been completed. In contrast, a soft + * real-time application spends some time processing data, after a + * batch of its requests has been completed. * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + * above, greedy applications may happen to meet both the + * bandwidth and isochrony requirements under heavy CPU or + * storage-device load. In more detail, in these scenarios, these + * applications happen, only for limited time periods, to do I/O + * slowly enough to meet all the requirements described so far, + * including the filtering in above item (a). These slow-speed + * time intervals are usually interspersed between other time + * intervals during which these applications do I/O at a very high + * speed. Fortunately, exactly because of the high speed of the + * I/O in the high-speed intervals, the values returned by this + * function happen to be so high, near the end of any such + * high-speed interval, to be likely to fall *after* the end of + * the low-speed time interval that follows. These high values are + * stored in bfqq->soft_rt_next_start after each invocation of + * this function. As a consequence, if the last value of + * bfqq->soft_rt_next_start is constantly used to lower-bound the + * next value that this function may return, then, from the very + * beginning of a low-speed interval, bfqq->soft_rt_next_start is + * likely to be constantly kept so high that any I/O request + * issued during the low-speed interval is considered as arriving + * to soon for the application to be deemed as soft + * real-time. Then, in the high-speed interval that follows, the + * application will not be deemed as soft real-time, just because + * it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + * devices with HZ=100. The time granularity may be so coarse + * that the approximation, in jiffies, of bfqd->bfq_slice_idle + * is rather lower than the exact value. * 2) jiffies, instead of increasing at a constant rate, may stop increasing * for a while, then suddenly 'jump' by several units to recover the lost * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines. */ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + return max3(bfqq->soft_rt_next_start, + bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } /** @@ -2999,17 +3064,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, */ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. - */ - bfqq->service_from_backlogged += entity->service; - /* * As above explained, charge slow (typically seeky) and * timed-out queues with the time and not the service @@ -3689,35 +3743,16 @@ exit: return rq; } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; -#endif - - spin_lock_irq(&bfqd->lock); - #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - - rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else - rq = __bfq_dispatch_request(hctx); -#endif - spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) +{ + struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) - return rq; + return; /* * rq and bfqq are guaranteed to exist until this function @@ -3732,7 +3767,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(hctx->queue->queue_lock); + spin_lock_irq(q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -3751,9 +3786,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(hctx->queue->queue_lock); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) {} #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled; + + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + + rq = __bfq_dispatch_request(hctx); + + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + + spin_unlock_irq(&bfqd->lock); + + bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, + idle_timer_disabled); + return rq; } @@ -4002,10 +4065,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->split_time = bfq_smallest_from_now(); /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. + * To not forget the possibly high bandwidth consumed by a + * process/queue in the recent past, + * bfq_bfqq_softrt_next_start() returns a value at least equal + * to the current value of bfqq->soft_rt_next_start (see + * comments on bfq_bfqq_softrt_next_start). Set + * soft_rt_next_start to now, to mean that bfqq has consumed + * no bandwidth so far. */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); + bfqq->soft_rt_next_start = jiffies; /* first request is almost certainly seeky */ bfqq->seek_history = 1; @@ -4276,16 +4344,46 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) return idle_timer_disabled; } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) +{ + if (!bfqq) + return; + + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instructions. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) {} +#endif + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; -#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4304,7 +4402,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4312,9 +4409,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); -#else - __bfq_insert_request(bfqd, rq); -#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4323,35 +4417,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request * merge). */ cmd_flags = rq->cmd_flags; -#endif + spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* - * bfqq still exists, because it can disappear only after - * either it is merged with another queue, or the process it - * is associated with exits. But both actions must be taken by - * the same process currently executing this flow of - * instruction. - * - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ - spin_lock_irq(q->queue_lock); - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); - if (idle_timer_disabled) - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); -#endif + bfq_update_insert_stats(q, bfqq, idle_timer_disabled, + cmd_flags); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4817,6 +4893,9 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else