obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
-mq-deadline-y += mq-deadline-main.o
-mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
- obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
+ obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
const char *blkg_dev_name(struct blkcg_gq *blkg)
{
- /* some drivers (floppy) instantiate a queue w/o disk registered */
- if (blkg->q->backing_dev_info->dev)
- return bdi_dev_name(blkg->q->backing_dev_info);
- return NULL;
+ if (!blkg->q->disk || !blkg->q->disk->bdi->dev)
+ return NULL;
+ return bdi_dev_name(blkg->q->disk->bdi);
}
/**
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur, delta;
+ unsigned long flags;
unsigned int seq;
/* fetch the current per-cpu values */
} while (u64_stats_fetch_retry(&bisc->sync, seq));
/* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
blkg_iostat_set(&delta, &cur);
blkg_iostat_sub(&delta, &bisc->last);
blkg_iostat_add(&blkg->iostat.cur, &delta);
blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
/* propagate global delta to parent (unless that's root) */
if (parent && parent->parent) {
- u64_stats_update_begin(&parent->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
blkg_iostat_set(&delta, &blkg->iostat.cur);
blkg_iostat_sub(&delta, &blkg->iostat.last);
blkg_iostat_add(&parent->iostat.cur, &delta);
blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
+ u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
}
}
memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
struct disk_stats *cpu_dkstats;
+ unsigned long flags;
cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
tmp.ios[BLKG_IOSTAT_READ] +=
tmp.bytes[BLKG_IOSTAT_DISCARD] +=
cpu_dkstats->sectors[STAT_DISCARD] << 9;
- u64_stats_update_begin(&blkg->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end(&blkg->iostat.sync);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}
}
- static int blkcg_print_stat(struct seq_file *sf, void *v)
+ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
- struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
- struct blkcg_gq *blkg;
-
- if (!seq_css(sf)->parent)
- blkcg_fill_root_iostats();
- else
- cgroup_rstat_flush(blkcg->css.cgroup);
+ struct blkg_iostat_set *bis = &blkg->iostat;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ bool has_stats = false;
+ const char *dname;
+ unsigned seq;
+ int i;
- rcu_read_lock();
+ if (!blkg->online)
+ return;
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkg_iostat_set *bis = &blkg->iostat;
- const char *dname;
- char *buf;
- u64 rbytes, wbytes, rios, wios, dbytes, dios;
- size_t size = seq_get_buf(sf, &buf), off = 0;
- int i;
- bool has_stats = false;
- unsigned seq;
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ return;
- spin_lock_irq(&blkg->q->queue_lock);
+ seq_printf(s, "%s ", dname);
- if (!blkg->online)
- goto skip;
+ do {
+ seq = u64_stats_fetch_begin(&bis->sync);
- dname = blkg_dev_name(blkg);
- if (!dname)
- goto skip;
+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+ } while (u64_stats_fetch_retry(&bis->sync, seq));
- /*
- * Hooray string manipulation, count is the size written NOT
- * INCLUDING THE \0, so size is now count+1 less than what we
- * had before, but we want to start writing the next bit from
- * the \0 so we only add count to buf.
- */
- off += scnprintf(buf+off, size-off, "%s ", dname);
+ if (rbytes || wbytes || rios || wios) {
+ has_stats = true;
+ seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
- do {
- seq = u64_stats_fetch_begin(&bis->sync);
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+ has_stats = true;
+ seq_printf(s, " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ atomic64_read(&blkg->delay_nsec));
+ }
- rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
- wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
- dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
- rios = bis->cur.ios[BLKG_IOSTAT_READ];
- wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
- dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
- } while (u64_stats_fetch_retry(&bis->sync, seq));
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
- if (rbytes || wbytes || rios || wios) {
- has_stats = true;
- off += scnprintf(buf+off, size-off,
- "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
- rbytes, wbytes, rios, wios,
- dbytes, dios);
- }
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
- if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+ if (pol->pd_stat_fn(blkg->pd[i], s))
has_stats = true;
- off += scnprintf(buf+off, size-off,
- " use_delay=%d delay_nsec=%llu",
- atomic_read(&blkg->use_delay),
- (unsigned long long)atomic64_read(&blkg->delay_nsec));
- }
+ }
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- size_t written;
+ if (has_stats)
+ seq_printf(s, "\n");
+ }
- if (!blkg->pd[i] || !pol->pd_stat_fn)
- continue;
+ static int blkcg_print_stat(struct seq_file *sf, void *v)
+ {
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
- written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
- if (written)
- has_stats = true;
- off += written;
- }
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
- if (has_stats) {
- if (off < size - 1) {
- off += scnprintf(buf+off, size-off, "\n");
- seq_commit(sf, off);
- } else {
- seq_commit(sf, -1);
- }
- }
- skip:
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ spin_lock_irq(&blkg->q->queue_lock);
+ blkcg_print_one_stat(blkg, sf);
spin_unlock_irq(&blkg->q->queue_lock);
}
-
rcu_read_unlock();
return 0;
}
*/
#include <linux/kernel.h>
#include <linux/module.h>
- #include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
- refcount_set(&rq->ref, 1);
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
- /* @q won't process any more request, flush async actions */
- del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
blk_sync_queue(q);
-
if (queue_is_mq(q))
blk_mq_exit_queue(q);
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc(node_id);
- if (!q->backing_dev_info)
- goto fail_split;
-
q->stats = blk_alloc_queue_stats();
if (!q->stats)
- goto fail_stats;
+ goto fail_split;
q->node = node_id;
atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
- timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
- laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
if (percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
- goto fail_bdi;
+ goto fail_stats;
if (blkcg_init_queue(q))
goto fail_ref;
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
- fail_bdi:
- blk_free_queue_stats(q->stats);
fail_stats:
- bdi_put(q->backing_dev_info);
+ blk_free_queue_stats(q->stats);
fail_split:
bioset_exit(&q->bio_split);
fail_id:
kfree(iocg);
}
- static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
+ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{
struct ioc_gq *iocg = pd_to_iocg(pd);
struct ioc *ioc = iocg->ioc;
- size_t pos = 0;
if (!ioc->enabled)
- return 0;
+ return false;
if (iocg->level == 0) {
unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
ioc->vtime_base_rate * 10000,
VTIME_PER_USEC);
- pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
- vp10k / 100, vp10k % 100);
+ seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
}
- pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
- iocg->last_stat.usage_us);
+ seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
if (blkcg_debug_stats)
- pos += scnprintf(buf + pos, size - pos,
- " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
- iocg->last_stat.wait_us,
- iocg->last_stat.indebt_us,
- iocg->last_stat.indelay_us);
-
- return pos;
+ seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
+ iocg->last_stat.wait_us,
+ iocg->last_stat.indebt_us,
+ iocg->last_stat.indelay_us);
+ return true;
}
static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
return -EINVAL;
- spin_lock(&blkcg->lock);
+ spin_lock_irq(&blkcg->lock);
iocc->dfl_weight = v * WEIGHT_ONE;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct ioc_gq *iocg = blkg_to_iocg(blkg);
if (iocg) {
- spin_lock_irq(&iocg->ioc->lock);
+ spin_lock(&iocg->ioc->lock);
ioc_now(iocg->ioc, &now);
weight_updated(iocg, &now);
- spin_unlock_irq(&iocg->ioc->lock);
+ spin_unlock(&iocg->ioc->lock);
}
}
- spin_unlock(&blkcg->lock);
+ spin_unlock_irq(&blkcg->lock);
return nbytes;
}
enable = iolatency_set_min_lat_nsec(blkg, lat_val);
if (enable) {
- WARN_ON_ONCE(!blk_get_queue(blkg->q));
+ if (!blk_get_queue(blkg->q)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
blkg_get(blkg);
}
return 0;
}
- static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
- size_t size)
+ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
{
struct latency_stat stat;
int cpu;
preempt_enable();
if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total);
- return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total,
- iolat->rq_depth.max_depth);
+ seq_printf(s, " missed=%llu total=%llu depth=max",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total);
+ else
+ seq_printf(s, " missed=%llu total=%llu depth=%u",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total,
+ iolat->rq_depth.max_depth);
+ return true;
}
- static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
- size_t size)
+ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
unsigned long long avg_lat;
unsigned long long cur_win;
if (!blkcg_debug_stats)
- return 0;
+ return false;
if (iolat->ssd)
- return iolatency_ssd_stat(iolat, buf, size);
+ return iolatency_ssd_stat(iolat, s);
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
- avg_lat, cur_win);
-
- return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
- iolat->rq_depth.max_depth, avg_lat, cur_win);
+ seq_printf(s, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);
+ else
+ seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
+ iolat->rq_depth.max_depth, avg_lat, cur_win);
+ return true;
}
-
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
struct request_queue *q,
struct blkcg *blkcg)
__blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
- laptop_io_completion(q->backing_dev_info);
+ laptop_io_completion(q->disk->bdi);
rq_qos_done(q, rq);
* This is probably worse than completing the request on a different
* cache domain.
*/
- if (force_irqthreads)
+ if (force_irqthreads())
return false;
/* same CPU or cache domain? Complete locally */
void blk_mq_put_rq_ref(struct request *rq)
{
- if (is_flush_rq(rq, rq->mq_hctx))
+ if (is_flush_rq(rq))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
unsigned long *next = priv;
/*
- * Just do a quick check if it is expired before locking the request in
- * so we're not unnecessarilly synchronizing across CPUs.
- */
- if (!blk_mq_req_expired(rq, next))
- return true;
-
- /*
- * We have reason to believe the request may be expired. Take a
- * reference on the request to lock this request lifetime into its
- * currently allocated context to prevent it from being reallocated in
- * the event the completion by-passes this timeout handler.
- *
- * If the reference was already released, then the driver beat the
- * timeout handler to posting a natural completion.
- */
- if (!refcount_inc_not_zero(&rq->ref))
- return true;
-
- /*
- * The request is now locked and cannot be reallocated underneath the
- * timeout handler's processing. Re-verify this exact request is truly
- * expired; if it is not expired, then the request was completed and
- * reallocated as a new request.
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
+ * be reallocated underneath the timeout handler's processing, then
+ * the expire check is reliable. If the request is not expired, then
+ * it was completed and reallocated as a new request after returning
+ * from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
-
- blk_mq_put_rq_ref(rq);
return true;
}
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared)
+ if (shared) {
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
- else
+ } else {
+ blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
+ }
}
}
}
EXPORT_SYMBOL(blk_mq_init_queue);
- struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
+ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+ struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
if (IS_ERR(q))
return ERR_CAST(q);
- disk = __alloc_disk_node(0, set->numa_node);
+ disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_cleanup_queue(q);
return ERR_PTR(-ENOMEM);
}
- disk->queue = q;
return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);
kobject_get(&q->kobj);
}
-static inline bool
-is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
-{
- return hctx->fq->flush_rq == req;
-}
+bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags);
bip_next->bip_vec[0].bv_offset);
}
- void blk_integrity_add(struct gendisk *);
+ int blk_integrity_add(struct gendisk *disk);
void blk_integrity_del(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
static inline void bio_integrity_free(struct bio *bio)
{
}
- static inline void blk_integrity_add(struct gendisk *disk)
+ static inline int blk_integrity_add(struct gendisk *disk)
{
+ return 0;
}
static inline void blk_integrity_del(struct gendisk *disk)
{
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
+ extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
+ static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
- char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
- int bdev_add_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
- int bdev_del_partition(struct block_device *bdev, int partno);
- int bdev_resize_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
+ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
+ int bdev_del_partition(struct gendisk *disk, int partno);
+ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
struct request_queue *blk_alloc_queue(int node_id);
- void disk_alloc_events(struct gendisk *disk);
+ int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ * for the blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-debugfs.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * See Documentation/block/deadline-iosched.rst
+ */
+static const int read_expire = HZ / 2; /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2; /* max times reads can starve a write */
+static const int fifo_batch = 16; /* # of sequential requests treated as one
+ by the above parameters. For throughput. */
+
+enum dd_data_dir {
+ DD_READ = READ,
+ DD_WRITE = WRITE,
+};
+
+enum { DD_DIR_COUNT = 2 };
+
+enum dd_prio {
+ DD_RT_PRIO = 0,
+ DD_BE_PRIO = 1,
+ DD_IDLE_PRIO = 2,
+ DD_PRIO_MAX = 2,
+};
+
+enum { DD_PRIO_COUNT = 3 };
+
+/* I/O statistics per I/O priority. */
+struct io_stats_per_prio {
+ local_t inserted;
+ local_t merged;
+ local_t dispatched;
+ local_t completed;
+};
+
+/* I/O statistics for all I/O priorities (enum dd_prio). */
+struct io_stats {
+ struct io_stats_per_prio stats[DD_PRIO_COUNT];
+};
+
+/*
+ * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
+ * present on both sort_list[] and fifo_list[].
+ */
+struct dd_per_prio {
+ struct list_head dispatch;
+ struct rb_root sort_list[DD_DIR_COUNT];
+ struct list_head fifo_list[DD_DIR_COUNT];
+ /* Next request in FIFO order. Read, write or both are NULL. */
+ struct request *next_rq[DD_DIR_COUNT];
+};
+
+struct deadline_data {
+ /*
+ * run time data
+ */
+
+ struct dd_per_prio per_prio[DD_PRIO_COUNT];
+
+ /* Data direction of latest dispatched request. */
+ enum dd_data_dir last_dir;
+ unsigned int batching; /* number of sequential requests made */
+ unsigned int starved; /* times reads have starved writes */
+
+ struct io_stats __percpu *stats;
+
+ /*
+ * settings that change how the i/o scheduler behaves
+ */
+ int fifo_expire[DD_DIR_COUNT];
+ int fifo_batch;
+ int writes_starved;
+ int front_merges;
+ u32 async_depth;
+
+ spinlock_t lock;
+ spinlock_t zone_lock;
+};
+
+/* Count one event of type 'event_type' and with I/O priority 'prio' */
+#define dd_count(dd, event_type, prio) do { \
+ struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
+ \
+ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
+ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
+ local_inc(&io_stats->stats[(prio)].event_type); \
+ put_cpu_ptr(io_stats); \
+} while (0)
+
+/*
+ * Returns the total number of dd_count(dd, event_type, prio) calls across all
+ * CPUs. No locking or barriers since it is fine if the returned sum is slightly
+ * outdated.
+ */
+#define dd_sum(dd, event_type, prio) ({ \
+ unsigned int cpu; \
+ u32 sum = 0; \
+ \
+ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
+ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
+ for_each_present_cpu(cpu) \
+ sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
+ stats[(prio)].event_type); \
+ sum; \
+})
+
+/* Maps an I/O priority class to a deadline scheduler priority. */
+static const enum dd_prio ioprio_class_to_prio[] = {
+ [IOPRIO_CLASS_NONE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_RT] = DD_RT_PRIO,
+ [IOPRIO_CLASS_BE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO,
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
+{
+ return &per_prio->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
+ * request.
+ */
+static u8 dd_rq_ioclass(struct request *rq)
+{
+ return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+ struct rb_node *node = rb_next(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
+{
+ struct rb_root *root = deadline_rb_root(per_prio, rq);
+
+ elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
+{
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+
+ if (per_prio->next_rq[data_dir] == rq)
+ per_prio->next_rq[data_dir] = deadline_latter_request(rq);
+
+ elv_rb_del(deadline_rb_root(per_prio, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q,
+ struct dd_per_prio *per_prio,
+ struct request *rq)
+{
+ list_del_init(&rq->queuelist);
+
+ /*
+ * We might not be on the rbtree, if we are doing an insert merge
+ */
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ deadline_del_rq_rb(per_prio, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+ enum elv_merge type)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(req);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ /*
+ * if the merge was a front merge, we need to reposition request
+ */
+ if (type == ELEVATOR_FRONT_MERGE) {
+ elv_rb_del(deadline_rb_root(per_prio, req), req);
+ deadline_add_rq_rb(per_prio, req);
+ }
+}
+
+/*
+ * Callback function that is invoked after @next has been merged into @req.
+ */
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+ struct request *next)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(next);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+
+ dd_count(dd, merged, prio);
+
+ /*
+ * if next expires before rq, assign its expire time to rq
+ * and move into next position (next will be deleted) in fifo
+ */
+ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+ if (time_before((unsigned long)next->fifo_time,
+ (unsigned long)req->fifo_time)) {
+ list_move(&req->queuelist, &next->queuelist);
+ req->fifo_time = next->fifo_time;
+ }
+ }
+
+ /*
+ * kill knowledge of next, this one is a goner
+ */
+ deadline_remove_request(q, &dd->per_prio[prio], next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ struct request *rq)
+{
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+
+ per_prio->next_rq[data_dir] = deadline_latter_request(rq);
+
+ /*
+ * take it off the sort and fifo list
+ */
+ deadline_remove_request(rq->q, per_prio, rq);
+}
+
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+ return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+
+ /*
+ * rq is expired!
+ */
+ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ if (list_empty(&per_prio->fifo_list[data_dir]))
+ return NULL;
+
+ rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ goto out;
+ }
+ rq = NULL;
+out:
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ rq = per_prio->next_rq[data_dir];
+ if (!rq)
+ return NULL;
+
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ while (rq) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ break;
+ rq = deadline_latter_request(rq);
+ }
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct deadline_data *dd,
+ struct dd_per_prio *per_prio)
+{
+ struct request *rq, *next_rq;
+ enum dd_data_dir data_dir;
+ enum dd_prio prio;
+ u8 ioprio_class;
+
+ lockdep_assert_held(&dd->lock);
+
+ if (!list_empty(&per_prio->dispatch)) {
+ rq = list_first_entry(&per_prio->dispatch, struct request,
+ queuelist);
+ list_del_init(&rq->queuelist);
+ goto done;
+ }
+
+ /*
+ * batches are currently reads XOR writes
+ */
+ rq = deadline_next_request(dd, per_prio, dd->last_dir);
+ if (rq && dd->batching < dd->fifo_batch)
+ /* we have a next request are still entitled to batch */
+ goto dispatch_request;
+
+ /*
+ * at this point we are not running a batch. select the appropriate
+ * data direction (read / write)
+ */
+
+ if (!list_empty(&per_prio->fifo_list[DD_READ])) {
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
+
+ if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
+ (dd->starved++ >= dd->writes_starved))
+ goto dispatch_writes;
+
+ data_dir = DD_READ;
+
+ goto dispatch_find_request;
+ }
+
+ /*
+ * there are either no reads or writes have been starved
+ */
+
+ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
+dispatch_writes:
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
+
+ dd->starved = 0;
+
+ data_dir = DD_WRITE;
+
+ goto dispatch_find_request;
+ }
+
+ return NULL;
+
+dispatch_find_request:
+ /*
+ * we are not running a batch, find best request for selected data_dir
+ */
+ next_rq = deadline_next_request(dd, per_prio, data_dir);
+ if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
+ /*
+ * A deadline has expired, the last request was in the other
+ * direction, or we have run out of higher-sectored requests.
+ * Start again from the request with the earliest expiry time.
+ */
+ rq = deadline_fifo_request(dd, per_prio, data_dir);
+ } else {
+ /*
+ * The last req was the same dir and we have a next request in
+ * sort order. No expired requests so continue on from here.
+ */
+ rq = next_rq;
+ }
+
+ /*
+ * For a zoned block device, if we only have writes queued and none of
+ * them can be dispatched, rq will be NULL.
+ */
+ if (!rq)
+ return NULL;
+
+ dd->last_dir = data_dir;
+ dd->batching = 0;
+
+dispatch_request:
+ /*
+ * rq is the selected appropriate request.
+ */
+ dd->batching++;
+ deadline_move_request(dd, per_prio, rq);
+done:
+ ioprio_class = dd_rq_ioclass(rq);
+ prio = ioprio_class_to_prio[ioprio_class];
+ dd_count(dd, dispatched, prio);
+ /*
+ * If the request needs its target zone locked, do it.
+ */
+ blk_req_zone_write_lock(rq);
+ rq->rq_flags |= RQF_STARTED;
+ return rq;
+}
+
+/*
+ * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
+ *
+ * One confusing aspect here is that we get called for a specific
+ * hardware queue, but we may return a request that is for a
+ * different hardware queue. This is because mq-deadline has shared
+ * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ */
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+ enum dd_prio prio;
+
+ spin_lock(&dd->lock);
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+ if (rq)
+ break;
+ }
+ spin_unlock(&dd->lock);
+
+ return rq;
+}
+
+/*
+ * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
+ * function is used by __blk_mq_get_tag().
+ */
+static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+ struct deadline_data *dd = data->q->elevator->elevator_data;
+
+ /* Do not throttle synchronous reads. */
+ if (op_is_sync(op) && !op_is_write(op))
+ return;
+
+ /*
+ * Throttle asynchronous requests and writes such that these requests
+ * do not block the allocation of synchronous requests.
+ */
+ data->shallow_depth = dd->async_depth;
+}
+
+/* Called by blk_mq_update_nr_requests(). */
+static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+
+ dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
+
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+}
+
+/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
+static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ dd_depth_updated(hctx);
+ return 0;
+}
+
+static void dd_exit_sched(struct elevator_queue *e)
+{
+ struct deadline_data *dd = e->elevator_data;
+ enum dd_prio prio;
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
+ }
+
+ free_percpu(dd->stats);
+
+ kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+ struct deadline_data *dd;
+ struct elevator_queue *eq;
+ enum dd_prio prio;
+ int ret = -ENOMEM;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return ret;
+
+ dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+ if (!dd)
+ goto put_eq;
+
+ eq->elevator_data = dd;
+
+ dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!dd->stats)
+ goto free_dd;
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ INIT_LIST_HEAD(&per_prio->dispatch);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
+ per_prio->sort_list[DD_READ] = RB_ROOT;
+ per_prio->sort_list[DD_WRITE] = RB_ROOT;
+ }
+ dd->fifo_expire[DD_READ] = read_expire;
+ dd->fifo_expire[DD_WRITE] = write_expire;
+ dd->writes_starved = writes_starved;
+ dd->front_merges = 1;
+ dd->last_dir = DD_WRITE;
+ dd->fifo_batch = fifo_batch;
+ spin_lock_init(&dd->lock);
+ spin_lock_init(&dd->zone_lock);
+
+ q->elevator = eq;
+ return 0;
+
+free_dd:
+ kfree(dd);
+
+put_eq:
+ kobject_put(&eq->kobj);
+ return ret;
+}
+
+/*
+ * Try to merge @bio into an existing request. If @bio has been merged into
+ * an existing request, store the pointer to that request into *@rq.
+ */
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+ struct bio *bio)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ sector_t sector = bio_end_sector(bio);
+ struct request *__rq;
+
+ if (!dd->front_merges)
+ return ELEVATOR_NO_MERGE;
+
+ __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
+ if (__rq) {
+ BUG_ON(sector != blk_rq_pos(__rq));
+
+ if (elv_bio_merge_ok(__rq, bio)) {
+ *rq = __rq;
++ if (blk_discard_mergable(__rq))
++ return ELEVATOR_DISCARD_MERGE;
+ return ELEVATOR_FRONT_MERGE;
+ }
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+/*
+ * Attempt to merge a bio into an existing request. This function is called
+ * before @bio is associated with a request.
+ */
+static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct request *free = NULL;
+ bool ret;
+
+ spin_lock(&dd->lock);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ spin_unlock(&dd->lock);
+
+ if (free)
+ blk_mq_free_request(free);
+
+ return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+ u16 ioprio = req_get_ioprio(rq);
+ u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+ struct dd_per_prio *per_prio;
+ enum dd_prio prio;
+ LIST_HEAD(free);
+
+ lockdep_assert_held(&dd->lock);
+
+ /*
+ * This may be a requeue of a write request that has locked its
+ * target zone. If it is the case, this releases the zone lock.
+ */
+ blk_req_zone_write_unlock(rq);
+
+ prio = ioprio_class_to_prio[ioprio_class];
+ dd_count(dd, inserted, prio);
+ rq->elv.priv[0] = (void *)(uintptr_t)1;
+
+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+ blk_mq_free_requests(&free);
+ return;
+ }
+
+ trace_block_rq_insert(rq);
+
+ per_prio = &dd->per_prio[prio];
+ if (at_head) {
+ list_add(&rq->queuelist, &per_prio->dispatch);
+ } else {
+ deadline_add_rq_rb(per_prio, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+
+ /*
+ * set expire time and add to fifo list
+ */
+ rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+ list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
+ }
+}
+
+/*
+ * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests().
+ */
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ spin_lock(&dd->lock);
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ dd_insert_request(hctx, rq, at_head);
+ }
+ spin_unlock(&dd->lock);
+}
+
+/* Callback from inside blk_mq_rq_ctx_init(). */
+static void dd_prepare_request(struct request *rq)
+{
+ rq->elv.priv[0] = NULL;
+}
+
+/*
+ * Callback from inside blk_mq_free_request().
+ *
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests. Do this while holding the zone lock
+ * spinlock so that the zone is never unlocked while deadline_fifo_request()
+ * or deadline_next_request() are executing. This function is called for
+ * all requests, whether or not these requests complete successfully.
+ *
+ * For a zoned block device, __dd_dispatch_request() may have stopped
+ * dispatching requests if all the queued requests are write requests directed
+ * at zones that are already locked due to on-going write requests. To ensure
+ * write request dispatch progress in this case, mark the queue as needing a
+ * restart to ensure that the queue is run again after completion of the
+ * request and zones being unlocked.
+ */
+static void dd_finish_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(rq);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ /*
+ * The block layer core may call dd_finish_request() without having
+ * called dd_insert_requests(). Hence only update statistics for
+ * requests for which dd_insert_requests() has been called. See also
+ * blk_mq_request_bypass_insert().
+ */
+ if (rq->elv.priv[0])
+ dd_count(dd, completed, prio);
+
+ if (blk_queue_is_zoned(q)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ blk_req_zone_write_unlock(rq);
+ if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+ }
+}
+
+static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
+{
+ return !list_empty_careful(&per_prio->dispatch) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio prio;
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++)
+ if (dd_has_work_for_prio(&dd->per_prio[prio]))
+ return true;
+
+ return false;
+}
+
+/*
+ * sysfs parts below
+ */
+#define SHOW_INT(__FUNC, __VAR) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ \
+ return sysfs_emit(page, "%d\n", __VAR); \
+}
+#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
+SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
+SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
+SHOW_INT(deadline_front_merges_show, dd->front_merges);
+SHOW_INT(deadline_async_depth_show, dd->front_merges);
+SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
+#undef SHOW_INT
+#undef SHOW_JIFFIES
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data, __ret; \
+ \
+ __ret = kstrtoint(page, 0, &__data); \
+ if (__ret < 0) \
+ return __ret; \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ *(__PTR) = __CONV(__data); \
+ return count; \
+}
+#define STORE_INT(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
+#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
+STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
+STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
+STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
+STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
+STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
+#undef STORE_FUNCTION
+#undef STORE_INT
+#undef STORE_JIFFIES
+
+#define DD_ATTR(name) \
+ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+ DD_ATTR(read_expire),
+ DD_ATTR(write_expire),
+ DD_ATTR(writes_starved),
+ DD_ATTR(front_merges),
+ DD_ATTR(async_depth),
+ DD_ATTR(fifo_batch),
+ __ATTR_NULL
+};
+
+#ifdef CONFIG_BLK_DEBUG_FS
+#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \
+static void *deadline_##name##_fifo_start(struct seq_file *m, \
+ loff_t *pos) \
+ __acquires(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ spin_lock(&dd->lock); \
+ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \
+} \
+ \
+static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
+ loff_t *pos) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \
+} \
+ \
+static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
+ __releases(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ \
+ spin_unlock(&dd->lock); \
+} \
+ \
+static const struct seq_operations deadline_##name##_fifo_seq_ops = { \
+ .start = deadline_##name##_fifo_start, \
+ .next = deadline_##name##_fifo_next, \
+ .stop = deadline_##name##_fifo_stop, \
+ .show = blk_mq_debugfs_rq_show, \
+}; \
+ \
+static int deadline_##name##_next_rq_show(void *data, \
+ struct seq_file *m) \
+{ \
+ struct request_queue *q = data; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ struct request *rq = per_prio->next_rq[data_dir]; \
+ \
+ if (rq) \
+ __blk_mq_debugfs_rq_show(m, rq); \
+ return 0; \
+}
+
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
+#undef DEADLINE_DEBUGFS_DDIR_ATTRS
+
+static int deadline_batching_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->batching);
+ return 0;
+}
+
+static int deadline_starved_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->starved);
+ return 0;
+}
+
+static int dd_async_depth_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->async_depth);
+ return 0;
+}
+
+static int dd_queued_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
+ dd_queued(dd, DD_BE_PRIO),
+ dd_queued(dd, DD_IDLE_PRIO));
+ return 0;
+}
+
+/* Number of requests owned by the block driver for a given priority. */
+static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
+{
+ return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
+ - dd_sum(dd, completed, prio);
+}
+
+static int dd_owned_by_driver_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
+ dd_owned_by_driver(dd, DD_BE_PRIO),
+ dd_owned_by_driver(dd, DD_IDLE_PRIO));
+ return 0;
+}
+
+#define DEADLINE_DISPATCH_ATTR(prio) \
+static void *deadline_dispatch##prio##_start(struct seq_file *m, \
+ loff_t *pos) \
+ __acquires(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ spin_lock(&dd->lock); \
+ return seq_list_start(&per_prio->dispatch, *pos); \
+} \
+ \
+static void *deadline_dispatch##prio##_next(struct seq_file *m, \
+ void *v, loff_t *pos) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ return seq_list_next(v, &per_prio->dispatch, pos); \
+} \
+ \
+static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \
+ __releases(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ \
+ spin_unlock(&dd->lock); \
+} \
+ \
+static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
+ .start = deadline_dispatch##prio##_start, \
+ .next = deadline_dispatch##prio##_next, \
+ .stop = deadline_dispatch##prio##_stop, \
+ .show = blk_mq_debugfs_rq_show, \
+}
+
+DEADLINE_DISPATCH_ATTR(0);
+DEADLINE_DISPATCH_ATTR(1);
+DEADLINE_DISPATCH_ATTR(2);
+#undef DEADLINE_DISPATCH_ATTR
+
+#define DEADLINE_QUEUE_DDIR_ATTRS(name) \
+ {#name "_fifo_list", 0400, \
+ .seq_ops = &deadline_##name##_fifo_seq_ops}
+#define DEADLINE_NEXT_RQ_ATTR(name) \
+ {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
+static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
+ DEADLINE_QUEUE_DDIR_ATTRS(read0),
+ DEADLINE_QUEUE_DDIR_ATTRS(write0),
+ DEADLINE_QUEUE_DDIR_ATTRS(read1),
+ DEADLINE_QUEUE_DDIR_ATTRS(write1),
+ DEADLINE_QUEUE_DDIR_ATTRS(read2),
+ DEADLINE_QUEUE_DDIR_ATTRS(write2),
+ DEADLINE_NEXT_RQ_ATTR(read0),
+ DEADLINE_NEXT_RQ_ATTR(write0),
+ DEADLINE_NEXT_RQ_ATTR(read1),
+ DEADLINE_NEXT_RQ_ATTR(write1),
+ DEADLINE_NEXT_RQ_ATTR(read2),
+ DEADLINE_NEXT_RQ_ATTR(write2),
+ {"batching", 0400, deadline_batching_show},
+ {"starved", 0400, deadline_starved_show},
+ {"async_depth", 0400, dd_async_depth_show},
+ {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
+ {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
+ {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
+ {"owned_by_driver", 0400, dd_owned_by_driver_show},
+ {"queued", 0400, dd_queued_show},
+ {},
+};
+#undef DEADLINE_QUEUE_DDIR_ATTRS
+#endif
+
+static struct elevator_type mq_deadline = {
+ .ops = {
+ .depth_updated = dd_depth_updated,
+ .limit_depth = dd_limit_depth,
+ .insert_requests = dd_insert_requests,
+ .dispatch_request = dd_dispatch_request,
+ .prepare_request = dd_prepare_request,
+ .finish_request = dd_finish_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .bio_merge = dd_bio_merge,
+ .request_merge = dd_request_merge,
+ .requests_merged = dd_merged_requests,
+ .request_merged = dd_request_merged,
+ .has_work = dd_has_work,
+ .init_sched = dd_init_sched,
+ .exit_sched = dd_exit_sched,
+ .init_hctx = dd_init_hctx,
+ },
+
+#ifdef CONFIG_BLK_DEBUG_FS
+ .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
+#endif
+ .elevator_attrs = deadline_attrs,
+ .elevator_name = "mq-deadline",
+ .elevator_alias = "deadline",
+ .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
+ .elevator_owner = THIS_MODULE,
+};
+MODULE_ALIAS("mq-deadline-iosched");
+
+static int __init deadline_init(void)
+{
+ return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+ elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
}
}
- num_sects = state->bdev->bd_inode->i_size >> 9;
+ num_sects = get_capacity(state->disk);
if ((ph[0]->config_start > num_sects) ||
((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
/**
* ldm_validate_tocblocks - Validate the table of contents and its backups
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* Find and compare the four tables of contents of the LDM Database stored on
- * @state->bdev and return the parsed information into @toc1.
+ * @state->disk and return the parsed information into @toc1.
*
* The offsets and sizes of the configs are range-checked against a privhead.
*
* only likely to happen if the underlying device is strange. If that IS
* the case we should return zero to let someone else try.
*
- * Return: 'true' @state->bdev is a dynamic disk
- * 'false' @state->bdev is not a dynamic disk, or an error occurred
+ * Return: 'true' @state->disk is a dynamic disk
+ * 'false' @state->disk is not a dynamic disk, or an error occurred
*/
static bool ldm_validate_partition_table(struct parsed_partitions *state)
{
/**
* ldm_get_vblks - Read the on-disk database of VBLKs into memory
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* To use the information from the VBLKs, they need to be read from the disk,
* example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
* and so on: the actual data containing partitions.
*
- * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
- * 0 Success, @state->bdev is not a dynamic disk
+ * Return: 1 Success, @state->disk is a dynamic disk and we handled it
+ * 0 Success, @state->disk is not a dynamic disk
* -1 An error occurred before enough information had been read
- * Or @state->bdev is a dynamic disk, but it may be corrupted
+ * Or @state->disk is a dynamic disk, but it may be corrupted
*/
int ldm_partition(struct parsed_partitions *state)
{
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- kfree(page_address(req->special_vec.bv_page) +
- req->special_vec.bv_offset);
- }
-
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+ kfree(bvec_virt(&req->special_vec));
blk_mq_end_request(req, virtblk_result(vbr));
}
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+static int virtblk_validate(struct virtio_device *vdev)
+{
+ u32 blk_size;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE))
+ return 0;
+
+ blk_size = virtio_cread32(vdev,
+ offsetof(struct virtio_blk_config, blk_size));
+
+ if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)
+ __virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE);
+
+ return 0;
+}
+
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
- if (!vdev->config->get) {
- dev_err(&vdev->dev, "%s failure: config access disabled\n",
- __func__);
- return -EINVAL;
- }
-
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
GFP_KERNEL);
if (err < 0)
else
blk_size = queue_logical_block_size(q);
- goto err_cleanup_disk;
+ if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) {
+ dev_err(&vdev->dev,
+ "block size is changed unexpectedly, now is %u\n",
+ blk_size);
+ err = -EINVAL;
++ goto out_cleanup_disk;
+ }
+
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
virtblk_update_capacity(vblk, false);
virtio_device_ready(vdev);
- device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
+ err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
+ if (err)
+ goto out_cleanup_disk;
+
return 0;
- err_cleanup_disk:
+ out_cleanup_disk:
blk_cleanup_disk(vblk->disk);
out_free_tags:
blk_mq_free_tag_set(&vblk->tag_set);
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
+ .validate = virtblk_validate,
.probe = virtblk_probe,
.remove = virtblk_remove,
.config_changed = virtblk_config_changed,
static void dasd_eckd_store_conf_data(struct dasd_device *device,
struct dasd_conf_data *conf_data, int chp)
{
+ struct dasd_eckd_private *private = device->private;
struct channel_path_desc_fmt0 *chp_desc;
struct subchannel_id sch_id;
+ void *cdp;
- ccw_device_get_schid(device->cdev, &sch_id);
/*
* path handling and read_conf allocate data
* free it before replacing the pointer
+ * also replace the old private->conf_data pointer
+ * with the new one if this points to the same data
*/
- kfree(device->path[chp].conf_data);
+ cdp = device->path[chp].conf_data;
+ if (private->conf_data == cdp) {
+ private->conf_data = (void *)conf_data;
+ dasd_eckd_identify_conf_parts(private);
+ }
+ ccw_device_get_schid(device->cdev, &sch_id);
device->path[chp].conf_data = conf_data;
device->path[chp].cssid = sch_id.cssid;
device->path[chp].ssid = sch_id.ssid;
if (chp_desc)
device->path[chp].chpid = chp_desc->chpid;
kfree(chp_desc);
+ kfree(cdp);
}
static void dasd_eckd_clear_conf_data(struct dasd_device *device)
end_blk = (curr_trk + 1) * recs_per_trk;
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
for (off = 0; off < bv.bv_len; off += blksize) {
if (first_blk + blk_count >= end_blk) {
cqr->proc_bytes = blk_count * blksize;
last_rec - recid + 1, cmd, basedev, blksize);
}
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
if (dasd_page_cache) {
char *copy = kmem_cache_alloc(dasd_page_cache,
GFP_DMA | __GFP_NOWARN);
idaw_dst = NULL;
idaw_len = 0;
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
seg_len = bv.bv_len;
while (seg_len) {
if (new_track) {
new_track = 1;
recid = first_rec;
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
seg_len = bv.bv_len;
while (seg_len) {
if (new_track) {
}
} else {
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
last_tidaw = itcw_add_tidaw(itcw, 0x00,
dst, bv.bv_len);
if (IS_ERR(last_tidaw)) {
idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE);
}
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
seg_len = bv.bv_len;
if (cmd == DASD_ECKD_CCW_READ_TRACK)
memset(dst, 0, seg_len);
if (private->uses_cdl == 0 || recid > 2*blk_per_trk)
ccw++;
rq_for_each_segment(bv, req, iter) {
- dst = page_address(bv.bv_page) + bv.bv_offset;
+ dst = bvec_virt(&bv);
for (off = 0; off < bv.bv_len; off += blksize) {
/* Skip locate record. */
if (private->uses_cdl && recid <= 2*blk_per_trk)
static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG];
static DEFINE_SPINLOCK(sr_index_lock);
+ static struct lock_class_key sr_bio_compl_lkclass;
+
/* This semaphore is used to mediate the 0->1 reference get in the
* face of object destruction (i.e. we can't allow a get on an
* object after last put) */
else if (med->media_event_code == 2)
return DISK_EVENT_MEDIA_CHANGE;
else if (med->media_event_code == 3)
- return DISK_EVENT_EJECT_REQUEST;
+ return DISK_EVENT_MEDIA_CHANGE;
return 0;
}
kref_init(&cd->kref);
- disk = alloc_disk(1);
+ disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE,
+ &sr_bio_compl_lkclass);
if (!disk)
goto fail_free;
mutex_init(&cd->lock);
disk->major = SCSI_CDROM_MAJOR;
disk->first_minor = minor;
+ disk->minors = 1;
sprintf(disk->disk_name, "sr%d", minor);
disk->fops = &sr_bdops;
disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
set_capacity(disk, cd->capacity);
disk->private_data = &cd->driver;
- disk->queue = sdev->request_queue;
if (register_cdrom(disk, &cd->cdi))
goto fail_minor;
* struct address_space - Contents of a cacheable, mappable object.
* @host: Owner, either the inode or the block_device.
* @i_pages: Cached pages.
+ * @invalidate_lock: Guards coherency between page cache contents and
+ * file offset->disk block mappings in the filesystem during invalidates.
+ * It is also used to block modification of page cache contents through
+ * memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
* @i_mmap_writable: Number of VM_SHARED mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
struct address_space {
struct inode *host;
struct xarray i_pages;
+ struct rw_semaphore invalidate_lock;
gfp_t gfp_mask;
atomic_t i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
down_read_nested(&inode->i_rwsem, subclass);
}
+static inline void filemap_invalidate_lock(struct address_space *mapping)
+{
+ down_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock(struct address_space *mapping)
+{
+ up_write(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+{
+ down_read(&mapping->invalidate_lock);
+}
+
+static inline int filemap_invalidate_trylock_shared(
+ struct address_space *mapping)
+{
+ return down_read_trylock(&mapping->invalidate_lock);
+}
+
+static inline void filemap_invalidate_unlock_shared(
+ struct address_space *mapping)
+{
+ up_read(&mapping->invalidate_lock);
+}
+
void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2);
+
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
- /* Pending fsnotify inode refs */
- atomic_long_t s_fsnotify_inode_refs;
+ /*
+ * Number of inode/mount/sb objects that are being watched, note that
+ * inodes objects are currently double-accounted.
+ */
+ atomic_long_t s_fsnotify_connectors;
/* Being remounted read-only */
int s_readonly_remount;
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
+ struct lock_class_key invalidate_lock_key;
struct lock_class_key i_mutex_dir_key;
};
#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
-#ifdef CONFIG_MANDATORY_FILE_LOCKING
-extern int locks_mandatory_locked(struct file *);
-extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
-
-/*
- * Candidates for mandatory locking have the setgid bit set
- * but no group execute bit - an otherwise meaningless combination.
- */
-
-static inline int __mandatory_lock(struct inode *ino)
-{
- return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
-}
-
-/*
- * ... and these candidates should be on SB_MANDLOCK mounted fs,
- * otherwise these will be advisory locks
- */
-
-static inline int mandatory_lock(struct inode *ino)
-{
- return IS_MANDLOCK(ino) && __mandatory_lock(ino);
-}
-
-static inline int locks_verify_locked(struct file *file)
-{
- if (mandatory_lock(locks_inode(file)))
- return locks_mandatory_locked(file);
- return 0;
-}
-
-static inline int locks_verify_truncate(struct inode *inode,
- struct file *f,
- loff_t size)
-{
- if (!inode->i_flctx || !mandatory_lock(inode))
- return 0;
-
- if (size < inode->i_size) {
- return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- F_WRLCK);
- } else {
- return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- F_WRLCK);
- }
-}
-
-#else /* !CONFIG_MANDATORY_FILE_LOCKING */
-
-static inline int locks_mandatory_locked(struct file *file)
-{
- return 0;
-}
-
-static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- loff_t start, loff_t end, unsigned char type)
-{
- return 0;
-}
-
-static inline int __mandatory_lock(struct inode *inode)
-{
- return 0;
-}
-
-static inline int mandatory_lock(struct inode *inode)
-{
- return 0;
-}
-
-static inline int locks_verify_locked(struct file *file)
-{
- return 0;
-}
-
-static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- size_t size)
-{
- return 0;
-}
-
-#endif /* CONFIG_MANDATORY_FILE_LOCKING */
-
-
#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
struct iov_iter *iter);
- /* fs/block_dev.c */
- extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync);
-
/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
select LOCKDEP
select DEBUG_SPINLOCK
- select DEBUG_MUTEXES
+ select DEBUG_MUTEXES if !PREEMPT_RT
select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_RWSEMS
select DEBUG_WW_MUTEX_SLOWPATH
depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
select LOCKDEP
select DEBUG_SPINLOCK
- select DEBUG_MUTEXES
+ select DEBUG_MUTEXES if !PREEMPT_RT
select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_LOCK_ALLOC
default n
config DEBUG_MUTEXES
bool "Mutex debugging: basic checks"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !PREEMPT_RT
help
This feature allows mutex semantics violations to be detected and
reported.
depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
select DEBUG_LOCK_ALLOC
select DEBUG_SPINLOCK
- select DEBUG_MUTEXES
+ select DEBUG_MUTEXES if !PREEMPT_RT
+ select DEBUG_RT_MUTEXES if PREEMPT_RT
help
This feature enables slowpath testing for w/w mutex users by
injecting additional -EDEADLK wound/backoff cases. Together with
bool "Lock debugging: detect incorrect freeing of live locks"
depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
select DEBUG_SPINLOCK
- select DEBUG_MUTEXES
+ select DEBUG_MUTEXES if !PREEMPT_RT
select DEBUG_RT_MUTEXES if RT_MUTEXES
select LOCKDEP
help
feature by default. When enabled, memory and cache locality will
be impacted.
- config DEBUG_BLOCK_EXT_DEVT
- bool "Force extended block device numbers and spread them"
- depends on DEBUG_KERNEL
- depends on BLOCK
- default n
- help
- BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON
- SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT
- YOU ARE DOING. Distros, please enable this and fix whatever
- is broken.
-
- Conventionally, block device numbers are allocated from
- predetermined contiguous area. However, extended block area
- may introduce non-contiguous block device numbers. This
- option forces most block device numbers to be allocated from
- the extended space and spreads them to discover kernel or
- userland code paths which assume predetermined contiguous
- device number allocation.
-
- Note that turning on this debug option shuffles all the
- device numbers for all IDE and SCSI devices including libata
- ones, so root partition specified using device number
- directly (via rdev or root=MAJ:MIN) won't work anymore.
- Textual device names (root=/dev/sdXn) will continue to work.
-
- Say N if you are unsure.
-
config CPU_HOTPLUG_STATE_CONTROL
bool "Enable CPU hotplug state control"
depends on DEBUG_KERNEL