block: don't release queue's sysfs lock during switching elevator
authorMing Lei <ming.lei@redhat.com>
Mon, 23 Sep 2019 15:12:09 +0000 (23:12 +0800)
committerJens Axboe <axboe@kernel.dk>
Thu, 26 Sep 2019 06:45:51 +0000 (00:45 -0600)
cecf5d87ff20 ("block: split .sysfs_lock into two locks") starts to
release & acquire sysfs_lock before registering/un-registering elevator
queue during switching elevator for avoiding potential deadlock from
showing & storing 'queue/iosched' attributes and removing elevator's
kobject.

Turns out there isn't such deadlock because 'q->sysfs_lock' isn't
required in .show & .store of queue/iosched's attributes, and just
elevator's sysfs lock is acquired in elv_iosched_store() and
elv_iosched_show(). So it is safe to hold queue's sysfs lock when
registering/un-registering elevator queue.

The biggest issue is that commit cecf5d87ff20 assumes that concurrent
write on 'queue/scheduler' can't happen. However, this assumption isn't
true, because kernfs_fop_write() only guarantees that concurrent write
aren't called on the same open file, but the write could be from
different open on the file. So we can't release & re-acquire queue's
sysfs lock during switching elevator, otherwise use-after-free on
elevator could be triggered.

Fixes the issue by not releasing queue's sysfs lock during switching
elevator.

Fixes: cecf5d87ff20 ("block: split .sysfs_lock into two locks")
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-sysfs.c
block/elevator.c

index b82736c..962fc0c 100644 (file)
@@ -989,13 +989,11 @@ int blk_register_queue(struct gendisk *disk)
                blk_mq_debugfs_register(q);
        }
 
-       /*
-        * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
-        * switch won't happen at all.
-        */
+       mutex_lock(&q->sysfs_lock);
        if (q->elevator) {
                ret = elv_register_queue(q, false);
                if (ret) {
+                       mutex_unlock(&q->sysfs_lock);
                        mutex_unlock(&q->sysfs_dir_lock);
                        kobject_del(&q->kobj);
                        blk_trace_remove_sysfs(dev);
@@ -1005,7 +1003,6 @@ int blk_register_queue(struct gendisk *disk)
                has_elevator = true;
        }
 
-       mutex_lock(&q->sysfs_lock);
        blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
        wbt_enable_default(q);
        blk_throtl_register_queue(q);
@@ -1062,12 +1059,10 @@ void blk_unregister_queue(struct gendisk *disk)
        kobject_del(&q->kobj);
        blk_trace_remove_sysfs(disk_to_dev(disk));
 
-       /*
-        * q->kobj has been removed, so it is safe to check if elevator
-        * exists without holding q->sysfs_lock.
-        */
+       mutex_lock(&q->sysfs_lock);
        if (q->elevator)
                elv_unregister_queue(q);
+       mutex_unlock(&q->sysfs_lock);
        mutex_unlock(&q->sysfs_dir_lock);
 
        kobject_put(&disk_to_dev(disk)->kobj);
index bba10e8..5437059 100644 (file)
@@ -503,9 +503,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
                if (uevent)
                        kobject_uevent(&e->kobj, KOBJ_ADD);
 
-               mutex_lock(&q->sysfs_lock);
                e->registered = 1;
-               mutex_unlock(&q->sysfs_lock);
        }
        return error;
 }
@@ -523,11 +521,9 @@ void elv_unregister_queue(struct request_queue *q)
                kobject_uevent(&e->kobj, KOBJ_REMOVE);
                kobject_del(&e->kobj);
 
-               mutex_lock(&q->sysfs_lock);
                e->registered = 0;
                /* Re-enable throttling in case elevator disabled it */
                wbt_enable_default(q);
-               mutex_unlock(&q->sysfs_lock);
        }
 }
 
@@ -590,32 +586,11 @@ int elevator_switch_mq(struct request_queue *q,
        lockdep_assert_held(&q->sysfs_lock);
 
        if (q->elevator) {
-               if (q->elevator->registered) {
-                       mutex_unlock(&q->sysfs_lock);
-
-                       /*
-                        * Concurrent elevator switch can't happen becasue
-                        * sysfs write is always exclusively on same file.
-                        *
-                        * Also the elevator queue won't be freed after
-                        * sysfs_lock is released becasue kobject_del() in
-                        * blk_unregister_queue() waits for completion of
-                        * .store & .show on its attributes.
-                        */
+               if (q->elevator->registered)
                        elv_unregister_queue(q);
 
-                       mutex_lock(&q->sysfs_lock);
-               }
                ioc_clear_queue(q);
                elevator_exit(q, q->elevator);
-
-               /*
-                * sysfs_lock may be dropped, so re-check if queue is
-                * unregistered. If yes, don't switch to new elevator
-                * any more
-                */
-               if (!blk_queue_registered(q))
-                       return 0;
        }
 
        ret = blk_mq_init_sched(q, new_e);
@@ -623,11 +598,7 @@ int elevator_switch_mq(struct request_queue *q,
                goto out;
 
        if (new_e) {
-               mutex_unlock(&q->sysfs_lock);
-
                ret = elv_register_queue(q, true);
-
-               mutex_lock(&q->sysfs_lock);
                if (ret) {
                        elevator_exit(q, q->elevator);
                        goto out;