Merge tag 'v4.18-rc6' into for-4.19/block2

author Jens Axboe <axboe@kernel.dk>

Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)

committer Jens Axboe <axboe@kernel.dk>

Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)
author Jens Axboe <axboe@kernel.dk>
Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)
committer Jens Axboe <axboe@kernel.dk>
Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats

index f91a973..abac31d 100644 (file)
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -5,6 +5,7 @@ Description:
                 The /proc/diskstats file displays the I/O statistics
                 of block devices. Each line contains the following 14
                 fields:
+
                  1 - major number
                  2 - minor mumber
                  3 - device name
@@ -19,4 +20,13 @@ Description:
                 12 - I/Os currently in progress
                 13 - time spent doing I/Os (ms)
                 14 - weighted time spent doing I/Os (ms)
+
+               Kernel 4.18+ appends four more fields for discard
+               tracking putting the total at 18:
+
+               15 - discards completed successfully
+               16 - discards merged
+               17 - sectors discarded
+               18 - time spent discarding
+
                 For more details refer to Documentation/iostats.txt
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 8a2c52d..1746131 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/.
       5-3. IO
         5-3-1. IO Interface Files
         5-3-2. Writeback
+       5-3-3. IO Latency
+         5-3-3-1. How IO Latency Throttling Works
+         5-3-3-2. IO Latency Interface Files
       5-4. PID
         5-4-1. PID Interface Files
       5-5. Device
@@ -1314,17 +1317,19 @@ IO Interface Files
         Lines are keyed by $MAJ:$MIN device numbers and not ordered.
         The following nested keys are defined.
  
-         ======        ===================
+         ======        =====================
           rbytes        Bytes read
           wbytes        Bytes written
           rios          Number of read IOs
           wios          Number of write IOs
-         ======        ===================
+         dbytes        Bytes discarded
+         dios          Number of discard IOs
+         ======        =====================
  
         An example read output follows:
  
-         8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
-         8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+         8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
+         8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
  
    io.weight
         A read-write flat-keyed file which exists on non-root cgroups.
@@ -1446,6 +1451,85 @@ writeback as follows.
         vm.dirty[_background]_ratio.
  
  
+IO Latency
+~~~~~~~~~~
+
+This is a cgroup v2 controller for IO workload protection.  You provide a group
+with a latency target, and if the average latency exceeds that target the
+controller will throttle any peers that have a lower latency target than the
+protected workload.
+
+The limits are only applied at the peer level in the hierarchy.  This means that
+in the diagram below, only groups A, B, and C will influence each other, and
+groups D and F will influence each other.  Group G will influence nobody.
+
+                       [root]
+               /          |            \
+               A          B            C
+              /  \        |
+             D    F       G
+
+
+So the ideal way to configure this is to set io.latency in groups A, B, and C.
+Generally you do not want to set a value lower than the latency your device
+supports.  Experiment to find the value that works best for your workload.
+Start at higher than the expected latency for your device and watch the
+avg_lat value in io.stat for your workload group to get an idea of the
+latency you see during normal operation.  Use the avg_lat value as a basis for
+your real setting, setting at 10-15% higher than the value in io.stat.
+
+How IO Latency Throttling Works
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+io.latency is work conserving; so as long as everybody is meeting their latency
+target the controller doesn't do anything.  Once a group starts missing its
+target it begins throttling any peer group that has a higher target than itself.
+This throttling takes 2 forms:
+
+- Queue depth throttling.  This is the number of outstanding IO's a group is
+  allowed to have.  We will clamp down relatively quickly, starting at no limit
+  and going all the way down to 1 IO at a time.
+
+- Artificial delay induction.  There are certain types of IO that cannot be
+  throttled without possibly adversely affecting higher priority groups.  This
+  includes swapping and metadata IO.  These types of IO are allowed to occur
+  normally, however they are "charged" to the originating group.  If the
+  originating group is being throttled you will see the use_delay and delay
+  fields in io.stat increase.  The delay value is how many microseconds that are
+  being added to any process that runs in this group.  Because this number can
+  grow quite large if there is a lot of swapping or metadata IO occurring we
+  limit the individual delay events to 1 second at a time.
+
+Once the victimized group starts meeting its latency target again it will start
+unthrottling any peer groups that were throttled previously.  If the victimized
+group simply stops doing IO the global counter will unthrottle appropriately.
+
+IO Latency Interface Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  io.latency
+       This takes a similar format as the other controllers.
+
+               "MAJOR:MINOR target=<target time in microseconds"
+
+  io.stat
+       If the controller is enabled you will see extra stats in io.stat in
+       addition to the normal ones.
+
+         depth
+               This is the current queue depth for the group.
+
+         avg_lat
+               This is an exponential moving average with a decay rate of 1/exp
+               bound by the sampling interval.  The decay rate interval can be
+               calculated by multiplying the win value in io.stat by the
+               corresponding number of samples based on the win value.
+
+         win
+               The sampling window size in milliseconds.  This is the minimum
+               duration of time between evaluation events.  Windows only elapse
+               with IO activity.  Idle periods extend the most recent window.
+
  PID
  ---
  
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt

index 07f1473..ea2dafe 100644 (file)
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
    0: Tag set is not shared.
    1: Tag set shared between devices for blk-mq. Only makes sense with
       nr_devices > 1, otherwise there's no tag set to share.
+
+zoned=[0/1]: Default: 0
+  0: Block device is exposed as a random-access block device.
+  1: Block device is exposed as a host-managed zoned block device.
+
+zone_size=[MB]: Default: 256
+  Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt

index 0dbc946..0aace9c 100644 (file)
--- a/Documentation/block/stat.txt
+++ b/Documentation/block/stat.txt
@@ -31,28 +31,32 @@ write ticks     milliseconds  total wait time for write requests
  in_flight       requests      number of I/Os currently in flight
  io_ticks        milliseconds  total time this block device has been active
  time_in_queue   milliseconds  total wait time for all requests
+discard I/Os    requests      number of discard I/Os processed
+discard merges  requests      number of discard I/Os merged with in-queue I/O
+discard sectors sectors       number of sectors discarded
+discard ticks   milliseconds  total wait time for discard requests
  
-read I/Os, write I/Os
-=====================
+read I/Os, write I/Os, discard I/0s
+===================================
  
  These values increment when an I/O request completes.
  
-read merges, write merges
-=========================
+read merges, write merges, discard merges
+=========================================
  
  These values increment when an I/O request is merged with an
  already-queued I/O request.
  
-read sectors, write sectors
-===========================
+read sectors, write sectors, discard_sectors
+============================================
  
-These values count the number of sectors read from or written to this
-block device.  The "sectors" in question are the standard UNIX 512-byte
-sectors, not any device- or filesystem-specific block size.  The
-counters are incremented when the I/O completes.
+These values count the number of sectors read from, written to, or
+discarded from this block device.  The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size.  The counters are incremented when the I/O completes.
  
-read ticks, write ticks
-=======================
+read ticks, write ticks, discard ticks
+======================================
  
  These values count the number of milliseconds that I/O requests have
  waited on this block device.  If there are multiple I/O requests waiting,
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt

index 04d394a..49df45f 100644 (file)
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -31,6 +31,9 @@ Here are examples of these different formats::
        3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
        3    1   hda1 35486 38030 38030 38030
  
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
  On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
  a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
  
@@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
      last update of this field.  This can provide an easy measure of both
      I/O completion time and the backlog that may be accumulating.
  
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
  
  To avoid introducing performance bottlenecks, no locks are held while
  modifying these counters.  This implies that minor inaccuracies may be
diff --git a/block/Kconfig b/block/Kconfig

index eb50fd4..1f2469a 100644 (file)
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -149,6 +149,18 @@ config BLK_WBT
         dynamically on an algorithm loosely based on CoDel, factoring in
         the realtime performance of the disk.
  
+config BLK_CGROUP_IOLATENCY
+       bool "Enable support for latency based cgroup IO protection"
+       depends on BLK_CGROUP=y
+       default n
+       ---help---
+       Enabling this option enables the .latency interface for IO throttling.
+       The IO controller will attempt to maintain average IO latencies below
+       the configured latency target, throttling anybody with a higher latency
+       target than the victimized group.
+
+       Note, this is an experimental interface and could be changed someday.
+
  config BLK_WBT_SQ
         bool "Single queue writeback throttling"
         default n
@@ -177,6 +189,10 @@ config BLK_DEBUG_FS
         Unless you are building a kernel for a tiny system, you should
         say Y here.
  
+config BLK_DEBUG_FS_ZONED
+       bool
+       default BLK_DEBUG_FS && BLK_DEV_ZONED
+
  config BLK_SED_OPAL
         bool "Logic for interfacing with Opal enabled SEDs"
         ---help---
diff --git a/block/Makefile b/block/Makefile

index 6a56303..572b33f 100644 (file)
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                         blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
                         blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
                         genhd.o partition-generic.o ioprio.o \
-                       badblocks.o partitions/
+                       badblocks.o partitions/ blk-rq-qos.o
  
  obj-$(CONFIG_BOUNCE)           += bounce.o
  obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG)     += bsg.o
  obj-$(CONFIG_BLK_DEV_BSGLIB)   += bsg-lib.o
  obj-$(CONFIG_BLK_CGROUP)       += blk-cgroup.o
  obj-$(CONFIG_BLK_DEV_THROTTLING)       += blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOLATENCY)     += blk-iolatency.o
  obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
  obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
  obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
@@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA)     += blk-mq-rdma.o
  obj-$(CONFIG_BLK_DEV_ZONED)    += blk-zoned.o
  obj-$(CONFIG_BLK_WBT)          += blk-wbt.o
  obj-$(CONFIG_BLK_DEBUG_FS)     += blk-mq-debugfs.o
+obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
  obj-$(CONFIG_BLK_SED_OPAL)     += sed-opal.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 495b9dd..41d9036 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
   * The following function returns true if every queue must receive the
   * same share of the throughput (this condition is used when deciding
   * whether idling may be disabled, see the comments in the function
- * bfq_bfqq_may_idle()).
+ * bfq_better_to_idle()).
   *
   * Such a scenario occurs when:
   * 1) all active queues have the same weight,
@@ -742,8 +742,9 @@ inc_counter:
   * See the comments to the function bfq_weights_tree_add() for considerations
   * about overhead.
   */
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
-                            struct rb_root *root)
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+                              struct bfq_entity *entity,
+                              struct rb_root *root)
  {
         if (!entity->weight_counter)
                 return;
@@ -759,6 +760,43 @@ reset_entity_pointer:
         entity->weight_counter = NULL;
  }
  
+/*
+ * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
+ * parent entities.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+                            struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = bfqq->entity.parent;
+
+       __bfq_weights_tree_remove(bfqd, &bfqq->entity,
+                                 &bfqd->queue_weights_tree);
+
+       for_each_entity(entity) {
+               struct bfq_sched_data *sd = entity->my_sched_data;
+
+               if (sd->next_in_service || sd->in_service_entity) {
+                       /*
+                        * entity is still active, because either
+                        * next_in_service or in_service_entity is not
+                        * NULL (see the comments on the definition of
+                        * next_in_service for details on why
+                        * in_service_entity must be checked too).
+                        *
+                        * As a consequence, the weight of entity is
+                        * not to be removed. In addition, if entity
+                        * is active, then its parent entities are
+                        * active as well, and thus their weights are
+                        * not to be removed either. In the end, this
+                        * loop must stop here.
+                        */
+                       break;
+               }
+               __bfq_weights_tree_remove(bfqd, entity,
+                                         &bfqd->group_weights_tree);
+       }
+}
+
  /*
   * Return expired entry, or NULL to just start from scratch in rbtree.
   */
@@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
                  * remain unchanged after such an expiration, and the
                  * following statement therefore assigns to
                  * entity->budget the remaining budget on such an
-                * expiration. For clarity, entity->service is not
-                * updated on expiration in any case, and, in normal
-                * operation, is reset only when bfqq is selected for
-                * service (see bfq_get_next_queue).
+                * expiration.
                  */
                 entity->budget = min_t(unsigned long,
                                        bfq_bfqq_budget_left(bfqq),
                                        bfqq->max_budget);
  
+               /*
+                * At this point, we have used entity->service to get
+                * the budget left (needed for updating
+                * entity->budget). Thus we finally can, and have to,
+                * reset entity->service. The latter must be reset
+                * because bfqq would otherwise be charged again for
+                * the service it has received during its previous
+                * service slot(s).
+                */
+               entity->service = 0;
+
                 return true;
         }
  
+       /*
+        * We can finally complete expiration, by setting service to 0.
+        */
+       entity->service = 0;
         entity->budget = max_t(unsigned long, bfqq->max_budget,
                                bfq_serv_to_charge(bfqq->next_rq, bfqq));
         bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
@@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
         ref = bfqq->ref;
         __bfq_bfqq_expire(bfqd, bfqq);
  
+       if (ref == 1) /* bfqq is gone, no more actions on it */
+               return;
+
         /* mark bfqq as waiting a request only if a bic still points to it */
-       if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+       if (!bfq_bfqq_busy(bfqq) &&
             reason != BFQQE_BUDGET_TIMEOUT &&
-           reason != BFQQE_BUDGET_EXHAUSTED)
+           reason != BFQQE_BUDGET_EXHAUSTED) {
                 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+               /*
+                * Not setting service to 0, because, if the next rq
+                * arrives in time, the queue will go on receiving
+                * service with this same budget (as if it never expired)
+                */
+       } else
+               entity->service = 0;
  }
  
  /*
@@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
   * issues taken into account are not trivial. We discuss these issues
   * individually while introducing the variables.
   */
-static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
  {
         struct bfq_data *bfqd = bfqq->bfqd;
         bool rot_without_queueing =
@@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
  }
  
  /*
- * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * If the in-service queue is empty but the function bfq_better_to_idle
   * returns true, then:
   * 1) the queue must remain in service and cannot be expired, and
   * 2) the device must be idled to wait for the possible arrival of a new
   *    request for the queue.
- * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * See the comments on the function bfq_better_to_idle for the reasons
   * why performing device idling is the best choice to boost the throughput
- * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * and preserve service guarantees when bfq_better_to_idle itself
   * returns true.
   */
  static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
  {
-       return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq);
+       return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
  }
  
  /*
@@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
  
         bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
  
+       /*
+        * Do not expire bfqq for budget timeout if bfqq may be about
+        * to enjoy device idling. The reason why, in this case, we
+        * prevent bfqq from expiring is the same as in the comments
+        * on the case where bfq_bfqq_must_idle() returns true, in
+        * bfq_completed_request().
+        */
         if (bfq_may_expire_for_budg_timeout(bfqq) &&
-           !bfq_bfqq_wait_request(bfqq) &&
             !bfq_bfqq_must_idle(bfqq))
                 goto expire;
  
@@ -3620,7 +3686,7 @@ check_queue:
          * may idle after their completion, then keep it anyway.
          */
         if (bfq_bfqq_wait_request(bfqq) ||
-           (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+           (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
                 bfqq = NULL;
                 goto keep_queue;
         }
@@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
                  */
                 bfqq->budget_timeout = jiffies;
  
-               bfq_weights_tree_remove(bfqd, &bfqq->entity,
-                                       &bfqd->queue_weights_tree);
+               bfq_weights_tree_remove(bfqd, bfqq);
         }
  
         now_ns = ktime_get_ns();
@@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
          * or if we want to idle in case it has no pending requests.
          */
         if (bfqd->in_service_queue == bfqq) {
-               if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
-                       bfq_arm_slice_timer(bfqd);
+               if (bfq_bfqq_must_idle(bfqq)) {
+                       if (bfqq->dispatched == 0)
+                               bfq_arm_slice_timer(bfqd);
+                       /*
+                        * If we get here, we do not expire bfqq, even
+                        * if bfqq was in budget timeout or had no
+                        * more requests (as controlled in the next
+                        * conditional instructions). The reason for
+                        * not expiring bfqq is as follows.
+                        *
+                        * Here bfqq->dispatched > 0 holds, but
+                        * bfq_bfqq_must_idle() returned true. This
+                        * implies that, even if no request arrives
+                        * for bfqq before bfqq->dispatched reaches 0,
+                        * bfqq will, however, not be expired on the
+                        * completion event that causes bfqq->dispatch
+                        * to reach zero. In contrast, on this event,
+                        * bfqq will start enjoying device idling
+                        * (I/O-dispatch plugging).
+                        *
+                        * But, if we expired bfqq here, bfqq would
+                        * not have the chance to enjoy device idling
+                        * when bfqq->dispatched finally reaches
+                        * zero. This would expose bfqq to violation
+                        * of its reserved service guarantees.
+                        */
                         return;
                 } else if (bfq_may_expire_for_budg_timeout(bfqq))
                         bfq_bfqq_expire(bfqd, bfqq, false,
                                         BFQQE_BUDGET_TIMEOUT);
                 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
                          (bfqq->dispatched == 0 ||
-                         !bfq_bfqq_may_idle(bfqq)))
+                         !bfq_better_to_idle(bfqq)))
                         bfq_bfqq_expire(bfqd, bfqq, false,
                                         BFQQE_NO_MORE_REQUESTS);
         }
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h

index 0f712e0..a8a2e5a 100644 (file)
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
  void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
  void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
                           struct rb_root *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
-                            struct rb_root *root);
+void __bfq_weights_tree_remove(struct bfq_data *bfqd,
+                              struct bfq_entity *entity,
+                              struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd,
+                            struct bfq_queue *bfqq);
  void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                      bool compensate, enum bfqq_expiration reason);
  void bfq_put_queue(struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c

index 4498c43..dbc07b4 100644 (file)
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
         if (bfqq)
                 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
-       else /* bfq_group */
-               bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
-
         if (bfqg != bfqd->root_group)
                 bfqg->active_entities++;
  #endif
@@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
         if (bfqq)
                 list_del(&bfqq->bfqq_list);
  #ifdef CONFIG_BFQ_GROUP_IOSCHED
-       else /* bfq_group */
-               bfq_weights_tree_remove(bfqd, entity,
-                                       &bfqd->group_weights_tree);
-
         if (bfqg != bfqd->root_group)
                 bfqg->active_entities--;
  #endif
@@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
                 if (prev_weight != new_weight) {
                         root = bfqq ? &bfqd->queue_weights_tree :
                                       &bfqd->group_weights_tree;
-                       bfq_weights_tree_remove(bfqd, entity, root);
+                       __bfq_weights_tree_remove(bfqd, entity, root);
                 }
                 entity->weight = new_weight;
                 /*
@@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
   * one of its children receives a new request.
   *
   * Basically, this function updates the timestamps of entity and
- * inserts entity into its active tree, ater possibly extracting it
+ * inserts entity into its active tree, after possibly extracting it
   * from its idle tree.
   */
  static void __bfq_activate_entity(struct bfq_entity *entity,
@@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
                 entity->on_st = true;
         }
  
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+       if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
+               struct bfq_group *bfqg =
+                       container_of(entity, struct bfq_group, entity);
+
+               bfq_weights_tree_add(bfqg->bfqd, entity,
+                                    &bfqd->group_weights_tree);
+       }
+#endif
+
         bfq_update_fin_time_enqueue(entity, st, backshifted);
  }
  
@@ -1541,12 +1544,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
                 entity = sd->next_in_service;
                 sd->in_service_entity = entity;
  
-               /*
-                * Reset the accumulator of the amount of service that
-                * the entity is about to receive.
-                */
-               entity->service = 0;
-
                 /*
                  * If entity is no longer a candidate for next
                  * service, then it must be extracted from its active
@@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
         bfqd->busy_queues--;
  
         if (!bfqq->dispatched)
-               bfq_weights_tree_remove(bfqd, &bfqq->entity,
-                                       &bfqd->queue_weights_tree);
+               bfq_weights_tree_remove(bfqd, bfqq);
  
         if (bfqq->wr_coeff > 1)
                 bfqd->wr_busy_queues--;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c

index add7c7c..67b5fb8 100644 (file)
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -159,28 +159,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
  }
  EXPORT_SYMBOL(bio_integrity_add_page);
  
-/**
- * bio_integrity_intervals - Return number of integrity intervals for a bio
- * @bi:                blk_integrity profile for device
- * @sectors:   Size of the bio in 512-byte sectors
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the data integrity
- * interval size of the storage device.  Convert the block layer sectors
- * to the appropriate number of integrity intervals.
- */
-static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
-                                                  unsigned int sectors)
-{
-       return sectors >> (bi->interval_exp - 9);
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-                                              unsigned int sectors)
-{
-       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
-}
-
  /**
   * bio_integrity_process - Process integrity metadata for a bio
   * @bio:       bio to generate/verify integrity metadata for
diff --git a/block/bio.c b/block/bio.c

index 67eff5e..b832151 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,9 +28,11 @@
  #include <linux/mempool.h>
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
+#include <linux/blk-cgroup.h>
  
  #include <trace/events/block.h>
  #include "blk.h"
+#include "blk-rq-qos.h"
  
  /*
   * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -644,83 +646,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
  }
  EXPORT_SYMBOL(bio_clone_fast);
  
-/**
- *     bio_clone_bioset - clone a bio
- *     @bio_src: bio to clone
- *     @gfp_mask: allocation priority
- *     @bs: bio_set to allocate from
- *
- *     Clone bio. Caller will own the returned bio, but not the actual data it
- *     points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-                            struct bio_set *bs)
-{
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       struct bio *bio;
-
-       /*
-        * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
-        * bio_src->bi_io_vec to bio->bi_io_vec.
-        *
-        * We can't do that anymore, because:
-        *
-        *  - The point of cloning the biovec is to produce a bio with a biovec
-        *    the caller can modify: bi_idx and bi_bvec_done should be 0.
-        *
-        *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
-        *    we tried to clone the whole thing bio_alloc_bioset() would fail.
-        *    But the clone should succeed as long as the number of biovecs we
-        *    actually need to allocate is fewer than BIO_MAX_PAGES.
-        *
-        *  - Lastly, bi_vcnt should not be looked at or relied upon by code
-        *    that does not own the bio - reason being drivers don't use it for
-        *    iterating over the biovec anymore, so expecting it to be kept up
-        *    to date (i.e. for clones that share the parent biovec) is just
-        *    asking for trouble and would force extra work on
-        *    __bio_clone_fast() anyways.
-        */
-
-       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-       if (!bio)
-               return NULL;
-       bio->bi_disk            = bio_src->bi_disk;
-       bio->bi_opf             = bio_src->bi_opf;
-       bio->bi_write_hint      = bio_src->bi_write_hint;
-       bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
-       bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
-
-       switch (bio_op(bio)) {
-       case REQ_OP_DISCARD:
-       case REQ_OP_SECURE_ERASE:
-       case REQ_OP_WRITE_ZEROES:
-               break;
-       case REQ_OP_WRITE_SAME:
-               bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-               break;
-       default:
-               bio_for_each_segment(bv, bio_src, iter)
-                       bio->bi_io_vec[bio->bi_vcnt++] = bv;
-               break;
-       }
-
-       if (bio_integrity(bio_src)) {
-               int ret;
-
-               ret = bio_integrity_clone(bio, bio_src, gfp_mask);
-               if (ret < 0) {
-                       bio_put(bio);
-                       return NULL;
-               }
-       }
-
-       bio_clone_blkcg_association(bio, bio_src);
-
-       return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
  /**
   *     bio_add_pc_page -       attempt to add page to bio
   *     @q: the target queue
@@ -1634,10 +1559,8 @@ void bio_set_pages_dirty(struct bio *bio)
         int i;
  
         bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-
-               if (page && !PageCompound(page))
-                       set_page_dirty_lock(page);
+               if (!PageCompound(bvec->bv_page))
+                       set_page_dirty_lock(bvec->bv_page);
         }
  }
  EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@ -1647,19 +1570,15 @@ static void bio_release_pages(struct bio *bio)
         struct bio_vec *bvec;
         int i;
  
-       bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-
-               if (page)
-                       put_page(page);
-       }
+       bio_for_each_segment_all(bvec, bio, i)
+               put_page(bvec->bv_page);
  }
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
   * If they are, then fine.  If, however, some pages are clean then they must
   * have been written out during the direct-IO read.  So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
+ * the BIO and re-dirty the pages in process context.
   *
   * It is expected that bio_check_pages_dirty() will wholly own the BIO from
   * here on.  It will run one put_page() against each page and will run one
@@ -1677,78 +1596,70 @@ static struct bio *bio_dirty_list;
   */
  static void bio_dirty_fn(struct work_struct *work)
  {
-       unsigned long flags;
-       struct bio *bio;
+       struct bio *bio, *next;
  
-       spin_lock_irqsave(&bio_dirty_lock, flags);
-       bio = bio_dirty_list;
+       spin_lock_irq(&bio_dirty_lock);
+       next = bio_dirty_list;
         bio_dirty_list = NULL;
-       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       spin_unlock_irq(&bio_dirty_lock);
  
-       while (bio) {
-               struct bio *next = bio->bi_private;
+       while ((bio = next) != NULL) {
+               next = bio->bi_private;
  
                 bio_set_pages_dirty(bio);
                 bio_release_pages(bio);
                 bio_put(bio);
-               bio = next;
         }
  }
  
  void bio_check_pages_dirty(struct bio *bio)
  {
         struct bio_vec *bvec;
-       int nr_clean_pages = 0;
+       unsigned long flags;
         int i;
  
         bio_for_each_segment_all(bvec, bio, i) {
-               struct page *page = bvec->bv_page;
-
-               if (PageDirty(page) || PageCompound(page)) {
-                       put_page(page);
-                       bvec->bv_page = NULL;
-               } else {
-                       nr_clean_pages++;
-               }
+               if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+                       goto defer;
         }
  
-       if (nr_clean_pages) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&bio_dirty_lock, flags);
-               bio->bi_private = bio_dirty_list;
-               bio_dirty_list = bio;
-               spin_unlock_irqrestore(&bio_dirty_lock, flags);
-               schedule_work(&bio_dirty_work);
-       } else {
-               bio_put(bio);
-       }
+       bio_release_pages(bio);
+       bio_put(bio);
+       return;
+defer:
+       spin_lock_irqsave(&bio_dirty_lock, flags);
+       bio->bi_private = bio_dirty_list;
+       bio_dirty_list = bio;
+       spin_unlock_irqrestore(&bio_dirty_lock, flags);
+       schedule_work(&bio_dirty_work);
  }
  EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
  
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
                            unsigned long sectors, struct hd_struct *part)
  {
+       const int sgrp = op_stat_group(op);
         int cpu = part_stat_lock();
  
         part_round_stats(q, cpu, part);
-       part_stat_inc(cpu, part, ios[rw]);
-       part_stat_add(cpu, part, sectors[rw], sectors);
-       part_inc_in_flight(q, part, rw);
+       part_stat_inc(cpu, part, ios[sgrp]);
+       part_stat_add(cpu, part, sectors[sgrp], sectors);
+       part_inc_in_flight(q, part, op_is_write(op));
  
         part_stat_unlock();
  }
  EXPORT_SYMBOL(generic_start_io_acct);
  
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int req_op,
                          struct hd_struct *part, unsigned long start_time)
  {
         unsigned long duration = jiffies - start_time;
+       const int sgrp = op_stat_group(req_op);
         int cpu = part_stat_lock();
  
-       part_stat_add(cpu, part, ticks[rw], duration);
+       part_stat_add(cpu, part, ticks[sgrp], duration);
         part_round_stats(q, cpu, part);
-       part_dec_in_flight(q, part, rw);
+       part_dec_in_flight(q, part, op_is_write(req_op));
  
         part_stat_unlock();
  }
@@ -1807,6 +1718,9 @@ again:
         if (!bio_integrity_endio(bio))
                 return;
  
+       if (bio->bi_disk)
+               rq_qos_done_bio(bio->bi_disk->queue, bio);
+
         /*
          * Need to have a real endio function for chained bios, otherwise
          * various corner cases will break (like stacking block devices that
@@ -2014,6 +1928,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
  
  #ifdef CONFIG_BLK_CGROUP
  
+#ifdef CONFIG_MEMCG
+/**
+ * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkcg from @page's owning memcg.  This works like
+ * every other associate function wrt references.
+ */
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+{
+       struct cgroup_subsys_state *blkcg_css;
+
+       if (unlikely(bio->bi_css))
+               return -EBUSY;
+       if (!page->mem_cgroup)
+               return 0;
+       blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
+                                    &io_cgrp_subsys);
+       bio->bi_css = blkcg_css;
+       return 0;
+}
+#endif /* CONFIG_MEMCG */
+
  /**
   * bio_associate_blkcg - associate a bio with the specified blkcg
   * @bio: target bio
@@ -2036,6 +1974,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
  }
  EXPORT_SYMBOL_GPL(bio_associate_blkcg);
  
+/**
+ * bio_associate_blkg - associate a bio with the specified blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * Associate @bio with the blkg specified by @blkg.  This is the queue specific
+ * blkcg information associated with the @bio, a reference will be taken on the
+ * @blkg and will be freed when the bio is freed.
+ */
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+{
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       blkg_get(blkg);
+       bio->bi_blkg = blkg;
+       return 0;
+}
+
  /**
   * bio_disassociate_task - undo bio_associate_current()
   * @bio: target bio
@@ -2050,6 +2006,10 @@ void bio_disassociate_task(struct bio *bio)
                 css_put(bio->bi_css);
                 bio->bi_css = NULL;
         }
+       if (bio->bi_blkg) {
+               blkg_put(bio->bi_blkg);
+               bio->bi_blkg = NULL;
+       }
  }
  
  /**
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index eb85cb8..694595b 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
  #include <linux/atomic.h>
  #include <linux/ctype.h>
  #include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
  #include "blk.h"
  
  #define MAX_KEY_LEN 100
@@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
  static LIST_HEAD(all_blkcgs);          /* protected by blkcg_pol_mutex */
  
+static bool blkcg_debug_stats = false;
+
  static bool blkcg_policy_enabled(struct request_queue *q,
                                  const struct blkcg_policy *pol)
  {
@@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
                 [BLKG_RWSTAT_WRITE]     = "Write",
                 [BLKG_RWSTAT_SYNC]      = "Sync",
                 [BLKG_RWSTAT_ASYNC]     = "Async",
+               [BLKG_RWSTAT_DISCARD]   = "Discard",
         };
         const char *dname = blkg_dev_name(pd->blkg);
         u64 v;
@@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
                            (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
  
         v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
-               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
         return v;
  }
@@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
  
         hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                 const char *dname;
+               char *buf;
                 struct blkg_rwstat rwstat;
-               u64 rbytes, wbytes, rios, wios;
+               u64 rbytes, wbytes, rios, wios, dbytes, dios;
+               size_t size = seq_get_buf(sf, &buf), off = 0;
+               int i;
+               bool has_stats = false;
  
                 dname = blkg_dev_name(blkg);
                 if (!dname)
                         continue;
  
+               /*
+                * Hooray string manipulation, count is the size written NOT
+                * INCLUDING THE \0, so size is now count+1 less than what we
+                * had before, but we want to start writing the next bit from
+                * the \0 so we only add count to buf.
+                */
+               off += scnprintf(buf+off, size-off, "%s ", dname);
+
                 spin_lock_irq(blkg->q->queue_lock);
  
                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
                                         offsetof(struct blkcg_gq, stat_bytes));
                 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
                 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+               dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
  
                 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
                                         offsetof(struct blkcg_gq, stat_ios));
                 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
                 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+               dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
  
                 spin_unlock_irq(blkg->q->queue_lock);
  
-               if (rbytes || wbytes || rios || wios)
-                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
-                                  dname, rbytes, wbytes, rios, wios);
+               if (rbytes || wbytes || rios || wios) {
+                       has_stats = true;
+                       off += scnprintf(buf+off, size-off,
+                                        "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+                                        rbytes, wbytes, rios, wios,
+                                        dbytes, dios);
+               }
+
+               if (!blkcg_debug_stats)
+                       goto next;
+
+               if (atomic_read(&blkg->use_delay)) {
+                       has_stats = true;
+                       off += scnprintf(buf+off, size-off,
+                                        " use_delay=%d delay_nsec=%llu",
+                                        atomic_read(&blkg->use_delay),
+                                       (unsigned long long)atomic64_read(&blkg->delay_nsec));
+               }
+
+               for (i = 0; i < BLKCG_MAX_POLS; i++) {
+                       struct blkcg_policy *pol = blkcg_policy[i];
+                       size_t written;
+
+                       if (!blkg->pd[i] || !pol->pd_stat_fn)
+                               continue;
+
+                       written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
+                       if (written)
+                               has_stats = true;
+                       off += written;
+               }
+next:
+               if (has_stats) {
+                       off += scnprintf(buf+off, size-off, "\n");
+                       seq_commit(sf, off);
+               }
         }
  
         rcu_read_unlock();
@@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
         if (preloaded)
                 radix_tree_preload_end();
  
+       ret = blk_iolatency_init(q);
+       if (ret) {
+               spin_lock_irq(q->queue_lock);
+               blkg_destroy_all(q);
+               spin_unlock_irq(q->queue_lock);
+               return ret;
+       }
+
         ret = blk_throtl_init(q);
         if (ret) {
                 spin_lock_irq(q->queue_lock);
@@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
         mutex_unlock(&blkcg_pol_mutex);
  }
  
+static void blkcg_exit(struct task_struct *tsk)
+{
+       if (tsk->throttle_queue)
+               blk_put_queue(tsk->throttle_queue);
+       tsk->throttle_queue = NULL;
+}
+
  struct cgroup_subsys io_cgrp_subsys = {
         .css_alloc = blkcg_css_alloc,
         .css_offline = blkcg_css_offline,
@@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
         .dfl_cftypes = blkcg_files,
         .legacy_cftypes = blkcg_legacy_files,
         .legacy_name = "blkio",
+       .exit = blkcg_exit,
  #ifdef CONFIG_MEMCG
         /*
          * This ensures that, if available, memcg is automatically enabled
@@ -1547,3 +1615,209 @@ out_unlock:
         mutex_unlock(&blkcg_pol_register_mutex);
  }
  EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay.  We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+       u64 old = atomic64_read(&blkg->delay_start);
+
+       /*
+        * We only want to scale down every second.  The idea here is that we
+        * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+        * time window.  We only want to throttle tasks for recent delay that
+        * has occurred, in 1 second time windows since that's the maximum
+        * things can be throttled.  We save the current delay window in
+        * blkg->last_delay so we know what amount is still left to be charged
+        * to the blkg from this point onward.  blkg->last_use keeps track of
+        * the use_delay counter.  The idea is if we're unthrottling the blkg we
+        * are ok with whatever is happening now, and we can take away more of
+        * the accumulated delay as we've already throttled enough that
+        * everybody is happy with their IO latencies.
+        */
+       if (time_before64(old + NSEC_PER_SEC, now) &&
+           atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+               u64 cur = atomic64_read(&blkg->delay_nsec);
+               u64 sub = min_t(u64, blkg->last_delay, now - old);
+               int cur_use = atomic_read(&blkg->use_delay);
+
+               /*
+                * We've been unthrottled, subtract a larger chunk of our
+                * accumulated delay.
+                */
+               if (cur_use < blkg->last_use)
+                       sub = max_t(u64, sub, blkg->last_delay >> 1);
+
+               /*
+                * This shouldn't happen, but handle it anyway.  Our delay_nsec
+                * should only ever be growing except here where we subtract out
+                * min(last_delay, 1 second), but lord knows bugs happen and I'd
+                * rather not end up with negative numbers.
+                */
+               if (unlikely(cur < sub)) {
+                       atomic64_set(&blkg->delay_nsec, 0);
+                       blkg->last_delay = 0;
+               } else {
+                       atomic64_sub(sub, &blkg->delay_nsec);
+                       blkg->last_delay = cur - sub;
+               }
+               blkg->last_use = cur_use;
+       }
+}
+
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay.  This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+       u64 now = ktime_to_ns(ktime_get());
+       u64 exp;
+       u64 delay_nsec = 0;
+       int tok;
+
+       while (blkg->parent) {
+               if (atomic_read(&blkg->use_delay)) {
+                       blkcg_scale_delay(blkg, now);
+                       delay_nsec = max_t(u64, delay_nsec,
+                                          atomic64_read(&blkg->delay_nsec));
+               }
+               blkg = blkg->parent;
+       }
+
+       if (!delay_nsec)
+               return;
+
+       /*
+        * Let's not sleep for all eternity if we've amassed a huge delay.
+        * Swapping or metadata IO can accumulate 10's of seconds worth of
+        * delay, and we want userspace to be able to do _something_ so cap the
+        * delays at 1 second.  If there's 10's of seconds worth of delay then
+        * the tasks will be delayed for 1 second for every syscall.
+        */
+       delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+
+       /*
+        * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+        * that hasn't landed upstream yet.  Once that stuff is in place we need
+        * to do a psi_memstall_enter/leave if memdelay is set.
+        */
+
+       exp = ktime_add_ns(now, delay_nsec);
+       tok = io_schedule_prepare();
+       do {
+               __set_current_state(TASK_KILLABLE);
+               if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+                       break;
+       } while (!fatal_signal_pending(current));
+       io_schedule_finish(tok);
+}
+
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume().  Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything.  This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+       struct request_queue *q = current->throttle_queue;
+       struct cgroup_subsys_state *css;
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       bool use_memdelay = current->use_memdelay;
+
+       if (!q)
+               return;
+
+       current->throttle_queue = NULL;
+       current->use_memdelay = false;
+
+       rcu_read_lock();
+       css = kthread_blkcg();
+       if (css)
+               blkcg = css_to_blkcg(css);
+       else
+               blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+
+       if (!blkcg)
+               goto out;
+       blkg = blkg_lookup(blkcg, q);
+       if (!blkg)
+               goto out;
+       blkg = blkg_try_get(blkg);
+       if (!blkg)
+               goto out;
+       rcu_read_unlock();
+
+       blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+       blkg_put(blkg);
+       blk_put_queue(q);
+       return;
+out:
+       rcu_read_unlock();
+       blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task.  We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point.  This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall.  You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once.  If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+       if (unlikely(current->flags & PF_KTHREAD))
+               return;
+
+       if (!blk_get_queue(q))
+               return;
+
+       if (current->throttle_queue)
+               blk_put_queue(current->throttle_queue);
+       current->throttle_queue = q;
+       if (use_memdelay)
+               current->use_memdelay = use_memdelay;
+       set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation.  This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+       blkcg_scale_delay(blkg, now);
+       atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
+
+module_param(blkcg_debug_stats, bool, 0644);
+MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c

index f84a9b7..f9ad73d 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,7 +42,7 @@
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-sched.h"
-#include "blk-wbt.h"
+#include "blk-rq-qos.h"
  
  #ifdef CONFIG_DEBUG_FS
  struct dentry *blk_debugfs_root;
@@ -762,9 +762,13 @@ void blk_cleanup_queue(struct request_queue *q)
          * make sure all in-progress dispatch are completed because
          * blk_freeze_queue() can only complete all requests, and
          * dispatch may still be in-progress since we dispatch requests
-        * from more than one contexts
+        * from more than one contexts.
+        *
+        * No need to quiesce queue if it isn't initialized yet since
+        * blk_freeze_queue() should be enough for cases of passthrough
+        * request.
          */
-       if (q->mq_ops)
+       if (q->mq_ops && blk_queue_init_done(q))
                 blk_mq_quiesce_queue(q);
  
         /* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -1180,6 +1184,7 @@ out_exit_flush_rq:
                 q->exit_rq_fn(q, q->fq->flush_rq);
  out_free_flush_queue:
         blk_free_flush_queue(q->fq);
+       q->fq = NULL;
         return -ENOMEM;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1641,7 +1646,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
         blk_delete_timer(rq);
         blk_clear_rq_complete(rq);
         trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
  
         if (rq->rq_flags & RQF_QUEUED)
                 blk_queue_end_tag(q, rq);
@@ -1748,7 +1753,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         /* this is a bio leak */
         WARN_ON(req->bio != NULL);
  
-       wbt_done(q->rq_wb, req);
+       rq_qos_done(q, req);
  
         /*
          * Request may not have originated from ll_rw_blk. if not,
@@ -1982,7 +1987,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         int where = ELEVATOR_INSERT_SORT;
         struct request *req, *free;
         unsigned int request_count = 0;
-       unsigned int wb_acct;
  
         /*
          * low level driver can indicate that it wants pages above a
@@ -2040,7 +2044,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         }
  
  get_rq:
-       wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+       rq_qos_throttle(q, bio, q->queue_lock);
  
         /*
          * Grab a free request. This is might sleep but can not fail.
@@ -2050,7 +2054,7 @@ get_rq:
         req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
         if (IS_ERR(req)) {
                 blk_queue_exit(q);
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                 if (PTR_ERR(req) == -ENOMEM)
                         bio->bi_status = BLK_STS_RESOURCE;
                 else
@@ -2059,7 +2063,7 @@ get_rq:
                 goto out_unlock;
         }
  
-       wbt_track(req, wb_acct);
+       rq_qos_track(q, req, bio);
  
         /*
          * After dropping the lock and possibly sleeping here, our request
@@ -2699,13 +2703,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
  void blk_account_io_completion(struct request *req, unsigned int bytes)
  {
         if (blk_do_io_stat(req)) {
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                 struct hd_struct *part;
                 int cpu;
  
                 cpu = part_stat_lock();
                 part = req->part;
-               part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+               part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
                 part_stat_unlock();
         }
  }
@@ -2719,7 +2723,7 @@ void blk_account_io_done(struct request *req, u64 now)
          */
         if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
                 unsigned long duration;
-               const int rw = rq_data_dir(req);
+               const int sgrp = op_stat_group(req_op(req));
                 struct hd_struct *part;
                 int cpu;
  
@@ -2727,10 +2731,10 @@ void blk_account_io_done(struct request *req, u64 now)
                 cpu = part_stat_lock();
                 part = req->part;
  
-               part_stat_inc(cpu, part, ios[rw]);
-               part_stat_add(cpu, part, ticks[rw], duration);
+               part_stat_inc(cpu, part, ios[sgrp]);
+               part_stat_add(cpu, part, ticks[sgrp], duration);
                 part_round_stats(req->q, cpu, part);
-               part_dec_in_flight(req->q, part, rw);
+               part_dec_in_flight(req->q, part, rq_data_dir(req));
  
                 hd_struct_put(part);
                 part_stat_unlock();
@@ -2750,9 +2754,9 @@ static bool blk_pm_allow_request(struct request *rq)
                 return rq->rq_flags & RQF_PM;
         case RPM_SUSPENDED:
                 return false;
+       default:
+               return true;
         }
-
-       return true;
  }
  #else
  static bool blk_pm_allow_request(struct request *rq)
@@ -2979,7 +2983,7 @@ void blk_start_request(struct request *req)
                 req->throtl_size = blk_rq_sectors(req);
  #endif
                 req->rq_flags |= RQF_STATS;
-               wbt_issue(req->q->rq_wb, req);
+               rq_qos_issue(req->q, req);
         }
  
         BUG_ON(blk_rq_is_complete(req));
@@ -3052,6 +3056,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
   *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
   *     %false return from this function.
   *
+ * Note:
+ *     The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
+ *     blk_rq_bytes() and in blk_update_request().
+ *
   * Return:
   *     %false - this request doesn't have any more data
   *     %true  - this request has more data
@@ -3199,7 +3207,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
         blk_account_io_done(req, now);
  
         if (req->end_io) {
-               wbt_done(req->q->rq_wb, req);
+               rq_qos_done(q, req);
                 req->end_io(req, error);
         } else {
                 if (blk_bidi_rq(req))
@@ -3762,9 +3770,11 @@ EXPORT_SYMBOL(blk_finish_plug);
   */
  void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
  {
-       /* not support for RQF_PM and ->rpm_status in blk-mq yet */
-       if (q->mq_ops)
+       /* Don't enable runtime PM for blk-mq until it is ready */
+       if (q->mq_ops) {
+               pm_runtime_disable(dev);
                 return;
+       }
  
         q->dev = dev;
         q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c

index f23311e..01580f8 100644 (file)
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
         atomic_set(&ioc->nr_tasks, 1);
         atomic_set(&ioc->active_ref, 1);
         spin_lock_init(&ioc->lock);
-       INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+       INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
         INIT_HLIST_HEAD(&ioc->icq_list);
         INIT_WORK(&ioc->release_work, ioc_release_fn);
  
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c

new file mode 100644 (file)

index 0000000..19923f8
--- /dev/null
+++ b/block/blk-iolatency.c
@@ -0,0 +1,955 @@
+/*
+ * Block rq-qos base io controller
+ *
+ * This works similar to wbt with a few exceptions
+ *
+ * - It's bio based, so the latency covers the whole block layer in addition to
+ *   the actual io.
+ * - We will throttle all IO that comes in here if we need to.
+ * - We use the mean latency over the 100ms window.  This is because writes can
+ *   be particularly fast, which could give us a false sense of the impact of
+ *   other workloads on our protected workload.
+ * - By default there's no throttling, we set the queue_depth to UINT_MAX so
+ *   that we can have as many outstanding bio's as we're allowed to.  Only at
+ *   throttle time do we pay attention to the actual queue depth.
+ *
+ * The hierarchy works like the cpu controller does, we track the latency at
+ * every configured node, and each configured node has it's own independent
+ * queue depth.  This means that we only care about our latency targets at the
+ * peer level.  Some group at the bottom of the hierarchy isn't going to affect
+ * a group at the end of some other path if we're only configred at leaf level.
+ *
+ * Consider the following
+ *
+ *                   root blkg
+ *             /                     \
+ *        fast (target=5ms)     slow (target=10ms)
+ *         /     \                  /        \
+ *       a        b          normal(15ms)   unloved
+ *
+ * "a" and "b" have no target, but their combined io under "fast" cannot exceed
+ * an average latency of 5ms.  If it does then we will throttle the "slow"
+ * group.  In the case of "normal", if it exceeds its 15ms target, we will
+ * throttle "unloved", but nobody else.
+ *
+ * In this example "fast", "slow", and "normal" will be the only groups actually
+ * accounting their io latencies.  We have to walk up the heirarchy to the root
+ * on every submit and complete so we can do the appropriate stat recording and
+ * adjust the queue depth of ourselves if needed.
+ *
+ * There are 2 ways we throttle IO.
+ *
+ * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
+ * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
+ * to 1.  If the group is only ever submitting IO for itself then this is the
+ * only way we throttle.
+ *
+ * 2) Induced delay throttling.  This is for the case that a group is generating
+ * IO that has to be issued by the root cg to avoid priority inversion. So think
+ * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
+ * of work done for us on behalf of the root cg and are being asked to scale
+ * down more then we induce a latency at userspace return.  We accumulate the
+ * total amount of time we need to be punished by doing
+ *
+ * total_time += min_lat_nsec - actual_io_completion
+ *
+ * and then at throttle time will do
+ *
+ * throttle_time = min(total_time, NSEC_PER_SEC)
+ *
+ * This induced delay will throttle back the activity that is generating the
+ * root cg issued io's, wethere that's some metadata intensive operation or the
+ * group is using so much memory that it is pushing us into swap.
+ *
+ * Copyright (C) 2018 Josef Bacik
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/signal.h>
+#include <trace/events/block.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+
+#define DEFAULT_SCALE_COOKIE 1000000U
+
+static struct blkcg_policy blkcg_policy_iolatency;
+struct iolatency_grp;
+
+struct blk_iolatency {
+       struct rq_qos rqos;
+       struct timer_list timer;
+       atomic_t enabled;
+};
+
+static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
+{
+       return container_of(rqos, struct blk_iolatency, rqos);
+}
+
+static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
+{
+       return atomic_read(&blkiolat->enabled) > 0;
+}
+
+struct child_latency_info {
+       spinlock_t lock;
+
+       /* Last time we adjusted the scale of everybody. */
+       u64 last_scale_event;
+
+       /* The latency that we missed. */
+       u64 scale_lat;
+
+       /* Total io's from all of our children for the last summation. */
+       u64 nr_samples;
+
+       /* The guy who actually changed the latency numbers. */
+       struct iolatency_grp *scale_grp;
+
+       /* Cookie to tell if we need to scale up or down. */
+       atomic_t scale_cookie;
+};
+
+struct iolatency_grp {
+       struct blkg_policy_data pd;
+       struct blk_rq_stat __percpu *stats;
+       struct blk_iolatency *blkiolat;
+       struct rq_depth rq_depth;
+       struct rq_wait rq_wait;
+       atomic64_t window_start;
+       atomic_t scale_cookie;
+       u64 min_lat_nsec;
+       u64 cur_win_nsec;
+
+       /* total running average of our io latency. */
+       u64 lat_avg;
+
+       /* Our current number of IO's for the last summation. */
+       u64 nr_samples;
+
+       struct child_latency_info child_lat;
+};
+
+#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
+/*
+ * These are the constants used to fake the fixed-point moving average
+ * calculation just like load average.  The call to CALC_LOAD folds
+ * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
+ * window size is bucketed to try to approximately calculate average
+ * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
+ * elapse immediately.  Note, windows only elapse with IO activity.  Idle
+ * periods extend the most recent window.
+ */
+#define BLKIOLATENCY_NR_EXP_FACTORS 5
+#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
+                                     (BLKIOLATENCY_NR_EXP_FACTORS - 1))
+static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
+       2045, // exp(1/600) - 600 samples
+       2039, // exp(1/240) - 240 samples
+       2031, // exp(1/120) - 120 samples
+       2023, // exp(1/80)  - 80 samples
+       2014, // exp(1/60)  - 60 samples
+};
+
+static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
+}
+
+static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
+{
+       return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
+}
+
+static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
+{
+       return pd_to_blkg(&iolat->pd);
+}
+
+static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
+                                      wait_queue_entry_t *wait,
+                                      bool first_block)
+{
+       struct rq_wait *rqw = &iolat->rq_wait;
+
+       if (first_block && waitqueue_active(&rqw->wait) &&
+           rqw->wait.head.next != &wait->entry)
+               return false;
+       return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+}
+
+static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
+                                      struct iolatency_grp *iolat,
+                                      spinlock_t *lock, bool issue_as_root,
+                                      bool use_memdelay)
+       __releases(lock)
+       __acquires(lock)
+{
+       struct rq_wait *rqw = &iolat->rq_wait;
+       unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
+       DEFINE_WAIT(wait);
+       bool first_block = true;
+
+       if (use_delay)
+               blkcg_schedule_throttle(rqos->q, use_memdelay);
+
+       /*
+        * To avoid priority inversions we want to just take a slot if we are
+        * issuing as root.  If we're being killed off there's no point in
+        * delaying things, we may have been killed by OOM so throttling may
+        * make recovery take even longer, so just let the IO's through so the
+        * task can go away.
+        */
+       if (issue_as_root || fatal_signal_pending(current)) {
+               atomic_inc(&rqw->inflight);
+               return;
+       }
+
+       if (iolatency_may_queue(iolat, &wait, first_block))
+               return;
+
+       do {
+               prepare_to_wait_exclusive(&rqw->wait, &wait,
+                                         TASK_UNINTERRUPTIBLE);
+
+               if (iolatency_may_queue(iolat, &wait, first_block))
+                       break;
+               first_block = false;
+
+               if (lock) {
+                       spin_unlock_irq(lock);
+                       io_schedule();
+                       spin_lock_irq(lock);
+               } else {
+                       io_schedule();
+               }
+       } while (1);
+
+       finish_wait(&rqw->wait, &wait);
+}
+
+#define SCALE_DOWN_FACTOR 2
+#define SCALE_UP_FACTOR 4
+
+static inline unsigned long scale_amount(unsigned long qd, bool up)
+{
+       return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
+}
+
+/*
+ * We scale the qd down faster than we scale up, so we need to use this helper
+ * to adjust the scale_cookie accordingly so we don't prematurely get
+ * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
+ *
+ * Each group has their own local copy of the last scale cookie they saw, so if
+ * the global scale cookie goes up or down they know which way they need to go
+ * based on their last knowledge of it.
+ */
+static void scale_cookie_change(struct blk_iolatency *blkiolat,
+                               struct child_latency_info *lat_info,
+                               bool up)
+{
+       unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+       unsigned long scale = scale_amount(qd, up);
+       unsigned long old = atomic_read(&lat_info->scale_cookie);
+       unsigned long max_scale = qd << 1;
+       unsigned long diff = 0;
+
+       if (old < DEFAULT_SCALE_COOKIE)
+               diff = DEFAULT_SCALE_COOKIE - old;
+
+       if (up) {
+               if (scale + old > DEFAULT_SCALE_COOKIE)
+                       atomic_set(&lat_info->scale_cookie,
+                                  DEFAULT_SCALE_COOKIE);
+               else if (diff > qd)
+                       atomic_inc(&lat_info->scale_cookie);
+               else
+                       atomic_add(scale, &lat_info->scale_cookie);
+       } else {
+               /*
+                * We don't want to dig a hole so deep that it takes us hours to
+                * dig out of it.  Just enough that we don't throttle/unthrottle
+                * with jagged workloads but can still unthrottle once pressure
+                * has sufficiently dissipated.
+                */
+               if (diff > qd) {
+                       if (diff < max_scale)
+                               atomic_dec(&lat_info->scale_cookie);
+               } else {
+                       atomic_sub(scale, &lat_info->scale_cookie);
+               }
+       }
+}
+
+/*
+ * Change the queue depth of the iolatency_grp.  We add/subtract 1/16th of the
+ * queue depth at a time so we don't get wild swings and hopefully dial in to
+ * fairer distribution of the overall queue depth.
+ */
+static void scale_change(struct iolatency_grp *iolat, bool up)
+{
+       unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+       unsigned long scale = scale_amount(qd, up);
+       unsigned long old = iolat->rq_depth.max_depth;
+       bool changed = false;
+
+       if (old > qd)
+               old = qd;
+
+       if (up) {
+               if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
+                       return;
+
+               if (old < qd) {
+                       changed = true;
+                       old += scale;
+                       old = min(old, qd);
+                       iolat->rq_depth.max_depth = old;
+                       wake_up_all(&iolat->rq_wait.wait);
+               }
+       } else if (old > 1) {
+               old >>= 1;
+               changed = true;
+               iolat->rq_depth.max_depth = max(old, 1UL);
+       }
+}
+
+/* Check our parent and see if the scale cookie has changed. */
+static void check_scale_change(struct iolatency_grp *iolat)
+{
+       struct iolatency_grp *parent;
+       struct child_latency_info *lat_info;
+       unsigned int cur_cookie;
+       unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
+       u64 scale_lat;
+       unsigned int old;
+       int direction = 0;
+
+       if (lat_to_blkg(iolat)->parent == NULL)
+               return;
+
+       parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
+       if (!parent)
+               return;
+
+       lat_info = &parent->child_lat;
+       cur_cookie = atomic_read(&lat_info->scale_cookie);
+       scale_lat = READ_ONCE(lat_info->scale_lat);
+
+       if (cur_cookie < our_cookie)
+               direction = -1;
+       else if (cur_cookie > our_cookie)
+               direction = 1;
+       else
+               return;
+
+       old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
+
+       /* Somebody beat us to the punch, just bail. */
+       if (old != our_cookie)
+               return;
+
+       if (direction < 0 && iolat->min_lat_nsec) {
+               u64 samples_thresh;
+
+               if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
+                       return;
+
+               /*
+                * Sometimes high priority groups are their own worst enemy, so
+                * instead of taking it out on some poor other group that did 5%
+                * or less of the IO's for the last summation just skip this
+                * scale down event.
+                */
+               samples_thresh = lat_info->nr_samples * 5;
+               samples_thresh = div64_u64(samples_thresh, 100);
+               if (iolat->nr_samples <= samples_thresh)
+                       return;
+       }
+
+       /* We're as low as we can go. */
+       if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+               blkcg_use_delay(lat_to_blkg(iolat));
+               return;
+       }
+
+       /* We're back to the default cookie, unthrottle all the things. */
+       if (cur_cookie == DEFAULT_SCALE_COOKIE) {
+               blkcg_clear_delay(lat_to_blkg(iolat));
+               iolat->rq_depth.max_depth = UINT_MAX;
+               wake_up_all(&iolat->rq_wait.wait);
+               return;
+       }
+
+       scale_change(iolat, direction > 0);
+}
+
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
+                                    spinlock_t *lock)
+{
+       struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       struct request_queue *q = rqos->q;
+       bool issue_as_root = bio_issue_as_root_blkg(bio);
+
+       if (!blk_iolatency_enabled(blkiolat))
+               return;
+
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+       bio_associate_blkcg(bio, &blkcg->css);
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg)) {
+               if (!lock)
+                       spin_lock_irq(q->queue_lock);
+               blkg = blkg_lookup_create(blkcg, q);
+               if (IS_ERR(blkg))
+                       blkg = NULL;
+               if (!lock)
+                       spin_unlock_irq(q->queue_lock);
+       }
+       if (!blkg)
+               goto out;
+
+       bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+       bio_associate_blkg(bio, blkg);
+out:
+       rcu_read_unlock();
+       while (blkg && blkg->parent) {
+               struct iolatency_grp *iolat = blkg_to_lat(blkg);
+               if (!iolat) {
+                       blkg = blkg->parent;
+                       continue;
+               }
+
+               check_scale_change(iolat);
+               __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+                                    (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+               blkg = blkg->parent;
+       }
+       if (!timer_pending(&blkiolat->timer))
+               mod_timer(&blkiolat->timer, jiffies + HZ);
+}
+
+static void iolatency_record_time(struct iolatency_grp *iolat,
+                                 struct bio_issue *issue, u64 now,
+                                 bool issue_as_root)
+{
+       struct blk_rq_stat *rq_stat;
+       u64 start = bio_issue_time(issue);
+       u64 req_time;
+
+       /*
+        * Have to do this so we are truncated to the correct time that our
+        * issue is truncated to.
+        */
+       now = __bio_issue_time(now);
+
+       if (now <= start)
+               return;
+
+       req_time = now - start;
+
+       /*
+        * We don't want to count issue_as_root bio's in the cgroups latency
+        * statistics as it could skew the numbers downwards.
+        */
+       if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+               u64 sub = iolat->min_lat_nsec;
+               if (req_time < sub)
+                       blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
+               return;
+       }
+
+       rq_stat = get_cpu_ptr(iolat->stats);
+       blk_rq_stat_add(rq_stat, req_time);
+       put_cpu_ptr(rq_stat);
+}
+
+#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
+
+static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
+{
+       struct blkcg_gq *blkg = lat_to_blkg(iolat);
+       struct iolatency_grp *parent;
+       struct child_latency_info *lat_info;
+       struct blk_rq_stat stat;
+       unsigned long flags;
+       int cpu, exp_idx;
+
+       blk_rq_stat_init(&stat);
+       preempt_disable();
+       for_each_online_cpu(cpu) {
+               struct blk_rq_stat *s;
+               s = per_cpu_ptr(iolat->stats, cpu);
+               blk_rq_stat_sum(&stat, s);
+               blk_rq_stat_init(s);
+       }
+       preempt_enable();
+
+       parent = blkg_to_lat(blkg->parent);
+       if (!parent)
+               return;
+
+       lat_info = &parent->child_lat;
+
+       /*
+        * CALC_LOAD takes in a number stored in fixed point representation.
+        * Because we are using this for IO time in ns, the values stored
+        * are significantly larger than the FIXED_1 denominator (2048).
+        * Therefore, rounding errors in the calculation are negligible and
+        * can be ignored.
+        */
+       exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+                       div64_u64(iolat->cur_win_nsec,
+                                 BLKIOLATENCY_EXP_BUCKET_SIZE));
+       CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+
+       /* Everything is ok and we don't need to adjust the scale. */
+       if (stat.mean <= iolat->min_lat_nsec &&
+           atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
+               return;
+
+       /* Somebody beat us to the punch, just bail. */
+       spin_lock_irqsave(&lat_info->lock, flags);
+       lat_info->nr_samples -= iolat->nr_samples;
+       lat_info->nr_samples += stat.nr_samples;
+       iolat->nr_samples = stat.nr_samples;
+
+       if ((lat_info->last_scale_event >= now ||
+           now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
+           lat_info->scale_lat <= iolat->min_lat_nsec)
+               goto out;
+
+       if (stat.mean <= iolat->min_lat_nsec &&
+           stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+               if (lat_info->scale_grp == iolat) {
+                       lat_info->last_scale_event = now;
+                       scale_cookie_change(iolat->blkiolat, lat_info, true);
+               }
+       } else if (stat.mean > iolat->min_lat_nsec) {
+               lat_info->last_scale_event = now;
+               if (!lat_info->scale_grp ||
+                   lat_info->scale_lat > iolat->min_lat_nsec) {
+                       WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
+                       lat_info->scale_grp = iolat;
+               }
+               scale_cookie_change(iolat->blkiolat, lat_info, false);
+       }
+out:
+       spin_unlock_irqrestore(&lat_info->lock, flags);
+}
+
+static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+       struct blkcg_gq *blkg;
+       struct rq_wait *rqw;
+       struct iolatency_grp *iolat;
+       u64 window_start;
+       u64 now = ktime_to_ns(ktime_get());
+       bool issue_as_root = bio_issue_as_root_blkg(bio);
+       bool enabled = false;
+
+       blkg = bio->bi_blkg;
+       if (!blkg)
+               return;
+
+       iolat = blkg_to_lat(bio->bi_blkg);
+       if (!iolat)
+               return;
+
+       enabled = blk_iolatency_enabled(iolat->blkiolat);
+       while (blkg && blkg->parent) {
+               iolat = blkg_to_lat(blkg);
+               if (!iolat) {
+                       blkg = blkg->parent;
+                       continue;
+               }
+               rqw = &iolat->rq_wait;
+
+               atomic_dec(&rqw->inflight);
+               if (!enabled || iolat->min_lat_nsec == 0)
+                       goto next;
+               iolatency_record_time(iolat, &bio->bi_issue, now,
+                                     issue_as_root);
+               window_start = atomic64_read(&iolat->window_start);
+               if (now > window_start &&
+                   (now - window_start) >= iolat->cur_win_nsec) {
+                       if (atomic64_cmpxchg(&iolat->window_start,
+                                       window_start, now) == window_start)
+                               iolatency_check_latencies(iolat, now);
+               }
+next:
+               wake_up(&rqw->wait);
+               blkg = blkg->parent;
+       }
+}
+
+static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+       struct blkcg_gq *blkg;
+
+       blkg = bio->bi_blkg;
+       while (blkg && blkg->parent) {
+               struct rq_wait *rqw;
+               struct iolatency_grp *iolat;
+
+               iolat = blkg_to_lat(blkg);
+               if (!iolat)
+                       goto next;
+
+               rqw = &iolat->rq_wait;
+               atomic_dec(&rqw->inflight);
+               wake_up(&rqw->wait);
+next:
+               blkg = blkg->parent;
+       }
+}
+
+static void blkcg_iolatency_exit(struct rq_qos *rqos)
+{
+       struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+
+       del_timer_sync(&blkiolat->timer);
+       blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
+       kfree(blkiolat);
+}
+
+static struct rq_qos_ops blkcg_iolatency_ops = {
+       .throttle = blkcg_iolatency_throttle,
+       .cleanup = blkcg_iolatency_cleanup,
+       .done_bio = blkcg_iolatency_done_bio,
+       .exit = blkcg_iolatency_exit,
+};
+
+static void blkiolatency_timer_fn(struct timer_list *t)
+{
+       struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
+       struct blkcg_gq *blkg;
+       struct cgroup_subsys_state *pos_css;
+       u64 now = ktime_to_ns(ktime_get());
+
+       rcu_read_lock();
+       blkg_for_each_descendant_pre(blkg, pos_css,
+                                    blkiolat->rqos.q->root_blkg) {
+               struct iolatency_grp *iolat;
+               struct child_latency_info *lat_info;
+               unsigned long flags;
+               u64 cookie;
+
+               /*
+                * We could be exiting, don't access the pd unless we have a
+                * ref on the blkg.
+                */
+               if (!blkg_try_get(blkg))
+                       continue;
+
+               iolat = blkg_to_lat(blkg);
+               if (!iolat)
+                       goto next;
+
+               lat_info = &iolat->child_lat;
+               cookie = atomic_read(&lat_info->scale_cookie);
+
+               if (cookie >= DEFAULT_SCALE_COOKIE)
+                       goto next;
+
+               spin_lock_irqsave(&lat_info->lock, flags);
+               if (lat_info->last_scale_event >= now)
+                       goto next_lock;
+
+               /*
+                * We scaled down but don't have a scale_grp, scale up and carry
+                * on.
+                */
+               if (lat_info->scale_grp == NULL) {
+                       scale_cookie_change(iolat->blkiolat, lat_info, true);
+                       goto next_lock;
+               }
+
+               /*
+                * It's been 5 seconds since our last scale event, clear the
+                * scale grp in case the group that needed the scale down isn't
+                * doing any IO currently.
+                */
+               if (now - lat_info->last_scale_event >=
+                   ((u64)NSEC_PER_SEC * 5))
+                       lat_info->scale_grp = NULL;
+next_lock:
+               spin_unlock_irqrestore(&lat_info->lock, flags);
+next:
+               blkg_put(blkg);
+       }
+       rcu_read_unlock();
+}
+
+int blk_iolatency_init(struct request_queue *q)
+{
+       struct blk_iolatency *blkiolat;
+       struct rq_qos *rqos;
+       int ret;
+
+       blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
+       if (!blkiolat)
+               return -ENOMEM;
+
+       rqos = &blkiolat->rqos;
+       rqos->id = RQ_QOS_CGROUP;
+       rqos->ops = &blkcg_iolatency_ops;
+       rqos->q = q;
+
+       rq_qos_add(q, rqos);
+
+       ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
+       if (ret) {
+               rq_qos_del(q, rqos);
+               kfree(blkiolat);
+               return ret;
+       }
+
+       timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+
+       return 0;
+}
+
+static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+{
+       struct iolatency_grp *iolat = blkg_to_lat(blkg);
+       struct blk_iolatency *blkiolat = iolat->blkiolat;
+       u64 oldval = iolat->min_lat_nsec;
+
+       iolat->min_lat_nsec = val;
+       iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
+       iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
+                                   BLKIOLATENCY_MAX_WIN_SIZE);
+
+       if (!oldval && val)
+               atomic_inc(&blkiolat->enabled);
+       if (oldval && !val)
+               atomic_dec(&blkiolat->enabled);
+}
+
+static void iolatency_clear_scaling(struct blkcg_gq *blkg)
+{
+       if (blkg->parent) {
+               struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
+               struct child_latency_info *lat_info;
+               if (!iolat)
+                       return;
+
+               lat_info = &iolat->child_lat;
+               spin_lock(&lat_info->lock);
+               atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
+               lat_info->last_scale_event = 0;
+               lat_info->scale_grp = NULL;
+               lat_info->scale_lat = 0;
+               spin_unlock(&lat_info->lock);
+       }
+}
+
+static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
+                            size_t nbytes, loff_t off)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkcg_gq *blkg;
+       struct blk_iolatency *blkiolat;
+       struct blkg_conf_ctx ctx;
+       struct iolatency_grp *iolat;
+       char *p, *tok;
+       u64 lat_val = 0;
+       u64 oldval;
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
+       if (ret)
+               return ret;
+
+       iolat = blkg_to_lat(ctx.blkg);
+       blkiolat = iolat->blkiolat;
+       p = ctx.body;
+
+       ret = -EINVAL;
+       while ((tok = strsep(&p, " "))) {
+               char key[16];
+               char val[21];   /* 18446744073709551616 */
+
+               if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
+                       goto out;
+
+               if (!strcmp(key, "target")) {
+                       u64 v;
+
+                       if (!strcmp(val, "max"))
+                               lat_val = 0;
+                       else if (sscanf(val, "%llu", &v) == 1)
+                               lat_val = v * NSEC_PER_USEC;
+                       else
+                               goto out;
+               } else {
+                       goto out;
+               }
+       }
+
+       /* Walk up the tree to see if our new val is lower than it should be. */
+       blkg = ctx.blkg;
+       oldval = iolat->min_lat_nsec;
+
+       iolatency_set_min_lat_nsec(blkg, lat_val);
+       if (oldval != iolat->min_lat_nsec) {
+               iolatency_clear_scaling(blkg);
+       }
+
+       ret = 0;
+out:
+       blkg_conf_finish(&ctx);
+       return ret ?: nbytes;
+}
+
+static u64 iolatency_prfill_limit(struct seq_file *sf,
+                                 struct blkg_policy_data *pd, int off)
+{
+       struct iolatency_grp *iolat = pd_to_lat(pd);
+       const char *dname = blkg_dev_name(pd->blkg);
+
+       if (!dname || !iolat->min_lat_nsec)
+               return 0;
+       seq_printf(sf, "%s target=%llu\n",
+                  dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
+       return 0;
+}
+
+static int iolatency_print_limit(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         iolatency_prfill_limit,
+                         &blkcg_policy_iolatency, seq_cft(sf)->private, false);
+       return 0;
+}
+
+static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
+                               size_t size)
+{
+       struct iolatency_grp *iolat = pd_to_lat(pd);
+       unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+       unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+
+       if (iolat->rq_depth.max_depth == UINT_MAX)
+               return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
+                                avg_lat, cur_win);
+
+       return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
+                        iolat->rq_depth.max_depth, avg_lat, cur_win);
+}
+
+
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+{
+       struct iolatency_grp *iolat;
+
+       iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+       if (!iolat)
+               return NULL;
+       iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
+                                      __alignof__(struct blk_rq_stat), gfp);
+       if (!iolat->stats) {
+               kfree(iolat);
+               return NULL;
+       }
+       return &iolat->pd;
+}
+
+static void iolatency_pd_init(struct blkg_policy_data *pd)
+{
+       struct iolatency_grp *iolat = pd_to_lat(pd);
+       struct blkcg_gq *blkg = lat_to_blkg(iolat);
+       struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
+       struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
+       u64 now = ktime_to_ns(ktime_get());
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct blk_rq_stat *stat;
+               stat = per_cpu_ptr(iolat->stats, cpu);
+               blk_rq_stat_init(stat);
+       }
+
+       rq_wait_init(&iolat->rq_wait);
+       spin_lock_init(&iolat->child_lat.lock);
+       iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+       iolat->rq_depth.max_depth = UINT_MAX;
+       iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+       iolat->blkiolat = blkiolat;
+       iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
+       atomic64_set(&iolat->window_start, now);
+
+       /*
+        * We init things in list order, so the pd for the parent may not be
+        * init'ed yet for whatever reason.
+        */
+       if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
+               struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
+               atomic_set(&iolat->scale_cookie,
+                          atomic_read(&parent->child_lat.scale_cookie));
+       } else {
+               atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
+       }
+
+       atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
+}
+
+static void iolatency_pd_offline(struct blkg_policy_data *pd)
+{
+       struct iolatency_grp *iolat = pd_to_lat(pd);
+       struct blkcg_gq *blkg = lat_to_blkg(iolat);
+
+       iolatency_set_min_lat_nsec(blkg, 0);
+       iolatency_clear_scaling(blkg);
+}
+
+static void iolatency_pd_free(struct blkg_policy_data *pd)
+{
+       struct iolatency_grp *iolat = pd_to_lat(pd);
+       free_percpu(iolat->stats);
+       kfree(iolat);
+}
+
+static struct cftype iolatency_files[] = {
+       {
+               .name = "latency",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = iolatency_print_limit,
+               .write = iolatency_set_limit,
+       },
+       {}
+};
+
+static struct blkcg_policy blkcg_policy_iolatency = {
+       .dfl_cftypes    = iolatency_files,
+       .pd_alloc_fn    = iolatency_pd_alloc,
+       .pd_init_fn     = iolatency_pd_init,
+       .pd_offline_fn  = iolatency_pd_offline,
+       .pd_free_fn     = iolatency_pd_free,
+       .pd_stat_fn     = iolatency_pd_stat,
+};
+
+static int __init iolatency_init(void)
+{
+       return blkcg_policy_register(&blkcg_policy_iolatency);
+}
+
+static void __exit iolatency_exit(void)
+{
+       return blkcg_policy_unregister(&blkcg_policy_iolatency);
+}
+
+module_init(iolatency_init);
+module_exit(iolatency_exit);
diff --git a/block/blk-lib.c b/block/blk-lib.c

index 8faa70f..d1b9dd0 100644 (file)
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                  */
                 req_sects = min_t(sector_t, nr_sects,
                                         q->limits.max_discard_sectors);
+               if (!req_sects)
+                       goto fail;
                 if (req_sects > UINT_MAX >> 9)
                         req_sects = UINT_MAX >> 9;
  
@@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
  
         *biop = bio;
         return 0;
+
+fail:
+       if (bio) {
+               submit_bio_wait(bio);
+               bio_put(bio);
+       }
+       *biop = NULL;
+       return -EOPNOTSUPP;
  }
  EXPORT_SYMBOL(__blkdev_issue_discard);
  
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c

new file mode 100644 (file)

index 0000000..fb2c82c
--- /dev/null
+++ b/block/blk-mq-debugfs-zoned.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Western Digital Corporation or its affiliates.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/blkdev.h>
+#include "blk-mq-debugfs.h"
+
+int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+       struct request_queue *q = data;
+       unsigned int i;
+
+       if (!q->seq_zones_wlock)
+               return 0;
+
+       for (i = 0; i < q->nr_zones; i++)
+               if (test_bit(i, q->seq_zones_wlock))
+                       seq_printf(m, "%u\n", i);
+
+       return 0;
+}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

index 1c4532e..cb1e6cf 100644 (file)
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
         return count;
  }
  
-static int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
-       struct request_queue *q = data;
-       unsigned int i;
-
-       if (!q->seq_zones_wlock)
-               return 0;
-
-       for (i = 0; i < blk_queue_nr_zones(q); i++)
-               if (test_bit(i, q->seq_zones_wlock))
-                       seq_printf(m, "%u\n", i);
-
-       return 0;
-}
-
  static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
         { "poll_stat", 0400, queue_poll_stat_show },
         { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
@@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
         return 0;
  }
  
+static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+
+       seq_printf(m, "%u\n", hctx->dispatch_busy);
+       return 0;
+}
+
  static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
         __acquires(&ctx->lock)
  {
@@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
         {"queued", 0600, hctx_queued_show, hctx_queued_write},
         {"run", 0600, hctx_run_show, hctx_run_write},
         {"active", 0400, hctx_active_show},
+       {"dispatch_busy", 0400, hctx_dispatch_busy_show},
         {},
  };
  
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h

index b9d366e..a9160be 100644 (file)
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
  }
  #endif
  
+#ifdef CONFIG_BLK_DEBUG_FS_ZONED
+int queue_zone_wlock_show(void *data, struct seq_file *m);
+#else
+static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+{
+       return 0;
+}
+#endif
+
  #endif
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c

index e233996..db644ec 100644 (file)
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -17,6 +17,8 @@
  #include <linux/pci.h>
  #include <linux/module.h>
  
+#include "blk-mq.h"
+
  /**
   * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
   * @set:       tagset to provide the mapping for
@@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
  
  fallback:
         WARN_ON_ONCE(set->nr_hw_queues > 1);
-       for_each_possible_cpu(cpu)
-               set->mq_map[cpu] = 0;
+       blk_mq_clear_mq_map(set);
         return 0;
  }
  EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c

index 56c493c..cf9c66c 100644 (file)
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                 return;
  
-       if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-               struct request_queue *q = hctx->queue;
-
-               if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                       atomic_inc(&q->shared_hctx_restart);
-       } else
-               set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+       set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  }
  
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  {
         if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-               return false;
-
-       if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-               struct request_queue *q = hctx->queue;
-
-               if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                       atomic_dec(&q->shared_hctx_restart);
-       } else
-               clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+               return;
+       clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  
-       return blk_mq_run_hw_queue(hctx, true);
+       blk_mq_run_hw_queue(hctx, true);
  }
  
  /*
@@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
                 }
         } else if (has_sched_dispatch) {
                 blk_mq_do_dispatch_sched(hctx);
-       } else if (q->mq_ops->get_budget) {
-               /*
-                * If we need to get budget before queuing request, we
-                * dequeue request one by one from sw queue for avoiding
-                * to mess up I/O merge when dispatch runs out of resource.
-                *
-                * TODO: get more budgets, and dequeue more requests in
-                * one time.
-                */
+       } else if (hctx->dispatch_busy) {
+               /* dequeue request one by one from sw queue if queue is busy */
                 blk_mq_do_dispatch_ctx(hctx);
         } else {
                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
                 return e->type->ops.mq.bio_merge(hctx, bio);
         }
  
-       if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
+       if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
+                       !list_empty_careful(&ctx->rq_list)) {
                 /* default per sw-queue merge */
                 spin_lock(&ctx->lock);
                 ret = blk_mq_attempt_merge(q, ctx, bio);
@@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
         return false;
  }
  
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos:    loop cursor.
- * @skip:   the list element that will not be examined. Iteration starts at
- *          @skip->next.
- * @head:   head of the list to examine. This list must have at least one
- *          element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member)            \
-       for ((pos) = (skip);                                            \
-            (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
-                       (pos)->member.next, typeof(*pos), member) :     \
-             list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
-            (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
-       struct blk_mq_tags *const tags = hctx->tags;
-       struct blk_mq_tag_set *const set = hctx->queue->tag_set;
-       struct request_queue *const queue = hctx->queue, *q;
-       struct blk_mq_hw_ctx *hctx2;
-       unsigned int i, j;
-
-       if (set->flags & BLK_MQ_F_TAG_SHARED) {
-               /*
-                * If this is 0, then we know that no hardware queues
-                * have RESTART marked. We're done.
-                */
-               if (!atomic_read(&queue->shared_hctx_restart))
-                       return;
-
-               rcu_read_lock();
-               list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
-                                          tag_set_list) {
-                       queue_for_each_hw_ctx(q, hctx2, i)
-                               if (hctx2->tags == tags &&
-                                   blk_mq_sched_restart_hctx(hctx2))
-                                       goto done;
-               }
-               j = hctx->queue_num + 1;
-               for (i = 0; i < queue->nr_hw_queues; i++, j++) {
-                       if (j == queue->nr_hw_queues)
-                               j = 0;
-                       hctx2 = queue->queue_hw_ctx[j];
-                       if (hctx2->tags == tags &&
-                           blk_mq_sched_restart_hctx(hctx2))
-                               break;
-               }
-done:
-               rcu_read_unlock();
-       } else {
-               blk_mq_sched_restart_hctx(hctx);
-       }
-}
-
  void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                  bool run_queue, bool async)
  {
@@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
  
         if (e && e->type->ops.mq.insert_requests)
                 e->type->ops.mq.insert_requests(hctx, list, false);
-       else
+       else {
+               /*
+                * try to issue requests directly if the hw queue isn't
+                * busy in case of 'none' scheduler, and this way may save
+                * us one extra enqueue & dequeue to sw queue.
+                */
+               if (!hctx->dispatch_busy && !e && !run_queue_async) {
+                       blk_mq_try_issue_list_directly(hctx, list);
+                       if (list_empty(list))
+                               return;
+               }
                 blk_mq_insert_requests(hctx, ctx, list);
+       }
  
         blk_mq_run_hw_queue(hctx, run_queue_async);
  }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

index 09b2ee6..c43b339 100644 (file)
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -399,8 +399,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
         if (tdepth <= tags->nr_reserved_tags)
                 return -EINVAL;
  
-       tdepth -= tags->nr_reserved_tags;
-
         /*
          * If we are allowed to grow beyond the original size, allocate
          * a new set of tags before freeing the old one.
@@ -420,7 +418,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                 if (tdepth > 16 * BLKDEV_MAX_RQ)
                         return -EINVAL;
  
-               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+               new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+                               tags->nr_reserved_tags);
                 if (!new)
                         return -ENOMEM;
                 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
@@ -437,7 +436,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                  * Don't need (or can't) update reserved tags here, they
                  * remain static and should never need resizing.
                  */
-               sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+               sbitmap_queue_resize(&tags->bitmap_tags,
+                               tdepth - tags->nr_reserved_tags);
         }
  
         return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 9591926..e13bdc2 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -34,8 +34,8 @@
  #include "blk-mq-debugfs.h"
  #include "blk-mq-tag.h"
  #include "blk-stat.h"
-#include "blk-wbt.h"
  #include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
  
  static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
  static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -504,7 +504,7 @@ void blk_mq_free_request(struct request *rq)
         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                 laptop_io_completion(q->backing_dev_info);
  
-       wbt_done(q->rq_wb, rq);
+       rq_qos_done(q, rq);
  
         if (blk_rq_rl(rq))
                 blk_put_rl(blk_rq_rl(rq));
@@ -527,7 +527,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
         blk_account_io_done(rq, now);
  
         if (rq->end_io) {
-               wbt_done(rq->q->rq_wb, rq);
+               rq_qos_done(rq->q, rq);
                 rq->end_io(rq, error);
         } else {
                 if (unlikely(blk_bidi_rq(rq)))
@@ -641,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
                 rq->throtl_size = blk_rq_sectors(rq);
  #endif
                 rq->rq_flags |= RQF_STATS;
-               wbt_issue(q->rq_wb, rq);
+               rq_qos_issue(q, rq);
         }
  
         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@ -667,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
         blk_mq_put_driver_tag(rq);
  
         trace_block_rq_requeue(q, rq);
-       wbt_requeue(q->rq_wb, rq);
+       rq_qos_requeue(q, rq);
  
         if (blk_mq_request_started(rq)) {
                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -964,17 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
-                          bool wait)
+bool blk_mq_get_driver_tag(struct request *rq)
  {
         struct blk_mq_alloc_data data = {
                 .q = rq->q,
                 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
-               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+               .flags = BLK_MQ_REQ_NOWAIT,
         };
  
-       might_sleep_if(wait);
-
         if (rq->tag != -1)
                 goto done;
  
@@ -991,8 +988,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
         }
  
  done:
-       if (hctx)
-               *hctx = data.hctx;
         return rq->tag != -1;
  }
  
@@ -1003,7 +998,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
  
         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
  
+       spin_lock(&hctx->dispatch_wait_lock);
         list_del_init(&wait->entry);
+       spin_unlock(&hctx->dispatch_wait_lock);
+
         blk_mq_run_hw_queue(hctx, true);
         return 1;
  }
@@ -1014,17 +1012,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
   * restart. For both cases, take care to check the condition again after
   * marking us as waiting.
   */
-static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                  struct request *rq)
  {
-       struct blk_mq_hw_ctx *this_hctx = *hctx;
-       struct sbq_wait_state *ws;
+       struct wait_queue_head *wq;
         wait_queue_entry_t *wait;
         bool ret;
  
-       if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
-               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
-                       set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+               if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+                       set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  
                 /*
                  * It's possible that a tag was freed in the window between the
@@ -1034,30 +1031,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
                  * Don't clear RESTART here, someone else could have set it.
                  * At most this will cost an extra queue run.
                  */
-               return blk_mq_get_driver_tag(rq, hctx, false);
+               return blk_mq_get_driver_tag(rq);
         }
  
-       wait = &this_hctx->dispatch_wait;
+       wait = &hctx->dispatch_wait;
         if (!list_empty_careful(&wait->entry))
                 return false;
  
-       spin_lock(&this_hctx->lock);
+       wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
+
+       spin_lock_irq(&wq->lock);
+       spin_lock(&hctx->dispatch_wait_lock);
         if (!list_empty(&wait->entry)) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                 return false;
         }
  
-       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-       add_wait_queue(&ws->wait, wait);
+       wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue(wq, wait);
  
         /*
          * It's possible that a tag was freed in the window between the
          * allocation failure and adding the hardware queue to the wait
          * queue.
          */
-       ret = blk_mq_get_driver_tag(rq, hctx, false);
+       ret = blk_mq_get_driver_tag(rq);
         if (!ret) {
-               spin_unlock(&this_hctx->lock);
+               spin_unlock(&hctx->dispatch_wait_lock);
+               spin_unlock_irq(&wq->lock);
                 return false;
         }
  
@@ -1065,14 +1067,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
          * We got a tag, remove ourselves from the wait queue to ensure
          * someone else gets the wakeup.
          */
-       spin_lock_irq(&ws->wait.lock);
         list_del_init(&wait->entry);
-       spin_unlock_irq(&ws->wait.lock);
-       spin_unlock(&this_hctx->lock);
+       spin_unlock(&hctx->dispatch_wait_lock);
+       spin_unlock_irq(&wq->lock);
  
         return true;
  }
  
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ *   factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+       unsigned int ewma;
+
+       if (hctx->queue->elevator)
+               return;
+
+       ewma = hctx->dispatch_busy;
+
+       if (!ewma && !busy)
+               return;
+
+       ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+       if (busy)
+               ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+       ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+       hctx->dispatch_busy = ewma;
+}
+
  #define BLK_MQ_RESOURCE_DELAY  3               /* ms units */
  
  /*
@@ -1105,7 +1135,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
                         break;
  
-               if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+               if (!blk_mq_get_driver_tag(rq)) {
                         /*
                          * The initial allocation attempt failed, so we need to
                          * rerun the hardware queue when a tag is freed. The
@@ -1113,7 +1143,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                          * before we add this entry back on the dispatch list,
                          * we'll re-run it below.
                          */
-                       if (!blk_mq_mark_tag_wait(&hctx, rq)) {
+                       if (!blk_mq_mark_tag_wait(hctx, rq)) {
                                 blk_mq_put_dispatch_budget(hctx);
                                 /*
                                  * For non-shared tags, the RESTART check
@@ -1137,7 +1167,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                         bd.last = true;
                 else {
                         nxt = list_first_entry(list, struct request, queuelist);
-                       bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
+                       bd.last = !blk_mq_get_driver_tag(nxt);
                 }
  
                 ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1209,8 +1239,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 else if (needs_restart && (ret == BLK_STS_RESOURCE))
                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
  
+               blk_mq_update_dispatch_busy(hctx, true);
                 return false;
-       }
+       } else
+               blk_mq_update_dispatch_busy(hctx, false);
  
         /*
          * If the host/device is unable to accept more work, inform the
@@ -1544,19 +1576,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                             struct list_head *list)
  
  {
+       struct request *rq;
+
         /*
          * preemption doesn't flush plug list, so it's possible ctx->cpu is
          * offline now
          */
-       spin_lock(&ctx->lock);
-       while (!list_empty(list)) {
-               struct request *rq;
-
-               rq = list_first_entry(list, struct request, queuelist);
+       list_for_each_entry(rq, list, queuelist) {
                 BUG_ON(rq->mq_ctx != ctx);
-               list_del_init(&rq->queuelist);
-               __blk_mq_insert_req_list(hctx, rq, false);
+               trace_block_rq_insert(hctx->queue, rq);
         }
+
+       spin_lock(&ctx->lock);
+       list_splice_tail_init(list, &ctx->rq_list);
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
  }
@@ -1659,13 +1691,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
         ret = q->mq_ops->queue_rq(hctx, &bd);
         switch (ret) {
         case BLK_STS_OK:
+               blk_mq_update_dispatch_busy(hctx, false);
                 *cookie = new_cookie;
                 break;
         case BLK_STS_RESOURCE:
         case BLK_STS_DEV_RESOURCE:
+               blk_mq_update_dispatch_busy(hctx, true);
                 __blk_mq_requeue_request(rq);
                 break;
         default:
+               blk_mq_update_dispatch_busy(hctx, false);
                 *cookie = BLK_QC_T_NONE;
                 break;
         }
@@ -1700,7 +1735,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
         if (!blk_mq_get_dispatch_budget(hctx))
                 goto insert;
  
-       if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+       if (!blk_mq_get_driver_tag(rq)) {
                 blk_mq_put_dispatch_budget(hctx);
                 goto insert;
         }
@@ -1748,6 +1783,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
         return ret;
  }
  
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+               struct list_head *list)
+{
+       while (!list_empty(list)) {
+               blk_status_t ret;
+               struct request *rq = list_first_entry(list, struct request,
+                               queuelist);
+
+               list_del_init(&rq->queuelist);
+               ret = blk_mq_request_issue_directly(rq);
+               if (ret != BLK_STS_OK) {
+                       if (ret == BLK_STS_RESOURCE ||
+                                       ret == BLK_STS_DEV_RESOURCE) {
+                               list_add(&rq->queuelist, list);
+                               break;
+                       }
+                       blk_mq_end_request(rq, ret);
+               }
+       }
+}
+
  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  {
         const int is_sync = op_is_sync(bio->bi_opf);
@@ -1758,7 +1814,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
-       unsigned int wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1774,19 +1829,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         if (blk_mq_sched_bio_merge(q, bio))
                 return BLK_QC_T_NONE;
  
-       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+       rq_qos_throttle(q, bio, NULL);
  
         trace_block_getrq(q, bio, bio->bi_opf);
  
         rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
         if (unlikely(!rq)) {
-               __wbt_done(q->rq_wb, wb_acct);
+               rq_qos_cleanup(q, bio);
                 if (bio->bi_opf & REQ_NOWAIT)
                         bio_wouldblock_error(bio);
                 return BLK_QC_T_NONE;
         }
  
-       wbt_track(rq, wb_acct);
+       rq_qos_track(q, rq, bio);
  
         cookie = request_to_qc_t(data.hctx, rq);
  
@@ -1849,7 +1904,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
                                         &cookie);
                 }
-       } else if (q->nr_hw_queues > 1 && is_sync) {
+       } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+                       !data.hctx->dispatch_busy)) {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@ -2148,6 +2204,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
  
         hctx->nr_ctx = 0;
  
+       spin_lock_init(&hctx->dispatch_wait_lock);
         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
  
@@ -2333,15 +2390,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
         int i;
  
         queue_for_each_hw_ctx(q, hctx, i) {
-               if (shared) {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_inc(&q->shared_hctx_restart);
+               if (shared)
                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
-               } else {
-                       if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-                               atomic_dec(&q->shared_hctx_restart);
+               else
                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-               }
         }
  }
  
@@ -2372,7 +2424,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
                 blk_mq_update_tag_set_depth(set, false);
         }
         mutex_unlock(&set->tag_list_lock);
-       synchronize_rcu();
         INIT_LIST_HEAD(&q->tag_set_list);
  }
  
@@ -2687,7 +2738,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  {
         if (set->ops->map_queues) {
-               int cpu;
                 /*
                  * transport .map_queues is usually done in the following
                  * way:
@@ -2702,8 +2752,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
                  * killing stale mapping since one CPU may not be mapped
                  * to any hw queue.
                  */
-               for_each_possible_cpu(cpu)
-                       set->mq_map[cpu] = 0;
+               blk_mq_clear_mq_map(set);
  
                 return set->ops->map_queues(set);
         } else
@@ -2713,7 +2762,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  /*
   * Alloc a tag set to be associated with one or more request queues.
   * May fail with EINVAL for various error conditions. May adjust the
- * requested depth down, if if it too large. In that case, the set
+ * requested depth down, if it's too large. In that case, the set
   * value will be stored in set->queue_depth.
   */
  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
diff --git a/block/blk-mq.h b/block/blk-mq.h

index 89231e4..9497b47 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
  void blk_mq_wake_waiters(struct request_queue *q);
  bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
  void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
-                               bool wait);
+bool blk_mq_get_driver_tag(struct request *rq);
  struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                         struct blk_mq_ctx *start);
  
@@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
  
  /* Used by blk_insert_cloned_request() to issue request directly */
  blk_status_t blk_mq_request_issue_directly(struct request *rq);
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+                                   struct list_head *list);
  
  /*
   * CPU -> queue mappings
@@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
         __blk_mq_put_driver_tag(hctx, rq);
  }
  
+static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               set->mq_map[cpu] = 0;
+}
+
  #endif
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c

new file mode 100644 (file)

index 0000000..0005dfd
--- /dev/null
+++ b/block/blk-rq-qos.c
@@ -0,0 +1,194 @@
+#include "blk-rq-qos.h"
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, unsigned int below)
+{
+       unsigned int cur = atomic_read(v);
+
+       for (;;) {
+               unsigned int old;
+
+               if (cur >= below)
+                       return false;
+               old = atomic_cmpxchg(v, cur, cur + 1);
+               if (old == cur)
+                       break;
+               cur = old;
+       }
+
+       return true;
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
+{
+       return atomic_inc_below(&rq_wait->inflight, limit);
+}
+
+void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
+{
+       struct rq_qos *rqos;
+
+       for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->cleanup)
+                       rqos->ops->cleanup(rqos, bio);
+       }
+}
+
+void rq_qos_done(struct request_queue *q, struct request *rq)
+{
+       struct rq_qos *rqos;
+
+       for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->done)
+                       rqos->ops->done(rqos, rq);
+       }
+}
+
+void rq_qos_issue(struct request_queue *q, struct request *rq)
+{
+       struct rq_qos *rqos;
+
+       for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->issue)
+                       rqos->ops->issue(rqos, rq);
+       }
+}
+
+void rq_qos_requeue(struct request_queue *q, struct request *rq)
+{
+       struct rq_qos *rqos;
+
+       for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->requeue)
+                       rqos->ops->requeue(rqos, rq);
+       }
+}
+
+void rq_qos_throttle(struct request_queue *q, struct bio *bio,
+                    spinlock_t *lock)
+{
+       struct rq_qos *rqos;
+
+       for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->throttle)
+                       rqos->ops->throttle(rqos, bio, lock);
+       }
+}
+
+void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
+{
+       struct rq_qos *rqos;
+
+       for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->track)
+                       rqos->ops->track(rqos, rq, bio);
+       }
+}
+
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+{
+       struct rq_qos *rqos;
+
+       for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->ops->done_bio)
+                       rqos->ops->done_bio(rqos, bio);
+       }
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+bool rq_depth_calc_max_depth(struct rq_depth *rqd)
+{
+       unsigned int depth;
+       bool ret = false;
+
+       /*
+        * For QD=1 devices, this is a special case. It's important for those
+        * to have one request ready when one completes, so force a depth of
+        * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+        * since the device can't have more than that in flight. If we're
+        * scaling down, then keep a setting of 1/1/1.
+        */
+       if (rqd->queue_depth == 1) {
+               if (rqd->scale_step > 0)
+                       rqd->max_depth = 1;
+               else {
+                       rqd->max_depth = 2;
+                       ret = true;
+               }
+       } else {
+               /*
+                * scale_step == 0 is our default state. If we have suffered
+                * latency spikes, step will be > 0, and we shrink the
+                * allowed write depths. If step is < 0, we're only doing
+                * writes, and we allow a temporarily higher depth to
+                * increase performance.
+                */
+               depth = min_t(unsigned int, rqd->default_depth,
+                             rqd->queue_depth);
+               if (rqd->scale_step > 0)
+                       depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
+               else if (rqd->scale_step < 0) {
+                       unsigned int maxd = 3 * rqd->queue_depth / 4;
+
+                       depth = 1 + ((depth - 1) << -rqd->scale_step);
+                       if (depth > maxd) {
+                               depth = maxd;
+                               ret = true;
+                       }
+               }
+
+               rqd->max_depth = depth;
+       }
+
+       return ret;
+}
+
+void rq_depth_scale_up(struct rq_depth *rqd)
+{
+       /*
+        * Hit max in previous round, stop here
+        */
+       if (rqd->scaled_max)
+               return;
+
+       rqd->scale_step--;
+
+       rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+{
+       /*
+        * Stop scaling down when we've hit the limit. This also prevents
+        * ->scale_step from going to crazy values, if the device can't
+        * keep up.
+        */
+       if (rqd->max_depth == 1)
+               return;
+
+       if (rqd->scale_step < 0 && hard_throttle)
+               rqd->scale_step = 0;
+       else
+               rqd->scale_step++;
+
+       rqd->scaled_max = false;
+       rq_depth_calc_max_depth(rqd);
+}
+
+void rq_qos_exit(struct request_queue *q)
+{
+       while (q->rq_qos) {
+               struct rq_qos *rqos = q->rq_qos;
+               q->rq_qos = rqos->next;
+               rqos->ops->exit(rqos);
+       }
+}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h

new file mode 100644 (file)

index 0000000..32b02ef
--- /dev/null
+++ b/block/blk-rq-qos.h
@@ -0,0 +1,109 @@
+#ifndef RQ_QOS_H
+#define RQ_QOS_H
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk_types.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+enum rq_qos_id {
+       RQ_QOS_WBT,
+       RQ_QOS_CGROUP,
+};
+
+struct rq_wait {
+       wait_queue_head_t wait;
+       atomic_t inflight;
+};
+
+struct rq_qos {
+       struct rq_qos_ops *ops;
+       struct request_queue *q;
+       enum rq_qos_id id;
+       struct rq_qos *next;
+};
+
+struct rq_qos_ops {
+       void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
+       void (*track)(struct rq_qos *, struct request *, struct bio *);
+       void (*issue)(struct rq_qos *, struct request *);
+       void (*requeue)(struct rq_qos *, struct request *);
+       void (*done)(struct rq_qos *, struct request *);
+       void (*done_bio)(struct rq_qos *, struct bio *);
+       void (*cleanup)(struct rq_qos *, struct bio *);
+       void (*exit)(struct rq_qos *);
+};
+
+struct rq_depth {
+       unsigned int max_depth;
+
+       int scale_step;
+       bool scaled_max;
+
+       unsigned int queue_depth;
+       unsigned int default_depth;
+};
+
+static inline struct rq_qos *rq_qos_id(struct request_queue *q,
+                                      enum rq_qos_id id)
+{
+       struct rq_qos *rqos;
+       for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+               if (rqos->id == id)
+                       break;
+       }
+       return rqos;
+}
+
+static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
+{
+       return rq_qos_id(q, RQ_QOS_WBT);
+}
+
+static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
+{
+       return rq_qos_id(q, RQ_QOS_CGROUP);
+}
+
+static inline void rq_wait_init(struct rq_wait *rq_wait)
+{
+       atomic_set(&rq_wait->inflight, 0);
+       init_waitqueue_head(&rq_wait->wait);
+}
+
+static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+{
+       rqos->next = q->rq_qos;
+       q->rq_qos = rqos;
+}
+
+static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+{
+       struct rq_qos *cur, *prev = NULL;
+       for (cur = q->rq_qos; cur; cur = cur->next) {
+               if (cur == rqos) {
+                       if (prev)
+                               prev->next = rqos->next;
+                       else
+                               q->rq_qos = cur;
+                       break;
+               }
+               prev = cur;
+       }
+}
+
+bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
+void rq_depth_scale_up(struct rq_depth *rqd);
+void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_calc_max_depth(struct rq_depth *rqd);
+
+void rq_qos_cleanup(struct request_queue *, struct bio *);
+void rq_qos_done(struct request_queue *, struct request *);
+void rq_qos_issue(struct request_queue *, struct request *);
+void rq_qos_requeue(struct request_queue *, struct request *);
+void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
+void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
+void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
+void rq_qos_exit(struct request_queue *);
+#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c

index d1de711..ffd4599 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
  
         /* Inherit limits from component devices */
         lim->max_segments = USHRT_MAX;
-       lim->max_discard_segments = 1;
+       lim->max_discard_segments = USHRT_MAX;
         lim->max_hw_sectors = UINT_MAX;
         lim->max_segment_size = UINT_MAX;
         lim->max_sectors = UINT_MAX;
@@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
  void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
  {
         q->queue_depth = depth;
-       wbt_set_queue_depth(q->rq_wb, depth);
+       wbt_set_queue_depth(q, depth);
  }
  EXPORT_SYMBOL(blk_set_queue_depth);
  
@@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
                 queue_flag_clear(QUEUE_FLAG_FUA, q);
         spin_unlock_irq(q->queue_lock);
  
-       wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+       wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
  }
  EXPORT_SYMBOL_GPL(blk_queue_write_cache);
  
diff --git a/block/blk-stat.c b/block/blk-stat.c

index 175c143..7587b1c 100644 (file)
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,7 +17,7 @@ struct blk_queue_stats {
         bool enable_accounting;
  };
  
-static void blk_stat_init(struct blk_rq_stat *stat)
+void blk_rq_stat_init(struct blk_rq_stat *stat)
  {
         stat->min = -1ULL;
         stat->max = stat->nr_samples = stat->mean = 0;
@@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat)
  }
  
  /* src is a per-cpu stat, mean isn't initialized */
-static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
  {
         if (!src->nr_samples)
                 return;
@@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
         dst->nr_samples += src->nr_samples;
  }
  
-static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
+void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
  {
         stat->min = min(stat->min, value);
         stat->max = max(stat->max, value);
@@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now)
                         continue;
  
                 stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
-               __blk_stat_add(stat, value);
+               blk_rq_stat_add(stat, value);
                 put_cpu_ptr(cb->cpu_stat);
         }
         rcu_read_unlock();
@@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t)
         int cpu;
  
         for (bucket = 0; bucket < cb->buckets; bucket++)
-               blk_stat_init(&cb->stat[bucket]);
+               blk_rq_stat_init(&cb->stat[bucket]);
  
         for_each_online_cpu(cpu) {
                 struct blk_rq_stat *cpu_stat;
  
                 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                 for (bucket = 0; bucket < cb->buckets; bucket++) {
-                       blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
-                       blk_stat_init(&cpu_stat[bucket]);
+                       blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+                       blk_rq_stat_init(&cpu_stat[bucket]);
                 }
         }
  
@@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q,
  
                 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                 for (bucket = 0; bucket < cb->buckets; bucket++)
-                       blk_stat_init(&cpu_stat[bucket]);
+                       blk_rq_stat_init(&cpu_stat[bucket]);
         }
  
         spin_lock(&q->stats->lock);
diff --git a/block/blk-stat.h b/block/blk-stat.h

index 78399cd..f4a1568 100644 (file)
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
         mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
  }
  
+void blk_rq_stat_add(struct blk_rq_stat *, u64);
+void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+void blk_rq_stat_init(struct blk_rq_stat *);
+
  #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index 94987b1..49c29a5 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
  
  static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
  {
-       if (!q->rq_wb)
+       if (!wbt_rq_qos(q))
                 return -EINVAL;
  
-       return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+       return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
  }
  
  static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
                                   size_t count)
  {
-       struct rq_wb *rwb;
+       struct rq_qos *rqos;
         ssize_t ret;
         s64 val;
  
@@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
         if (val < -1)
                 return -EINVAL;
  
-       rwb = q->rq_wb;
-       if (!rwb) {
+       rqos = wbt_rq_qos(q);
+       if (!rqos) {
                 ret = wbt_init(q);
                 if (ret)
                         return ret;
         }
  
-       rwb = q->rq_wb;
         if (val == -1)
-               rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+               val = wbt_default_latency_nsec(q);
         else if (val >= 0)
-               rwb->min_lat_nsec = val * 1000ULL;
+               val *= 1000ULL;
  
-       if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
-               rwb->enable_state = WBT_STATE_ON_MANUAL;
+       wbt_set_min_lat(q, val);
  
-       wbt_update_limits(rwb);
+       wbt_update_limits(q);
         return count;
  }
  
@@ -964,7 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
         kobject_del(&q->kobj);
         blk_trace_remove_sysfs(disk_to_dev(disk));
  
-       wbt_exit(q);
+       rq_qos_exit(q);
  
         mutex_lock(&q->sysfs_lock);
         if (q->request_fn || (q->mq_ops && q->elevator))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 82282e6..caaabbe 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
                 struct throtl_grp *tg = blkg_to_tg(blkg);
  
                 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
-                   tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+                   tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
                         low_valid = true;
+                       break;
+               }
         }
         rcu_read_unlock();
  
@@ -2132,12 +2134,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
  {
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       if (bio->bi_css) {
-               if (bio->bi_cg_private)
-                       blkg_put(tg_to_blkg(bio->bi_cg_private));
-               bio->bi_cg_private = tg;
-               blkg_get(tg_to_blkg(tg));
-       }
+       if (bio->bi_css)
+               bio_associate_blkg(bio, tg_to_blkg(tg));
         bio_issue_init(&bio->bi_issue, bio_sectors(bio));
  #endif
  }
@@ -2285,6 +2283,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
  
  void blk_throtl_bio_endio(struct bio *bio)
  {
+       struct blkcg_gq *blkg;
         struct throtl_grp *tg;
         u64 finish_time_ns;
         unsigned long finish_time;
@@ -2292,20 +2291,18 @@ void blk_throtl_bio_endio(struct bio *bio)
         unsigned long lat;
         int rw = bio_data_dir(bio);
  
-       tg = bio->bi_cg_private;
-       if (!tg)
+       blkg = bio->bi_blkg;
+       if (!blkg)
                 return;
-       bio->bi_cg_private = NULL;
+       tg = blkg_to_tg(blkg);
  
         finish_time_ns = ktime_get_ns();
         tg->last_finish_time = finish_time_ns >> 10;
  
         start_time = bio_issue_time(&bio->bi_issue) >> 10;
         finish_time = __bio_issue_time(finish_time_ns) >> 10;
-       if (!start_time || finish_time <= start_time) {
-               blkg_put(tg_to_blkg(tg));
+       if (!start_time || finish_time <= start_time)
                 return;
-       }
  
         lat = finish_time - start_time;
         /* this is only for bio based driver */
@@ -2334,8 +2331,6 @@ void blk_throtl_bio_endio(struct bio *bio)
                 tg->bio_cnt /= 2;
                 tg->bad_bio_cnt /= 2;
         }
-
-       blkg_put(tg_to_blkg(tg));
  }
  #endif
  
diff --git a/block/blk-wbt.c b/block/blk-wbt.c

index 4f89b28..461a9af 100644 (file)
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,6 +25,7 @@
  #include <linux/swap.h>
  
  #include "blk-wbt.h"
+#include "blk-rq-qos.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/wbt.h>
@@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
         return rwb && rwb->wb_normal != 0;
  }
  
-/*
- * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
- * false if 'v' + 1 would be bigger than 'below'.
- */
-static bool atomic_inc_below(atomic_t *v, int below)
-{
-       int cur = atomic_read(v);
-
-       for (;;) {
-               int old;
-
-               if (cur >= below)
-                       return false;
-               old = atomic_cmpxchg(v, cur, cur + 1);
-               if (old == cur)
-                       break;
-               cur = old;
-       }
-
-       return true;
-}
-
  static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  {
         if (rwb_enabled(rwb)) {
@@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
   */
  static bool wb_recent_wait(struct rq_wb *rwb)
  {
-       struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
+       struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
  
         return time_before(jiffies, wb->dirty_sleep + HZ);
  }
@@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
         }
  }
  
-void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
  {
+       struct rq_wb *rwb = RQWB(rqos);
         struct rq_wait *rqw;
         int inflight, limit;
  
@@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
   * Called on completion of a request. Note that it's also called when
   * a request is merged, when the request gets freed.
   */
-void wbt_done(struct rq_wb *rwb, struct request *rq)
+static void wbt_done(struct rq_qos *rqos, struct request *rq)
  {
-       if (!rwb)
-               return;
+       struct rq_wb *rwb = RQWB(rqos);
  
         if (!wbt_is_tracked(rq)) {
                 if (rwb->sync_cookie == rq) {
@@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
                         wb_timestamp(rwb, &rwb->last_comp);
         } else {
                 WARN_ON_ONCE(rq == rwb->sync_cookie);
-               __wbt_done(rwb, wbt_flags(rq));
+               __wbt_done(rqos, wbt_flags(rq));
         }
         wbt_clear_state(rq);
  }
  
-/*
- * Return true, if we can't increase the depth further by scaling
- */
-static bool calc_wb_limits(struct rq_wb *rwb)
-{
-       unsigned int depth;
-       bool ret = false;
-
-       if (!rwb->min_lat_nsec) {
-               rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
-               return false;
-       }
-
-       /*
-        * For QD=1 devices, this is a special case. It's important for those
-        * to have one request ready when one completes, so force a depth of
-        * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
-        * since the device can't have more than that in flight. If we're
-        * scaling down, then keep a setting of 1/1/1.
-        */
-       if (rwb->queue_depth == 1) {
-               if (rwb->scale_step > 0)
-                       rwb->wb_max = rwb->wb_normal = 1;
-               else {
-                       rwb->wb_max = rwb->wb_normal = 2;
-                       ret = true;
-               }
-               rwb->wb_background = 1;
-       } else {
-               /*
-                * scale_step == 0 is our default state. If we have suffered
-                * latency spikes, step will be > 0, and we shrink the
-                * allowed write depths. If step is < 0, we're only doing
-                * writes, and we allow a temporarily higher depth to
-                * increase performance.
-                */
-               depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
-               if (rwb->scale_step > 0)
-                       depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
-               else if (rwb->scale_step < 0) {
-                       unsigned int maxd = 3 * rwb->queue_depth / 4;
-
-                       depth = 1 + ((depth - 1) << -rwb->scale_step);
-                       if (depth > maxd) {
-                               depth = maxd;
-                               ret = true;
-                       }
-               }
-
-               /*
-                * Set our max/normal/bg queue depths based on how far
-                * we have scaled down (->scale_step).
-                */
-               rwb->wb_max = depth;
-               rwb->wb_normal = (rwb->wb_max + 1) / 2;
-               rwb->wb_background = (rwb->wb_max + 3) / 4;
-       }
-
-       return ret;
-}
-
  static inline bool stat_sample_valid(struct blk_rq_stat *stat)
  {
         /*
@@ -307,7 +225,8 @@ enum {
  
  static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
  {
-       struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+       struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+       struct rq_depth *rqd = &rwb->rq_depth;
         u64 thislat;
  
         /*
@@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
                 return LAT_EXCEEDED;
         }
  
-       if (rwb->scale_step)
+       if (rqd->scale_step)
                 trace_wbt_stat(bdi, stat);
  
         return LAT_OK;
@@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
  
  static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
  {
-       struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
+       struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+       struct rq_depth *rqd = &rwb->rq_depth;
  
-       trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
-                       rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+       trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
+                       rwb->wb_background, rwb->wb_normal, rqd->max_depth);
  }
  
-static void scale_up(struct rq_wb *rwb)
+static void calc_wb_limits(struct rq_wb *rwb)
  {
-       /*
-        * Hit max in previous round, stop here
-        */
-       if (rwb->scaled_max)
-               return;
+       if (rwb->min_lat_nsec == 0) {
+               rwb->wb_normal = rwb->wb_background = 0;
+       } else if (rwb->rq_depth.max_depth <= 2) {
+               rwb->wb_normal = rwb->rq_depth.max_depth;
+               rwb->wb_background = 1;
+       } else {
+               rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
+               rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
+       }
+}
  
-       rwb->scale_step--;
+static void scale_up(struct rq_wb *rwb)
+{
+       rq_depth_scale_up(&rwb->rq_depth);
+       calc_wb_limits(rwb);
         rwb->unknown_cnt = 0;
-
-       rwb->scaled_max = calc_wb_limits(rwb);
-
-       rwb_wake_all(rwb);
-
-       rwb_trace_step(rwb, "step up");
+       rwb_trace_step(rwb, "scale up");
  }
  
-/*
- * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
- */
  static void scale_down(struct rq_wb *rwb, bool hard_throttle)
  {
-       /*
-        * Stop scaling down when we've hit the limit. This also prevents
-        * ->scale_step from going to crazy values, if the device can't
-        * keep up.
-        */
-       if (rwb->wb_max == 1)
-               return;
-
-       if (rwb->scale_step < 0 && hard_throttle)
-               rwb->scale_step = 0;
-       else
-               rwb->scale_step++;
-
-       rwb->scaled_max = false;
-       rwb->unknown_cnt = 0;
+       rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
         calc_wb_limits(rwb);
-       rwb_trace_step(rwb, "step down");
+       rwb->unknown_cnt = 0;
+       rwb_wake_all(rwb);
+       rwb_trace_step(rwb, "scale down");
  }
  
  static void rwb_arm_timer(struct rq_wb *rwb)
  {
-       if (rwb->scale_step > 0) {
+       struct rq_depth *rqd = &rwb->rq_depth;
+
+       if (rqd->scale_step > 0) {
                 /*
                  * We should speed this up, using some variant of a fast
                  * integer inverse square root calculation. Since we only do
@@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
                  * though.
                  */
                 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
-                                       int_sqrt((rwb->scale_step + 1) << 8));
+                                       int_sqrt((rqd->scale_step + 1) << 8));
         } else {
                 /*
                  * For step < 0, we don't want to increase/decrease the
@@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
  static void wb_timer_fn(struct blk_stat_callback *cb)
  {
         struct rq_wb *rwb = cb->data;
+       struct rq_depth *rqd = &rwb->rq_depth;
         unsigned int inflight = wbt_inflight(rwb);
         int status;
  
         status = latency_exceeded(rwb, cb->stat);
  
-       trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
+       trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
                         inflight);
  
         /*
@@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
                  * currently don't have a valid read/write sample. For that
                  * case, slowly return to center state (step == 0).
                  */
-               if (rwb->scale_step > 0)
+               if (rqd->scale_step > 0)
                         scale_up(rwb);
-               else if (rwb->scale_step < 0)
+               else if (rqd->scale_step < 0)
                         scale_down(rwb, false);
                 break;
         default:
@@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
         /*
          * Re-arm timer, if we have IO in flight
          */
-       if (rwb->scale_step || inflight)
+       if (rqd->scale_step || inflight)
                 rwb_arm_timer(rwb);
  }
  
-void wbt_update_limits(struct rq_wb *rwb)
+static void __wbt_update_limits(struct rq_wb *rwb)
  {
-       rwb->scale_step = 0;
-       rwb->scaled_max = false;
+       struct rq_depth *rqd = &rwb->rq_depth;
+
+       rqd->scale_step = 0;
+       rqd->scaled_max = false;
+
+       rq_depth_calc_max_depth(rqd);
         calc_wb_limits(rwb);
  
         rwb_wake_all(rwb);
  }
  
+void wbt_update_limits(struct request_queue *q)
+{
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       if (!rqos)
+               return;
+       __wbt_update_limits(RQWB(rqos));
+}
+
+u64 wbt_get_min_lat(struct request_queue *q)
+{
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       if (!rqos)
+               return 0;
+       return RQWB(rqos)->min_lat_nsec;
+}
+
+void wbt_set_min_lat(struct request_queue *q, u64 val)
+{
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       if (!rqos)
+               return;
+       RQWB(rqos)->min_lat_nsec = val;
+       RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+       __wbt_update_limits(RQWB(rqos));
+}
+
+
  static bool close_io(struct rq_wb *rwb)
  {
         const unsigned long now = jiffies;
@@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
          * IO for a bit.
          */
         if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
-               limit = rwb->wb_max;
+               limit = rwb->rq_depth.max_depth;
         else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
                 /*
                  * If less than 100ms since we completed unrelated IO,
@@ -554,7 +495,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
             rqw->wait.head.next != &wait->entry)
                 return false;
  
-       return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+       return rq_wait_inc_below(rqw, get_limit(rwb, rw));
  }
  
  /*
@@ -608,43 +549,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
         }
  }
  
+static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
+{
+       enum wbt_flags flags = 0;
+
+       if (bio_op(bio) == REQ_OP_READ) {
+               flags = WBT_READ;
+       } else if (wbt_should_throttle(rwb, bio)) {
+               if (current_is_kswapd())
+                       flags |= WBT_KSWAPD;
+               if (bio_op(bio) == REQ_OP_DISCARD)
+                       flags |= WBT_DISCARD;
+               flags |= WBT_TRACKED;
+       }
+       return flags;
+}
+
+static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
+{
+       struct rq_wb *rwb = RQWB(rqos);
+       enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
+       __wbt_done(rqos, flags);
+}
+
  /*
   * Returns true if the IO request should be accounted, false if not.
   * May sleep, if we have exceeded the writeback limits. Caller can pass
   * in an irq held spinlock, if it holds one when calling this function.
   * If we do sleep, we'll release and re-grab it.
   */
-enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
  {
-       enum wbt_flags ret = 0;
+       struct rq_wb *rwb = RQWB(rqos);
+       enum wbt_flags flags;
  
         if (!rwb_enabled(rwb))
-               return 0;
+               return;
  
-       if (bio_op(bio) == REQ_OP_READ)
-               ret = WBT_READ;
+       flags = bio_to_wbt_flags(rwb, bio);
  
         if (!wbt_should_throttle(rwb, bio)) {
-               if (ret & WBT_READ)
+               if (flags & WBT_READ)
                         wb_timestamp(rwb, &rwb->last_issue);
-               return ret;
+               return;
         }
  
         if (current_is_kswapd())
-               ret |= WBT_KSWAPD;
+               flags |= WBT_KSWAPD;
         if (bio_op(bio) == REQ_OP_DISCARD)
-               ret |= WBT_DISCARD;
+               flags |= WBT_DISCARD;
  
-       __wbt_wait(rwb, ret, bio->bi_opf, lock);
+       __wbt_wait(rwb, flags, bio->bi_opf, lock);
  
         if (!blk_stat_is_active(rwb->cb))
                 rwb_arm_timer(rwb);
+}
  
-       return ret | WBT_TRACKED;
+static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
+{
+       struct rq_wb *rwb = RQWB(rqos);
+       rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
  }
  
-void wbt_issue(struct rq_wb *rwb, struct request *rq)
+void wbt_issue(struct rq_qos *rqos, struct request *rq)
  {
+       struct rq_wb *rwb = RQWB(rqos);
+
         if (!rwb_enabled(rwb))
                 return;
  
@@ -661,8 +631,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
         }
  }
  
-void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+void wbt_requeue(struct rq_qos *rqos, struct request *rq)
  {
+       struct rq_wb *rwb = RQWB(rqos);
         if (!rwb_enabled(rwb))
                 return;
         if (rq == rwb->sync_cookie) {
@@ -671,39 +642,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
         }
  }
  
-void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
  {
-       if (rwb) {
-               rwb->queue_depth = depth;
-               wbt_update_limits(rwb);
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       if (rqos) {
+               RQWB(rqos)->rq_depth.queue_depth = depth;
+               __wbt_update_limits(RQWB(rqos));
         }
  }
  
-void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
-{
-       if (rwb)
-               rwb->wc = write_cache_on;
-}
-
-/*
- * Disable wbt, if enabled by default.
- */
-void wbt_disable_default(struct request_queue *q)
+void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
  {
-       struct rq_wb *rwb = q->rq_wb;
-
-       if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
-               wbt_exit(q);
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       if (rqos)
+               RQWB(rqos)->wc = write_cache_on;
  }
-EXPORT_SYMBOL_GPL(wbt_disable_default);
  
  /*
   * Enable wbt if defaults are configured that way
   */
  void wbt_enable_default(struct request_queue *q)
  {
+       struct rq_qos *rqos = wbt_rq_qos(q);
         /* Throttling already enabled? */
-       if (q->rq_wb)
+       if (rqos)
                 return;
  
         /* Queue not registered? Maybe shutting down... */
@@ -741,6 +703,42 @@ static int wbt_data_dir(const struct request *rq)
         return -1;
  }
  
+static void wbt_exit(struct rq_qos *rqos)
+{
+       struct rq_wb *rwb = RQWB(rqos);
+       struct request_queue *q = rqos->q;
+
+       blk_stat_remove_callback(q, rwb->cb);
+       blk_stat_free_callback(rwb->cb);
+       kfree(rwb);
+}
+
+/*
+ * Disable wbt, if enabled by default.
+ */
+void wbt_disable_default(struct request_queue *q)
+{
+       struct rq_qos *rqos = wbt_rq_qos(q);
+       struct rq_wb *rwb;
+       if (!rqos)
+               return;
+       rwb = RQWB(rqos);
+       if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+               rwb->wb_normal = 0;
+}
+EXPORT_SYMBOL_GPL(wbt_disable_default);
+
+
+static struct rq_qos_ops wbt_rqos_ops = {
+       .throttle = wbt_wait,
+       .issue = wbt_issue,
+       .track = wbt_track,
+       .requeue = wbt_requeue,
+       .done = wbt_done,
+       .cleanup = wbt_cleanup,
+       .exit = wbt_exit,
+};
+
  int wbt_init(struct request_queue *q)
  {
         struct rq_wb *rwb;
@@ -756,39 +754,29 @@ int wbt_init(struct request_queue *q)
                 return -ENOMEM;
         }
  
-       for (i = 0; i < WBT_NUM_RWQ; i++) {
-               atomic_set(&rwb->rq_wait[i].inflight, 0);
-               init_waitqueue_head(&rwb->rq_wait[i].wait);
-       }
+       for (i = 0; i < WBT_NUM_RWQ; i++)
+               rq_wait_init(&rwb->rq_wait[i]);
  
+       rwb->rqos.id = RQ_QOS_WBT;
+       rwb->rqos.ops = &wbt_rqos_ops;
+       rwb->rqos.q = q;
         rwb->last_comp = rwb->last_issue = jiffies;
-       rwb->queue = q;
         rwb->win_nsec = RWB_WINDOW_NSEC;
         rwb->enable_state = WBT_STATE_ON_DEFAULT;
-       wbt_update_limits(rwb);
+       rwb->wc = 1;
+       rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
+       __wbt_update_limits(rwb);
  
         /*
          * Assign rwb and add the stats callback.
          */
-       q->rq_wb = rwb;
+       rq_qos_add(q, &rwb->rqos);
         blk_stat_add_callback(q, rwb->cb);
  
         rwb->min_lat_nsec = wbt_default_latency_nsec(q);
  
-       wbt_set_queue_depth(rwb, blk_queue_depth(q));
-       wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+       wbt_set_queue_depth(q, blk_queue_depth(q));
+       wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
  
         return 0;
  }
-
-void wbt_exit(struct request_queue *q)
-{
-       struct rq_wb *rwb = q->rq_wb;
-
-       if (rwb) {
-               blk_stat_remove_callback(q, rwb->cb);
-               blk_stat_free_callback(rwb->cb);
-               q->rq_wb = NULL;
-               kfree(rwb);
-       }
-}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h

index 300df53..f47218d 100644 (file)
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,6 +9,7 @@
  #include <linux/ktime.h>
  
  #include "blk-stat.h"
+#include "blk-rq-qos.h"
  
  enum wbt_flags {
         WBT_TRACKED             = 1,    /* write, tracked for throttling */
@@ -35,20 +36,12 @@ enum {
         WBT_STATE_ON_MANUAL     = 2,
  };
  
-struct rq_wait {
-       wait_queue_head_t wait;
-       atomic_t inflight;
-};
-
  struct rq_wb {
         /*
          * Settings that govern how we throttle
          */
         unsigned int wb_background;             /* background writeback */
         unsigned int wb_normal;                 /* normal writeback */
-       unsigned int wb_max;                    /* max throughput writeback */
-       int scale_step;
-       bool scaled_max;
  
         short enable_state;                     /* WBT_STATE_* */
  
@@ -67,15 +60,20 @@ struct rq_wb {
         void *sync_cookie;
  
         unsigned int wc;
-       unsigned int queue_depth;
  
         unsigned long last_issue;               /* last non-throttled issue */
         unsigned long last_comp;                /* last non-throttled comp */
         unsigned long min_lat_nsec;
-       struct request_queue *queue;
+       struct rq_qos rqos;
         struct rq_wait rq_wait[WBT_NUM_RWQ];
+       struct rq_depth rq_depth;
  };
  
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+       return container_of(rqos, struct rq_wb, rqos);
+}
+
  static inline unsigned int wbt_inflight(struct rq_wb *rwb)
  {
         unsigned int i, ret = 0;
@@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
         return ret;
  }
  
-#ifdef CONFIG_BLK_WBT
  
-static inline void wbt_track(struct request *rq, enum wbt_flags flags)
-{
-       rq->wbt_flags |= flags;
-}
+#ifdef CONFIG_BLK_WBT
  
-void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct request *);
-enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
  int wbt_init(struct request_queue *);
-void wbt_exit(struct request_queue *);
-void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct request *);
-void wbt_issue(struct rq_wb *, struct request *);
+void wbt_update_limits(struct request_queue *);
  void wbt_disable_default(struct request_queue *);
  void wbt_enable_default(struct request_queue *);
  
-void wbt_set_queue_depth(struct rq_wb *, unsigned int);
-void wbt_set_write_cache(struct rq_wb *, bool);
+u64 wbt_get_min_lat(struct request_queue *q);
+void wbt_set_min_lat(struct request_queue *q, u64 val);
+
+void wbt_set_queue_depth(struct request_queue *, unsigned int);
+void wbt_set_write_cache(struct request_queue *, bool);
  
  u64 wbt_default_latency_nsec(struct request_queue *);
  
@@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *);
  static inline void wbt_track(struct request *rq, enum wbt_flags flags)
  {
  }
-static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
-{
-}
-static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
-{
-}
-static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
-                                     spinlock_t *lock)
-{
-       return 0;
-}
  static inline int wbt_init(struct request_queue *q)
  {
         return -EINVAL;
  }
-static inline void wbt_exit(struct request_queue *q)
-{
-}
-static inline void wbt_update_limits(struct rq_wb *rwb)
+static inline void wbt_update_limits(struct request_queue *q)
  {
  }
-static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_disable_default(struct request_queue *q)
  {
  }
-static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
+static inline void wbt_enable_default(struct request_queue *q)
  {
  }
-static inline void wbt_disable_default(struct request_queue *q)
+static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
  {
  }
-static inline void wbt_enable_default(struct request_queue *q)
+static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
  {
  }
-static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+static inline u64 wbt_get_min_lat(struct request_queue *q)
  {
+       return 0;
  }
-static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
  {
  }
  static inline u64 wbt_default_latency_nsec(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c

index 5100091..c461cf6 100644 (file)
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev,
                 /* Get header in the first page */
                 ofst = 0;
                 if (!nr_rep) {
-                       hdr = (struct blk_zone_report_hdr *) addr;
+                       hdr = addr;
                         nr_rep = hdr->nr_zones;
                         ofst = sizeof(struct blk_zone_report_hdr);
                 }
diff --git a/block/blk.h b/block/blk.h

index 8d23aea..69b14cd 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -412,4 +412,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
  
  extern void blk_drain_queue(struct request_queue *q);
  
+#ifdef CONFIG_BLK_CGROUP_IOLATENCY
+extern int blk_iolatency_init(struct request_queue *q);
+#else
+static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
+#endif
+
  #endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c

index fd31347..bc63b3a 100644 (file)
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
         __bounce_end_io_read(bio, &isa_page_pool);
  }
  
+static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
+               struct bio_set *bs)
+{
+       struct bvec_iter iter;
+       struct bio_vec bv;
+       struct bio *bio;
+
+       /*
+        * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
+        * bio_src->bi_io_vec to bio->bi_io_vec.
+        *
+        * We can't do that anymore, because:
+        *
+        *  - The point of cloning the biovec is to produce a bio with a biovec
+        *    the caller can modify: bi_idx and bi_bvec_done should be 0.
+        *
+        *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+        *    we tried to clone the whole thing bio_alloc_bioset() would fail.
+        *    But the clone should succeed as long as the number of biovecs we
+        *    actually need to allocate is fewer than BIO_MAX_PAGES.
+        *
+        *  - Lastly, bi_vcnt should not be looked at or relied upon by code
+        *    that does not own the bio - reason being drivers don't use it for
+        *    iterating over the biovec anymore, so expecting it to be kept up
+        *    to date (i.e. for clones that share the parent biovec) is just
+        *    asking for trouble and would force extra work on
+        *    __bio_clone_fast() anyways.
+        */
+
+       bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
+       if (!bio)
+               return NULL;
+       bio->bi_disk            = bio_src->bi_disk;
+       bio->bi_opf             = bio_src->bi_opf;
+       bio->bi_write_hint      = bio_src->bi_write_hint;
+       bio->bi_iter.bi_sector  = bio_src->bi_iter.bi_sector;
+       bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
+
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_SECURE_ERASE:
+       case REQ_OP_WRITE_ZEROES:
+               break;
+       case REQ_OP_WRITE_SAME:
+               bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+               break;
+       default:
+               bio_for_each_segment(bv, bio_src, iter)
+                       bio->bi_io_vec[bio->bi_vcnt++] = bv;
+               break;
+       }
+
+       if (bio_integrity(bio_src)) {
+               int ret;
+
+               ret = bio_integrity_clone(bio, bio_src, gfp_mask);
+               if (ret < 0) {
+                       bio_put(bio);
+                       return NULL;
+               }
+       }
+
+       bio_clone_blkcg_association(bio, bio_src);
+
+       return bio;
+}
+
  static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                                mempool_t *pool)
  {
@@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                 generic_make_request(*bio_orig);
                 *bio_orig = bio;
         }
-       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+       bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
                         &bounce_bio_set);
  
         bio_for_each_segment_all(to, bio, i) {
diff --git a/block/bsg-lib.c b/block/bsg-lib.c

index 9419def..f3501cd 100644 (file)
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
  
         job->request_len = hdr->request_len;
         job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
-       if (IS_ERR(job->request))
-               return PTR_ERR(job->request);
-       return 0;
+
+       return PTR_ERR_OR_ZERO(job->request);
  }
  
  static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
diff --git a/block/bsg.c b/block/bsg.c

index 3da540f..db588ad 100644 (file)
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -13,11 +13,9 @@
  #include <linux/init.h>
  #include <linux/file.h>
  #include <linux/blkdev.h>
-#include <linux/poll.h>
  #include <linux/cdev.h>
  #include <linux/jiffies.h>
  #include <linux/percpu.h>
-#include <linux/uio.h>
  #include <linux/idr.h>
  #include <linux/bsg.h>
  #include <linux/slab.h>
@@ -38,21 +36,10 @@
  struct bsg_device {
         struct request_queue *queue;
         spinlock_t lock;
-       struct list_head busy_list;
-       struct list_head done_list;
         struct hlist_node dev_list;
         atomic_t ref_count;
-       int queued_cmds;
-       int done_cmds;
-       wait_queue_head_t wq_done;
-       wait_queue_head_t wq_free;
         char name[20];
         int max_queue;
-       unsigned long flags;
-};
-
-enum {
-       BSG_F_BLOCK             = 1,
  };
  
  #define BSG_DEFAULT_CMDS       64
@@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
  static struct class *bsg_class;
  static int bsg_major;
  
-static struct kmem_cache *bsg_cmd_cachep;
-
-/*
- * our internal command type
- */
-struct bsg_command {
-       struct bsg_device *bd;
-       struct list_head list;
-       struct request *rq;
-       struct bio *bio;
-       struct bio *bidi_bio;
-       int err;
-       struct sg_io_v4 hdr;
-};
-
-static void bsg_free_command(struct bsg_command *bc)
-{
-       struct bsg_device *bd = bc->bd;
-       unsigned long flags;
-
-       kmem_cache_free(bsg_cmd_cachep, bc);
-
-       spin_lock_irqsave(&bd->lock, flags);
-       bd->queued_cmds--;
-       spin_unlock_irqrestore(&bd->lock, flags);
-
-       wake_up(&bd->wq_free);
-}
-
-static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
-{
-       struct bsg_command *bc = ERR_PTR(-EINVAL);
-
-       spin_lock_irq(&bd->lock);
-
-       if (bd->queued_cmds >= bd->max_queue)
-               goto out;
-
-       bd->queued_cmds++;
-       spin_unlock_irq(&bd->lock);
-
-       bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
-       if (unlikely(!bc)) {
-               spin_lock_irq(&bd->lock);
-               bd->queued_cmds--;
-               bc = ERR_PTR(-ENOMEM);
-               goto out;
-       }
-
-       bc->bd = bd;
-       INIT_LIST_HEAD(&bc->list);
-       bsg_dbg(bd, "returning free cmd %p\n", bc);
-       return bc;
-out:
-       spin_unlock_irq(&bd->lock);
-       return bc;
-}
-
  static inline struct hlist_head *bsg_dev_idx_hash(int index)
  {
         return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
@@ -285,101 +214,6 @@ out:
         return ERR_PTR(ret);
  }
  
-/*
- * async completion call-back from the block layer, when scsi/ide/whatever
- * calls end_that_request_last() on a request
- */
-static void bsg_rq_end_io(struct request *rq, blk_status_t status)
-{
-       struct bsg_command *bc = rq->end_io_data;
-       struct bsg_device *bd = bc->bd;
-       unsigned long flags;
-
-       bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
-               rq, bc, bc->bio);
-
-       bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
-
-       spin_lock_irqsave(&bd->lock, flags);
-       list_move_tail(&bc->list, &bd->done_list);
-       bd->done_cmds++;
-       spin_unlock_irqrestore(&bd->lock, flags);
-
-       wake_up(&bd->wq_done);
-}
-
-/*
- * do final setup of a 'bc' and submit the matching 'rq' to the block
- * layer for io
- */
-static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
-                           struct bsg_command *bc, struct request *rq)
-{
-       int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
-
-       /*
-        * add bc command to busy queue and submit rq for io
-        */
-       bc->rq = rq;
-       bc->bio = rq->bio;
-       if (rq->next_rq)
-               bc->bidi_bio = rq->next_rq->bio;
-       bc->hdr.duration = jiffies;
-       spin_lock_irq(&bd->lock);
-       list_add_tail(&bc->list, &bd->busy_list);
-       spin_unlock_irq(&bd->lock);
-
-       bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
-
-       rq->end_io_data = bc;
-       blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
-}
-
-static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
-{
-       struct bsg_command *bc = NULL;
-
-       spin_lock_irq(&bd->lock);
-       if (bd->done_cmds) {
-               bc = list_first_entry(&bd->done_list, struct bsg_command, list);
-               list_del(&bc->list);
-               bd->done_cmds--;
-       }
-       spin_unlock_irq(&bd->lock);
-
-       return bc;
-}
-
-/*
- * Get a finished command from the done list
- */
-static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
-{
-       struct bsg_command *bc;
-       int ret;
-
-       do {
-               bc = bsg_next_done_cmd(bd);
-               if (bc)
-                       break;
-
-               if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
-                       bc = ERR_PTR(-EAGAIN);
-                       break;
-               }
-
-               ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
-               if (ret) {
-                       bc = ERR_PTR(-ERESTARTSYS);
-                       break;
-               }
-       } while (1);
-
-       bsg_dbg(bd, "returning done %p\n", bc);
-
-       return bc;
-}
-
  static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
                                     struct bio *bio, struct bio *bidi_bio)
  {
@@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
         return ret;
  }
  
-static bool bsg_complete(struct bsg_device *bd)
-{
-       bool ret = false;
-       bool spin;
-
-       do {
-               spin_lock_irq(&bd->lock);
-
-               BUG_ON(bd->done_cmds > bd->queued_cmds);
-
-               /*
-                * All commands consumed.
-                */
-               if (bd->done_cmds == bd->queued_cmds)
-                       ret = true;
-
-               spin = !test_bit(BSG_F_BLOCK, &bd->flags);
-
-               spin_unlock_irq(&bd->lock);
-       } while (!ret && spin);
-
-       return ret;
-}
-
-static int bsg_complete_all_commands(struct bsg_device *bd)
-{
-       struct bsg_command *bc;
-       int ret, tret;
-
-       bsg_dbg(bd, "entered\n");
-
-       /*
-        * wait for all commands to complete
-        */
-       io_wait_event(bd->wq_done, bsg_complete(bd));
-
-       /*
-        * discard done commands
-        */
-       ret = 0;
-       do {
-               spin_lock_irq(&bd->lock);
-               if (!bd->queued_cmds) {
-                       spin_unlock_irq(&bd->lock);
-                       break;
-               }
-               spin_unlock_irq(&bd->lock);
-
-               bc = bsg_get_done_cmd(bd);
-               if (IS_ERR(bc))
-                       break;
-
-               tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
-                                               bc->bidi_bio);
-               if (!ret)
-                       ret = tret;
-
-               bsg_free_command(bc);
-       } while (1);
-
-       return ret;
-}
-
-static int
-__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
-          const struct iovec *iov, ssize_t *bytes_read)
-{
-       struct bsg_command *bc;
-       int nr_commands, ret;
-
-       if (count % sizeof(struct sg_io_v4))
-               return -EINVAL;
-
-       ret = 0;
-       nr_commands = count / sizeof(struct sg_io_v4);
-       while (nr_commands) {
-               bc = bsg_get_done_cmd(bd);
-               if (IS_ERR(bc)) {
-                       ret = PTR_ERR(bc);
-                       break;
-               }
-
-               /*
-                * this is the only case where we need to copy data back
-                * after completing the request. so do that here,
-                * bsg_complete_work() cannot do that for us
-                */
-               ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
-                                              bc->bidi_bio);
-
-               if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
-                       ret = -EFAULT;
-
-               bsg_free_command(bc);
-
-               if (ret)
-                       break;
-
-               buf += sizeof(struct sg_io_v4);
-               *bytes_read += sizeof(struct sg_io_v4);
-               nr_commands--;
-       }
-
-       return ret;
-}
-
-static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
-{
-       if (file->f_flags & O_NONBLOCK)
-               clear_bit(BSG_F_BLOCK, &bd->flags);
-       else
-               set_bit(BSG_F_BLOCK, &bd->flags);
-}
-
-/*
- * Check if the error is a "real" error that we should return.
- */
-static inline int err_block_err(int ret)
-{
-       if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
-               return 1;
-
-       return 0;
-}
-
-static ssize_t
-bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-       struct bsg_device *bd = file->private_data;
-       int ret;
-       ssize_t bytes_read;
-
-       bsg_dbg(bd, "read %zd bytes\n", count);
-
-       bsg_set_block(bd, file);
-
-       bytes_read = 0;
-       ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
-       *ppos = bytes_read;
-
-       if (!bytes_read || err_block_err(ret))
-               bytes_read = ret;
-
-       return bytes_read;
-}
-
-static int __bsg_write(struct bsg_device *bd, const char __user *buf,
-                      size_t count, ssize_t *bytes_written, fmode_t mode)
-{
-       struct bsg_command *bc;
-       struct request *rq;
-       int ret, nr_commands;
-
-       if (count % sizeof(struct sg_io_v4))
-               return -EINVAL;
-
-       nr_commands = count / sizeof(struct sg_io_v4);
-       rq = NULL;
-       bc = NULL;
-       ret = 0;
-       while (nr_commands) {
-               struct request_queue *q = bd->queue;
-
-               bc = bsg_alloc_command(bd);
-               if (IS_ERR(bc)) {
-                       ret = PTR_ERR(bc);
-                       bc = NULL;
-                       break;
-               }
-
-               if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
-                       ret = -EFAULT;
-                       break;
-               }
-
-               /*
-                * get a request, fill in the blanks, and add to request queue
-                */
-               rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
-               if (IS_ERR(rq)) {
-                       ret = PTR_ERR(rq);
-                       rq = NULL;
-                       break;
-               }
-
-               bsg_add_command(bd, q, bc, rq);
-               bc = NULL;
-               rq = NULL;
-               nr_commands--;
-               buf += sizeof(struct sg_io_v4);
-               *bytes_written += sizeof(struct sg_io_v4);
-       }
-
-       if (bc)
-               bsg_free_command(bc);
-
-       return ret;
-}
-
-static ssize_t
-bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
-       struct bsg_device *bd = file->private_data;
-       ssize_t bytes_written;
-       int ret;
-
-       bsg_dbg(bd, "write %zd bytes\n", count);
-
-       if (unlikely(uaccess_kernel()))
-               return -EINVAL;
-
-       bsg_set_block(bd, file);
-
-       bytes_written = 0;
-       ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
-
-       *ppos = bytes_written;
-
-       /*
-        * return bytes written on non-fatal errors
-        */
-       if (!bytes_written || err_block_err(ret))
-               bytes_written = ret;
-
-       bsg_dbg(bd, "returning %zd\n", bytes_written);
-       return bytes_written;
-}
-
  static struct bsg_device *bsg_alloc_device(void)
  {
         struct bsg_device *bd;
@@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void)
                 return NULL;
  
         spin_lock_init(&bd->lock);
-
         bd->max_queue = BSG_DEFAULT_CMDS;
-
-       INIT_LIST_HEAD(&bd->busy_list);
-       INIT_LIST_HEAD(&bd->done_list);
         INIT_HLIST_NODE(&bd->dev_list);
-
-       init_waitqueue_head(&bd->wq_free);
-       init_waitqueue_head(&bd->wq_done);
         return bd;
  }
  
  static int bsg_put_device(struct bsg_device *bd)
  {
-       int ret = 0, do_free;
         struct request_queue *q = bd->queue;
  
         mutex_lock(&bsg_mutex);
  
-       do_free = atomic_dec_and_test(&bd->ref_count);
-       if (!do_free) {
+       if (!atomic_dec_and_test(&bd->ref_count)) {
                 mutex_unlock(&bsg_mutex);
-               goto out;
+               return 0;
         }
  
         hlist_del(&bd->dev_list);
@@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd)
         /*
          * close can always block
          */
-       set_bit(BSG_F_BLOCK, &bd->flags);
-
-       /*
-        * correct error detection baddies here again. it's the responsibility
-        * of the app to properly reap commands before close() if it wants
-        * fool-proof error detection
-        */
-       ret = bsg_complete_all_commands(bd);
-
         kfree(bd);
-out:
-       if (do_free)
-               blk_put_queue(q);
-       return ret;
+       blk_put_queue(q);
+       return 0;
  }
  
  static struct bsg_device *bsg_add_device(struct inode *inode,
@@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
  
         bd->queue = rq;
  
-       bsg_set_block(bd, file);
-
         atomic_set(&bd->ref_count, 1);
         hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
  
@@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file)
         return bsg_put_device(bd);
  }
  
-static __poll_t bsg_poll(struct file *file, poll_table *wait)
-{
-       struct bsg_device *bd = file->private_data;
-       __poll_t mask = 0;
-
-       poll_wait(file, &bd->wq_done, wait);
-       poll_wait(file, &bd->wq_free, wait);
-
-       spin_lock_irq(&bd->lock);
-       if (!list_empty(&bd->done_list))
-               mask |= EPOLLIN | EPOLLRDNORM;
-       if (bd->queued_cmds < bd->max_queue)
-               mask |= EPOLLOUT;
-       spin_unlock_irq(&bd->lock);
-
-       return mask;
-}
-
  static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  {
         struct bsg_device *bd = file->private_data;
@@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  }
  
  static const struct file_operations bsg_fops = {
-       .read           =       bsg_read,
-       .write          =       bsg_write,
-       .poll           =       bsg_poll,
         .open           =       bsg_open,
         .release        =       bsg_release,
         .unlocked_ioctl =       bsg_ioctl,
@@ -977,21 +540,12 @@ static int __init bsg_init(void)
         int ret, i;
         dev_t devid;
  
-       bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
-                               sizeof(struct bsg_command), 0, 0, NULL);
-       if (!bsg_cmd_cachep) {
-               printk(KERN_ERR "bsg: failed creating slab cache\n");
-               return -ENOMEM;
-       }
-
         for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
                 INIT_HLIST_HEAD(&bsg_device_list[i]);
  
         bsg_class = class_create(THIS_MODULE, "bsg");
-       if (IS_ERR(bsg_class)) {
-               ret = PTR_ERR(bsg_class);
-               goto destroy_kmemcache;
-       }
+       if (IS_ERR(bsg_class))
+               return PTR_ERR(bsg_class);
         bsg_class->devnode = bsg_devnode;
  
         ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
@@ -1012,8 +566,6 @@ unregister_chrdev:
         unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
  destroy_bsg_class:
         class_destroy(bsg_class);
-destroy_kmemcache:
-       kmem_cache_destroy(bsg_cmd_cachep);
         return ret;
  }
  
diff --git a/block/genhd.c b/block/genhd.c

index f1543a4..8cc719a 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                 part_round_stats(gp->queue, cpu, hd);
                 part_stat_unlock();
                 part_in_flight(gp->queue, hd, inflight);
-               seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
-                          "%u %lu %lu %lu %u %u %u %u\n",
+               seq_printf(seqf, "%4d %7d %s "
+                          "%lu %lu %lu %u "
+                          "%lu %lu %lu %u "
+                          "%u %u %u "
+                          "%lu %lu %lu %u\n",
                            MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                            disk_name(gp, hd->partno, buf),
-                          part_stat_read(hd, ios[READ]),
-                          part_stat_read(hd, merges[READ]),
-                          part_stat_read(hd, sectors[READ]),
-                          jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
-                          part_stat_read(hd, ios[WRITE]),
-                          part_stat_read(hd, merges[WRITE]),
-                          part_stat_read(hd, sectors[WRITE]),
-                          jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
+                          part_stat_read(hd, ios[STAT_READ]),
+                          part_stat_read(hd, merges[STAT_READ]),
+                          part_stat_read(hd, sectors[STAT_READ]),
+                          jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
+                          part_stat_read(hd, ios[STAT_WRITE]),
+                          part_stat_read(hd, merges[STAT_WRITE]),
+                          part_stat_read(hd, sectors[STAT_WRITE]),
+                          jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
                            inflight[0],
                            jiffies_to_msecs(part_stat_read(hd, io_ticks)),
-                          jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+                          jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
+                          part_stat_read(hd, ios[STAT_DISCARD]),
+                          part_stat_read(hd, merges[STAT_DISCARD]),
+                          part_stat_read(hd, sectors[STAT_DISCARD]),
+                          jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
                         );
         }
         disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c

index 3dcfd4e..5a8975a 100644 (file)
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev,
         return sprintf(buf,
                 "%8lu %8lu %8llu %8u "
                 "%8lu %8lu %8llu %8u "
-               "%8u %8u %8u"
+               "%8u %8u %8u "
+               "%8lu %8lu %8llu %8u"
                 "\n",
-               part_stat_read(p, ios[READ]),
-               part_stat_read(p, merges[READ]),
-               (unsigned long long)part_stat_read(p, sectors[READ]),
-               jiffies_to_msecs(part_stat_read(p, ticks[READ])),
-               part_stat_read(p, ios[WRITE]),
-               part_stat_read(p, merges[WRITE]),
-               (unsigned long long)part_stat_read(p, sectors[WRITE]),
-               jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+               part_stat_read(p, ios[STAT_READ]),
+               part_stat_read(p, merges[STAT_READ]),
+               (unsigned long long)part_stat_read(p, sectors[STAT_READ]),
+               jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
+               part_stat_read(p, ios[STAT_WRITE]),
+               part_stat_read(p, merges[STAT_WRITE]),
+               (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
+               jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
                 inflight[0],
                 jiffies_to_msecs(part_stat_read(p, io_ticks)),
-               jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+               jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+               part_stat_read(p, ios[STAT_DISCARD]),
+               part_stat_read(p, merges[STAT_DISCARD]),
+               (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
+               jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
  }
  
  ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/partitions/aix.c b/block/partitions/aix.c

index 007f95e..903f3ed 100644 (file)
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state)
         u32 vgda_sector = 0;
         u32 vgda_len = 0;
         int numlvs = 0;
-       struct pvd *pvd;
+       struct pvd *pvd = NULL;
         struct lv_info {
                 unsigned short pps_per_lv;
                 unsigned short pps_found;
@@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state)
                                 if (lvip[i].pps_per_lv)
                                         foundlvs += 1;
                         }
+                       /* pvd loops depend on n[].name and lvip[].pps_per_lv */
+                       pvd = alloc_pvd(state, vgda_sector + 17);
                 }
                 put_dev_sector(sect);
         }
-       pvd = alloc_pvd(state, vgda_sector + 17);
         if (pvd) {
                 int numpps = be16_to_cpu(pvd->pp_count);
                 int psn_part1 = be32_to_cpu(pvd->psn_part1);
@@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state)
                                 next_lp_ix += 1;
                 }
                 for (i = 0; i < state->limit; i += 1)
-                       if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+                       if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) {
+                               char tmp[sizeof(n[i].name) + 1]; // null char
+
+                               snprintf(tmp, sizeof(tmp), "%s", n[i].name);
                                 pr_warn("partition %s (%u pp's found) is "
                                         "not contiguous\n",
-                                       n[i].name, lvip[i].pps_found);
+                                       tmp, lvip[i].pps_found);
+                       }
                 kfree(pvd);
         }
         kfree(n);
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c

index 0417937..16766f2 100644 (file)
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
  {
         char buf[64];
         int r_objid, r_name, r_id1, r_id2, len;
-       struct vblk_dgrp *dgrp;
  
         BUG_ON (!buffer || !vb);
  
@@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
         if (len != get_unaligned_be32(buffer + 0x14))
                 return false;
  
-       dgrp = &vb->vblk.dgrp;
-
         ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
         return true;
  }
diff --git a/block/t10-pi.c b/block/t10-pi.c

index a98db38..62aed77 100644 (file)
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
         .verify_fn              = t10_pi_type3_verify_ip,
  };
  EXPORT_SYMBOL(t10_pi_type3_ip);
+
+/**
+ * t10_pi_prepare - prepare PI prior submitting request to device
+ * @rq:              request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Remap protection information to match the
+ * physical LBA.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+       const int tuple_sz = rq->q->integrity.tuple_size;
+       u32 ref_tag = t10_pi_ref_tag(rq);
+       struct bio *bio;
+
+       if (protection_type == T10_PI_TYPE3_PROTECTION)
+               return;
+
+       __rq_for_each_bio(bio, rq) {
+               struct bio_integrity_payload *bip = bio_integrity(bio);
+               u32 virt = bip_get_seed(bip) & 0xffffffff;
+               struct bio_vec iv;
+               struct bvec_iter iter;
+
+               /* Already remapped? */
+               if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+                       break;
+
+               bip_for_each_vec(iv, bip, iter) {
+                       void *p, *pmap;
+                       unsigned int j;
+
+                       pmap = kmap_atomic(iv.bv_page);
+                       p = pmap + iv.bv_offset;
+                       for (j = 0; j < iv.bv_len; j += tuple_sz) {
+                               struct t10_pi_tuple *pi = p;
+
+                               if (be32_to_cpu(pi->ref_tag) == virt)
+                                       pi->ref_tag = cpu_to_be32(ref_tag);
+                               virt++;
+                               ref_tag++;
+                               p += tuple_sz;
+                       }
+
+                       kunmap_atomic(pmap);
+               }
+
+               bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+       }
+}
+EXPORT_SYMBOL(t10_pi_prepare);
+
+/**
+ * t10_pi_complete - prepare PI prior returning request to the block layer
+ * @rq:              request with PI that should be prepared
+ * @protection_type: PI type (Type 1/Type 2/Type 3)
+ * @intervals:       total elements to prepare
+ *
+ * For Type 1/Type 2, the virtual start sector is the one that was
+ * originally submitted by the block layer for the ref_tag usage. Due to
+ * partitioning, MD/DM cloning, etc. the actual physical start sector is
+ * likely to be different. Since the physical start sector was submitted
+ * to the device, we should remap it back to virtual values expected by the
+ * block layer.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_complete(struct request *rq, u8 protection_type,
+                    unsigned int intervals)
+{
+       const int tuple_sz = rq->q->integrity.tuple_size;
+       u32 ref_tag = t10_pi_ref_tag(rq);
+       struct bio *bio;
+
+       if (protection_type == T10_PI_TYPE3_PROTECTION)
+               return;
+
+       __rq_for_each_bio(bio, rq) {
+               struct bio_integrity_payload *bip = bio_integrity(bio);
+               u32 virt = bip_get_seed(bip) & 0xffffffff;
+               struct bio_vec iv;
+               struct bvec_iter iter;
+
+               bip_for_each_vec(iv, bip, iter) {
+                       void *p, *pmap;
+                       unsigned int j;
+
+                       pmap = kmap_atomic(iv.bv_page);
+                       p = pmap + iv.bv_offset;
+                       for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+                               struct t10_pi_tuple *pi = p;
+
+                               if (be32_to_cpu(pi->ref_tag) == ref_tag)
+                                       pi->ref_tag = cpu_to_be32(virt);
+                               virt++;
+                               ref_tag++;
+                               intervals--;
+                               p += tuple_sz;
+                       }
+
+                       kunmap_atomic(pmap);
+               }
+       }
+}
+EXPORT_SYMBOL(t10_pi_complete);
diff --git a/drivers/Makefile b/drivers/Makefile

index 24cd470..a6abd7a 100644 (file)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
  obj-$(CONFIG_NUBUS)            += nubus/
  obj-y                          += macintosh/
  obj-$(CONFIG_IDE)              += ide/
-obj-$(CONFIG_SCSI)             += scsi/
+obj-y                          += scsi/
  obj-y                          += nvme/
  obj-$(CONFIG_ATA)              += ata/
  obj-$(CONFIG_TARGET_CORE)      += target/
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c

index aad1b01..8e27096 100644 (file)
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev,
  int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
  {
         int rc = 0;
+       u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
         u8 scsi_cmd[MAX_COMMAND_SIZE];
-       u8 args[4], *argbuf = NULL, *sensebuf = NULL;
+       u8 args[4], *argbuf = NULL;
         int argsize = 0;
         enum dma_data_direction data_dir;
         struct scsi_sense_hdr sshdr;
@@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
         if (copy_from_user(args, arg, sizeof(args)))
                 return -EFAULT;
  
-       sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
-       if (!sensebuf)
-               return -ENOMEM;
-
+       memset(sensebuf, 0, sizeof(sensebuf));
         memset(scsi_cmd, 0, sizeof(scsi_cmd));
  
         if (args[3]) {
@@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
          && copy_to_user(arg + sizeof(args), argbuf, argsize))
                 rc = -EFAULT;
  error:
-       kfree(sensebuf);
         kfree(argbuf);
         return rc;
  }
@@ -704,8 +701,9 @@ error:
  int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
  {
         int rc = 0;
+       u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
         u8 scsi_cmd[MAX_COMMAND_SIZE];
-       u8 args[7], *sensebuf = NULL;
+       u8 args[7];
         struct scsi_sense_hdr sshdr;
         int cmd_result;
  
@@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
         if (copy_from_user(args, arg, sizeof(args)))
                 return -EFAULT;
  
-       sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);
-       if (!sensebuf)
-               return -ENOMEM;
-
+       memset(sensebuf, 0, sizeof(sensebuf));
         memset(scsi_cmd, 0, sizeof(scsi_cmd));
         scsi_cmd[0]  = ATA_16;
         scsi_cmd[1]  = (3 << 1); /* Non-data */
@@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
         }
  
   error:
-       kfree(sensebuf);
         return rc;
  }
  
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c

index f651806..f99e5c8 100644 (file)
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -21,6 +21,7 @@
  #define DAC960_DriverDate                      "21 Aug 2007"
  
  
+#include <linux/compiler.h>
  #include <linux/module.h>
  #include <linux/types.h>
  #include <linux/miscdevice.h>
@@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
    return true;
  }
  
-static int dac960_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v)
  {
    unsigned char *StatusMessage = "OK\n";
    int ControllerNumber;
@@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v)
    return 0;
  }
  
-static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m,
+                                                         void *v)
  {
         DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
         seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
         return 0;
  }
  
-static int dac960_current_status_proc_show(struct seq_file *m, void *v)
+static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m,
+                                                         void *v)
  {
    DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
    unsigned char *StatusMessage =
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index ad9b687..d491351 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -74,12 +74,12 @@ config AMIGA_Z2RAM
  
  config CDROM
         tristate
+       select BLK_SCSI_REQUEST
  
  config GDROM
         tristate "SEGA Dreamcast GD-ROM drive"
         depends on SH_DREAMCAST
         select CDROM
-       select BLK_SCSI_REQUEST # only for the generic cdrom code
         help
           A standard SEGA Dreamcast comes with a modified CD ROM drive called a
           "GD-ROM" by SEGA to signify it is capable of reading special disks
diff --git a/drivers/block/Makefile b/drivers/block/Makefile

index dc06115..8566b18 100644 (file)
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
  obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
  
  obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
-obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
  obj-$(CONFIG_ZRAM) += zram/
  
+obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
+null_blk-objs  := null_blk_main.o
+null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
+
  skd-y          := skd_main.o
  swim_mod-y     := swim.o swim_asm.o
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c

index 096882e..136dc50 100644 (file)
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1137,6 +1137,7 @@ noskb:            if (buf)
                         break;
                 }
                 bvcpy(skb, f->buf->bio, f->iter, n);
+               /* fall through */
         case ATA_CMD_PIO_WRITE:
         case ATA_CMD_PIO_WRITE_EXT:
                 spin_lock_irq(&d->lock);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c

index bb97659..df8103d 100644 (file)
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
   * Process a single bvec of a bio.
   */
  static int brd_do_bvec(struct brd_device *brd, struct page *page,
-                       unsigned int len, unsigned int off, bool is_write,
+                       unsigned int len, unsigned int off, unsigned int op,
                         sector_t sector)
  {
         void *mem;
         int err = 0;
  
-       if (is_write) {
+       if (op_is_write(op)) {
                 err = copy_to_brd_setup(brd, sector, len);
                 if (err)
                         goto out;
         }
  
         mem = kmap_atomic(page);
-       if (!is_write) {
+       if (!op_is_write(op)) {
                 copy_from_brd(mem + off, brd, sector, len);
                 flush_dcache_page(page);
         } else {
@@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
                 int err;
  
                 err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
-                                       op_is_write(bio_op(bio)), sector);
+                                 bio_op(bio), sector);
                 if (err)
                         goto io_error;
                 sector += len >> SECTOR_SHIFT;
@@ -310,15 +310,15 @@ io_error:
  }
  
  static int brd_rw_page(struct block_device *bdev, sector_t sector,
-                      struct page *page, bool is_write)
+                      struct page *page, unsigned int op)
  {
         struct brd_device *brd = bdev->bd_disk->private_data;
         int err;
  
         if (PageTransHuge(page))
                 return -ENOTSUPP;
-       err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
-       page_endio(page, is_write, err);
+       err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
+       page_endio(page, op_is_write(op), err);
         return err;
  }
  
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h

index bc4ed2e..e35a234 100644 (file)
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -55,12 +55,10 @@
  # define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
  # define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
  # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
-# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
  #else
  # define __protected_by(x)
  # define __protected_read_by(x)
  # define __protected_write_by(x)
-# define __must_hold(x)
  #endif
  
  /* shared module parameters, defined in drbd_main.c */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c

index be9450f..75f6b47 100644 (file)
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
         if (c_min_rate == 0)
                 return false;
  
-       curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-                     (int)part_stat_read(&disk->part0, sectors[1]) -
+       curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
                         atomic_read(&device->rs_sect_ev);
  
         if (atomic_read(&device->ap_actlog_cnt)
@@ -2790,6 +2789,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
                    then we would do something smarter here than reading
                    the block... */
                 peer_req->flags |= EE_RS_THIN_REQ;
+               /* fall through */
         case P_RS_DATA_REQUEST:
                 peer_req->w.cb = w_e_end_rsdata_req;
                 fault_type = DRBD_FAULT_RS_RD;
@@ -2968,6 +2968,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
                 /* Else fall through to one of the other strategies... */
                 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
                      "Using discard-least-changes instead\n");
+               /* fall through */
         case ASB_DISCARD_ZERO_CHG:
                 if (ch_peer == 0 && ch_self == 0) {
                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
@@ -2979,6 +2980,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
                 }
                 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
                         break;
+               /* else: fall through */
         case ASB_DISCARD_LEAST_CHG:
                 if      (ch_self < ch_peer)
                         rv = -1;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index d146fed..19cac36 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
  {
         struct request_queue *q = device->rq_queue;
  
-       generic_start_io_acct(q, bio_data_dir(req->master_bio),
+       generic_start_io_acct(q, bio_op(req->master_bio),
                                 req->i.size >> 9, &device->vdisk->part0);
  }
  
@@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
  {
         struct request_queue *q = device->rq_queue;
  
-       generic_end_io_acct(q, bio_data_dir(req->master_bio),
+       generic_end_io_acct(q, bio_op(req->master_bio),
                             &device->vdisk->part0, req->start_jif);
  }
  
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c

index 5e793dd..b8f77e8 100644 (file)
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
         atomic_set(&device->rs_sect_in, 0);
         atomic_set(&device->rs_sect_ev, 0);
         device->rs_in_flight = 0;
-       device->rs_last_events =
-               (int)part_stat_read(&disk->part0, sectors[0]) +
-               (int)part_stat_read(&disk->part0, sectors[1]);
+       device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
  
         /* Updating the RCU protected object in place is necessary since
            this function gets called from atomic context.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c

index 8871b50..48f6227 100644 (file)
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void)
         int i;
         int r;
         int flags;
-       int dflags;
         unsigned long ready_date;
         void (*function)(void);
  
@@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void)
                 if (fd_wait_for_completion(ready_date, function))
                         return;
         }
-       dflags = DRS->flags;
-
         if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
                 setup_DMA();
  
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index 4cb1d1b..ea9debf 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
                           unsigned int arg)
  {
         struct file     *file, *old_file;
-       struct inode    *inode;
         int             error;
  
         error = -ENXIO;
@@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
         if (error)
                 goto out_putf;
  
-       inode = file->f_mapping->host;
         old_file = lo->lo_backing_file;
  
         error = -EINVAL;
@@ -1611,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
         case LOOP_GET_STATUS64:
         case LOOP_SET_STATUS64:
                 arg = (unsigned long) compat_ptr(arg);
+               /* fall through */
         case LOOP_SET_FD:
         case LOOP_CHANGE_FD:
         case LOOP_SET_BLOCK_SIZE:
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c

deleted file mode 100644 (file)

index 042c778..0000000
--- a/drivers/block/null_blk.c
+++ /dev/null
@@ -1,1955 +0,0 @@
-/*
- * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
- * Shaohua Li <shli@fb.com>
- */
-#include <linux/module.h>
-
-#include <linux/moduleparam.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/hrtimer.h>
-#include <linux/configfs.h>
-#include <linux/badblocks.h>
-#include <linux/fault-inject.h>
-
-#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
-#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
-#define SECTOR_MASK            (PAGE_SECTORS - 1)
-
-#define FREE_BATCH             16
-
-#define TICKS_PER_SEC          50ULL
-#define TIMER_INTERVAL         (NSEC_PER_SEC / TICKS_PER_SEC)
-
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-static DECLARE_FAULT_ATTR(null_timeout_attr);
-static DECLARE_FAULT_ATTR(null_requeue_attr);
-#endif
-
-static inline u64 mb_per_tick(int mbps)
-{
-       return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
-}
-
-struct nullb_cmd {
-       struct list_head list;
-       struct llist_node ll_list;
-       struct __call_single_data csd;
-       struct request *rq;
-       struct bio *bio;
-       unsigned int tag;
-       blk_status_t error;
-       struct nullb_queue *nq;
-       struct hrtimer timer;
-};
-
-struct nullb_queue {
-       unsigned long *tag_map;
-       wait_queue_head_t wait;
-       unsigned int queue_depth;
-       struct nullb_device *dev;
-       unsigned int requeue_selection;
-
-       struct nullb_cmd *cmds;
-};
-
-/*
- * Status flags for nullb_device.
- *
- * CONFIGURED: Device has been configured and turned on. Cannot reconfigure.
- * UP:         Device is currently on and visible in userspace.
- * THROTTLED:  Device is being throttled.
- * CACHE:      Device is using a write-back cache.
- */
-enum nullb_device_flags {
-       NULLB_DEV_FL_CONFIGURED = 0,
-       NULLB_DEV_FL_UP         = 1,
-       NULLB_DEV_FL_THROTTLED  = 2,
-       NULLB_DEV_FL_CACHE      = 3,
-};
-
-#define MAP_SZ         ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
-/*
- * nullb_page is a page in memory for nullb devices.
- *
- * @page:      The page holding the data.
- * @bitmap:    The bitmap represents which sector in the page has data.
- *             Each bit represents one block size. For example, sector 8
- *             will use the 7th bit
- * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
- * page is being flushing to storage. FREE means the cache page is freed and
- * should be skipped from flushing to storage. Please see
- * null_make_cache_space
- */
-struct nullb_page {
-       struct page *page;
-       DECLARE_BITMAP(bitmap, MAP_SZ);
-};
-#define NULLB_PAGE_LOCK (MAP_SZ - 1)
-#define NULLB_PAGE_FREE (MAP_SZ - 2)
-
-struct nullb_device {
-       struct nullb *nullb;
-       struct config_item item;
-       struct radix_tree_root data; /* data stored in the disk */
-       struct radix_tree_root cache; /* disk cache data */
-       unsigned long flags; /* device flags */
-       unsigned int curr_cache;
-       struct badblocks badblocks;
-
-       unsigned long size; /* device size in MB */
-       unsigned long completion_nsec; /* time in ns to complete a request */
-       unsigned long cache_size; /* disk cache size in MB */
-       unsigned int submit_queues; /* number of submission queues */
-       unsigned int home_node; /* home node for the device */
-       unsigned int queue_mode; /* block interface */
-       unsigned int blocksize; /* block size */
-       unsigned int irqmode; /* IRQ completion handler */
-       unsigned int hw_queue_depth; /* queue depth */
-       unsigned int index; /* index of the disk, only valid with a disk */
-       unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
-       bool blocking; /* blocking blk-mq device */
-       bool use_per_node_hctx; /* use per-node allocation for hardware context */
-       bool power; /* power on/off the device */
-       bool memory_backed; /* if data is stored in memory */
-       bool discard; /* if support discard */
-};
-
-struct nullb {
-       struct nullb_device *dev;
-       struct list_head list;
-       unsigned int index;
-       struct request_queue *q;
-       struct gendisk *disk;
-       struct blk_mq_tag_set *tag_set;
-       struct blk_mq_tag_set __tag_set;
-       unsigned int queue_depth;
-       atomic_long_t cur_bytes;
-       struct hrtimer bw_timer;
-       unsigned long cache_flush_pos;
-       spinlock_t lock;
-
-       struct nullb_queue *queues;
-       unsigned int nr_queues;
-       char disk_name[DISK_NAME_LEN];
-};
-
-static LIST_HEAD(nullb_list);
-static struct mutex lock;
-static int null_major;
-static DEFINE_IDA(nullb_indexes);
-static struct blk_mq_tag_set tag_set;
-
-enum {
-       NULL_IRQ_NONE           = 0,
-       NULL_IRQ_SOFTIRQ        = 1,
-       NULL_IRQ_TIMER          = 2,
-};
-
-enum {
-       NULL_Q_BIO              = 0,
-       NULL_Q_RQ               = 1,
-       NULL_Q_MQ               = 2,
-};
-
-static int g_no_sched;
-module_param_named(no_sched, g_no_sched, int, 0444);
-MODULE_PARM_DESC(no_sched, "No io scheduler");
-
-static int g_submit_queues = 1;
-module_param_named(submit_queues, g_submit_queues, int, 0444);
-MODULE_PARM_DESC(submit_queues, "Number of submission queues");
-
-static int g_home_node = NUMA_NO_NODE;
-module_param_named(home_node, g_home_node, int, 0444);
-MODULE_PARM_DESC(home_node, "Home node for the device");
-
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-static char g_timeout_str[80];
-module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
-
-static char g_requeue_str[80];
-module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
-#endif
-
-static int g_queue_mode = NULL_Q_MQ;
-
-static int null_param_store_val(const char *str, int *val, int min, int max)
-{
-       int ret, new_val;
-
-       ret = kstrtoint(str, 10, &new_val);
-       if (ret)
-               return -EINVAL;
-
-       if (new_val < min || new_val > max)
-               return -EINVAL;
-
-       *val = new_val;
-       return 0;
-}
-
-static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
-{
-       return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
-}
-
-static const struct kernel_param_ops null_queue_mode_param_ops = {
-       .set    = null_set_queue_mode,
-       .get    = param_get_int,
-};
-
-device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
-MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
-
-static int g_gb = 250;
-module_param_named(gb, g_gb, int, 0444);
-MODULE_PARM_DESC(gb, "Size in GB");
-
-static int g_bs = 512;
-module_param_named(bs, g_bs, int, 0444);
-MODULE_PARM_DESC(bs, "Block size (in bytes)");
-
-static int nr_devices = 1;
-module_param(nr_devices, int, 0444);
-MODULE_PARM_DESC(nr_devices, "Number of devices to register");
-
-static bool g_blocking;
-module_param_named(blocking, g_blocking, bool, 0444);
-MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
-
-static bool shared_tags;
-module_param(shared_tags, bool, 0444);
-MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
-
-static int g_irqmode = NULL_IRQ_SOFTIRQ;
-
-static int null_set_irqmode(const char *str, const struct kernel_param *kp)
-{
-       return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
-                                       NULL_IRQ_TIMER);
-}
-
-static const struct kernel_param_ops null_irqmode_param_ops = {
-       .set    = null_set_irqmode,
-       .get    = param_get_int,
-};
-
-device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
-MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
-
-static unsigned long g_completion_nsec = 10000;
-module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
-MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
-
-static int g_hw_queue_depth = 64;
-module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
-MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
-
-static bool g_use_per_node_hctx;
-module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
-MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
-
-static struct nullb_device *null_alloc_dev(void);
-static void null_free_dev(struct nullb_device *dev);
-static void null_del_dev(struct nullb *nullb);
-static int null_add_dev(struct nullb_device *dev);
-static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
-
-static inline struct nullb_device *to_nullb_device(struct config_item *item)
-{
-       return item ? container_of(item, struct nullb_device, item) : NULL;
-}
-
-static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
-{
-       return snprintf(page, PAGE_SIZE, "%u\n", val);
-}
-
-static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
-       char *page)
-{
-       return snprintf(page, PAGE_SIZE, "%lu\n", val);
-}
-
-static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
-{
-       return snprintf(page, PAGE_SIZE, "%u\n", val);
-}
-
-static ssize_t nullb_device_uint_attr_store(unsigned int *val,
-       const char *page, size_t count)
-{
-       unsigned int tmp;
-       int result;
-
-       result = kstrtouint(page, 0, &tmp);
-       if (result)
-               return result;
-
-       *val = tmp;
-       return count;
-}
-
-static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
-       const char *page, size_t count)
-{
-       int result;
-       unsigned long tmp;
-
-       result = kstrtoul(page, 0, &tmp);
-       if (result)
-               return result;
-
-       *val = tmp;
-       return count;
-}
-
-static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
-       size_t count)
-{
-       bool tmp;
-       int result;
-
-       result = kstrtobool(page,  &tmp);
-       if (result)
-               return result;
-
-       *val = tmp;
-       return count;
-}
-
-/* The following macro should only be used with TYPE = {uint, ulong, bool}. */
-#define NULLB_DEVICE_ATTR(NAME, TYPE)                                          \
-static ssize_t                                                                 \
-nullb_device_##NAME##_show(struct config_item *item, char *page)               \
-{                                                                              \
-       return nullb_device_##TYPE##_attr_show(                                 \
-                               to_nullb_device(item)->NAME, page);             \
-}                                                                              \
-static ssize_t                                                                 \
-nullb_device_##NAME##_store(struct config_item *item, const char *page,                \
-                           size_t count)                                       \
-{                                                                              \
-       if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags))   \
-               return -EBUSY;                                                  \
-       return nullb_device_##TYPE##_attr_store(                                \
-                       &to_nullb_device(item)->NAME, page, count);             \
-}                                                                              \
-CONFIGFS_ATTR(nullb_device_, NAME);
-
-NULLB_DEVICE_ATTR(size, ulong);
-NULLB_DEVICE_ATTR(completion_nsec, ulong);
-NULLB_DEVICE_ATTR(submit_queues, uint);
-NULLB_DEVICE_ATTR(home_node, uint);
-NULLB_DEVICE_ATTR(queue_mode, uint);
-NULLB_DEVICE_ATTR(blocksize, uint);
-NULLB_DEVICE_ATTR(irqmode, uint);
-NULLB_DEVICE_ATTR(hw_queue_depth, uint);
-NULLB_DEVICE_ATTR(index, uint);
-NULLB_DEVICE_ATTR(blocking, bool);
-NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
-NULLB_DEVICE_ATTR(memory_backed, bool);
-NULLB_DEVICE_ATTR(discard, bool);
-NULLB_DEVICE_ATTR(mbps, uint);
-NULLB_DEVICE_ATTR(cache_size, ulong);
-
-static ssize_t nullb_device_power_show(struct config_item *item, char *page)
-{
-       return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
-}
-
-static ssize_t nullb_device_power_store(struct config_item *item,
-                                    const char *page, size_t count)
-{
-       struct nullb_device *dev = to_nullb_device(item);
-       bool newp = false;
-       ssize_t ret;
-
-       ret = nullb_device_bool_attr_store(&newp, page, count);
-       if (ret < 0)
-               return ret;
-
-       if (!dev->power && newp) {
-               if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
-                       return count;
-               if (null_add_dev(dev)) {
-                       clear_bit(NULLB_DEV_FL_UP, &dev->flags);
-                       return -ENOMEM;
-               }
-
-               set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
-               dev->power = newp;
-       } else if (dev->power && !newp) {
-               mutex_lock(&lock);
-               dev->power = newp;
-               null_del_dev(dev->nullb);
-               mutex_unlock(&lock);
-               clear_bit(NULLB_DEV_FL_UP, &dev->flags);
-       }
-
-       return count;
-}
-
-CONFIGFS_ATTR(nullb_device_, power);
-
-static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
-{
-       struct nullb_device *t_dev = to_nullb_device(item);
-
-       return badblocks_show(&t_dev->badblocks, page, 0);
-}
-
-static ssize_t nullb_device_badblocks_store(struct config_item *item,
-                                    const char *page, size_t count)
-{
-       struct nullb_device *t_dev = to_nullb_device(item);
-       char *orig, *buf, *tmp;
-       u64 start, end;
-       int ret;
-
-       orig = kstrndup(page, count, GFP_KERNEL);
-       if (!orig)
-               return -ENOMEM;
-
-       buf = strstrip(orig);
-
-       ret = -EINVAL;
-       if (buf[0] != '+' && buf[0] != '-')
-               goto out;
-       tmp = strchr(&buf[1], '-');
-       if (!tmp)
-               goto out;
-       *tmp = '\0';
-       ret = kstrtoull(buf + 1, 0, &start);
-       if (ret)
-               goto out;
-       ret = kstrtoull(tmp + 1, 0, &end);
-       if (ret)
-               goto out;
-       ret = -EINVAL;
-       if (start > end)
-               goto out;
-       /* enable badblocks */
-       cmpxchg(&t_dev->badblocks.shift, -1, 0);
-       if (buf[0] == '+')
-               ret = badblocks_set(&t_dev->badblocks, start,
-                       end - start + 1, 1);
-       else
-               ret = badblocks_clear(&t_dev->badblocks, start,
-                       end - start + 1);
-       if (ret == 0)
-               ret = count;
-out:
-       kfree(orig);
-       return ret;
-}
-CONFIGFS_ATTR(nullb_device_, badblocks);
-
-static struct configfs_attribute *nullb_device_attrs[] = {
-       &nullb_device_attr_size,
-       &nullb_device_attr_completion_nsec,
-       &nullb_device_attr_submit_queues,
-       &nullb_device_attr_home_node,
-       &nullb_device_attr_queue_mode,
-       &nullb_device_attr_blocksize,
-       &nullb_device_attr_irqmode,
-       &nullb_device_attr_hw_queue_depth,
-       &nullb_device_attr_index,
-       &nullb_device_attr_blocking,
-       &nullb_device_attr_use_per_node_hctx,
-       &nullb_device_attr_power,
-       &nullb_device_attr_memory_backed,
-       &nullb_device_attr_discard,
-       &nullb_device_attr_mbps,
-       &nullb_device_attr_cache_size,
-       &nullb_device_attr_badblocks,
-       NULL,
-};
-
-static void nullb_device_release(struct config_item *item)
-{
-       struct nullb_device *dev = to_nullb_device(item);
-
-       null_free_device_storage(dev, false);
-       null_free_dev(dev);
-}
-
-static struct configfs_item_operations nullb_device_ops = {
-       .release        = nullb_device_release,
-};
-
-static const struct config_item_type nullb_device_type = {
-       .ct_item_ops    = &nullb_device_ops,
-       .ct_attrs       = nullb_device_attrs,
-       .ct_owner       = THIS_MODULE,
-};
-
-static struct
-config_item *nullb_group_make_item(struct config_group *group, const char *name)
-{
-       struct nullb_device *dev;
-
-       dev = null_alloc_dev();
-       if (!dev)
-               return ERR_PTR(-ENOMEM);
-
-       config_item_init_type_name(&dev->item, name, &nullb_device_type);
-
-       return &dev->item;
-}
-
-static void
-nullb_group_drop_item(struct config_group *group, struct config_item *item)
-{
-       struct nullb_device *dev = to_nullb_device(item);
-
-       if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
-               mutex_lock(&lock);
-               dev->power = false;
-               null_del_dev(dev->nullb);
-               mutex_unlock(&lock);
-       }
-
-       config_item_put(item);
-}
-
-static ssize_t memb_group_features_show(struct config_item *item, char *page)
-{
-       return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
-}
-
-CONFIGFS_ATTR_RO(memb_group_, features);
-
-static struct configfs_attribute *nullb_group_attrs[] = {
-       &memb_group_attr_features,
-       NULL,
-};
-
-static struct configfs_group_operations nullb_group_ops = {
-       .make_item      = nullb_group_make_item,
-       .drop_item      = nullb_group_drop_item,
-};
-
-static const struct config_item_type nullb_group_type = {
-       .ct_group_ops   = &nullb_group_ops,
-       .ct_attrs       = nullb_group_attrs,
-       .ct_owner       = THIS_MODULE,
-};
-
-static struct configfs_subsystem nullb_subsys = {
-       .su_group = {
-               .cg_item = {
-                       .ci_namebuf = "nullb",
-                       .ci_type = &nullb_group_type,
-               },
-       },
-};
-
-static inline int null_cache_active(struct nullb *nullb)
-{
-       return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
-}
-
-static struct nullb_device *null_alloc_dev(void)
-{
-       struct nullb_device *dev;
-
-       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-       if (!dev)
-               return NULL;
-       INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
-       INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
-       if (badblocks_init(&dev->badblocks, 0)) {
-               kfree(dev);
-               return NULL;
-       }
-
-       dev->size = g_gb * 1024;
-       dev->completion_nsec = g_completion_nsec;
-       dev->submit_queues = g_submit_queues;
-       dev->home_node = g_home_node;
-       dev->queue_mode = g_queue_mode;
-       dev->blocksize = g_bs;
-       dev->irqmode = g_irqmode;
-       dev->hw_queue_depth = g_hw_queue_depth;
-       dev->blocking = g_blocking;
-       dev->use_per_node_hctx = g_use_per_node_hctx;
-       return dev;
-}
-
-static void null_free_dev(struct nullb_device *dev)
-{
-       if (!dev)
-               return;
-
-       badblocks_exit(&dev->badblocks);
-       kfree(dev);
-}
-
-static void put_tag(struct nullb_queue *nq, unsigned int tag)
-{
-       clear_bit_unlock(tag, nq->tag_map);
-
-       if (waitqueue_active(&nq->wait))
-               wake_up(&nq->wait);
-}
-
-static unsigned int get_tag(struct nullb_queue *nq)
-{
-       unsigned int tag;
-
-       do {
-               tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
-               if (tag >= nq->queue_depth)
-                       return -1U;
-       } while (test_and_set_bit_lock(tag, nq->tag_map));
-
-       return tag;
-}
-
-static void free_cmd(struct nullb_cmd *cmd)
-{
-       put_tag(cmd->nq, cmd->tag);
-}
-
-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
-
-static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
-{
-       struct nullb_cmd *cmd;
-       unsigned int tag;
-
-       tag = get_tag(nq);
-       if (tag != -1U) {
-               cmd = &nq->cmds[tag];
-               cmd->tag = tag;
-               cmd->nq = nq;
-               if (nq->dev->irqmode == NULL_IRQ_TIMER) {
-                       hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
-                                    HRTIMER_MODE_REL);
-                       cmd->timer.function = null_cmd_timer_expired;
-               }
-               return cmd;
-       }
-
-       return NULL;
-}
-
-static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
-{
-       struct nullb_cmd *cmd;
-       DEFINE_WAIT(wait);
-
-       cmd = __alloc_cmd(nq);
-       if (cmd || !can_wait)
-               return cmd;
-
-       do {
-               prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
-               cmd = __alloc_cmd(nq);
-               if (cmd)
-                       break;
-
-               io_schedule();
-       } while (1);
-
-       finish_wait(&nq->wait, &wait);
-       return cmd;
-}
-
-static void end_cmd(struct nullb_cmd *cmd)
-{
-       struct request_queue *q = NULL;
-       int queue_mode = cmd->nq->dev->queue_mode;
-
-       if (cmd->rq)
-               q = cmd->rq->q;
-
-       switch (queue_mode)  {
-       case NULL_Q_MQ:
-               blk_mq_end_request(cmd->rq, cmd->error);
-               return;
-       case NULL_Q_RQ:
-               INIT_LIST_HEAD(&cmd->rq->queuelist);
-               blk_end_request_all(cmd->rq, cmd->error);
-               break;
-       case NULL_Q_BIO:
-               cmd->bio->bi_status = cmd->error;
-               bio_endio(cmd->bio);
-               break;
-       }
-
-       free_cmd(cmd);
-
-       /* Restart queue if needed, as we are freeing a tag */
-       if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) {
-               unsigned long flags;
-
-               spin_lock_irqsave(q->queue_lock, flags);
-               blk_start_queue_async(q);
-               spin_unlock_irqrestore(q->queue_lock, flags);
-       }
-}
-
-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
-{
-       end_cmd(container_of(timer, struct nullb_cmd, timer));
-
-       return HRTIMER_NORESTART;
-}
-
-static void null_cmd_end_timer(struct nullb_cmd *cmd)
-{
-       ktime_t kt = cmd->nq->dev->completion_nsec;
-
-       hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
-}
-
-static void null_softirq_done_fn(struct request *rq)
-{
-       struct nullb *nullb = rq->q->queuedata;
-
-       if (nullb->dev->queue_mode == NULL_Q_MQ)
-               end_cmd(blk_mq_rq_to_pdu(rq));
-       else
-               end_cmd(rq->special);
-}
-
-static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
-{
-       struct nullb_page *t_page;
-
-       t_page = kmalloc(sizeof(struct nullb_page), gfp_flags);
-       if (!t_page)
-               goto out;
-
-       t_page->page = alloc_pages(gfp_flags, 0);
-       if (!t_page->page)
-               goto out_freepage;
-
-       memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
-       return t_page;
-out_freepage:
-       kfree(t_page);
-out:
-       return NULL;
-}
-
-static void null_free_page(struct nullb_page *t_page)
-{
-       __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
-       if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
-               return;
-       __free_page(t_page->page);
-       kfree(t_page);
-}
-
-static bool null_page_empty(struct nullb_page *page)
-{
-       int size = MAP_SZ - 2;
-
-       return find_first_bit(page->bitmap, size) == size;
-}
-
-static void null_free_sector(struct nullb *nullb, sector_t sector,
-       bool is_cache)
-{
-       unsigned int sector_bit;
-       u64 idx;
-       struct nullb_page *t_page, *ret;
-       struct radix_tree_root *root;
-
-       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
-       idx = sector >> PAGE_SECTORS_SHIFT;
-       sector_bit = (sector & SECTOR_MASK);
-
-       t_page = radix_tree_lookup(root, idx);
-       if (t_page) {
-               __clear_bit(sector_bit, t_page->bitmap);
-
-               if (null_page_empty(t_page)) {
-                       ret = radix_tree_delete_item(root, idx, t_page);
-                       WARN_ON(ret != t_page);
-                       null_free_page(ret);
-                       if (is_cache)
-                               nullb->dev->curr_cache -= PAGE_SIZE;
-               }
-       }
-}
-
-static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
-       struct nullb_page *t_page, bool is_cache)
-{
-       struct radix_tree_root *root;
-
-       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
-
-       if (radix_tree_insert(root, idx, t_page)) {
-               null_free_page(t_page);
-               t_page = radix_tree_lookup(root, idx);
-               WARN_ON(!t_page || t_page->page->index != idx);
-       } else if (is_cache)
-               nullb->dev->curr_cache += PAGE_SIZE;
-
-       return t_page;
-}
-
-static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
-{
-       unsigned long pos = 0;
-       int nr_pages;
-       struct nullb_page *ret, *t_pages[FREE_BATCH];
-       struct radix_tree_root *root;
-
-       root = is_cache ? &dev->cache : &dev->data;
-
-       do {
-               int i;
-
-               nr_pages = radix_tree_gang_lookup(root,
-                               (void **)t_pages, pos, FREE_BATCH);
-
-               for (i = 0; i < nr_pages; i++) {
-                       pos = t_pages[i]->page->index;
-                       ret = radix_tree_delete_item(root, pos, t_pages[i]);
-                       WARN_ON(ret != t_pages[i]);
-                       null_free_page(ret);
-               }
-
-               pos++;
-       } while (nr_pages == FREE_BATCH);
-
-       if (is_cache)
-               dev->curr_cache = 0;
-}
-
-static struct nullb_page *__null_lookup_page(struct nullb *nullb,
-       sector_t sector, bool for_write, bool is_cache)
-{
-       unsigned int sector_bit;
-       u64 idx;
-       struct nullb_page *t_page;
-       struct radix_tree_root *root;
-
-       idx = sector >> PAGE_SECTORS_SHIFT;
-       sector_bit = (sector & SECTOR_MASK);
-
-       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
-       t_page = radix_tree_lookup(root, idx);
-       WARN_ON(t_page && t_page->page->index != idx);
-
-       if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
-               return t_page;
-
-       return NULL;
-}
-
-static struct nullb_page *null_lookup_page(struct nullb *nullb,
-       sector_t sector, bool for_write, bool ignore_cache)
-{
-       struct nullb_page *page = NULL;
-
-       if (!ignore_cache)
-               page = __null_lookup_page(nullb, sector, for_write, true);
-       if (page)
-               return page;
-       return __null_lookup_page(nullb, sector, for_write, false);
-}
-
-static struct nullb_page *null_insert_page(struct nullb *nullb,
-       sector_t sector, bool ignore_cache)
-{
-       u64 idx;
-       struct nullb_page *t_page;
-
-       t_page = null_lookup_page(nullb, sector, true, ignore_cache);
-       if (t_page)
-               return t_page;
-
-       spin_unlock_irq(&nullb->lock);
-
-       t_page = null_alloc_page(GFP_NOIO);
-       if (!t_page)
-               goto out_lock;
-
-       if (radix_tree_preload(GFP_NOIO))
-               goto out_freepage;
-
-       spin_lock_irq(&nullb->lock);
-       idx = sector >> PAGE_SECTORS_SHIFT;
-       t_page->page->index = idx;
-       t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
-       radix_tree_preload_end();
-
-       return t_page;
-out_freepage:
-       null_free_page(t_page);
-out_lock:
-       spin_lock_irq(&nullb->lock);
-       return null_lookup_page(nullb, sector, true, ignore_cache);
-}
-
-static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
-{
-       int i;
-       unsigned int offset;
-       u64 idx;
-       struct nullb_page *t_page, *ret;
-       void *dst, *src;
-
-       idx = c_page->page->index;
-
-       t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
-
-       __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
-       if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
-               null_free_page(c_page);
-               if (t_page && null_page_empty(t_page)) {
-                       ret = radix_tree_delete_item(&nullb->dev->data,
-                               idx, t_page);
-                       null_free_page(t_page);
-               }
-               return 0;
-       }
-
-       if (!t_page)
-               return -ENOMEM;
-
-       src = kmap_atomic(c_page->page);
-       dst = kmap_atomic(t_page->page);
-
-       for (i = 0; i < PAGE_SECTORS;
-                       i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
-               if (test_bit(i, c_page->bitmap)) {
-                       offset = (i << SECTOR_SHIFT);
-                       memcpy(dst + offset, src + offset,
-                               nullb->dev->blocksize);
-                       __set_bit(i, t_page->bitmap);
-               }
-       }
-
-       kunmap_atomic(dst);
-       kunmap_atomic(src);
-
-       ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
-       null_free_page(ret);
-       nullb->dev->curr_cache -= PAGE_SIZE;
-
-       return 0;
-}
-
-static int null_make_cache_space(struct nullb *nullb, unsigned long n)
-{
-       int i, err, nr_pages;
-       struct nullb_page *c_pages[FREE_BATCH];
-       unsigned long flushed = 0, one_round;
-
-again:
-       if ((nullb->dev->cache_size * 1024 * 1024) >
-            nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
-               return 0;
-
-       nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
-                       (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
-       /*
-        * nullb_flush_cache_page could unlock before using the c_pages. To
-        * avoid race, we don't allow page free
-        */
-       for (i = 0; i < nr_pages; i++) {
-               nullb->cache_flush_pos = c_pages[i]->page->index;
-               /*
-                * We found the page which is being flushed to disk by other
-                * threads
-                */
-               if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
-                       c_pages[i] = NULL;
-               else
-                       __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
-       }
-
-       one_round = 0;
-       for (i = 0; i < nr_pages; i++) {
-               if (c_pages[i] == NULL)
-                       continue;
-               err = null_flush_cache_page(nullb, c_pages[i]);
-               if (err)
-                       return err;
-               one_round++;
-       }
-       flushed += one_round << PAGE_SHIFT;
-
-       if (n > flushed) {
-               if (nr_pages == 0)
-                       nullb->cache_flush_pos = 0;
-               if (one_round == 0) {
-                       /* give other threads a chance */
-                       spin_unlock_irq(&nullb->lock);
-                       spin_lock_irq(&nullb->lock);
-               }
-               goto again;
-       }
-       return 0;
-}
-
-static int copy_to_nullb(struct nullb *nullb, struct page *source,
-       unsigned int off, sector_t sector, size_t n, bool is_fua)
-{
-       size_t temp, count = 0;
-       unsigned int offset;
-       struct nullb_page *t_page;
-       void *dst, *src;
-
-       while (count < n) {
-               temp = min_t(size_t, nullb->dev->blocksize, n - count);
-
-               if (null_cache_active(nullb) && !is_fua)
-                       null_make_cache_space(nullb, PAGE_SIZE);
-
-               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
-               t_page = null_insert_page(nullb, sector,
-                       !null_cache_active(nullb) || is_fua);
-               if (!t_page)
-                       return -ENOSPC;
-
-               src = kmap_atomic(source);
-               dst = kmap_atomic(t_page->page);
-               memcpy(dst + offset, src + off + count, temp);
-               kunmap_atomic(dst);
-               kunmap_atomic(src);
-
-               __set_bit(sector & SECTOR_MASK, t_page->bitmap);
-
-               if (is_fua)
-                       null_free_sector(nullb, sector, true);
-
-               count += temp;
-               sector += temp >> SECTOR_SHIFT;
-       }
-       return 0;
-}
-
-static int copy_from_nullb(struct nullb *nullb, struct page *dest,
-       unsigned int off, sector_t sector, size_t n)
-{
-       size_t temp, count = 0;
-       unsigned int offset;
-       struct nullb_page *t_page;
-       void *dst, *src;
-
-       while (count < n) {
-               temp = min_t(size_t, nullb->dev->blocksize, n - count);
-
-               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
-               t_page = null_lookup_page(nullb, sector, false,
-                       !null_cache_active(nullb));
-
-               dst = kmap_atomic(dest);
-               if (!t_page) {
-                       memset(dst + off + count, 0, temp);
-                       goto next;
-               }
-               src = kmap_atomic(t_page->page);
-               memcpy(dst + off + count, src + offset, temp);
-               kunmap_atomic(src);
-next:
-               kunmap_atomic(dst);
-
-               count += temp;
-               sector += temp >> SECTOR_SHIFT;
-       }
-       return 0;
-}
-
-static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
-{
-       size_t temp;
-
-       spin_lock_irq(&nullb->lock);
-       while (n > 0) {
-               temp = min_t(size_t, n, nullb->dev->blocksize);
-               null_free_sector(nullb, sector, false);
-               if (null_cache_active(nullb))
-                       null_free_sector(nullb, sector, true);
-               sector += temp >> SECTOR_SHIFT;
-               n -= temp;
-       }
-       spin_unlock_irq(&nullb->lock);
-}
-
-static int null_handle_flush(struct nullb *nullb)
-{
-       int err;
-
-       if (!null_cache_active(nullb))
-               return 0;
-
-       spin_lock_irq(&nullb->lock);
-       while (true) {
-               err = null_make_cache_space(nullb,
-                       nullb->dev->cache_size * 1024 * 1024);
-               if (err || nullb->dev->curr_cache == 0)
-                       break;
-       }
-
-       WARN_ON(!radix_tree_empty(&nullb->dev->cache));
-       spin_unlock_irq(&nullb->lock);
-       return err;
-}
-
-static int null_transfer(struct nullb *nullb, struct page *page,
-       unsigned int len, unsigned int off, bool is_write, sector_t sector,
-       bool is_fua)
-{
-       int err = 0;
-
-       if (!is_write) {
-               err = copy_from_nullb(nullb, page, off, sector, len);
-               flush_dcache_page(page);
-       } else {
-               flush_dcache_page(page);
-               err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
-       }
-
-       return err;
-}
-
-static int null_handle_rq(struct nullb_cmd *cmd)
-{
-       struct request *rq = cmd->rq;
-       struct nullb *nullb = cmd->nq->dev->nullb;
-       int err;
-       unsigned int len;
-       sector_t sector;
-       struct req_iterator iter;
-       struct bio_vec bvec;
-
-       sector = blk_rq_pos(rq);
-
-       if (req_op(rq) == REQ_OP_DISCARD) {
-               null_handle_discard(nullb, sector, blk_rq_bytes(rq));
-               return 0;
-       }
-
-       spin_lock_irq(&nullb->lock);
-       rq_for_each_segment(bvec, rq, iter) {
-               len = bvec.bv_len;
-               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
-                                    op_is_write(req_op(rq)), sector,
-                                    req_op(rq) & REQ_FUA);
-               if (err) {
-                       spin_unlock_irq(&nullb->lock);
-                       return err;
-               }
-               sector += len >> SECTOR_SHIFT;
-       }
-       spin_unlock_irq(&nullb->lock);
-
-       return 0;
-}
-
-static int null_handle_bio(struct nullb_cmd *cmd)
-{
-       struct bio *bio = cmd->bio;
-       struct nullb *nullb = cmd->nq->dev->nullb;
-       int err;
-       unsigned int len;
-       sector_t sector;
-       struct bio_vec bvec;
-       struct bvec_iter iter;
-
-       sector = bio->bi_iter.bi_sector;
-
-       if (bio_op(bio) == REQ_OP_DISCARD) {
-               null_handle_discard(nullb, sector,
-                       bio_sectors(bio) << SECTOR_SHIFT);
-               return 0;
-       }
-
-       spin_lock_irq(&nullb->lock);
-       bio_for_each_segment(bvec, bio, iter) {
-               len = bvec.bv_len;
-               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
-                                    op_is_write(bio_op(bio)), sector,
-                                    bio_op(bio) & REQ_FUA);
-               if (err) {
-                       spin_unlock_irq(&nullb->lock);
-                       return err;
-               }
-               sector += len >> SECTOR_SHIFT;
-       }
-       spin_unlock_irq(&nullb->lock);
-       return 0;
-}
-
-static void null_stop_queue(struct nullb *nullb)
-{
-       struct request_queue *q = nullb->q;
-
-       if (nullb->dev->queue_mode == NULL_Q_MQ)
-               blk_mq_stop_hw_queues(q);
-       else {
-               spin_lock_irq(q->queue_lock);
-               blk_stop_queue(q);
-               spin_unlock_irq(q->queue_lock);
-       }
-}
-
-static void null_restart_queue_async(struct nullb *nullb)
-{
-       struct request_queue *q = nullb->q;
-       unsigned long flags;
-
-       if (nullb->dev->queue_mode == NULL_Q_MQ)
-               blk_mq_start_stopped_hw_queues(q, true);
-       else {
-               spin_lock_irqsave(q->queue_lock, flags);
-               blk_start_queue_async(q);
-               spin_unlock_irqrestore(q->queue_lock, flags);
-       }
-}
-
-static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
-{
-       struct nullb_device *dev = cmd->nq->dev;
-       struct nullb *nullb = dev->nullb;
-       int err = 0;
-
-       if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
-               struct request *rq = cmd->rq;
-
-               if (!hrtimer_active(&nullb->bw_timer))
-                       hrtimer_restart(&nullb->bw_timer);
-
-               if (atomic_long_sub_return(blk_rq_bytes(rq),
-                               &nullb->cur_bytes) < 0) {
-                       null_stop_queue(nullb);
-                       /* race with timer */
-                       if (atomic_long_read(&nullb->cur_bytes) > 0)
-                               null_restart_queue_async(nullb);
-                       if (dev->queue_mode == NULL_Q_RQ) {
-                               struct request_queue *q = nullb->q;
-
-                               spin_lock_irq(q->queue_lock);
-                               rq->rq_flags |= RQF_DONTPREP;
-                               blk_requeue_request(q, rq);
-                               spin_unlock_irq(q->queue_lock);
-                               return BLK_STS_OK;
-                       } else
-                               /* requeue request */
-                               return BLK_STS_DEV_RESOURCE;
-               }
-       }
-
-       if (nullb->dev->badblocks.shift != -1) {
-               int bad_sectors;
-               sector_t sector, size, first_bad;
-               bool is_flush = true;
-
-               if (dev->queue_mode == NULL_Q_BIO &&
-                               bio_op(cmd->bio) != REQ_OP_FLUSH) {
-                       is_flush = false;
-                       sector = cmd->bio->bi_iter.bi_sector;
-                       size = bio_sectors(cmd->bio);
-               }
-               if (dev->queue_mode != NULL_Q_BIO &&
-                               req_op(cmd->rq) != REQ_OP_FLUSH) {
-                       is_flush = false;
-                       sector = blk_rq_pos(cmd->rq);
-                       size = blk_rq_sectors(cmd->rq);
-               }
-               if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
-                               size, &first_bad, &bad_sectors)) {
-                       cmd->error = BLK_STS_IOERR;
-                       goto out;
-               }
-       }
-
-       if (dev->memory_backed) {
-               if (dev->queue_mode == NULL_Q_BIO) {
-                       if (bio_op(cmd->bio) == REQ_OP_FLUSH)
-                               err = null_handle_flush(nullb);
-                       else
-                               err = null_handle_bio(cmd);
-               } else {
-                       if (req_op(cmd->rq) == REQ_OP_FLUSH)
-                               err = null_handle_flush(nullb);
-                       else
-                               err = null_handle_rq(cmd);
-               }
-       }
-       cmd->error = errno_to_blk_status(err);
-out:
-       /* Complete IO by inline, softirq or timer */
-       switch (dev->irqmode) {
-       case NULL_IRQ_SOFTIRQ:
-               switch (dev->queue_mode)  {
-               case NULL_Q_MQ:
-                       blk_mq_complete_request(cmd->rq);
-                       break;
-               case NULL_Q_RQ:
-                       blk_complete_request(cmd->rq);
-                       break;
-               case NULL_Q_BIO:
-                       /*
-                        * XXX: no proper submitting cpu information available.
-                        */
-                       end_cmd(cmd);
-                       break;
-               }
-               break;
-       case NULL_IRQ_NONE:
-               end_cmd(cmd);
-               break;
-       case NULL_IRQ_TIMER:
-               null_cmd_end_timer(cmd);
-               break;
-       }
-       return BLK_STS_OK;
-}
-
-static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
-{
-       struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
-       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
-       unsigned int mbps = nullb->dev->mbps;
-
-       if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
-               return HRTIMER_NORESTART;
-
-       atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
-       null_restart_queue_async(nullb);
-
-       hrtimer_forward_now(&nullb->bw_timer, timer_interval);
-
-       return HRTIMER_RESTART;
-}
-
-static void nullb_setup_bwtimer(struct nullb *nullb)
-{
-       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
-
-       hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       nullb->bw_timer.function = nullb_bwtimer_fn;
-       atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
-       hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
-}
-
-static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
-{
-       int index = 0;
-
-       if (nullb->nr_queues != 1)
-               index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
-
-       return &nullb->queues[index];
-}
-
-static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
-{
-       struct nullb *nullb = q->queuedata;
-       struct nullb_queue *nq = nullb_to_queue(nullb);
-       struct nullb_cmd *cmd;
-
-       cmd = alloc_cmd(nq, 1);
-       cmd->bio = bio;
-
-       null_handle_cmd(cmd);
-       return BLK_QC_T_NONE;
-}
-
-static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
-{
-       pr_info("null: rq %p timed out\n", rq);
-       __blk_complete_request(rq);
-       return BLK_EH_DONE;
-}
-
-static int null_rq_prep_fn(struct request_queue *q, struct request *req)
-{
-       struct nullb *nullb = q->queuedata;
-       struct nullb_queue *nq = nullb_to_queue(nullb);
-       struct nullb_cmd *cmd;
-
-       cmd = alloc_cmd(nq, 0);
-       if (cmd) {
-               cmd->rq = req;
-               req->special = cmd;
-               return BLKPREP_OK;
-       }
-       blk_stop_queue(q);
-
-       return BLKPREP_DEFER;
-}
-
-static bool should_timeout_request(struct request *rq)
-{
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-       if (g_timeout_str[0])
-               return should_fail(&null_timeout_attr, 1);
-#endif
-       return false;
-}
-
-static bool should_requeue_request(struct request *rq)
-{
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-       if (g_requeue_str[0])
-               return should_fail(&null_requeue_attr, 1);
-#endif
-       return false;
-}
-
-static void null_request_fn(struct request_queue *q)
-{
-       struct request *rq;
-
-       while ((rq = blk_fetch_request(q)) != NULL) {
-               struct nullb_cmd *cmd = rq->special;
-
-               /* just ignore the request */
-               if (should_timeout_request(rq))
-                       continue;
-               if (should_requeue_request(rq)) {
-                       blk_requeue_request(q, rq);
-                       continue;
-               }
-
-               spin_unlock_irq(q->queue_lock);
-               null_handle_cmd(cmd);
-               spin_lock_irq(q->queue_lock);
-       }
-}
-
-static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
-{
-       pr_info("null: rq %p timed out\n", rq);
-       blk_mq_complete_request(rq);
-       return BLK_EH_DONE;
-}
-
-static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
-                        const struct blk_mq_queue_data *bd)
-{
-       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
-       struct nullb_queue *nq = hctx->driver_data;
-
-       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
-       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
-               hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-               cmd->timer.function = null_cmd_timer_expired;
-       }
-       cmd->rq = bd->rq;
-       cmd->nq = nq;
-
-       blk_mq_start_request(bd->rq);
-
-       if (should_requeue_request(bd->rq)) {
-               /*
-                * Alternate between hitting the core BUSY path, and the
-                * driver driven requeue path
-                */
-               nq->requeue_selection++;
-               if (nq->requeue_selection & 1)
-                       return BLK_STS_RESOURCE;
-               else {
-                       blk_mq_requeue_request(bd->rq, true);
-                       return BLK_STS_OK;
-               }
-       }
-       if (should_timeout_request(bd->rq))
-               return BLK_STS_OK;
-
-       return null_handle_cmd(cmd);
-}
-
-static const struct blk_mq_ops null_mq_ops = {
-       .queue_rq       = null_queue_rq,
-       .complete       = null_softirq_done_fn,
-       .timeout        = null_timeout_rq,
-};
-
-static void cleanup_queue(struct nullb_queue *nq)
-{
-       kfree(nq->tag_map);
-       kfree(nq->cmds);
-}
-
-static void cleanup_queues(struct nullb *nullb)
-{
-       int i;
-
-       for (i = 0; i < nullb->nr_queues; i++)
-               cleanup_queue(&nullb->queues[i]);
-
-       kfree(nullb->queues);
-}
-
-static void null_del_dev(struct nullb *nullb)
-{
-       struct nullb_device *dev = nullb->dev;
-
-       ida_simple_remove(&nullb_indexes, nullb->index);
-
-       list_del_init(&nullb->list);
-
-       del_gendisk(nullb->disk);
-
-       if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
-               hrtimer_cancel(&nullb->bw_timer);
-               atomic_long_set(&nullb->cur_bytes, LONG_MAX);
-               null_restart_queue_async(nullb);
-       }
-
-       blk_cleanup_queue(nullb->q);
-       if (dev->queue_mode == NULL_Q_MQ &&
-           nullb->tag_set == &nullb->__tag_set)
-               blk_mq_free_tag_set(nullb->tag_set);
-       put_disk(nullb->disk);
-       cleanup_queues(nullb);
-       if (null_cache_active(nullb))
-               null_free_device_storage(nullb->dev, true);
-       kfree(nullb);
-       dev->nullb = NULL;
-}
-
-static void null_config_discard(struct nullb *nullb)
-{
-       if (nullb->dev->discard == false)
-               return;
-       nullb->q->limits.discard_granularity = nullb->dev->blocksize;
-       nullb->q->limits.discard_alignment = nullb->dev->blocksize;
-       blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
-       blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q);
-}
-
-static int null_open(struct block_device *bdev, fmode_t mode)
-{
-       return 0;
-}
-
-static void null_release(struct gendisk *disk, fmode_t mode)
-{
-}
-
-static const struct block_device_operations null_fops = {
-       .owner =        THIS_MODULE,
-       .open =         null_open,
-       .release =      null_release,
-};
-
-static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
-{
-       BUG_ON(!nullb);
-       BUG_ON(!nq);
-
-       init_waitqueue_head(&nq->wait);
-       nq->queue_depth = nullb->queue_depth;
-       nq->dev = nullb->dev;
-}
-
-static void null_init_queues(struct nullb *nullb)
-{
-       struct request_queue *q = nullb->q;
-       struct blk_mq_hw_ctx *hctx;
-       struct nullb_queue *nq;
-       int i;
-
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (!hctx->nr_ctx || !hctx->tags)
-                       continue;
-               nq = &nullb->queues[i];
-               hctx->driver_data = nq;
-               null_init_queue(nullb, nq);
-               nullb->nr_queues++;
-       }
-}
-
-static int setup_commands(struct nullb_queue *nq)
-{
-       struct nullb_cmd *cmd;
-       int i, tag_size;
-
-       nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
-       if (!nq->cmds)
-               return -ENOMEM;
-
-       tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
-       nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
-       if (!nq->tag_map) {
-               kfree(nq->cmds);
-               return -ENOMEM;
-       }
-
-       for (i = 0; i < nq->queue_depth; i++) {
-               cmd = &nq->cmds[i];
-               INIT_LIST_HEAD(&cmd->list);
-               cmd->ll_list.next = NULL;
-               cmd->tag = -1U;
-       }
-
-       return 0;
-}
-
-static int setup_queues(struct nullb *nullb)
-{
-       nullb->queues = kcalloc(nullb->dev->submit_queues,
-                               sizeof(struct nullb_queue),
-                               GFP_KERNEL);
-       if (!nullb->queues)
-               return -ENOMEM;
-
-       nullb->nr_queues = 0;
-       nullb->queue_depth = nullb->dev->hw_queue_depth;
-
-       return 0;
-}
-
-static int init_driver_queues(struct nullb *nullb)
-{
-       struct nullb_queue *nq;
-       int i, ret = 0;
-
-       for (i = 0; i < nullb->dev->submit_queues; i++) {
-               nq = &nullb->queues[i];
-
-               null_init_queue(nullb, nq);
-
-               ret = setup_commands(nq);
-               if (ret)
-                       return ret;
-               nullb->nr_queues++;
-       }
-       return 0;
-}
-
-static int null_gendisk_register(struct nullb *nullb)
-{
-       struct gendisk *disk;
-       sector_t size;
-
-       disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
-       if (!disk)
-               return -ENOMEM;
-       size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
-       set_capacity(disk, size >> 9);
-
-       disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
-       disk->major             = null_major;
-       disk->first_minor       = nullb->index;
-       disk->fops              = &null_fops;
-       disk->private_data      = nullb;
-       disk->queue             = nullb->q;
-       strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
-
-       add_disk(disk);
-       return 0;
-}
-
-static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
-{
-       set->ops = &null_mq_ops;
-       set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
-                                               g_submit_queues;
-       set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
-                                               g_hw_queue_depth;
-       set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
-       set->cmd_size   = sizeof(struct nullb_cmd);
-       set->flags = BLK_MQ_F_SHOULD_MERGE;
-       if (g_no_sched)
-               set->flags |= BLK_MQ_F_NO_SCHED;
-       set->driver_data = NULL;
-
-       if ((nullb && nullb->dev->blocking) || g_blocking)
-               set->flags |= BLK_MQ_F_BLOCKING;
-
-       return blk_mq_alloc_tag_set(set);
-}
-
-static void null_validate_conf(struct nullb_device *dev)
-{
-       dev->blocksize = round_down(dev->blocksize, 512);
-       dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
-
-       if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
-               if (dev->submit_queues != nr_online_nodes)
-                       dev->submit_queues = nr_online_nodes;
-       } else if (dev->submit_queues > nr_cpu_ids)
-               dev->submit_queues = nr_cpu_ids;
-       else if (dev->submit_queues == 0)
-               dev->submit_queues = 1;
-
-       dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
-       dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
-
-       /* Do memory allocation, so set blocking */
-       if (dev->memory_backed)
-               dev->blocking = true;
-       else /* cache is meaningless */
-               dev->cache_size = 0;
-       dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
-                                               dev->cache_size);
-       dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
-       /* can not stop a queue */
-       if (dev->queue_mode == NULL_Q_BIO)
-               dev->mbps = 0;
-}
-
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-static bool __null_setup_fault(struct fault_attr *attr, char *str)
-{
-       if (!str[0])
-               return true;
-
-       if (!setup_fault_attr(attr, str))
-               return false;
-
-       attr->verbose = 0;
-       return true;
-}
-#endif
-
-static bool null_setup_fault(void)
-{
-#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-       if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
-               return false;
-       if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
-               return false;
-#endif
-       return true;
-}
-
-static int null_add_dev(struct nullb_device *dev)
-{
-       struct nullb *nullb;
-       int rv;
-
-       null_validate_conf(dev);
-
-       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
-       if (!nullb) {
-               rv = -ENOMEM;
-               goto out;
-       }
-       nullb->dev = dev;
-       dev->nullb = nullb;
-
-       spin_lock_init(&nullb->lock);
-
-       rv = setup_queues(nullb);
-       if (rv)
-               goto out_free_nullb;
-
-       if (dev->queue_mode == NULL_Q_MQ) {
-               if (shared_tags) {
-                       nullb->tag_set = &tag_set;
-                       rv = 0;
-               } else {
-                       nullb->tag_set = &nullb->__tag_set;
-                       rv = null_init_tag_set(nullb, nullb->tag_set);
-               }
-
-               if (rv)
-                       goto out_cleanup_queues;
-
-               if (!null_setup_fault())
-                       goto out_cleanup_queues;
-
-               nullb->tag_set->timeout = 5 * HZ;
-               nullb->q = blk_mq_init_queue(nullb->tag_set);
-               if (IS_ERR(nullb->q)) {
-                       rv = -ENOMEM;
-                       goto out_cleanup_tags;
-               }
-               null_init_queues(nullb);
-       } else if (dev->queue_mode == NULL_Q_BIO) {
-               nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
-                                               NULL);
-               if (!nullb->q) {
-                       rv = -ENOMEM;
-                       goto out_cleanup_queues;
-               }
-               blk_queue_make_request(nullb->q, null_queue_bio);
-               rv = init_driver_queues(nullb);
-               if (rv)
-                       goto out_cleanup_blk_queue;
-       } else {
-               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
-                                               dev->home_node);
-               if (!nullb->q) {
-                       rv = -ENOMEM;
-                       goto out_cleanup_queues;
-               }
-
-               if (!null_setup_fault())
-                       goto out_cleanup_blk_queue;
-
-               blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-               blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
-               blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
-               nullb->q->rq_timeout = 5 * HZ;
-               rv = init_driver_queues(nullb);
-               if (rv)
-                       goto out_cleanup_blk_queue;
-       }
-
-       if (dev->mbps) {
-               set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
-               nullb_setup_bwtimer(nullb);
-       }
-
-       if (dev->cache_size > 0) {
-               set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
-               blk_queue_write_cache(nullb->q, true, true);
-               blk_queue_flush_queueable(nullb->q, true);
-       }
-
-       nullb->q->queuedata = nullb;
-       blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
-       blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
-
-       mutex_lock(&lock);
-       nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
-       dev->index = nullb->index;
-       mutex_unlock(&lock);
-
-       blk_queue_logical_block_size(nullb->q, dev->blocksize);
-       blk_queue_physical_block_size(nullb->q, dev->blocksize);
-
-       null_config_discard(nullb);
-
-       sprintf(nullb->disk_name, "nullb%d", nullb->index);
-
-       rv = null_gendisk_register(nullb);
-       if (rv)
-               goto out_cleanup_blk_queue;
-
-       mutex_lock(&lock);
-       list_add_tail(&nullb->list, &nullb_list);
-       mutex_unlock(&lock);
-
-       return 0;
-out_cleanup_blk_queue:
-       blk_cleanup_queue(nullb->q);
-out_cleanup_tags:
-       if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
-               blk_mq_free_tag_set(nullb->tag_set);
-out_cleanup_queues:
-       cleanup_queues(nullb);
-out_free_nullb:
-       kfree(nullb);
-out:
-       return rv;
-}
-
-static int __init null_init(void)
-{
-       int ret = 0;
-       unsigned int i;
-       struct nullb *nullb;
-       struct nullb_device *dev;
-
-       if (g_bs > PAGE_SIZE) {
-               pr_warn("null_blk: invalid block size\n");
-               pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
-               g_bs = PAGE_SIZE;
-       }
-
-       if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
-               if (g_submit_queues != nr_online_nodes) {
-                       pr_warn("null_blk: submit_queues param is set to %u.\n",
-                                                       nr_online_nodes);
-                       g_submit_queues = nr_online_nodes;
-               }
-       } else if (g_submit_queues > nr_cpu_ids)
-               g_submit_queues = nr_cpu_ids;
-       else if (g_submit_queues <= 0)
-               g_submit_queues = 1;
-
-       if (g_queue_mode == NULL_Q_MQ && shared_tags) {
-               ret = null_init_tag_set(NULL, &tag_set);
-               if (ret)
-                       return ret;
-       }
-
-       config_group_init(&nullb_subsys.su_group);
-       mutex_init(&nullb_subsys.su_mutex);
-
-       ret = configfs_register_subsystem(&nullb_subsys);
-       if (ret)
-               goto err_tagset;
-
-       mutex_init(&lock);
-
-       null_major = register_blkdev(0, "nullb");
-       if (null_major < 0) {
-               ret = null_major;
-               goto err_conf;
-       }
-
-       for (i = 0; i < nr_devices; i++) {
-               dev = null_alloc_dev();
-               if (!dev) {
-                       ret = -ENOMEM;
-                       goto err_dev;
-               }
-               ret = null_add_dev(dev);
-               if (ret) {
-                       null_free_dev(dev);
-                       goto err_dev;
-               }
-       }
-
-       pr_info("null: module loaded\n");
-       return 0;
-
-err_dev:
-       while (!list_empty(&nullb_list)) {
-               nullb = list_entry(nullb_list.next, struct nullb, list);
-               dev = nullb->dev;
-               null_del_dev(nullb);
-               null_free_dev(dev);
-       }
-       unregister_blkdev(null_major, "nullb");
-err_conf:
-       configfs_unregister_subsystem(&nullb_subsys);
-err_tagset:
-       if (g_queue_mode == NULL_Q_MQ && shared_tags)
-               blk_mq_free_tag_set(&tag_set);
-       return ret;
-}
-
-static void __exit null_exit(void)
-{
-       struct nullb *nullb;
-
-       configfs_unregister_subsystem(&nullb_subsys);
-
-       unregister_blkdev(null_major, "nullb");
-
-       mutex_lock(&lock);
-       while (!list_empty(&nullb_list)) {
-               struct nullb_device *dev;
-
-               nullb = list_entry(nullb_list.next, struct nullb, list);
-               dev = nullb->dev;
-               null_del_dev(nullb);
-               null_free_dev(dev);
-       }
-       mutex_unlock(&lock);
-
-       if (g_queue_mode == NULL_Q_MQ && shared_tags)
-               blk_mq_free_tag_set(&tag_set);
-}
-
-module_init(null_init);
-module_exit(null_exit);
-
-MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h

new file mode 100644 (file)

index 0000000..d81781f
--- /dev/null
+++ b/drivers/block/null_blk.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_NULL_BLK_H
+#define __BLK_NULL_BLK_H
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+#include <linux/configfs.h>
+#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
+
+struct nullb_cmd {
+       struct list_head list;
+       struct llist_node ll_list;
+       struct __call_single_data csd;
+       struct request *rq;
+       struct bio *bio;
+       unsigned int tag;
+       blk_status_t error;
+       struct nullb_queue *nq;
+       struct hrtimer timer;
+};
+
+struct nullb_queue {
+       unsigned long *tag_map;
+       wait_queue_head_t wait;
+       unsigned int queue_depth;
+       struct nullb_device *dev;
+       unsigned int requeue_selection;
+
+       struct nullb_cmd *cmds;
+};
+
+struct nullb_device {
+       struct nullb *nullb;
+       struct config_item item;
+       struct radix_tree_root data; /* data stored in the disk */
+       struct radix_tree_root cache; /* disk cache data */
+       unsigned long flags; /* device flags */
+       unsigned int curr_cache;
+       struct badblocks badblocks;
+
+       unsigned int nr_zones;
+       struct blk_zone *zones;
+       sector_t zone_size_sects;
+
+       unsigned long size; /* device size in MB */
+       unsigned long completion_nsec; /* time in ns to complete a request */
+       unsigned long cache_size; /* disk cache size in MB */
+       unsigned long zone_size; /* zone size in MB if device is zoned */
+       unsigned int submit_queues; /* number of submission queues */
+       unsigned int home_node; /* home node for the device */
+       unsigned int queue_mode; /* block interface */
+       unsigned int blocksize; /* block size */
+       unsigned int irqmode; /* IRQ completion handler */
+       unsigned int hw_queue_depth; /* queue depth */
+       unsigned int index; /* index of the disk, only valid with a disk */
+       unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+       bool blocking; /* blocking blk-mq device */
+       bool use_per_node_hctx; /* use per-node allocation for hardware context */
+       bool power; /* power on/off the device */
+       bool memory_backed; /* if data is stored in memory */
+       bool discard; /* if support discard */
+       bool zoned; /* if device is zoned */
+};
+
+struct nullb {
+       struct nullb_device *dev;
+       struct list_head list;
+       unsigned int index;
+       struct request_queue *q;
+       struct gendisk *disk;
+       struct blk_mq_tag_set *tag_set;
+       struct blk_mq_tag_set __tag_set;
+       unsigned int queue_depth;
+       atomic_long_t cur_bytes;
+       struct hrtimer bw_timer;
+       unsigned long cache_flush_pos;
+       spinlock_t lock;
+
+       struct nullb_queue *queues;
+       unsigned int nr_queues;
+       char disk_name[DISK_NAME_LEN];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int null_zone_init(struct nullb_device *dev);
+void null_zone_exit(struct nullb_device *dev);
+blk_status_t null_zone_report(struct nullb *nullb,
+                                           struct nullb_cmd *cmd);
+void null_zone_write(struct nullb_cmd *cmd);
+void null_zone_reset(struct nullb_cmd *cmd);
+#else
+static inline int null_zone_init(struct nullb_device *dev)
+{
+       return -EINVAL;
+}
+static inline void null_zone_exit(struct nullb_device *dev) {}
+static inline blk_status_t null_zone_report(struct nullb *nullb,
+                                           struct nullb_cmd *cmd)
+{
+       return BLK_STS_NOTSUPP;
+}
+static inline void null_zone_write(struct nullb_cmd *cmd) {}
+static inline void null_zone_reset(struct nullb_cmd *cmd) {}
+#endif /* CONFIG_BLK_DEV_ZONED */
+#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c

new file mode 100644 (file)

index 0000000..86cafa6
--- /dev/null
+++ b/drivers/block/null_blk_main.c
@@ -0,0 +1,1926 @@
+/*
+ * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
+ * Shaohua Li <shli@fb.com>
+ */
+#include <linux/module.h>
+
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include "null_blk.h"
+
+#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK            (PAGE_SECTORS - 1)
+
+#define FREE_BATCH             16
+
+#define TICKS_PER_SEC          50ULL
+#define TIMER_INTERVAL         (NSEC_PER_SEC / TICKS_PER_SEC)
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static DECLARE_FAULT_ATTR(null_timeout_attr);
+static DECLARE_FAULT_ATTR(null_requeue_attr);
+#endif
+
+static inline u64 mb_per_tick(int mbps)
+{
+       return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
+}
+
+/*
+ * Status flags for nullb_device.
+ *
+ * CONFIGURED: Device has been configured and turned on. Cannot reconfigure.
+ * UP:         Device is currently on and visible in userspace.
+ * THROTTLED:  Device is being throttled.
+ * CACHE:      Device is using a write-back cache.
+ */
+enum nullb_device_flags {
+       NULLB_DEV_FL_CONFIGURED = 0,
+       NULLB_DEV_FL_UP         = 1,
+       NULLB_DEV_FL_THROTTLED  = 2,
+       NULLB_DEV_FL_CACHE      = 3,
+};
+
+#define MAP_SZ         ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
+/*
+ * nullb_page is a page in memory for nullb devices.
+ *
+ * @page:      The page holding the data.
+ * @bitmap:    The bitmap represents which sector in the page has data.
+ *             Each bit represents one block size. For example, sector 8
+ *             will use the 7th bit
+ * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
+ * page is being flushing to storage. FREE means the cache page is freed and
+ * should be skipped from flushing to storage. Please see
+ * null_make_cache_space
+ */
+struct nullb_page {
+       struct page *page;
+       DECLARE_BITMAP(bitmap, MAP_SZ);
+};
+#define NULLB_PAGE_LOCK (MAP_SZ - 1)
+#define NULLB_PAGE_FREE (MAP_SZ - 2)
+
+static LIST_HEAD(nullb_list);
+static struct mutex lock;
+static int null_major;
+static DEFINE_IDA(nullb_indexes);
+static struct blk_mq_tag_set tag_set;
+
+enum {
+       NULL_IRQ_NONE           = 0,
+       NULL_IRQ_SOFTIRQ        = 1,
+       NULL_IRQ_TIMER          = 2,
+};
+
+enum {
+       NULL_Q_BIO              = 0,
+       NULL_Q_RQ               = 1,
+       NULL_Q_MQ               = 2,
+};
+
+static int g_no_sched;
+module_param_named(no_sched, g_no_sched, int, 0444);
+MODULE_PARM_DESC(no_sched, "No io scheduler");
+
+static int g_submit_queues = 1;
+module_param_named(submit_queues, g_submit_queues, int, 0444);
+MODULE_PARM_DESC(submit_queues, "Number of submission queues");
+
+static int g_home_node = NUMA_NO_NODE;
+module_param_named(home_node, g_home_node, int, 0444);
+MODULE_PARM_DESC(home_node, "Home node for the device");
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static char g_timeout_str[80];
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
+
+static char g_requeue_str[80];
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
+#endif
+
+static int g_queue_mode = NULL_Q_MQ;
+
+static int null_param_store_val(const char *str, int *val, int min, int max)
+{
+       int ret, new_val;
+
+       ret = kstrtoint(str, 10, &new_val);
+       if (ret)
+               return -EINVAL;
+
+       if (new_val < min || new_val > max)
+               return -EINVAL;
+
+       *val = new_val;
+       return 0;
+}
+
+static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
+{
+       return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+}
+
+static const struct kernel_param_ops null_queue_mode_param_ops = {
+       .set    = null_set_queue_mode,
+       .get    = param_get_int,
+};
+
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
+MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
+
+static int g_gb = 250;
+module_param_named(gb, g_gb, int, 0444);
+MODULE_PARM_DESC(gb, "Size in GB");
+
+static int g_bs = 512;
+module_param_named(bs, g_bs, int, 0444);
+MODULE_PARM_DESC(bs, "Block size (in bytes)");
+
+static int nr_devices = 1;
+module_param(nr_devices, int, 0444);
+MODULE_PARM_DESC(nr_devices, "Number of devices to register");
+
+static bool g_blocking;
+module_param_named(blocking, g_blocking, bool, 0444);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
+static bool shared_tags;
+module_param(shared_tags, bool, 0444);
+MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
+
+static int g_irqmode = NULL_IRQ_SOFTIRQ;
+
+static int null_set_irqmode(const char *str, const struct kernel_param *kp)
+{
+       return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
+                                       NULL_IRQ_TIMER);
+}
+
+static const struct kernel_param_ops null_irqmode_param_ops = {
+       .set    = null_set_irqmode,
+       .get    = param_get_int,
+};
+
+device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
+MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
+
+static unsigned long g_completion_nsec = 10000;
+module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
+MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
+
+static int g_hw_queue_depth = 64;
+module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
+MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
+
+static bool g_use_per_node_hctx;
+module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
+MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+
+static bool g_zoned;
+module_param_named(zoned, g_zoned, bool, S_IRUGO);
+MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
+
+static unsigned long g_zone_size = 256;
+module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+
+static struct nullb_device *null_alloc_dev(void);
+static void null_free_dev(struct nullb_device *dev);
+static void null_del_dev(struct nullb *nullb);
+static int null_add_dev(struct nullb_device *dev);
+static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
+
+static inline struct nullb_device *to_nullb_device(struct config_item *item)
+{
+       return item ? container_of(item, struct nullb_device, item) : NULL;
+}
+
+static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
+       char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%lu\n", val);
+}
+
+static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t nullb_device_uint_attr_store(unsigned int *val,
+       const char *page, size_t count)
+{
+       unsigned int tmp;
+       int result;
+
+       result = kstrtouint(page, 0, &tmp);
+       if (result)
+               return result;
+
+       *val = tmp;
+       return count;
+}
+
+static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
+       const char *page, size_t count)
+{
+       int result;
+       unsigned long tmp;
+
+       result = kstrtoul(page, 0, &tmp);
+       if (result)
+               return result;
+
+       *val = tmp;
+       return count;
+}
+
+static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
+       size_t count)
+{
+       bool tmp;
+       int result;
+
+       result = kstrtobool(page,  &tmp);
+       if (result)
+               return result;
+
+       *val = tmp;
+       return count;
+}
+
+/* The following macro should only be used with TYPE = {uint, ulong, bool}. */
+#define NULLB_DEVICE_ATTR(NAME, TYPE)                                          \
+static ssize_t                                                                 \
+nullb_device_##NAME##_show(struct config_item *item, char *page)               \
+{                                                                              \
+       return nullb_device_##TYPE##_attr_show(                                 \
+                               to_nullb_device(item)->NAME, page);             \
+}                                                                              \
+static ssize_t                                                                 \
+nullb_device_##NAME##_store(struct config_item *item, const char *page,                \
+                           size_t count)                                       \
+{                                                                              \
+       if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags))   \
+               return -EBUSY;                                                  \
+       return nullb_device_##TYPE##_attr_store(                                \
+                       &to_nullb_device(item)->NAME, page, count);             \
+}                                                                              \
+CONFIGFS_ATTR(nullb_device_, NAME);
+
+NULLB_DEVICE_ATTR(size, ulong);
+NULLB_DEVICE_ATTR(completion_nsec, ulong);
+NULLB_DEVICE_ATTR(submit_queues, uint);
+NULLB_DEVICE_ATTR(home_node, uint);
+NULLB_DEVICE_ATTR(queue_mode, uint);
+NULLB_DEVICE_ATTR(blocksize, uint);
+NULLB_DEVICE_ATTR(irqmode, uint);
+NULLB_DEVICE_ATTR(hw_queue_depth, uint);
+NULLB_DEVICE_ATTR(index, uint);
+NULLB_DEVICE_ATTR(blocking, bool);
+NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
+NULLB_DEVICE_ATTR(memory_backed, bool);
+NULLB_DEVICE_ATTR(discard, bool);
+NULLB_DEVICE_ATTR(mbps, uint);
+NULLB_DEVICE_ATTR(cache_size, ulong);
+NULLB_DEVICE_ATTR(zoned, bool);
+NULLB_DEVICE_ATTR(zone_size, ulong);
+
+static ssize_t nullb_device_power_show(struct config_item *item, char *page)
+{
+       return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
+}
+
+static ssize_t nullb_device_power_store(struct config_item *item,
+                                    const char *page, size_t count)
+{
+       struct nullb_device *dev = to_nullb_device(item);
+       bool newp = false;
+       ssize_t ret;
+
+       ret = nullb_device_bool_attr_store(&newp, page, count);
+       if (ret < 0)
+               return ret;
+
+       if (!dev->power && newp) {
+               if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
+                       return count;
+               if (null_add_dev(dev)) {
+                       clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+                       return -ENOMEM;
+               }
+
+               set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+               dev->power = newp;
+       } else if (dev->power && !newp) {
+               mutex_lock(&lock);
+               dev->power = newp;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+               clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+               clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+       }
+
+       return count;
+}
+
+CONFIGFS_ATTR(nullb_device_, power);
+
+static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
+{
+       struct nullb_device *t_dev = to_nullb_device(item);
+
+       return badblocks_show(&t_dev->badblocks, page, 0);
+}
+
+static ssize_t nullb_device_badblocks_store(struct config_item *item,
+                                    const char *page, size_t count)
+{
+       struct nullb_device *t_dev = to_nullb_device(item);
+       char *orig, *buf, *tmp;
+       u64 start, end;
+       int ret;
+
+       orig = kstrndup(page, count, GFP_KERNEL);
+       if (!orig)
+               return -ENOMEM;
+
+       buf = strstrip(orig);
+
+       ret = -EINVAL;
+       if (buf[0] != '+' && buf[0] != '-')
+               goto out;
+       tmp = strchr(&buf[1], '-');
+       if (!tmp)
+               goto out;
+       *tmp = '\0';
+       ret = kstrtoull(buf + 1, 0, &start);
+       if (ret)
+               goto out;
+       ret = kstrtoull(tmp + 1, 0, &end);
+       if (ret)
+               goto out;
+       ret = -EINVAL;
+       if (start > end)
+               goto out;
+       /* enable badblocks */
+       cmpxchg(&t_dev->badblocks.shift, -1, 0);
+       if (buf[0] == '+')
+               ret = badblocks_set(&t_dev->badblocks, start,
+                       end - start + 1, 1);
+       else
+               ret = badblocks_clear(&t_dev->badblocks, start,
+                       end - start + 1);
+       if (ret == 0)
+               ret = count;
+out:
+       kfree(orig);
+       return ret;
+}
+CONFIGFS_ATTR(nullb_device_, badblocks);
+
+static struct configfs_attribute *nullb_device_attrs[] = {
+       &nullb_device_attr_size,
+       &nullb_device_attr_completion_nsec,
+       &nullb_device_attr_submit_queues,
+       &nullb_device_attr_home_node,
+       &nullb_device_attr_queue_mode,
+       &nullb_device_attr_blocksize,
+       &nullb_device_attr_irqmode,
+       &nullb_device_attr_hw_queue_depth,
+       &nullb_device_attr_index,
+       &nullb_device_attr_blocking,
+       &nullb_device_attr_use_per_node_hctx,
+       &nullb_device_attr_power,
+       &nullb_device_attr_memory_backed,
+       &nullb_device_attr_discard,
+       &nullb_device_attr_mbps,
+       &nullb_device_attr_cache_size,
+       &nullb_device_attr_badblocks,
+       &nullb_device_attr_zoned,
+       &nullb_device_attr_zone_size,
+       NULL,
+};
+
+static void nullb_device_release(struct config_item *item)
+{
+       struct nullb_device *dev = to_nullb_device(item);
+
+       null_free_device_storage(dev, false);
+       null_free_dev(dev);
+}
+
+static struct configfs_item_operations nullb_device_ops = {
+       .release        = nullb_device_release,
+};
+
+static const struct config_item_type nullb_device_type = {
+       .ct_item_ops    = &nullb_device_ops,
+       .ct_attrs       = nullb_device_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct
+config_item *nullb_group_make_item(struct config_group *group, const char *name)
+{
+       struct nullb_device *dev;
+
+       dev = null_alloc_dev();
+       if (!dev)
+               return ERR_PTR(-ENOMEM);
+
+       config_item_init_type_name(&dev->item, name, &nullb_device_type);
+
+       return &dev->item;
+}
+
+static void
+nullb_group_drop_item(struct config_group *group, struct config_item *item)
+{
+       struct nullb_device *dev = to_nullb_device(item);
+
+       if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+               mutex_lock(&lock);
+               dev->power = false;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+       }
+
+       config_item_put(item);
+}
+
+static ssize_t memb_group_features_show(struct config_item *item, char *page)
+{
+       return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
+}
+
+CONFIGFS_ATTR_RO(memb_group_, features);
+
+static struct configfs_attribute *nullb_group_attrs[] = {
+       &memb_group_attr_features,
+       NULL,
+};
+
+static struct configfs_group_operations nullb_group_ops = {
+       .make_item      = nullb_group_make_item,
+       .drop_item      = nullb_group_drop_item,
+};
+
+static const struct config_item_type nullb_group_type = {
+       .ct_group_ops   = &nullb_group_ops,
+       .ct_attrs       = nullb_group_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct configfs_subsystem nullb_subsys = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "nullb",
+                       .ci_type = &nullb_group_type,
+               },
+       },
+};
+
+static inline int null_cache_active(struct nullb *nullb)
+{
+       return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+}
+
+static struct nullb_device *null_alloc_dev(void)
+{
+       struct nullb_device *dev;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return NULL;
+       INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
+       INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
+       if (badblocks_init(&dev->badblocks, 0)) {
+               kfree(dev);
+               return NULL;
+       }
+
+       dev->size = g_gb * 1024;
+       dev->completion_nsec = g_completion_nsec;
+       dev->submit_queues = g_submit_queues;
+       dev->home_node = g_home_node;
+       dev->queue_mode = g_queue_mode;
+       dev->blocksize = g_bs;
+       dev->irqmode = g_irqmode;
+       dev->hw_queue_depth = g_hw_queue_depth;
+       dev->blocking = g_blocking;
+       dev->use_per_node_hctx = g_use_per_node_hctx;
+       dev->zoned = g_zoned;
+       dev->zone_size = g_zone_size;
+       return dev;
+}
+
+static void null_free_dev(struct nullb_device *dev)
+{
+       if (!dev)
+               return;
+
+       null_zone_exit(dev);
+       badblocks_exit(&dev->badblocks);
+       kfree(dev);
+}
+
+static void put_tag(struct nullb_queue *nq, unsigned int tag)
+{
+       clear_bit_unlock(tag, nq->tag_map);
+
+       if (waitqueue_active(&nq->wait))
+               wake_up(&nq->wait);
+}
+
+static unsigned int get_tag(struct nullb_queue *nq)
+{
+       unsigned int tag;
+
+       do {
+               tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
+               if (tag >= nq->queue_depth)
+                       return -1U;
+       } while (test_and_set_bit_lock(tag, nq->tag_map));
+
+       return tag;
+}
+
+static void free_cmd(struct nullb_cmd *cmd)
+{
+       put_tag(cmd->nq, cmd->tag);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
+
+static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
+{
+       struct nullb_cmd *cmd;
+       unsigned int tag;
+
+       tag = get_tag(nq);
+       if (tag != -1U) {
+               cmd = &nq->cmds[tag];
+               cmd->tag = tag;
+               cmd->nq = nq;
+               if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+                       hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
+                                    HRTIMER_MODE_REL);
+                       cmd->timer.function = null_cmd_timer_expired;
+               }
+               return cmd;
+       }
+
+       return NULL;
+}
+
+static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
+{
+       struct nullb_cmd *cmd;
+       DEFINE_WAIT(wait);
+
+       cmd = __alloc_cmd(nq);
+       if (cmd || !can_wait)
+               return cmd;
+
+       do {
+               prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
+               cmd = __alloc_cmd(nq);
+               if (cmd)
+                       break;
+
+               io_schedule();
+       } while (1);
+
+       finish_wait(&nq->wait, &wait);
+       return cmd;
+}
+
+static void end_cmd(struct nullb_cmd *cmd)
+{
+       struct request_queue *q = NULL;
+       int queue_mode = cmd->nq->dev->queue_mode;
+
+       if (cmd->rq)
+               q = cmd->rq->q;
+
+       switch (queue_mode)  {
+       case NULL_Q_MQ:
+               blk_mq_end_request(cmd->rq, cmd->error);
+               return;
+       case NULL_Q_RQ:
+               INIT_LIST_HEAD(&cmd->rq->queuelist);
+               blk_end_request_all(cmd->rq, cmd->error);
+               break;
+       case NULL_Q_BIO:
+               cmd->bio->bi_status = cmd->error;
+               bio_endio(cmd->bio);
+               break;
+       }
+
+       free_cmd(cmd);
+
+       /* Restart queue if needed, as we are freeing a tag */
+       if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) {
+               unsigned long flags;
+
+               spin_lock_irqsave(q->queue_lock, flags);
+               blk_start_queue_async(q);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+       end_cmd(container_of(timer, struct nullb_cmd, timer));
+
+       return HRTIMER_NORESTART;
+}
+
+static void null_cmd_end_timer(struct nullb_cmd *cmd)
+{
+       ktime_t kt = cmd->nq->dev->completion_nsec;
+
+       hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
+}
+
+static void null_softirq_done_fn(struct request *rq)
+{
+       struct nullb *nullb = rq->q->queuedata;
+
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               end_cmd(blk_mq_rq_to_pdu(rq));
+       else
+               end_cmd(rq->special);
+}
+
+static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
+{
+       struct nullb_page *t_page;
+
+       t_page = kmalloc(sizeof(struct nullb_page), gfp_flags);
+       if (!t_page)
+               goto out;
+
+       t_page->page = alloc_pages(gfp_flags, 0);
+       if (!t_page->page)
+               goto out_freepage;
+
+       memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
+       return t_page;
+out_freepage:
+       kfree(t_page);
+out:
+       return NULL;
+}
+
+static void null_free_page(struct nullb_page *t_page)
+{
+       __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
+       if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
+               return;
+       __free_page(t_page->page);
+       kfree(t_page);
+}
+
+static bool null_page_empty(struct nullb_page *page)
+{
+       int size = MAP_SZ - 2;
+
+       return find_first_bit(page->bitmap, size) == size;
+}
+
+static void null_free_sector(struct nullb *nullb, sector_t sector,
+       bool is_cache)
+{
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       struct radix_tree_root *root;
+
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+
+       t_page = radix_tree_lookup(root, idx);
+       if (t_page) {
+               __clear_bit(sector_bit, t_page->bitmap);
+
+               if (null_page_empty(t_page)) {
+                       ret = radix_tree_delete_item(root, idx, t_page);
+                       WARN_ON(ret != t_page);
+                       null_free_page(ret);
+                       if (is_cache)
+                               nullb->dev->curr_cache -= PAGE_SIZE;
+               }
+       }
+}
+
+static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
+       struct nullb_page *t_page, bool is_cache)
+{
+       struct radix_tree_root *root;
+
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+
+       if (radix_tree_insert(root, idx, t_page)) {
+               null_free_page(t_page);
+               t_page = radix_tree_lookup(root, idx);
+               WARN_ON(!t_page || t_page->page->index != idx);
+       } else if (is_cache)
+               nullb->dev->curr_cache += PAGE_SIZE;
+
+       return t_page;
+}
+
+static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
+{
+       unsigned long pos = 0;
+       int nr_pages;
+       struct nullb_page *ret, *t_pages[FREE_BATCH];
+       struct radix_tree_root *root;
+
+       root = is_cache ? &dev->cache : &dev->data;
+
+       do {
+               int i;
+
+               nr_pages = radix_tree_gang_lookup(root,
+                               (void **)t_pages, pos, FREE_BATCH);
+
+               for (i = 0; i < nr_pages; i++) {
+                       pos = t_pages[i]->page->index;
+                       ret = radix_tree_delete_item(root, pos, t_pages[i]);
+                       WARN_ON(ret != t_pages[i]);
+                       null_free_page(ret);
+               }
+
+               pos++;
+       } while (nr_pages == FREE_BATCH);
+
+       if (is_cache)
+               dev->curr_cache = 0;
+}
+
+static struct nullb_page *__null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool is_cache)
+{
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page;
+       struct radix_tree_root *root;
+
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       t_page = radix_tree_lookup(root, idx);
+       WARN_ON(t_page && t_page->page->index != idx);
+
+       if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
+               return t_page;
+
+       return NULL;
+}
+
+static struct nullb_page *null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool ignore_cache)
+{
+       struct nullb_page *page = NULL;
+
+       if (!ignore_cache)
+               page = __null_lookup_page(nullb, sector, for_write, true);
+       if (page)
+               return page;
+       return __null_lookup_page(nullb, sector, for_write, false);
+}
+
+static struct nullb_page *null_insert_page(struct nullb *nullb,
+       sector_t sector, bool ignore_cache)
+{
+       u64 idx;
+       struct nullb_page *t_page;
+
+       t_page = null_lookup_page(nullb, sector, true, ignore_cache);
+       if (t_page)
+               return t_page;
+
+       spin_unlock_irq(&nullb->lock);
+
+       t_page = null_alloc_page(GFP_NOIO);
+       if (!t_page)
+               goto out_lock;
+
+       if (radix_tree_preload(GFP_NOIO))
+               goto out_freepage;
+
+       spin_lock_irq(&nullb->lock);
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       t_page->page->index = idx;
+       t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
+       radix_tree_preload_end();
+
+       return t_page;
+out_freepage:
+       null_free_page(t_page);
+out_lock:
+       spin_lock_irq(&nullb->lock);
+       return null_lookup_page(nullb, sector, true, ignore_cache);
+}
+
+static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
+{
+       int i;
+       unsigned int offset;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       void *dst, *src;
+
+       idx = c_page->page->index;
+
+       t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
+
+       __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
+       if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
+               null_free_page(c_page);
+               if (t_page && null_page_empty(t_page)) {
+                       ret = radix_tree_delete_item(&nullb->dev->data,
+                               idx, t_page);
+                       null_free_page(t_page);
+               }
+               return 0;
+       }
+
+       if (!t_page)
+               return -ENOMEM;
+
+       src = kmap_atomic(c_page->page);
+       dst = kmap_atomic(t_page->page);
+
+       for (i = 0; i < PAGE_SECTORS;
+                       i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
+               if (test_bit(i, c_page->bitmap)) {
+                       offset = (i << SECTOR_SHIFT);
+                       memcpy(dst + offset, src + offset,
+                               nullb->dev->blocksize);
+                       __set_bit(i, t_page->bitmap);
+               }
+       }
+
+       kunmap_atomic(dst);
+       kunmap_atomic(src);
+
+       ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
+       null_free_page(ret);
+       nullb->dev->curr_cache -= PAGE_SIZE;
+
+       return 0;
+}
+
+static int null_make_cache_space(struct nullb *nullb, unsigned long n)
+{
+       int i, err, nr_pages;
+       struct nullb_page *c_pages[FREE_BATCH];
+       unsigned long flushed = 0, one_round;
+
+again:
+       if ((nullb->dev->cache_size * 1024 * 1024) >
+            nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
+               return 0;
+
+       nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
+                       (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
+       /*
+        * nullb_flush_cache_page could unlock before using the c_pages. To
+        * avoid race, we don't allow page free
+        */
+       for (i = 0; i < nr_pages; i++) {
+               nullb->cache_flush_pos = c_pages[i]->page->index;
+               /*
+                * We found the page which is being flushed to disk by other
+                * threads
+                */
+               if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
+                       c_pages[i] = NULL;
+               else
+                       __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
+       }
+
+       one_round = 0;
+       for (i = 0; i < nr_pages; i++) {
+               if (c_pages[i] == NULL)
+                       continue;
+               err = null_flush_cache_page(nullb, c_pages[i]);
+               if (err)
+                       return err;
+               one_round++;
+       }
+       flushed += one_round << PAGE_SHIFT;
+
+       if (n > flushed) {
+               if (nr_pages == 0)
+                       nullb->cache_flush_pos = 0;
+               if (one_round == 0) {
+                       /* give other threads a chance */
+                       spin_unlock_irq(&nullb->lock);
+                       spin_lock_irq(&nullb->lock);
+               }
+               goto again;
+       }
+       return 0;
+}
+
+static int copy_to_nullb(struct nullb *nullb, struct page *source,
+       unsigned int off, sector_t sector, size_t n, bool is_fua)
+{
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+               if (null_cache_active(nullb) && !is_fua)
+                       null_make_cache_space(nullb, PAGE_SIZE);
+
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_insert_page(nullb, sector,
+                       !null_cache_active(nullb) || is_fua);
+               if (!t_page)
+                       return -ENOSPC;
+
+               src = kmap_atomic(source);
+               dst = kmap_atomic(t_page->page);
+               memcpy(dst + offset, src + off + count, temp);
+               kunmap_atomic(dst);
+               kunmap_atomic(src);
+
+               __set_bit(sector & SECTOR_MASK, t_page->bitmap);
+
+               if (is_fua)
+                       null_free_sector(nullb, sector, true);
+
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+}
+
+static int copy_from_nullb(struct nullb *nullb, struct page *dest,
+       unsigned int off, sector_t sector, size_t n)
+{
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_lookup_page(nullb, sector, false,
+                       !null_cache_active(nullb));
+
+               dst = kmap_atomic(dest);
+               if (!t_page) {
+                       memset(dst + off + count, 0, temp);
+                       goto next;
+               }
+               src = kmap_atomic(t_page->page);
+               memcpy(dst + off + count, src + offset, temp);
+               kunmap_atomic(src);
+next:
+               kunmap_atomic(dst);
+
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+}
+
+static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
+{
+       size_t temp;
+
+       spin_lock_irq(&nullb->lock);
+       while (n > 0) {
+               temp = min_t(size_t, n, nullb->dev->blocksize);
+               null_free_sector(nullb, sector, false);
+               if (null_cache_active(nullb))
+                       null_free_sector(nullb, sector, true);
+               sector += temp >> SECTOR_SHIFT;
+               n -= temp;
+       }
+       spin_unlock_irq(&nullb->lock);
+}
+
+static int null_handle_flush(struct nullb *nullb)
+{
+       int err;
+
+       if (!null_cache_active(nullb))
+               return 0;
+
+       spin_lock_irq(&nullb->lock);
+       while (true) {
+               err = null_make_cache_space(nullb,
+                       nullb->dev->cache_size * 1024 * 1024);
+               if (err || nullb->dev->curr_cache == 0)
+                       break;
+       }
+
+       WARN_ON(!radix_tree_empty(&nullb->dev->cache));
+       spin_unlock_irq(&nullb->lock);
+       return err;
+}
+
+static int null_transfer(struct nullb *nullb, struct page *page,
+       unsigned int len, unsigned int off, bool is_write, sector_t sector,
+       bool is_fua)
+{
+       int err = 0;
+
+       if (!is_write) {
+               err = copy_from_nullb(nullb, page, off, sector, len);
+               flush_dcache_page(page);
+       } else {
+               flush_dcache_page(page);
+               err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+       }
+
+       return err;
+}
+
+static int null_handle_rq(struct nullb_cmd *cmd)
+{
+       struct request *rq = cmd->rq;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct req_iterator iter;
+       struct bio_vec bvec;
+
+       sector = blk_rq_pos(rq);
+
+       if (req_op(rq) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector, blk_rq_bytes(rq));
+               return 0;
+       }
+
+       spin_lock_irq(&nullb->lock);
+       rq_for_each_segment(bvec, rq, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(req_op(rq)), sector,
+                                    req_op(rq) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+
+       return 0;
+}
+
+static int null_handle_bio(struct nullb_cmd *cmd)
+{
+       struct bio *bio = cmd->bio;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+
+       sector = bio->bi_iter.bi_sector;
+
+       if (bio_op(bio) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector,
+                       bio_sectors(bio) << SECTOR_SHIFT);
+               return 0;
+       }
+
+       spin_lock_irq(&nullb->lock);
+       bio_for_each_segment(bvec, bio, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(bio_op(bio)), sector,
+                                    bio_op(bio) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+       return 0;
+}
+
+static void null_stop_queue(struct nullb *nullb)
+{
+       struct request_queue *q = nullb->q;
+
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_stop_hw_queues(q);
+       else {
+               spin_lock_irq(q->queue_lock);
+               blk_stop_queue(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+}
+
+static void null_restart_queue_async(struct nullb *nullb)
+{
+       struct request_queue *q = nullb->q;
+       unsigned long flags;
+
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_start_stopped_hw_queues(q, true);
+       else {
+               spin_lock_irqsave(q->queue_lock, flags);
+               blk_start_queue_async(q);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+}
+
+static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
+{
+       struct nullb_device *dev = cmd->nq->dev;
+       struct nullb *nullb = dev->nullb;
+       int err = 0;
+
+       if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
+               cmd->error = null_zone_report(nullb, cmd);
+               goto out;
+       }
+
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+               struct request *rq = cmd->rq;
+
+               if (!hrtimer_active(&nullb->bw_timer))
+                       hrtimer_restart(&nullb->bw_timer);
+
+               if (atomic_long_sub_return(blk_rq_bytes(rq),
+                               &nullb->cur_bytes) < 0) {
+                       null_stop_queue(nullb);
+                       /* race with timer */
+                       if (atomic_long_read(&nullb->cur_bytes) > 0)
+                               null_restart_queue_async(nullb);
+                       if (dev->queue_mode == NULL_Q_RQ) {
+                               struct request_queue *q = nullb->q;
+
+                               spin_lock_irq(q->queue_lock);
+                               rq->rq_flags |= RQF_DONTPREP;
+                               blk_requeue_request(q, rq);
+                               spin_unlock_irq(q->queue_lock);
+                               return BLK_STS_OK;
+                       } else
+                               /* requeue request */
+                               return BLK_STS_DEV_RESOURCE;
+               }
+       }
+
+       if (nullb->dev->badblocks.shift != -1) {
+               int bad_sectors;
+               sector_t sector, size, first_bad;
+               bool is_flush = true;
+
+               if (dev->queue_mode == NULL_Q_BIO &&
+                               bio_op(cmd->bio) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = cmd->bio->bi_iter.bi_sector;
+                       size = bio_sectors(cmd->bio);
+               }
+               if (dev->queue_mode != NULL_Q_BIO &&
+                               req_op(cmd->rq) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = blk_rq_pos(cmd->rq);
+                       size = blk_rq_sectors(cmd->rq);
+               }
+               if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
+                               size, &first_bad, &bad_sectors)) {
+                       cmd->error = BLK_STS_IOERR;
+                       goto out;
+               }
+       }
+
+       if (dev->memory_backed) {
+               if (dev->queue_mode == NULL_Q_BIO) {
+                       if (bio_op(cmd->bio) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_bio(cmd);
+               } else {
+                       if (req_op(cmd->rq) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_rq(cmd);
+               }
+       }
+       cmd->error = errno_to_blk_status(err);
+
+       if (!cmd->error && dev->zoned) {
+               if (req_op(cmd->rq) == REQ_OP_WRITE)
+                       null_zone_write(cmd);
+               else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
+                       null_zone_reset(cmd);
+       }
+out:
+       /* Complete IO by inline, softirq or timer */
+       switch (dev->irqmode) {
+       case NULL_IRQ_SOFTIRQ:
+               switch (dev->queue_mode)  {
+               case NULL_Q_MQ:
+                       blk_mq_complete_request(cmd->rq);
+                       break;
+               case NULL_Q_RQ:
+                       blk_complete_request(cmd->rq);
+                       break;
+               case NULL_Q_BIO:
+                       /*
+                        * XXX: no proper submitting cpu information available.
+                        */
+                       end_cmd(cmd);
+                       break;
+               }
+               break;
+       case NULL_IRQ_NONE:
+               end_cmd(cmd);
+               break;
+       case NULL_IRQ_TIMER:
+               null_cmd_end_timer(cmd);
+               break;
+       }
+       return BLK_STS_OK;
+}
+
+static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
+{
+       struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+       unsigned int mbps = nullb->dev->mbps;
+
+       if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
+               return HRTIMER_NORESTART;
+
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
+       null_restart_queue_async(nullb);
+
+       hrtimer_forward_now(&nullb->bw_timer, timer_interval);
+
+       return HRTIMER_RESTART;
+}
+
+static void nullb_setup_bwtimer(struct nullb *nullb)
+{
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+
+       hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       nullb->bw_timer.function = nullb_bwtimer_fn;
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
+       hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
+}
+
+static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
+{
+       int index = 0;
+
+       if (nullb->nr_queues != 1)
+               index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
+
+       return &nullb->queues[index];
+}
+
+static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
+{
+       struct nullb *nullb = q->queuedata;
+       struct nullb_queue *nq = nullb_to_queue(nullb);
+       struct nullb_cmd *cmd;
+
+       cmd = alloc_cmd(nq, 1);
+       cmd->bio = bio;
+
+       null_handle_cmd(cmd);
+       return BLK_QC_T_NONE;
+}
+
+static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+{
+       pr_info("null: rq %p timed out\n", rq);
+       __blk_complete_request(rq);
+       return BLK_EH_DONE;
+}
+
+static int null_rq_prep_fn(struct request_queue *q, struct request *req)
+{
+       struct nullb *nullb = q->queuedata;
+       struct nullb_queue *nq = nullb_to_queue(nullb);
+       struct nullb_cmd *cmd;
+
+       cmd = alloc_cmd(nq, 0);
+       if (cmd) {
+               cmd->rq = req;
+               req->special = cmd;
+               return BLKPREP_OK;
+       }
+       blk_stop_queue(q);
+
+       return BLKPREP_DEFER;
+}
+
+static bool should_timeout_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (g_timeout_str[0])
+               return should_fail(&null_timeout_attr, 1);
+#endif
+       return false;
+}
+
+static bool should_requeue_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (g_requeue_str[0])
+               return should_fail(&null_requeue_attr, 1);
+#endif
+       return false;
+}
+
+static void null_request_fn(struct request_queue *q)
+{
+       struct request *rq;
+
+       while ((rq = blk_fetch_request(q)) != NULL) {
+               struct nullb_cmd *cmd = rq->special;
+
+               /* just ignore the request */
+               if (should_timeout_request(rq))
+                       continue;
+               if (should_requeue_request(rq)) {
+                       blk_requeue_request(q, rq);
+                       continue;
+               }
+
+               spin_unlock_irq(q->queue_lock);
+               null_handle_cmd(cmd);
+               spin_lock_irq(q->queue_lock);
+       }
+}
+
+static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+{
+       pr_info("null: rq %p timed out\n", rq);
+       blk_mq_complete_request(rq);
+       return BLK_EH_DONE;
+}
+
+static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
+{
+       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+       struct nullb_queue *nq = hctx->driver_data;
+
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
+       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+               hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+               cmd->timer.function = null_cmd_timer_expired;
+       }
+       cmd->rq = bd->rq;
+       cmd->nq = nq;
+
+       blk_mq_start_request(bd->rq);
+
+       if (should_requeue_request(bd->rq)) {
+               /*
+                * Alternate between hitting the core BUSY path, and the
+                * driver driven requeue path
+                */
+               nq->requeue_selection++;
+               if (nq->requeue_selection & 1)
+                       return BLK_STS_RESOURCE;
+               else {
+                       blk_mq_requeue_request(bd->rq, true);
+                       return BLK_STS_OK;
+               }
+       }
+       if (should_timeout_request(bd->rq))
+               return BLK_STS_OK;
+
+       return null_handle_cmd(cmd);
+}
+
+static const struct blk_mq_ops null_mq_ops = {
+       .queue_rq       = null_queue_rq,
+       .complete       = null_softirq_done_fn,
+       .timeout        = null_timeout_rq,
+};
+
+static void cleanup_queue(struct nullb_queue *nq)
+{
+       kfree(nq->tag_map);
+       kfree(nq->cmds);
+}
+
+static void cleanup_queues(struct nullb *nullb)
+{
+       int i;
+
+       for (i = 0; i < nullb->nr_queues; i++)
+               cleanup_queue(&nullb->queues[i]);
+
+       kfree(nullb->queues);
+}
+
+static void null_del_dev(struct nullb *nullb)
+{
+       struct nullb_device *dev = nullb->dev;
+
+       ida_simple_remove(&nullb_indexes, nullb->index);
+
+       list_del_init(&nullb->list);
+
+       del_gendisk(nullb->disk);
+
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
+               hrtimer_cancel(&nullb->bw_timer);
+               atomic_long_set(&nullb->cur_bytes, LONG_MAX);
+               null_restart_queue_async(nullb);
+       }
+
+       blk_cleanup_queue(nullb->q);
+       if (dev->queue_mode == NULL_Q_MQ &&
+           nullb->tag_set == &nullb->__tag_set)
+               blk_mq_free_tag_set(nullb->tag_set);
+       put_disk(nullb->disk);
+       cleanup_queues(nullb);
+       if (null_cache_active(nullb))
+               null_free_device_storage(nullb->dev, true);
+       kfree(nullb);
+       dev->nullb = NULL;
+}
+
+static void null_config_discard(struct nullb *nullb)
+{
+       if (nullb->dev->discard == false)
+               return;
+       nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+       nullb->q->limits.discard_alignment = nullb->dev->blocksize;
+       blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+       blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q);
+}
+
+static int null_open(struct block_device *bdev, fmode_t mode)
+{
+       return 0;
+}
+
+static void null_release(struct gendisk *disk, fmode_t mode)
+{
+}
+
+static const struct block_device_operations null_fops = {
+       .owner =        THIS_MODULE,
+       .open =         null_open,
+       .release =      null_release,
+};
+
+static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
+{
+       BUG_ON(!nullb);
+       BUG_ON(!nq);
+
+       init_waitqueue_head(&nq->wait);
+       nq->queue_depth = nullb->queue_depth;
+       nq->dev = nullb->dev;
+}
+
+static void null_init_queues(struct nullb *nullb)
+{
+       struct request_queue *q = nullb->q;
+       struct blk_mq_hw_ctx *hctx;
+       struct nullb_queue *nq;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (!hctx->nr_ctx || !hctx->tags)
+                       continue;
+               nq = &nullb->queues[i];
+               hctx->driver_data = nq;
+               null_init_queue(nullb, nq);
+               nullb->nr_queues++;
+       }
+}
+
+static int setup_commands(struct nullb_queue *nq)
+{
+       struct nullb_cmd *cmd;
+       int i, tag_size;
+
+       nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
+       if (!nq->cmds)
+               return -ENOMEM;
+
+       tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
+       nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL);
+       if (!nq->tag_map) {
+               kfree(nq->cmds);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < nq->queue_depth; i++) {
+               cmd = &nq->cmds[i];
+               INIT_LIST_HEAD(&cmd->list);
+               cmd->ll_list.next = NULL;
+               cmd->tag = -1U;
+       }
+
+       return 0;
+}
+
+static int setup_queues(struct nullb *nullb)
+{
+       nullb->queues = kcalloc(nullb->dev->submit_queues,
+                               sizeof(struct nullb_queue),
+                               GFP_KERNEL);
+       if (!nullb->queues)
+               return -ENOMEM;
+
+       nullb->nr_queues = 0;
+       nullb->queue_depth = nullb->dev->hw_queue_depth;
+
+       return 0;
+}
+
+static int init_driver_queues(struct nullb *nullb)
+{
+       struct nullb_queue *nq;
+       int i, ret = 0;
+
+       for (i = 0; i < nullb->dev->submit_queues; i++) {
+               nq = &nullb->queues[i];
+
+               null_init_queue(nullb, nq);
+
+               ret = setup_commands(nq);
+               if (ret)
+                       return ret;
+               nullb->nr_queues++;
+       }
+       return 0;
+}
+
+static int null_gendisk_register(struct nullb *nullb)
+{
+       struct gendisk *disk;
+       sector_t size;
+
+       disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
+       if (!disk)
+               return -ENOMEM;
+       size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
+       set_capacity(disk, size >> 9);
+
+       disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
+       disk->major             = null_major;
+       disk->first_minor       = nullb->index;
+       disk->fops              = &null_fops;
+       disk->private_data      = nullb;
+       disk->queue             = nullb->q;
+       strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
+       add_disk(disk);
+       return 0;
+}
+
+static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
+{
+       set->ops = &null_mq_ops;
+       set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
+                                               g_submit_queues;
+       set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
+                                               g_hw_queue_depth;
+       set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
+       set->cmd_size   = sizeof(struct nullb_cmd);
+       set->flags = BLK_MQ_F_SHOULD_MERGE;
+       if (g_no_sched)
+               set->flags |= BLK_MQ_F_NO_SCHED;
+       set->driver_data = NULL;
+
+       if ((nullb && nullb->dev->blocking) || g_blocking)
+               set->flags |= BLK_MQ_F_BLOCKING;
+
+       return blk_mq_alloc_tag_set(set);
+}
+
+static void null_validate_conf(struct nullb_device *dev)
+{
+       dev->blocksize = round_down(dev->blocksize, 512);
+       dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
+
+       if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+               if (dev->submit_queues != nr_online_nodes)
+                       dev->submit_queues = nr_online_nodes;
+       } else if (dev->submit_queues > nr_cpu_ids)
+               dev->submit_queues = nr_cpu_ids;
+       else if (dev->submit_queues == 0)
+               dev->submit_queues = 1;
+
+       dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
+       dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+
+       /* Do memory allocation, so set blocking */
+       if (dev->memory_backed)
+               dev->blocking = true;
+       else /* cache is meaningless */
+               dev->cache_size = 0;
+       dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
+                                               dev->cache_size);
+       dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
+       /* can not stop a queue */
+       if (dev->queue_mode == NULL_Q_BIO)
+               dev->mbps = 0;
+}
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static bool __null_setup_fault(struct fault_attr *attr, char *str)
+{
+       if (!str[0])
+               return true;
+
+       if (!setup_fault_attr(attr, str))
+               return false;
+
+       attr->verbose = 0;
+       return true;
+}
+#endif
+
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
+               return false;
+       if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
+               return false;
+#endif
+       return true;
+}
+
+static int null_add_dev(struct nullb_device *dev)
+{
+       struct nullb *nullb;
+       int rv;
+
+       null_validate_conf(dev);
+
+       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
+       if (!nullb) {
+               rv = -ENOMEM;
+               goto out;
+       }
+       nullb->dev = dev;
+       dev->nullb = nullb;
+
+       spin_lock_init(&nullb->lock);
+
+       rv = setup_queues(nullb);
+       if (rv)
+               goto out_free_nullb;
+
+       if (dev->queue_mode == NULL_Q_MQ) {
+               if (shared_tags) {
+                       nullb->tag_set = &tag_set;
+                       rv = 0;
+               } else {
+                       nullb->tag_set = &nullb->__tag_set;
+                       rv = null_init_tag_set(nullb, nullb->tag_set);
+               }
+
+               if (rv)
+                       goto out_cleanup_queues;
+
+               if (!null_setup_fault())
+                       goto out_cleanup_queues;
+
+               nullb->tag_set->timeout = 5 * HZ;
+               nullb->q = blk_mq_init_queue(nullb->tag_set);
+               if (IS_ERR(nullb->q)) {
+                       rv = -ENOMEM;
+                       goto out_cleanup_tags;
+               }
+               null_init_queues(nullb);
+       } else if (dev->queue_mode == NULL_Q_BIO) {
+               nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
+                                               NULL);
+               if (!nullb->q) {
+                       rv = -ENOMEM;
+                       goto out_cleanup_queues;
+               }
+               blk_queue_make_request(nullb->q, null_queue_bio);
+               rv = init_driver_queues(nullb);
+               if (rv)
+                       goto out_cleanup_blk_queue;
+       } else {
+               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
+                                               dev->home_node);
+               if (!nullb->q) {
+                       rv = -ENOMEM;
+                       goto out_cleanup_queues;
+               }
+
+               if (!null_setup_fault())
+                       goto out_cleanup_blk_queue;
+
+               blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
+               blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+               blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+               nullb->q->rq_timeout = 5 * HZ;
+               rv = init_driver_queues(nullb);
+               if (rv)
+                       goto out_cleanup_blk_queue;
+       }
+
+       if (dev->mbps) {
+               set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
+               nullb_setup_bwtimer(nullb);
+       }
+
+       if (dev->cache_size > 0) {
+               set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+               blk_queue_write_cache(nullb->q, true, true);
+               blk_queue_flush_queueable(nullb->q, true);
+       }
+
+       if (dev->zoned) {
+               rv = null_zone_init(dev);
+               if (rv)
+                       goto out_cleanup_blk_queue;
+
+               blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
+               nullb->q->limits.zoned = BLK_ZONED_HM;
+       }
+
+       nullb->q->queuedata = nullb;
+       blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
+       blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
+
+       mutex_lock(&lock);
+       nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+       dev->index = nullb->index;
+       mutex_unlock(&lock);
+
+       blk_queue_logical_block_size(nullb->q, dev->blocksize);
+       blk_queue_physical_block_size(nullb->q, dev->blocksize);
+
+       null_config_discard(nullb);
+
+       sprintf(nullb->disk_name, "nullb%d", nullb->index);
+
+       rv = null_gendisk_register(nullb);
+       if (rv)
+               goto out_cleanup_zone;
+
+       mutex_lock(&lock);
+       list_add_tail(&nullb->list, &nullb_list);
+       mutex_unlock(&lock);
+
+       return 0;
+out_cleanup_zone:
+       if (dev->zoned)
+               null_zone_exit(dev);
+out_cleanup_blk_queue:
+       blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+       if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+               blk_mq_free_tag_set(nullb->tag_set);
+out_cleanup_queues:
+       cleanup_queues(nullb);
+out_free_nullb:
+       kfree(nullb);
+out:
+       return rv;
+}
+
+static int __init null_init(void)
+{
+       int ret = 0;
+       unsigned int i;
+       struct nullb *nullb;
+       struct nullb_device *dev;
+
+       if (g_bs > PAGE_SIZE) {
+               pr_warn("null_blk: invalid block size\n");
+               pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+               g_bs = PAGE_SIZE;
+       }
+
+       if (!is_power_of_2(g_zone_size)) {
+               pr_err("null_blk: zone_size must be power-of-two\n");
+               return -EINVAL;
+       }
+
+       if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+               if (g_submit_queues != nr_online_nodes) {
+                       pr_warn("null_blk: submit_queues param is set to %u.\n",
+                                                       nr_online_nodes);
+                       g_submit_queues = nr_online_nodes;
+               }
+       } else if (g_submit_queues > nr_cpu_ids)
+               g_submit_queues = nr_cpu_ids;
+       else if (g_submit_queues <= 0)
+               g_submit_queues = 1;
+
+       if (g_queue_mode == NULL_Q_MQ && shared_tags) {
+               ret = null_init_tag_set(NULL, &tag_set);
+               if (ret)
+                       return ret;
+       }
+
+       config_group_init(&nullb_subsys.su_group);
+       mutex_init(&nullb_subsys.su_mutex);
+
+       ret = configfs_register_subsystem(&nullb_subsys);
+       if (ret)
+               goto err_tagset;
+
+       mutex_init(&lock);
+
+       null_major = register_blkdev(0, "nullb");
+       if (null_major < 0) {
+               ret = null_major;
+               goto err_conf;
+       }
+
+       for (i = 0; i < nr_devices; i++) {
+               dev = null_alloc_dev();
+               if (!dev) {
+                       ret = -ENOMEM;
+                       goto err_dev;
+               }
+               ret = null_add_dev(dev);
+               if (ret) {
+                       null_free_dev(dev);
+                       goto err_dev;
+               }
+       }
+
+       pr_info("null: module loaded\n");
+       return 0;
+
+err_dev:
+       while (!list_empty(&nullb_list)) {
+               nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
+               null_del_dev(nullb);
+               null_free_dev(dev);
+       }
+       unregister_blkdev(null_major, "nullb");
+err_conf:
+       configfs_unregister_subsystem(&nullb_subsys);
+err_tagset:
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
+               blk_mq_free_tag_set(&tag_set);
+       return ret;
+}
+
+static void __exit null_exit(void)
+{
+       struct nullb *nullb;
+
+       configfs_unregister_subsystem(&nullb_subsys);
+
+       unregister_blkdev(null_major, "nullb");
+
+       mutex_lock(&lock);
+       while (!list_empty(&nullb_list)) {
+               struct nullb_device *dev;
+
+               nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
+               null_del_dev(nullb);
+               null_free_dev(dev);
+       }
+       mutex_unlock(&lock);
+
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
+               blk_mq_free_tag_set(&tag_set);
+}
+
+module_init(null_init);
+module_exit(null_exit);
+
+MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c

new file mode 100644 (file)

index 0000000..a979ca0
--- /dev/null
+++ b/drivers/block/null_blk_zoned.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/vmalloc.h>
+#include "null_blk.h"
+
+/* zone_size in MBs to sectors. */
+#define ZONE_SIZE_SHIFT                11
+
+static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
+{
+       return sect >> ilog2(dev->zone_size_sects);
+}
+
+int null_zone_init(struct nullb_device *dev)
+{
+       sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
+       sector_t sector = 0;
+       unsigned int i;
+
+       if (!is_power_of_2(dev->zone_size)) {
+               pr_err("null_blk: zone_size must be power-of-two\n");
+               return -EINVAL;
+       }
+
+       dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
+       dev->nr_zones = dev_size >>
+                               (SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+       dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
+                       GFP_KERNEL | __GFP_ZERO);
+       if (!dev->zones)
+               return -ENOMEM;
+
+       for (i = 0; i < dev->nr_zones; i++) {
+               struct blk_zone *zone = &dev->zones[i];
+
+               zone->start = zone->wp = sector;
+               zone->len = dev->zone_size_sects;
+               zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+               zone->cond = BLK_ZONE_COND_EMPTY;
+
+               sector += dev->zone_size_sects;
+       }
+
+       return 0;
+}
+
+void null_zone_exit(struct nullb_device *dev)
+{
+       kvfree(dev->zones);
+}
+
+static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
+                             unsigned int zno, unsigned int nr_zones)
+{
+       struct blk_zone_report_hdr *hdr = NULL;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+       void *addr;
+       unsigned int zones_to_cpy;
+
+       bio_for_each_segment(bvec, rq->bio, iter) {
+               addr = kmap_atomic(bvec.bv_page);
+
+               zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
+
+               if (!hdr) {
+                       hdr = (struct blk_zone_report_hdr *)addr;
+                       hdr->nr_zones = nr_zones;
+                       zones_to_cpy--;
+                       addr += sizeof(struct blk_zone_report_hdr);
+               }
+
+               zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
+
+               memcpy(addr, &dev->zones[zno],
+                               zones_to_cpy * sizeof(struct blk_zone));
+
+               kunmap_atomic(addr);
+
+               nr_zones -= zones_to_cpy;
+               zno += zones_to_cpy;
+
+               if (!nr_zones)
+                       break;
+       }
+}
+
+blk_status_t null_zone_report(struct nullb *nullb,
+                                    struct nullb_cmd *cmd)
+{
+       struct nullb_device *dev = nullb->dev;
+       struct request *rq = cmd->rq;
+       unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+       unsigned int nr_zones = dev->nr_zones - zno;
+       unsigned int max_zones = (blk_rq_bytes(rq) /
+                                       sizeof(struct blk_zone)) - 1;
+
+       nr_zones = min_t(unsigned int, nr_zones, max_zones);
+
+       null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
+
+       return BLK_STS_OK;
+}
+
+void null_zone_write(struct nullb_cmd *cmd)
+{
+       struct nullb_device *dev = cmd->nq->dev;
+       struct request *rq = cmd->rq;
+       sector_t sector = blk_rq_pos(rq);
+       unsigned int rq_sectors = blk_rq_sectors(rq);
+       unsigned int zno = null_zone_no(dev, sector);
+       struct blk_zone *zone = &dev->zones[zno];
+
+       switch (zone->cond) {
+       case BLK_ZONE_COND_FULL:
+               /* Cannot write to a full zone */
+               cmd->error = BLK_STS_IOERR;
+               break;
+       case BLK_ZONE_COND_EMPTY:
+       case BLK_ZONE_COND_IMP_OPEN:
+               /* Writes must be at the write pointer position */
+               if (blk_rq_pos(rq) != zone->wp) {
+                       cmd->error = BLK_STS_IOERR;
+                       break;
+               }
+
+               if (zone->cond == BLK_ZONE_COND_EMPTY)
+                       zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+               zone->wp += rq_sectors;
+               if (zone->wp == zone->start + zone->len)
+                       zone->cond = BLK_ZONE_COND_FULL;
+               break;
+       default:
+               /* Invalid zone condition */
+               cmd->error = BLK_STS_IOERR;
+               break;
+       }
+}
+
+void null_zone_reset(struct nullb_cmd *cmd)
+{
+       struct nullb_device *dev = cmd->nq->dev;
+       struct request *rq = cmd->rq;
+       unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+       struct blk_zone *zone = &dev->zones[zno];
+
+       zone->cond = BLK_ZONE_COND_EMPTY;
+       zone->wp = zone->start;
+}
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c

index 4f27e73..f5f63ca 100644 (file)
--- a/drivers/block/paride/bpck.c
+++ b/drivers/block/paride/bpck.c
@@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
  
  static void bpck_read_eeprom ( PIA *pi, char * buf )
  
-{       int i,j,k,n,p,v,f, om, od;
+{       int i, j, k, p, v, f, om, od;
  
         bpck_force_spp(pi);
  
@@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf )
  
         bpck_connect(pi);
         
-       n = 0;
         WR(4,0);
         for (i=0;i<64;i++) {
             WR(6,8);  
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c

index b3f83cd..e285413 100644 (file)
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -67,7 +67,7 @@
  #include <scsi/scsi.h>
  #include <linux/debugfs.h>
  #include <linux/device.h>
-
+#include <linux/nospec.h>
  #include <linux/uaccess.h>
  
  #define DRIVER_NAME    "pktcdvd"
@@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index)
  static void pkt_dump_sense(struct pktcdvd_device *pd,
                            struct packet_command *cgc)
  {
-       struct request_sense *sense = cgc->sense;
+       struct scsi_sense_hdr *sshdr = cgc->sshdr;
  
-       if (sense)
+       if (sshdr)
                 pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
                         CDROM_PACKET_SIZE, cgc->cmd,
-                       sense->sense_key, sense->asc, sense->ascq,
-                       sense_key_string(sense->sense_key));
+                       sshdr->sense_key, sshdr->asc, sshdr->ascq,
+                       sense_key_string(sshdr->sense_key));
         else
                 pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
  }
@@ -787,18 +787,19 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
                                 unsigned write_speed, unsigned read_speed)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         int ret;
  
         init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
         cgc.cmd[0] = GPCMD_SET_SPEED;
         cgc.cmd[2] = (read_speed >> 8) & 0xff;
         cgc.cmd[3] = read_speed & 0xff;
         cgc.cmd[4] = (write_speed >> 8) & 0xff;
         cgc.cmd[5] = write_speed & 0xff;
  
-       if ((ret = pkt_generic_packet(pd, &cgc)))
+       ret = pkt_generic_packet(pd, &cgc);
+       if (ret)
                 pkt_dump_sense(pd, &cgc);
  
         return ret;
@@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
         cgc.cmd[8] = cgc.buflen = 2;
         cgc.quiet = 1;
  
-       if ((ret = pkt_generic_packet(pd, &cgc)))
+       ret = pkt_generic_packet(pd, &cgc);
+       if (ret)
                 return ret;
  
         /* not all drives have the same disc_info length, so requeue
@@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type,
         cgc.cmd[8] = 8;
         cgc.quiet = 1;
  
-       if ((ret = pkt_generic_packet(pd, &cgc)))
+       ret = pkt_generic_packet(pd, &cgc);
+       if (ret)
                 return ret;
  
         cgc.buflen = be16_to_cpu(ti->track_information_length) +
@@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
         __u32 last_track;
         int ret = -1;
  
-       if ((ret = pkt_get_disc_info(pd, &di)))
+       ret = pkt_get_disc_info(pd, &di);
+       if (ret)
                 return ret;
  
         last_track = (di.last_track_msb << 8) | di.last_track_lsb;
-       if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+       ret = pkt_get_track_info(pd, last_track, 1, &ti);
+       if (ret)
                 return ret;
  
         /* if this track is blank, try the previous. */
         if (ti.blank) {
                 last_track--;
-               if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+               ret = pkt_get_track_info(pd, last_track, 1, &ti);
+               if (ret)
                         return ret;
         }
  
@@ -1645,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
  static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         write_param_page *wp;
         char buffer[128];
         int ret, size;
@@ -1656,8 +1662,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
  
         memset(buffer, 0, sizeof(buffer));
         init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
-       cgc.sense = &sense;
-       if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+       cgc.sshdr = &sshdr;
+       ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+       if (ret) {
                 pkt_dump_sense(pd, &cgc);
                 return ret;
         }
@@ -1671,8 +1678,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
          * now get it all
          */
         init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
-       cgc.sense = &sense;
-       if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+       cgc.sshdr = &sshdr;
+       ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+       if (ret) {
                 pkt_dump_sense(pd, &cgc);
                 return ret;
         }
@@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
         wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
  
         cgc.buflen = cgc.cmd[8] = size;
-       if ((ret = pkt_mode_select(pd, &cgc))) {
+       ret = pkt_mode_select(pd, &cgc);
+       if (ret) {
                 pkt_dump_sense(pd, &cgc);
                 return ret;
         }
@@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
         memset(&di, 0, sizeof(disc_information));
         memset(&ti, 0, sizeof(track_information));
  
-       if ((ret = pkt_get_disc_info(pd, &di))) {
+       ret = pkt_get_disc_info(pd, &di);
+       if (ret) {
                 pkt_err(pd, "failed get_disc\n");
                 return ret;
         }
@@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
         pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
  
         track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
-       if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
+       ret = pkt_get_track_info(pd, track, 1, &ti);
+       if (ret) {
                 pkt_err(pd, "failed get_track\n");
                 return ret;
         }
@@ -1905,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
                                                 int set)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         unsigned char buf[64];
         int ret;
  
         init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
         cgc.buflen = pd->mode_offset + 12;
  
         /*
@@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
          */
         cgc.quiet = 1;
  
-       if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
+       ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
+       if (ret)
                 return ret;
  
         buf[pd->mode_offset + 10] |= (!!set << 2);
@@ -1950,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
                                                 unsigned *write_speed)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         unsigned char buf[256+18];
         unsigned char *cap_buf;
         int ret, offset;
  
         cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
         init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
  
         ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
         if (ret) {
@@ -2011,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
                                                 unsigned *speed)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         unsigned char buf[64];
         unsigned int size, st, sp;
         int ret;
  
         init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
         cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
         cgc.cmd[1] = 2;
         cgc.cmd[2] = 4; /* READ ATIP */
@@ -2032,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
                 size = sizeof(buf);
  
         init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
         cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
         cgc.cmd[1] = 2;
         cgc.cmd[2] = 4;
@@ -2083,17 +2095,18 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
  static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
  {
         struct packet_command cgc;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         int ret;
  
         pkt_dbg(2, pd, "Performing OPC\n");
  
         init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-       cgc.sense = &sense;
+       cgc.sshdr = &sshdr;
         cgc.timeout = 60*HZ;
         cgc.cmd[0] = GPCMD_SEND_OPC;
         cgc.cmd[1] = 1;
-       if ((ret = pkt_generic_packet(pd, &cgc)))
+       ret = pkt_generic_packet(pd, &cgc);
+       if (ret)
                 pkt_dump_sense(pd, &cgc);
         return ret;
  }
@@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd)
         int ret;
         unsigned int write_speed, media_write_speed, read_speed;
  
-       if ((ret = pkt_probe_settings(pd))) {
+       ret = pkt_probe_settings(pd);
+       if (ret) {
                 pkt_dbg(2, pd, "failed probe\n");
                 return ret;
         }
  
-       if ((ret = pkt_set_write_settings(pd))) {
+       ret = pkt_set_write_settings(pd);
+       if (ret) {
                 pkt_dbg(1, pd, "failed saving write settings\n");
                 return -EIO;
         }
  
         pkt_write_caching(pd, USE_WCACHING);
  
-       if ((ret = pkt_get_max_speed(pd, &write_speed)))
+       ret = pkt_get_max_speed(pd, &write_speed);
+       if (ret)
                 write_speed = 16 * 177;
         switch (pd->mmc3_profile) {
                 case 0x13: /* DVD-RW */
@@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd)
                         pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
                         break;
                 default:
-                       if ((ret = pkt_media_speed(pd, &media_write_speed)))
+                       ret = pkt_media_speed(pd, &media_write_speed);
+                       if (ret)
                                 media_write_speed = 16;
                         write_speed = min(write_speed, media_write_speed * 177);
                         pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
@@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd)
         }
         read_speed = write_speed;
  
-       if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
+       ret = pkt_set_speed(pd, write_speed, read_speed);
+       if (ret) {
                 pkt_dbg(1, pd, "couldn't set write speed\n");
                 return -EIO;
         }
         pd->write_speed = write_speed;
         pd->read_speed = read_speed;
  
-       if ((ret = pkt_perform_opc(pd))) {
+       ret = pkt_perform_opc(pd);
+       if (ret) {
                 pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
         }
  
@@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
          * so bdget() can't fail.
          */
         bdget(pd->bdev->bd_dev);
-       if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
+       ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd);
+       if (ret)
                 goto out;
  
-       if ((ret = pkt_get_last_written(pd, &lba))) {
+       ret = pkt_get_last_written(pd, &lba);
+       if (ret) {
                 pkt_err(pd, "pkt_get_last_written failed\n");
                 goto out_putdev;
         }
@@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
  
         q = bdev_get_queue(pd->bdev);
         if (write) {
-               if ((ret = pkt_open_write(pd)))
+               ret = pkt_open_write(pd);
+               if (ret)
                         goto out_putdev;
                 /*
                  * Some CDRW drives can not handle writes larger than one packet,
@@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
                 clear_bit(PACKET_WRITABLE, &pd->flags);
         }
  
-       if ((ret = pkt_set_segment_merging(pd, q)))
+       ret = pkt_set_segment_merging(pd, q);
+       if (ret)
                 goto out_putdev;
  
         if (write) {
@@ -2231,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
  {
         if (dev_minor >= MAX_WRITERS)
                 return NULL;
+
+       dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
         return pkt_devs[dev_minor];
  }
  
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c

index dddb3f2..1a92f9e 100644 (file)
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
  
  static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
  {
-       generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio),
+       generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
                              &card->gendisk->part0);
  }
  
@@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
                                 struct bio *bio,
                                 unsigned long start_time)
  {
-       generic_end_io_acct(card->queue, bio_data_dir(bio),
-                               &card->gendisk->part0, start_time);
+       generic_end_io_acct(card->queue, bio_op(bio),
+                           &card->gendisk->part0, start_time);
  }
  
  static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c

index bc7aea6..87b9e7f 100644 (file)
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev,
  
         if (unlikely(skdev->dbg_level > 1)) {
                 dev_dbg(&skdev->pdev->dev,
-                       "skreq=%x sksg_list=%p sksg_dma=%llx\n",
-                       skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+                       "skreq=%x sksg_list=%p sksg_dma=%pad\n",
+                       skreq->id, skreq->sksg_list, &skreq->sksg_dma_address);
                 for (i = 0; i < n_sg; i++) {
                         struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
  
@@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev,
  {
         u64 qcmd;
  
-       dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n",
-               skmsg->mb_dma_address, skd_in_flight(skdev));
+       dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n",
+               &skmsg->mb_dma_address, skd_in_flight(skdev));
         dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf);
  
         qcmd = skmsg->mb_dma_address;
@@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
                 }
  
                 dev_dbg(&skdev->pdev->dev,
-                       "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+                       "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n",
                         skspcl, skspcl->req.id, skspcl->req.sksg_list,
-                       skspcl->req.sksg_dma_address);
+                       &skspcl->req.sksg_dma_address);
                 for (i = 0; i < skspcl->req.n_sg; i++) {
                         struct fit_sg_descriptor *sgd =
                                 &skspcl->req.sksg_list[i];
@@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev)
  
                 WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) &
                      (FIT_QCMD_ALIGN - 1),
-                    "not aligned: msg_buf %p mb_dma_address %#llx\n",
-                    skmsg->msg_buf, skmsg->mb_dma_address);
+                    "not aligned: msg_buf %p mb_dma_address %pad\n",
+                    skmsg->msg_buf, &skmsg->mb_dma_address);
                 memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
         }
  
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index b5cedcc..94300db 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock);
  #define GRANTS_PER_INDIRECT_FRAME \
         (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
  
-#define PSEGS_PER_INDIRECT_FRAME       \
-       (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
-
  #define INDIRECT_GREFS(_grants)                \
         DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
  
-#define GREFS(_psegs)  ((_psegs) * GRANTS_PER_PSEG)
-
  static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
  static void blkfront_gather_backend_features(struct blkfront_info *info);
  static int negotiate_mq(struct blkfront_info *info);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index 7436b2d..2907a81 100644 (file)
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1274,17 +1274,16 @@ static void zram_bio_discard(struct zram *zram, u32 index,
   * Returns 1 if IO request was successfully submitted.
   */
  static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-                       int offset, bool is_write, struct bio *bio)
+                       int offset, unsigned int op, struct bio *bio)
  {
         unsigned long start_time = jiffies;
-       int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
         struct request_queue *q = zram->disk->queue;
         int ret;
  
-       generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+       generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
                         &zram->disk->part0);
  
-       if (!is_write) {
+       if (!op_is_write(op)) {
                 atomic64_inc(&zram->stats.num_reads);
                 ret = zram_bvec_read(zram, bvec, index, offset, bio);
                 flush_dcache_page(bvec->bv_page);
@@ -1293,14 +1292,14 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
                 ret = zram_bvec_write(zram, bvec, index, offset, bio);
         }
  
-       generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
+       generic_end_io_acct(q, op, &zram->disk->part0, start_time);
  
         zram_slot_lock(zram, index);
         zram_accessed(zram, index);
         zram_slot_unlock(zram, index);
  
         if (unlikely(ret < 0)) {
-               if (!is_write)
+               if (!op_is_write(op))
                         atomic64_inc(&zram->stats.failed_reads);
                 else
                         atomic64_inc(&zram->stats.failed_writes);
@@ -1338,7 +1337,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                         unwritten);
                         if (zram_bvec_rw(zram, &bv, index, offset,
-                                       op_is_write(bio_op(bio)), bio) < 0)
+                                        bio_op(bio), bio) < 0)
                                 goto out;
  
                         bv.bv_offset += bv.bv_len;
@@ -1390,7 +1389,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
  }
  
  static int zram_rw_page(struct block_device *bdev, sector_t sector,
-                      struct page *page, bool is_write)
+                      struct page *page, unsigned int op)
  {
         int offset, ret;
         u32 index;
@@ -1414,7 +1413,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
         bv.bv_len = PAGE_SIZE;
         bv.bv_offset = 0;
  
-       ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
+       ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
  out:
         /*
          * If I/O fails, just return error(ie, non-zero) without
@@ -1429,7 +1428,7 @@ out:
  
         switch (ret) {
         case 0:
-               page_endio(page, is_write, 0);
+               page_endio(page, op_is_write(op), 0);
                 break;
         case 1:
                 ret = 0;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c

index a78b8e7..113fc6e 100644 (file)
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -282,6 +282,7 @@
  #include <linux/blkdev.h>
  #include <linux/times.h>
  #include <linux/uaccess.h>
+#include <scsi/scsi_common.h>
  #include <scsi/scsi_request.h>
  
  /* used to tell the module to turn on full debugging messages */
@@ -345,10 +346,10 @@ static LIST_HEAD(cdrom_list);
  int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
                                struct packet_command *cgc)
  {
-       if (cgc->sense) {
-               cgc->sense->sense_key = 0x05;
-               cgc->sense->asc = 0x20;
-               cgc->sense->ascq = 0x00;
+       if (cgc->sshdr) {
+               cgc->sshdr->sense_key = 0x05;
+               cgc->sshdr->asc = 0x20;
+               cgc->sshdr->ascq = 0x00;
         }
  
         cgc->stat = -EIO;
@@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
  
                 blk_execute_rq(q, cdi->disk, rq, 0);
                 if (scsi_req(rq)->result) {
-                       struct request_sense *s = req->sense;
+                       struct scsi_sense_hdr sshdr;
+
                         ret = -EIO;
-                       cdi->last_sense = s->sense_key;
+                       scsi_normalize_sense(req->sense, req->sense_len,
+                                            &sshdr);
+                       cdi->last_sense = sshdr.sense_key;
                 }
  
                 if (blk_rq_unmap_user(bio))
@@ -2943,7 +2947,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
                                               struct packet_command *cgc,
                                               int cmd)
  {
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         struct cdrom_msf msf;
         int blocksize = 0, format = 0, lba;
         int ret;
@@ -2971,13 +2975,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
         if (cgc->buffer == NULL)
                 return -ENOMEM;
  
-       memset(&sense, 0, sizeof(sense));
-       cgc->sense = &sense;
+       memset(&sshdr, 0, sizeof(sshdr));
+       cgc->sshdr = &sshdr;
         cgc->data_direction = CGC_DATA_READ;
         ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
-       if (ret && sense.sense_key == 0x05 &&
-           sense.asc == 0x20 &&
-           sense.ascq == 0x00) {
+       if (ret && sshdr.sense_key == 0x05 &&
+           sshdr.asc == 0x20 &&
+           sshdr.ascq == 0x00) {
                 /*
                  * SCSI-II devices are not required to support
                  * READ_CD, so let's try switching block size
@@ -2986,7 +2990,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
                 ret = cdrom_switch_blocksize(cdi, blocksize);
                 if (ret)
                         goto out;
-               cgc->sense = NULL;
+               cgc->sshdr = NULL;
                 ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
                 ret |= cdrom_switch_blocksize(cdi, blocksize);
         }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c

index 5f17838..44a7a25 100644 (file)
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -419,10 +419,11 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
  
  int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                     int write, void *buffer, unsigned *bufflen,
-                   struct request_sense *sense, int timeout,
+                   struct scsi_sense_hdr *sshdr, int timeout,
                     req_flags_t rq_flags)
  {
         struct cdrom_info *info = drive->driver_data;
+       struct scsi_sense_hdr local_sshdr;
         int retries = 10;
         bool failed;
  
@@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                                   "rq_flags: 0x%x",
                                   cmd[0], write, timeout, rq_flags);
  
+       if (!sshdr)
+               sshdr = &local_sshdr;
+
         /* start of retry loop */
         do {
                 struct request *rq;
@@ -456,8 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
  
                 if (buffer)
                         *bufflen = scsi_req(rq)->resid_len;
-               if (sense)
-                       memcpy(sense, scsi_req(rq)->sense, sizeof(*sense));
+               scsi_normalize_sense(scsi_req(rq)->sense,
+                                    scsi_req(rq)->sense_len, sshdr);
  
                 /*
                  * FIXME: we should probably abort/retry or something in case of
@@ -469,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                          * The request failed.  Retry if it was due to a unit
                          * attention status (usually means media was changed).
                          */
-                       struct request_sense *reqbuf = scsi_req(rq)->sense;
-
-                       if (reqbuf->sense_key == UNIT_ATTENTION)
+                       if (sshdr->sense_key == UNIT_ATTENTION)
                                 cdrom_saw_media_change(drive);
-                       else if (reqbuf->sense_key == NOT_READY &&
-                                reqbuf->asc == 4 && reqbuf->ascq != 4) {
+                       else if (sshdr->sense_key == NOT_READY &&
+                                sshdr->asc == 4 && sshdr->ascq != 4) {
                                 /*
                                  * The drive is in the process of loading
                                  * a disk.  Retry, but wait a little to give
@@ -864,7 +866,7 @@ static void msf_from_bcd(struct atapi_msf *msf)
         msf->frame  = bcd2bin(msf->frame);
  }
  
-int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
+int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
  {
         struct cdrom_info *info = drive->driver_data;
         struct cdrom_device_info *cdi;
@@ -886,12 +888,11 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
          */
         cmd[7] = cdi->sanyo_slot % 3;
  
-       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET);
+       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
  }
  
  static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
-                              unsigned long *sectors_per_frame,
-                              struct request_sense *sense)
+                              unsigned long *sectors_per_frame)
  {
         struct {
                 __be32 lba;
@@ -908,7 +909,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
         memset(cmd, 0, BLK_MAX_CDB);
         cmd[0] = GPCMD_READ_CDVD_CAPACITY;
  
-       stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0,
+       stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0,
                                RQF_QUIET);
         if (stat)
                 return stat;
@@ -944,8 +945,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
  }
  
  static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
-                               int format, char *buf, int buflen,
-                               struct request_sense *sense)
+                               int format, char *buf, int buflen)
  {
         unsigned char cmd[BLK_MAX_CDB];
  
@@ -962,11 +962,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
         if (msf_flag)
                 cmd[1] = 2;
  
-       return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET);
+       return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET);
  }
  
  /* Try to read the entire TOC for the disk into our internal buffer. */
-int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
+int ide_cd_read_toc(ide_drive_t *drive)
  {
         int stat, ntracks, i;
         struct cdrom_info *info = drive->driver_data;
@@ -996,14 +996,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
          * Check to see if the existing data is still valid. If it is,
          * just return.
          */
-       (void) cdrom_check_status(drive, sense);
+       (void) cdrom_check_status(drive, NULL);
  
         if (drive->atapi_flags & IDE_AFLAG_TOC_VALID)
                 return 0;
  
         /* try to get the total cdrom capacity and sector size */
-       stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame,
-                                  sense);
+       stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame);
         if (stat)
                 toc->capacity = 0x1fffff;
  
@@ -1016,7 +1015,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
  
         /* first read just the header, so we know how long the TOC is */
         stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
-                                   sizeof(struct atapi_toc_header), sense);
+                                   sizeof(struct atapi_toc_header));
         if (stat)
                 return stat;
  
@@ -1036,7 +1035,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
                                   (char *)&toc->hdr,
                                    sizeof(struct atapi_toc_header) +
                                    (ntracks + 1) *
-                                  sizeof(struct atapi_toc_entry), sense);
+                                  sizeof(struct atapi_toc_entry));
  
         if (stat && toc->hdr.first_track > 1) {
                 /*
@@ -1056,8 +1055,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
                                            (char *)&toc->hdr,
                                            sizeof(struct atapi_toc_header) +
                                            (ntracks + 1) *
-                                          sizeof(struct atapi_toc_entry),
-                                          sense);
+                                          sizeof(struct atapi_toc_entry));
                 if (stat)
                         return stat;
  
@@ -1094,7 +1092,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
         if (toc->hdr.first_track != CDROM_LEADOUT) {
                 /* read the multisession information */
                 stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp,
-                                          sizeof(ms_tmp), sense);
+                                          sizeof(ms_tmp));
                 if (stat)
                         return stat;
  
@@ -1108,7 +1106,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
         if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
                 /* re-read multisession information using MSF format */
                 stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp,
-                                          sizeof(ms_tmp), sense);
+                                          sizeof(ms_tmp));
                 if (stat)
                         return stat;
  
@@ -1412,7 +1410,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive)
  {
         unsigned long capacity, sectors_per_frame;
  
-       if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame, NULL))
+       if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame))
                 return 0;
  
         return capacity * sectors_per_frame;
@@ -1710,9 +1708,8 @@ static unsigned int idecd_check_events(struct gendisk *disk,
  static int idecd_revalidate_disk(struct gendisk *disk)
  {
         struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
-       struct request_sense sense;
  
-       ide_cd_read_toc(info->drive, &sense);
+       ide_cd_read_toc(info->drive);
  
         return  0;
  }
@@ -1736,7 +1733,6 @@ static int ide_cd_probe(ide_drive_t *drive)
  {
         struct cdrom_info *info;
         struct gendisk *g;
-       struct request_sense sense;
  
         ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
                                      drive->driver_req, drive->media);
@@ -1785,7 +1781,7 @@ static int ide_cd_probe(ide_drive_t *drive)
                 goto failed;
         }
  
-       ide_cd_read_toc(drive, &sense);
+       ide_cd_read_toc(drive);
         g->fops = &idecd_ops;
         g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
         device_add_disk(&drive->gendev, g);
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h

index 04f0f31..a69dc7f 100644 (file)
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
  
  /* ide-cd.c functions used by ide-cd_ioctl.c */
  int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
-                   unsigned *, struct request_sense *, int, req_flags_t);
-int ide_cd_read_toc(ide_drive_t *, struct request_sense *);
+                   unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
+int ide_cd_read_toc(ide_drive_t *);
  int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
  void ide_cdrom_update_speed(ide_drive_t *, u8 *);
-int cdrom_check_status(ide_drive_t *, struct request_sense *);
+int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
  
  /* ide-cd_ioctl.c */
  int ide_cdrom_open_real(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c

index b132240..4a6e1a4 100644 (file)
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
  {
         ide_drive_t *drive = cdi->handle;
         struct media_event_desc med;
-       struct request_sense sense;
+       struct scsi_sense_hdr sshdr;
         int stat;
  
         if (slot_nr != CDSL_CURRENT)
                 return -EINVAL;
  
-       stat = cdrom_check_status(drive, &sense);
-       if (!stat || sense.sense_key == UNIT_ATTENTION)
+       stat = cdrom_check_status(drive, &sshdr);
+       if (!stat || sshdr.sense_key == UNIT_ATTENTION)
                 return CDS_DISC_OK;
  
         if (!cdrom_get_media_event(cdi, &med)) {
@@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
                         return CDS_NO_DISC;
         }
  
-       if (sense.sense_key == NOT_READY && sense.asc == 0x04
-                       && sense.ascq == 0x04)
+       if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
+                       && sshdr.ascq == 0x04)
                 return CDS_DISC_OK;
  
         /*
@@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
          * just return TRAY_OPEN since ATAPI doesn't provide
          * any other way to detect this...
          */
-       if (sense.sense_key == NOT_READY) {
-               if (sense.asc == 0x3a && sense.ascq == 1)
+       if (sshdr.sense_key == NOT_READY) {
+               if (sshdr.asc == 0x3a && sshdr.ascq == 1)
                         return CDS_NO_DISC;
                 else
                         return CDS_TRAY_OPEN;
@@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
  /* Eject the disk if EJECTFLAG is 0.
     If EJECTFLAG is 1, try to reload the disk. */
  static
-int cdrom_eject(ide_drive_t *drive, int ejectflag,
-               struct request_sense *sense)
+int cdrom_eject(ide_drive_t *drive, int ejectflag)
  {
         struct cdrom_info *cd = drive->driver_data;
         struct cdrom_device_info *cdi = &cd->devinfo;
@@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag,
         cmd[0] = GPCMD_START_STOP_UNIT;
         cmd[4] = loej | (ejectflag != 0);
  
-       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0);
+       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
  }
  
  /* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */
  static
-int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
-                   struct request_sense *sense)
+int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
  {
-       struct request_sense my_sense;
+       struct scsi_sense_hdr sshdr;
         int stat;
  
-       if (sense == NULL)
-               sense = &my_sense;
-
         /* If the drive cannot lock the door, just pretend. */
         if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
                 stat = 0;
@@ -155,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
                 cmd[4] = lockflag ? 1 : 0;
  
                 stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
-                                      sense, 0, 0);
+                                      &sshdr, 0, 0);
         }
  
         /* If we got an illegal field error, the drive
            probably cannot lock the door. */
         if (stat != 0 &&
-           sense->sense_key == ILLEGAL_REQUEST &&
-           (sense->asc == 0x24 || sense->asc == 0x20)) {
+           sshdr.sense_key == ILLEGAL_REQUEST &&
+           (sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
                 printk(KERN_ERR "%s: door locking not supported\n",
                         drive->name);
                 drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
@@ -170,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
         }
  
         /* no medium, that's alright. */
-       if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a)
+       if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
                 stat = 0;
  
         if (stat == 0) {
@@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
  int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position)
  {
         ide_drive_t *drive = cdi->handle;
-       struct request_sense sense;
  
         if (position) {
-               int stat = ide_cd_lockdoor(drive, 0, &sense);
+               int stat = ide_cd_lockdoor(drive, 0);
  
                 if (stat)
                         return stat;
         }
  
-       return cdrom_eject(drive, !position, &sense);
+       return cdrom_eject(drive, !position);
  }
  
  int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock)
  {
         ide_drive_t *drive = cdi->handle;
  
-       return ide_cd_lockdoor(drive, lock, NULL);
+       return ide_cd_lockdoor(drive, lock);
  }
  
  /*
@@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
  {
         ide_drive_t *drive = cdi->handle;
         struct cdrom_info *cd = drive->driver_data;
-       struct request_sense sense;
         u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
         int stat;
         unsigned char cmd[BLK_MAX_CDB];
@@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
                 cmd[5] = speed & 0xff;
         }
  
-       stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
+       stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
  
         if (!ide_cdrom_get_capabilities(drive, buf)) {
                 ide_cdrom_update_speed(drive, buf);
@@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi,
         struct atapi_toc *toc;
         ide_drive_t *drive = cdi->handle;
         struct cdrom_info *info = drive->driver_data;
-       struct request_sense sense;
         int ret;
  
         if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) {
-               ret = ide_cd_read_toc(drive, &sense);
+               ret = ide_cd_read_toc(drive);
                 if (ret)
                         return ret;
         }
@@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
  {
         ide_drive_t *drive = cdi->handle;
         struct cdrom_info *cd = drive->driver_data;
-       struct request_sense sense;
         struct request *rq;
         int ret;
  
@@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
          * lock it again.
          */
         if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED)
-               (void)ide_cd_lockdoor(drive, 1, &sense);
+               (void)ide_cd_lockdoor(drive, 1);
  
         return ret;
  }
@@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
         struct atapi_toc_entry *first_toc, *last_toc;
         unsigned long lba_start, lba_end;
         int stat;
-       struct request_sense sense;
         unsigned char cmd[BLK_MAX_CDB];
  
         stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc);
@@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
         lba_to_msf(lba_start,   &cmd[3], &cmd[4], &cmd[5]);
         lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]);
  
-       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0);
+       return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
  }
  
  static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
@@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
         int stat;
  
         /* Make sure our saved TOC is valid. */
-       stat = ide_cd_read_toc(drive, NULL);
+       stat = ide_cd_read_toc(drive);
         if (stat)
                 return stat;
  
@@ -461,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
            layer. the packet must be complete, as we do not
            touch it at all. */
  
-       if (cgc->sense)
-               memset(cgc->sense, 0, sizeof(struct request_sense));
+       if (cgc->sshdr)
+               memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
  
         if (cgc->quiet)
                 flags |= RQF_QUIET;
@@ -470,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
         cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
                                     cgc->data_direction == CGC_DATA_WRITE,
                                     cgc->buffer, &len,
-                                   cgc->sense, cgc->timeout, flags);
+                                   cgc->sshdr, cgc->timeout, flags);
         if (!cgc->stat)
                 cgc->buflen -= len;
         return cgc->stat;
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c

index ca844a9..130bf16 100644 (file)
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
  {
         domain->sig_type = IB_SIG_TYPE_T10_DIF;
         domain->sig.dif.pi_interval = scsi_prot_interval(sc);
-       domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+       domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
         /*
          * At the moment we hard code those, but in the future
          * we will take them from sc.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig

index 9c03f35..439bf90 100644 (file)
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -17,23 +17,25 @@ menuconfig NVM
  
  if NVM
  
-config NVM_DEBUG
-       bool "Open-Channel SSD debugging support"
-       default n
-       ---help---
-       Exposes a debug management interface to create/remove targets at:
+config NVM_PBLK
+       tristate "Physical Block Device Open-Channel SSD target"
+       help
+         Allows an open-channel SSD to be exposed as a block device to the
+         host. The target assumes the device exposes raw flash and must be
+         explicitly managed by the host.
  
-         /sys/module/lnvm/parameters/configure_debug
+         Please note the disk format is considered EXPERIMENTAL for now.
  
-       It is required to create/remove targets without IOCTLs.
+if NVM_PBLK
  
-config NVM_PBLK
-       tristate "Physical Block Device Open-Channel SSD target"
-       ---help---
-       Allows an open-channel SSD to be exposed as a block device to the
-       host. The target assumes the device exposes raw flash and must be
-       explicitly managed by the host.
+config NVM_PBLK_DEBUG
+       bool "PBlk Debug Support"
+       default n
+       help
+         Enables debug support for pblk. This includes extra checks, more
+         vocal error messages, and extra tracking fields in the pblk sysfs
+         entries.
  
-       Please note the disk format is considered EXPERIMENTAL for now.
+endif # NVM_PBLK_DEBUG
  
  endif # NVM
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c

index b1c6d7e..f565a56 100644 (file)
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
         int nr_entries = pblk_get_secs(bio);
         int i, ret;
  
-       generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
+       generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
+                             &pblk->disk->part0);
  
         /* Update the write buffer head (mem) with the entries that we can
          * write. The write in itself cannot fail, so there is no need to
@@ -67,7 +68,7 @@ retry:
  
         atomic64_add(nr_entries, &pblk->user_wa);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(nr_entries, &pblk->inflight_writes);
         atomic_long_add(nr_entries, &pblk->req_writes);
  #endif
@@ -75,7 +76,7 @@ retry:
         pblk_rl_inserted(&pblk->rl, nr_entries);
  
  out:
-       generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
+       generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
         pblk_write_should_kick(pblk);
         return ret;
  }
@@ -123,7 +124,7 @@ retry:
  
         atomic64_add(valid_entries, &pblk->gc_wa);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(valid_entries, &pblk->inflight_writes);
         atomic_long_add(valid_entries, &pblk->recov_gc_writes);
  #endif
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c

index ed9cc97..00984b4 100644 (file)
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
                 line = &pblk->lines[pblk_ppa_to_line(*ppa)];
                 pos = pblk_ppa_to_pos(&dev->geo, *ppa);
  
-               pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+               pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
                                 line->id, pos);
         }
  
@@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
         struct ppa_addr *ppa;
         int pos = pblk_ppa_to_pos(geo, ppa_addr);
  
-       pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+       pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
         atomic_long_inc(&pblk->erase_failed);
  
         atomic_dec(&line->blk_in_line);
         if (test_and_set_bit(pos, line->blk_bitmap))
-               pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+               pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
                                                         line->id, pos);
  
         /* Not necessary to mark bad blocks on 2.0 spec. */
@@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
         u64 paddr;
         int line_id;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Callers must ensure that the ppa points to a device address */
         BUG_ON(pblk_addr_in_cache(ppa));
         BUG_ON(pblk_ppa_empty(ppa));
@@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
         switch (type) {
         case PBLK_WRITE:
                 kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+               /* fall through */
         case PBLK_WRITE_INT:
                 pool = &pblk->w_rq_pool;
                 break;
@@ -274,7 +275,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
                 pool = &pblk->e_rq_pool;
                 break;
         default:
-               pr_err("pblk: trying to free unknown rqd type\n");
+               pblk_err(pblk, "trying to free unknown rqd type\n");
                 return;
         }
  
@@ -310,7 +311,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
  
                 ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
                 if (ret != PBLK_EXPOSED_PAGE_SIZE) {
-                       pr_err("pblk: could not add page to bio\n");
+                       pblk_err(pblk, "could not add page to bio\n");
                         mempool_free(page, &pblk->page_bio_pool);
                         goto err;
                 }
@@ -410,7 +411,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
                 line->state = PBLK_LINESTATE_CORRUPT;
                 line->gc_group = PBLK_LINEGC_NONE;
                 move_list =  &l_mg->corrupt_list;
-               pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+               pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
                                                 line->id, vsc,
                                                 line->sec_in_line,
                                                 lm->high_thrs, lm->mid_thrs);
@@ -430,7 +431,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
  void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
  {
         atomic_long_inc(&pblk->write_failed);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         pblk_print_failed_rqd(pblk, rqd, rqd->error);
  #endif
  }
@@ -452,9 +453,9 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
                 atomic_long_inc(&pblk->read_failed);
                 break;
         default:
-               pr_err("pblk: unknown read error:%d\n", rqd->error);
+               pblk_err(pblk, "unknown read error:%d\n", rqd->error);
         }
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         pblk_print_failed_rqd(pblk, rqd, rqd->error);
  #endif
  }
@@ -470,7 +471,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
  
         atomic_inc(&pblk->inflight_io);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         if (pblk_check_io(pblk, rqd))
                 return NVM_IO_ERR;
  #endif
@@ -484,7 +485,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
  
         atomic_inc(&pblk->inflight_io);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         if (pblk_check_io(pblk, rqd))
                 return NVM_IO_ERR;
  #endif
@@ -517,7 +518,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
         for (i = 0; i < nr_secs; i++) {
                 page = vmalloc_to_page(kaddr);
                 if (!page) {
-                       pr_err("pblk: could not map vmalloc bio\n");
+                       pblk_err(pblk, "could not map vmalloc bio\n");
                         bio_put(bio);
                         bio = ERR_PTR(-ENOMEM);
                         goto out;
@@ -525,7 +526,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
  
                 ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
                 if (ret != PAGE_SIZE) {
-                       pr_err("pblk: could not add page to bio\n");
+                       pblk_err(pblk, "could not add page to bio\n");
                         bio_put(bio);
                         bio = ERR_PTR(-ENOMEM);
                         goto out;
@@ -711,7 +712,7 @@ next_rq:
                         while (test_bit(pos, line->blk_bitmap)) {
                                 paddr += min;
                                 if (pblk_boundary_paddr_checks(pblk, paddr)) {
-                                       pr_err("pblk: corrupt emeta line:%d\n",
+                                       pblk_err(pblk, "corrupt emeta line:%d\n",
                                                                 line->id);
                                         bio_put(bio);
                                         ret = -EINTR;
@@ -723,7 +724,7 @@ next_rq:
                         }
  
                         if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
-                               pr_err("pblk: corrupt emeta line:%d\n",
+                               pblk_err(pblk, "corrupt emeta line:%d\n",
                                                                 line->id);
                                 bio_put(bio);
                                 ret = -EINTR;
@@ -738,7 +739,7 @@ next_rq:
  
         ret = pblk_submit_io_sync(pblk, &rqd);
         if (ret) {
-               pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
                 bio_put(bio);
                 goto free_rqd_dma;
         }
@@ -843,7 +844,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
          */
         ret = pblk_submit_io_sync(pblk, &rqd);
         if (ret) {
-               pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
                 bio_put(bio);
                 goto free_ppa_list;
         }
@@ -905,7 +906,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
                 struct nvm_tgt_dev *dev = pblk->dev;
                 struct nvm_geo *geo = &dev->geo;
  
-               pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+               pblk_err(pblk, "could not sync erase line:%d,blk:%d\n",
                                         pblk_ppa_to_line(ppa),
                                         pblk_ppa_to_pos(geo, ppa));
  
@@ -945,7 +946,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
  
                 ret = pblk_blk_erase_sync(pblk, ppa);
                 if (ret) {
-                       pr_err("pblk: failed to erase line %d\n", line->id);
+                       pblk_err(pblk, "failed to erase line %d\n", line->id);
                         return ret;
                 }
         } while (1);
@@ -1012,7 +1013,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
                 list_add_tail(&line->list, &l_mg->bad_list);
                 spin_unlock(&l_mg->free_lock);
  
-               pr_debug("pblk: line %d is bad\n", line->id);
+               pblk_debug(pblk, "line %d is bad\n", line->id);
  
                 return 0;
         }
@@ -1122,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
         line->cur_sec = off + lm->smeta_sec;
  
         if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
-               pr_debug("pblk: line smeta I/O failed. Retry\n");
+               pblk_debug(pblk, "line smeta I/O failed. Retry\n");
                 return 0;
         }
  
@@ -1154,7 +1155,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
                 spin_unlock(&line->lock);
  
                 list_add_tail(&line->list, &l_mg->bad_list);
-               pr_err("pblk: unexpected line %d is bad\n", line->id);
+               pblk_err(pblk, "unexpected line %d is bad\n", line->id);
  
                 return 0;
         }
@@ -1299,7 +1300,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
  
  retry:
         if (list_empty(&l_mg->free_list)) {
-               pr_err("pblk: no free lines\n");
+               pblk_err(pblk, "no free lines\n");
                 return NULL;
         }
  
@@ -1315,7 +1316,7 @@ retry:
  
                 list_add_tail(&line->list, &l_mg->bad_list);
  
-               pr_debug("pblk: line %d is bad\n", line->id);
+               pblk_debug(pblk, "line %d is bad\n", line->id);
                 goto retry;
         }
  
@@ -1329,7 +1330,7 @@ retry:
                         list_add(&line->list, &l_mg->corrupt_list);
                         goto retry;
                 default:
-                       pr_err("pblk: failed to prepare line %d\n", line->id);
+                       pblk_err(pblk, "failed to prepare line %d\n", line->id);
                         list_add(&line->list, &l_mg->free_list);
                         l_mg->nr_free_lines++;
                         return NULL;
@@ -1477,7 +1478,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
  
                         ret = pblk_submit_meta_io(pblk, line);
                         if (ret) {
-                               pr_err("pblk: sync meta line %d failed (%d)\n",
+                               pblk_err(pblk, "sync meta line %d failed (%d)\n",
                                                         line->id, ret);
                                 return;
                         }
@@ -1507,7 +1508,7 @@ void __pblk_pipeline_flush(struct pblk *pblk)
  
         ret = pblk_recov_pad(pblk);
         if (ret) {
-               pr_err("pblk: could not close data on teardown(%d)\n", ret);
+               pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
                 return;
         }
  
@@ -1687,7 +1688,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
                 struct nvm_tgt_dev *dev = pblk->dev;
                 struct nvm_geo *geo = &dev->geo;
  
-               pr_err("pblk: could not async erase line:%d,blk:%d\n",
+               pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
                                         pblk_ppa_to_line(ppa),
                                         pblk_ppa_to_pos(geo, ppa));
         }
@@ -1726,7 +1727,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
         struct list_head *move_list;
         int i;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
                                 "pblk: corrupt closed line %d\n", line->id);
  #endif
@@ -1856,7 +1857,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
          * Only send one inflight I/O per LUN. Since we map at a page
          * granurality, all ppas in the I/O will map to the same LUN
          */
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         int i;
  
         for (i = 1; i < nr_ppas; i++)
@@ -1866,7 +1867,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
  
         ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
         if (ret == -ETIME || ret == -EINTR)
-               pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
+               pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
+                               -ret);
  }
  
  void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
@@ -1901,7 +1903,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
         struct pblk_lun *rlun;
         int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         int i;
  
         for (i = 1; i < nr_ppas; i++)
@@ -1951,7 +1953,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
  void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
  {
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Callers must ensure that the ppa points to a cache address */
         BUG_ON(!pblk_addr_in_cache(ppa));
         BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
@@ -1966,7 +1968,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
         struct ppa_addr ppa_l2p, ppa_gc;
         int ret = 1;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Callers must ensure that the ppa points to a cache address */
         BUG_ON(!pblk_addr_in_cache(ppa_new));
         BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
@@ -2003,14 +2005,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
  {
         struct ppa_addr ppa_l2p;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Callers must ensure that the ppa points to a device address */
         BUG_ON(pblk_addr_in_cache(ppa_mapped));
  #endif
         /* Invalidate and discard padded entries */
         if (lba == ADDR_EMPTY) {
                 atomic64_inc(&pblk->pad_wa);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                 atomic_long_inc(&pblk->padded_wb);
  #endif
                 if (!pblk_ppa_empty(ppa_mapped))
@@ -2036,7 +2038,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
                 goto out;
         }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
  #endif
  
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c

index 080469d..157c256 100644 (file)
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
  
         gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
         if (!gc_rq->data) {
-               pr_err("pblk: could not GC line:%d (%d/%d)\n",
+               pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
                                         line->id, *line->vsc, gc_rq->nr_secs);
                 goto out;
         }
@@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
         /* Read from GC victim block */
         ret = pblk_submit_read_gc(pblk, gc_rq);
         if (ret) {
-               pr_err("pblk: failed GC read in line:%d (err:%d)\n",
+               pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
                                                                 line->id, ret);
                 goto out;
         }
@@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
  
         ret = pblk_line_read_emeta(pblk, line, emeta_buf);
         if (ret) {
-               pr_err("pblk: line %d read emeta failed (%d)\n",
+               pblk_err(pblk, "line %d read emeta failed (%d)\n",
                                 line->id, ret);
                 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
                 return NULL;
@@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
  
         ret = pblk_recov_check_emeta(pblk, emeta_buf);
         if (ret) {
-               pr_err("pblk: inconsistent emeta (line %d)\n",
+               pblk_err(pblk, "inconsistent emeta (line %d)\n",
                                 line->id);
                 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
                 return NULL;
@@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
         } else {
                 lba_list = get_lba_list_from_emeta(pblk, line);
                 if (!lba_list) {
-                       pr_err("pblk: could not interpret emeta (line %d)\n",
+                       pblk_err(pblk, "could not interpret emeta (line %d)\n",
                                         line->id);
                         goto fail_free_invalid_bitmap;
                 }
@@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
         spin_unlock(&line->lock);
  
         if (sec_left < 0) {
-               pr_err("pblk: corrupted GC line (%d)\n", line->id);
+               pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
                 goto fail_free_lba_list;
         }
  
@@ -289,7 +289,7 @@ fail_free_ws:
         kref_put(&line->ref, pblk_line_put);
         atomic_dec(&gc->read_inflight_gc);
  
-       pr_err("pblk: Failed to GC line %d\n", line->id);
+       pblk_err(pblk, "failed to GC line %d\n", line->id);
  }
  
  static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
@@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
         struct pblk_gc *gc = &pblk->gc;
         struct pblk_line_ws *line_ws;
  
-       pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+       pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
  
         line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
         if (!line_ws)
@@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk)
         pblk_gc_kick(pblk);
  
         if (pblk_gc_line(pblk, line))
-               pr_err("pblk: failed to GC line %d\n", line->id);
+               pblk_err(pblk, "failed to GC line %d\n", line->id);
  
         return 0;
  }
@@ -522,8 +522,8 @@ static int pblk_gc_reader_ts(void *data)
                 io_schedule();
         }
  
-#ifdef CONFIG_NVM_DEBUG
-       pr_info("pblk: flushing gc pipeline, %d lines left\n",
+#ifdef CONFIG_NVM_PBLK_DEBUG
+       pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
                 atomic_read(&gc->pipeline_gc));
  #endif
  
@@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data)
  static void pblk_gc_start(struct pblk *pblk)
  {
         pblk->gc.gc_active = 1;
-       pr_debug("pblk: gc start\n");
+       pblk_debug(pblk, "gc start\n");
  }
  
  void pblk_gc_should_start(struct pblk *pblk)
@@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk)
  
         gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
         if (IS_ERR(gc->gc_ts)) {
-               pr_err("pblk: could not allocate GC main kthread\n");
+               pblk_err(pblk, "could not allocate GC main kthread\n");
                 return PTR_ERR(gc->gc_ts);
         }
  
         gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
                                                         "pblk-gc-writer-ts");
         if (IS_ERR(gc->gc_writer_ts)) {
-               pr_err("pblk: could not allocate GC writer kthread\n");
+               pblk_err(pblk, "could not allocate GC writer kthread\n");
                 ret = PTR_ERR(gc->gc_writer_ts);
                 goto fail_free_main_kthread;
         }
@@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk)
         gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
                                                         "pblk-gc-reader-ts");
         if (IS_ERR(gc->gc_reader_ts)) {
-               pr_err("pblk: could not allocate GC reader kthread\n");
+               pblk_err(pblk, "could not allocate GC reader kthread\n");
                 ret = PTR_ERR(gc->gc_reader_ts);
                 goto fail_free_writer_kthread;
         }
@@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk)
         gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
                         WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
         if (!gc->gc_line_reader_wq) {
-               pr_err("pblk: could not allocate GC line reader workqueue\n");
+               pblk_err(pblk, "could not allocate GC line reader workqueue\n");
                 ret = -ENOMEM;
                 goto fail_free_reader_kthread;
         }
@@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk)
         gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
                                         WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
         if (!gc->gc_reader_wq) {
-               pr_err("pblk: could not allocate GC reader workqueue\n");
+               pblk_err(pblk, "could not allocate GC reader workqueue\n");
                 ret = -ENOMEM;
                 goto fail_free_reader_line_wq;
         }
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c

index b57f764..537e98f 100644 (file)
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk)
         return entry_size * pblk->rl.nr_secs;
  }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
  static u32 pblk_l2p_crc(struct pblk *pblk)
  {
         size_t map_size;
@@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
         } else {
                 line = pblk_recov_l2p(pblk);
                 if (IS_ERR(line)) {
-                       pr_err("pblk: could not recover l2p table\n");
+                       pblk_err(pblk, "could not recover l2p table\n");
                         return -EFAULT;
                 }
         }
  
-#ifdef CONFIG_NVM_DEBUG
-       pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#ifdef CONFIG_NVM_PBLK_DEBUG
+       pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
  #endif
  
         /* Free full lines directly as GC has not been started yet */
@@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
  static void pblk_rwb_free(struct pblk *pblk)
  {
         if (pblk_rb_tear_down_check(&pblk->rwb))
-               pr_err("pblk: write buffer error on tear down\n");
+               pblk_err(pblk, "write buffer error on tear down\n");
  
         pblk_rb_data_free(&pblk->rwb);
         vfree(pblk_rb_entries_ref(&pblk->rwb));
@@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk)
         struct pblk_rb_entry *entries;
         unsigned long nr_entries, buffer_size;
         unsigned int power_size, power_seg_sz;
+       int pgs_in_buffer;
  
-       if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
+       pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns;
+
+       if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
                 buffer_size = write_buffer_size;
         else
-               buffer_size = pblk->pgs_in_buffer;
+               buffer_size = pgs_in_buffer;
  
         nr_entries = pblk_rb_calculate_size(buffer_size);
  
@@ -200,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk)
  /* Minimum pages needed within a lun */
  #define ADDR_POOL_SIZE 64
  
-static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
+static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
+                            struct nvm_addrf_12 *dst)
  {
         struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
         int power_len;
@@ -208,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
         /* Re-calculate channel and lun format to adapt to configuration */
         power_len = get_count_order(geo->num_ch);
         if (1 << power_len != geo->num_ch) {
-               pr_err("pblk: supports only power-of-two channel config.\n");
+               pblk_err(pblk, "supports only power-of-two channel config.\n");
                 return -EINVAL;
         }
         dst->ch_len = power_len;
  
         power_len = get_count_order(geo->num_lun);
         if (1 << power_len != geo->num_lun) {
-               pr_err("pblk: supports only power-of-two LUN config.\n");
+               pblk_err(pblk, "supports only power-of-two LUN config.\n");
                 return -EINVAL;
         }
         dst->lun_len = power_len;
@@ -282,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk)
         case NVM_OCSSD_SPEC_12:
                 div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
                 if (mod) {
-                       pr_err("pblk: bad configuration of sectors/pages\n");
+                       pblk_err(pblk, "bad configuration of sectors/pages\n");
                         return -EINVAL;
                 }
  
-               pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf);
+               pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
+                                                       (void *)&pblk->addrf);
                 break;
         case NVM_OCSSD_SPEC_20:
                 pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
-                                                               &pblk->uaddrf);
+                                                       &pblk->uaddrf);
                 break;
         default:
-               pr_err("pblk: OCSSD revision not supported (%d)\n",
+               pblk_err(pblk, "OCSSD revision not supported (%d)\n",
                                                                 geo->version);
                 return -EINVAL;
         }
@@ -366,15 +371,13 @@ static int pblk_core_init(struct pblk *pblk)
         atomic64_set(&pblk->nr_flush, 0);
         pblk->nr_flush_rst = 0;
  
-       pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns;
-
         pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
         max_write_ppas = pblk->min_write_pgs * geo->all_luns;
         pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
         pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
  
         if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
-               pr_err("pblk: vector list too big(%u > %u)\n",
+               pblk_err(pblk, "vector list too big(%u > %u)\n",
                                 pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
                 return -EINVAL;
         }
@@ -607,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk)
  
         /* TODO: Implement unbalanced LUN support */
         if (geo->num_lun < 0) {
-               pr_err("pblk: unbalanced LUN config.\n");
+               pblk_err(pblk, "unbalanced LUN config.\n");
                 return -EINVAL;
         }
  
@@ -716,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
  
                 /*
                  * In 1.2 spec. chunk state is not persisted by the device. Thus
-                * some of the values are reset each time pblk is instantiated.
+                * some of the values are reset each time pblk is instantiated,
+                * so we have to assume that the block is closed.
                  */
                 if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
-                       chunk->state =  NVM_CHK_ST_FREE;
+                       chunk->state =  NVM_CHK_ST_CLOSED;
                 else
                         chunk->state = NVM_CHK_ST_OFFLINE;
  
@@ -1026,7 +1030,7 @@ add_emeta_page:
                                         lm->emeta_sec[0], geo->clba);
  
         if (lm->min_blk_line > lm->blk_per_line) {
-               pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
+               pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
                                                         lm->blk_per_line);
                 return -EINVAL;
         }
@@ -1078,7 +1082,7 @@ static int pblk_lines_init(struct pblk *pblk)
         }
  
         if (!nr_free_chks) {
-               pr_err("pblk: too many bad blocks prevent for sane instance\n");
+               pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
                 return -EINTR;
         }
  
@@ -1108,7 +1112,7 @@ static int pblk_writer_init(struct pblk *pblk)
                 int err = PTR_ERR(pblk->writer_ts);
  
                 if (err != -EINTR)
-                       pr_err("pblk: could not allocate writer kthread (%d)\n",
+                       pblk_err(pblk, "could not allocate writer kthread (%d)\n",
                                         err);
                 return err;
         }
@@ -1154,7 +1158,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
         pblk_rb_sync_l2p(&pblk->rwb);
         pblk_rl_free(&pblk->rl);
  
-       pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
+       pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
  }
  
  static void pblk_exit(void *private, bool graceful)
@@ -1165,8 +1169,8 @@ static void pblk_exit(void *private, bool graceful)
         pblk_gc_exit(pblk, graceful);
         pblk_tear_down(pblk, graceful);
  
-#ifdef CONFIG_NVM_DEBUG
-       pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#ifdef CONFIG_NVM_PBLK_DEBUG
+       pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
  #endif
  
         pblk_free(pblk);
@@ -1189,34 +1193,35 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
         struct pblk *pblk;
         int ret;
  
-       /* pblk supports 1.2 and 2.0 versions */
+       pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+       if (!pblk)
+               return ERR_PTR(-ENOMEM);
+
+       pblk->dev = dev;
+       pblk->disk = tdisk;
+       pblk->state = PBLK_STATE_RUNNING;
+       pblk->gc.gc_enabled = 0;
+
         if (!(geo->version == NVM_OCSSD_SPEC_12 ||
                                         geo->version == NVM_OCSSD_SPEC_20)) {
-               pr_err("pblk: OCSSD version not supported (%u)\n",
+               pblk_err(pblk, "OCSSD version not supported (%u)\n",
                                                         geo->version);
+               kfree(pblk);
                 return ERR_PTR(-EINVAL);
         }
  
         if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
-               pr_err("pblk: host-side L2P table not supported. (%x)\n",
+               pblk_err(pblk, "host-side L2P table not supported. (%x)\n",
                                                         geo->dom);
+               kfree(pblk);
                 return ERR_PTR(-EINVAL);
         }
  
-       pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
-       if (!pblk)
-               return ERR_PTR(-ENOMEM);
-
-       pblk->dev = dev;
-       pblk->disk = tdisk;
-       pblk->state = PBLK_STATE_RUNNING;
-       pblk->gc.gc_enabled = 0;
-
         spin_lock_init(&pblk->resubmit_lock);
         spin_lock_init(&pblk->trans_lock);
         spin_lock_init(&pblk->lock);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_set(&pblk->inflight_writes, 0);
         atomic_long_set(&pblk->padded_writes, 0);
         atomic_long_set(&pblk->padded_wb, 0);
@@ -1241,38 +1246,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
  
         ret = pblk_core_init(pblk);
         if (ret) {
-               pr_err("pblk: could not initialize core\n");
+               pblk_err(pblk, "could not initialize core\n");
                 goto fail;
         }
  
         ret = pblk_lines_init(pblk);
         if (ret) {
-               pr_err("pblk: could not initialize lines\n");
+               pblk_err(pblk, "could not initialize lines\n");
                 goto fail_free_core;
         }
  
         ret = pblk_rwb_init(pblk);
         if (ret) {
-               pr_err("pblk: could not initialize write buffer\n");
+               pblk_err(pblk, "could not initialize write buffer\n");
                 goto fail_free_lines;
         }
  
         ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
         if (ret) {
-               pr_err("pblk: could not initialize maps\n");
+               pblk_err(pblk, "could not initialize maps\n");
                 goto fail_free_rwb;
         }
  
         ret = pblk_writer_init(pblk);
         if (ret) {
                 if (ret != -EINTR)
-                       pr_err("pblk: could not initialize write thread\n");
+                       pblk_err(pblk, "could not initialize write thread\n");
                 goto fail_free_l2p;
         }
  
         ret = pblk_gc_init(pblk);
         if (ret) {
-               pr_err("pblk: could not initialize gc\n");
+               pblk_err(pblk, "could not initialize gc\n");
                 goto fail_stop_writer;
         }
  
@@ -1287,8 +1292,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
         blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
         blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
  
-       pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
-                       tdisk->disk_name,
+       pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
                         geo->all_luns, pblk->l_mg.nr_lines,
                         (unsigned long long)pblk->rl.nr_secs,
                         pblk->rwb.nr_entries);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c

index 55e9442..f6eec02 100644 (file)
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
         } while (iter > 0);
         up_write(&pblk_rb_lock);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_set(&rb->inflight_flush_point, 0);
  #endif
  
@@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
  
         entry = &rb->entries[ring_pos];
         flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Caller must guarantee that the entry is free */
         BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
  #endif
@@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
  
         entry = &rb->entries[ring_pos];
         flags = READ_ONCE(entry->w_ctx.flags);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Caller must guarantee that the entry is free */
         BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
  #endif
@@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
                 return 0;
         }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_inc(&rb->inflight_flush_point);
  #endif
  
@@ -547,7 +547,7 @@ try:
  
                 page = virt_to_page(entry->data);
                 if (!page) {
-                       pr_err("pblk: could not allocate write bio page\n");
+                       pblk_err(pblk, "could not allocate write bio page\n");
                         flags &= ~PBLK_WRITTEN_DATA;
                         flags |= PBLK_SUBMITTED_ENTRY;
                         /* Release flags on context. Protect from writes */
@@ -557,7 +557,7 @@ try:
  
                 if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
                                                                 rb->seg_size) {
-                       pr_err("pblk: could not add page to write bio\n");
+                       pblk_err(pblk, "could not add page to write bio\n");
                         flags &= ~PBLK_WRITTEN_DATA;
                         flags |= PBLK_SUBMITTED_ENTRY;
                         /* Release flags on context. Protect from writes */
@@ -576,19 +576,19 @@ try:
  
         if (pad) {
                 if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
-                       pr_err("pblk: could not pad page in write bio\n");
+                       pblk_err(pblk, "could not pad page in write bio\n");
                         return NVM_IO_ERR;
                 }
  
                 if (pad < pblk->min_write_pgs)
                         atomic64_inc(&pblk->pad_dist[pad - 1]);
                 else
-                       pr_warn("pblk: padding more than min. sectors\n");
+                       pblk_warn(pblk, "padding more than min. sectors\n");
  
                 atomic64_add(pad, &pblk->pad_wa);
         }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(pad, &pblk->padded_writes);
  #endif
  
@@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
         int ret = 1;
  
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Caller must ensure that the access will not cause an overflow */
         BUG_ON(pos >= rb->nr_entries);
  #endif
@@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
                         rb->subm,
                         rb->sync,
                         rb->l2p_update,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         atomic_read(&rb->inflight_flush_point),
  #else
                         0,
@@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
                         rb->subm,
                         rb->sync,
                         rb->l2p_update,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         atomic_read(&rb->inflight_flush_point),
  #else
                         0,
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c

index 1869469..5a46d7f 100644 (file)
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
                                 sector_t lba, struct ppa_addr ppa,
                                 int bio_iter, bool advanced_bio)
  {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Callers must ensure that the ppa points to a cache address */
         BUG_ON(pblk_ppa_empty(ppa));
         BUG_ON(!pblk_addr_in_cache(ppa));
@@ -79,7 +79,7 @@ retry:
                         WARN_ON(test_and_set_bit(i, read_bitmap));
                         meta_list[i].lba = cpu_to_le64(lba);
                         advanced_bio = true;
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         atomic_long_inc(&pblk->cache_reads);
  #endif
                 } else {
@@ -97,7 +97,7 @@ next:
         else
                 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(nr_secs, &pblk->inflight_reads);
  #endif
  }
@@ -117,13 +117,13 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
                         continue;
  
                 if (lba != blba + i) {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         struct ppa_addr *p;
  
                         p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
-                       print_ppa(&pblk->dev->geo, p, "seq", i);
+                       print_ppa(pblk, p, "seq", i);
  #endif
-                       pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+                       pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
                                                         lba, (u64)blba + i);
                         WARN_ON(1);
                 }
@@ -149,14 +149,14 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
                 meta_lba = le64_to_cpu(meta_lba_list[j].lba);
  
                 if (lba != meta_lba) {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         struct ppa_addr *p;
                         int nr_ppas = rqd->nr_ppas;
  
                         p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
-                       print_ppa(&pblk->dev->geo, p, "seq", j);
+                       print_ppa(pblk, p, "seq", j);
  #endif
-                       pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+                       pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
                                                                 lba, meta_lba);
                         WARN_ON(1);
                 }
@@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
  
  static void pblk_end_user_read(struct bio *bio)
  {
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
  #endif
         bio_endio(bio);
@@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
         struct bio *int_bio = rqd->bio;
         unsigned long start_time = r_ctx->start_time;
  
-       generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
+       generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
  
         if (rqd->error)
                 pblk_log_read_err(pblk, rqd);
@@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
         if (put_line)
                 pblk_read_put_rqd_kref(pblk, rqd);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
         atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
  #endif
@@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
         __pblk_end_io_read(pblk, rqd, true);
  }
  
-static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
-                            struct bio *orig_bio, unsigned int bio_init_idx,
-                            unsigned long *read_bitmap)
+static void pblk_end_partial_read(struct nvm_rq *rqd)
  {
-       struct pblk_sec_meta *meta_list = rqd->meta_list;
-       struct bio *new_bio;
+       struct pblk *pblk = rqd->private;
+       struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+       struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+       struct bio *new_bio = rqd->bio;
+       struct bio *bio = pr_ctx->orig_bio;
         struct bio_vec src_bv, dst_bv;
-       void *ppa_ptr = NULL;
-       void *src_p, *dst_p;
-       dma_addr_t dma_ppa_list = 0;
-       __le64 *lba_list_mem, *lba_list_media;
-       int nr_secs = rqd->nr_ppas;
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       int bio_init_idx = pr_ctx->bio_init_idx;
+       unsigned long *read_bitmap = pr_ctx->bitmap;
+       int nr_secs = pr_ctx->orig_nr_secs;
         int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
-       int i, ret, hole;
-
-       /* Re-use allocated memory for intermediate lbas */
-       lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-       lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
-
-       new_bio = bio_alloc(GFP_KERNEL, nr_holes);
-
-       if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
-               goto fail_add_pages;
-
-       if (nr_holes != new_bio->bi_vcnt) {
-               pr_err("pblk: malformed bio\n");
-               goto fail;
-       }
-
-       for (i = 0; i < nr_secs; i++)
-               lba_list_mem[i] = meta_list[i].lba;
-
-       new_bio->bi_iter.bi_sector = 0; /* internal bio */
-       bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
-
-       rqd->bio = new_bio;
-       rqd->nr_ppas = nr_holes;
-       rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-
-       if (unlikely(nr_holes == 1)) {
-               ppa_ptr = rqd->ppa_list;
-               dma_ppa_list = rqd->dma_ppa_list;
-               rqd->ppa_addr = rqd->ppa_list[0];
-       }
-
-       ret = pblk_submit_io_sync(pblk, rqd);
-       if (ret) {
-               bio_put(rqd->bio);
-               pr_err("pblk: sync read IO submission failed\n");
-               goto fail;
-       }
-
-       if (rqd->error) {
-               atomic_long_inc(&pblk->read_failed);
-#ifdef CONFIG_NVM_DEBUG
-               pblk_print_failed_rqd(pblk, rqd, rqd->error);
-#endif
-       }
+       __le64 *lba_list_mem, *lba_list_media;
+       void *src_p, *dst_p;
+       int hole, i;
  
         if (unlikely(nr_holes == 1)) {
                 struct ppa_addr ppa;
  
                 ppa = rqd->ppa_addr;
-               rqd->ppa_list = ppa_ptr;
-               rqd->dma_ppa_list = dma_ppa_list;
+               rqd->ppa_list = pr_ctx->ppa_ptr;
+               rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
                 rqd->ppa_list[0] = ppa;
         }
  
+       /* Re-use allocated memory for intermediate lbas */
+       lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+       lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
+
         for (i = 0; i < nr_secs; i++) {
                 lba_list_media[i] = meta_list[i].lba;
                 meta_list[i].lba = lba_list_mem[i];
@@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
                 meta_list[hole].lba = lba_list_media[i];
  
                 src_bv = new_bio->bi_io_vec[i++];
-               dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
+               dst_bv = bio->bi_io_vec[bio_init_idx + hole];
  
                 src_p = kmap_atomic(src_bv.bv_page);
                 dst_p = kmap_atomic(dst_bv.bv_page);
@@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
         } while (hole < nr_secs);
  
         bio_put(new_bio);
+       kfree(pr_ctx);
  
         /* restore original request */
         rqd->bio = NULL;
         rqd->nr_ppas = nr_secs;
  
+       bio_endio(bio);
         __pblk_end_io_read(pblk, rqd, false);
-       return NVM_IO_DONE;
+}
  
-fail:
-       /* Free allocated pages in new bio */
+static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+                           unsigned int bio_init_idx,
+                           unsigned long *read_bitmap,
+                           int nr_holes)
+{
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+       struct pblk_pr_ctx *pr_ctx;
+       struct bio *new_bio, *bio = r_ctx->private;
+       __le64 *lba_list_mem;
+       int nr_secs = rqd->nr_ppas;
+       int i;
+
+       /* Re-use allocated memory for intermediate lbas */
+       lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+
+       new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+
+       if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+               goto fail_bio_put;
+
+       if (nr_holes != new_bio->bi_vcnt) {
+               WARN_ONCE(1, "pblk: malformed bio\n");
+               goto fail_free_pages;
+       }
+
+       pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+       if (!pr_ctx)
+               goto fail_free_pages;
+
+       for (i = 0; i < nr_secs; i++)
+               lba_list_mem[i] = meta_list[i].lba;
+
+       new_bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+
+       rqd->bio = new_bio;
+       rqd->nr_ppas = nr_holes;
+       rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
+
+       pr_ctx->ppa_ptr = NULL;
+       pr_ctx->orig_bio = bio;
+       bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
+       pr_ctx->bio_init_idx = bio_init_idx;
+       pr_ctx->orig_nr_secs = nr_secs;
+       r_ctx->private = pr_ctx;
+
+       if (unlikely(nr_holes == 1)) {
+               pr_ctx->ppa_ptr = rqd->ppa_list;
+               pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
+               rqd->ppa_addr = rqd->ppa_list[0];
+       }
+       return 0;
+
+fail_free_pages:
         pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
-fail_add_pages:
-       pr_err("pblk: failed to perform partial read\n");
+fail_bio_put:
+       bio_put(new_bio);
+
+       return -ENOMEM;
+}
+
+static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+                                unsigned int bio_init_idx,
+                                unsigned long *read_bitmap, int nr_secs)
+{
+       int nr_holes;
+       int ret;
+
+       nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+
+       if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
+                                   nr_holes))
+               return NVM_IO_ERR;
+
+       rqd->end_io = pblk_end_partial_read;
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               bio_put(rqd->bio);
+               pblk_err(pblk, "partial read IO submission failed\n");
+               goto err;
+       }
+
+       return NVM_IO_OK;
+
+err:
+       pblk_err(pblk, "failed to perform partial read\n");
+
+       /* Free allocated pages in new bio */
+       pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
         __pblk_end_io_read(pblk, rqd, false);
         return NVM_IO_ERR;
  }
@@ -359,7 +409,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
  
         pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_inc(&pblk->inflight_reads);
  #endif
  
@@ -382,7 +432,7 @@ retry:
                 WARN_ON(test_and_set_bit(0, read_bitmap));
                 meta_list[0].lba = cpu_to_le64(lba);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                 atomic_long_inc(&pblk->cache_reads);
  #endif
         } else {
@@ -401,7 +451,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
         struct pblk_g_ctx *r_ctx;
         struct nvm_rq *rqd;
         unsigned int bio_init_idx;
-       unsigned long read_bitmap; /* Max 64 ppas per request */
+       DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
         int ret = NVM_IO_ERR;
  
         /* logic error: lba out-of-bounds. Ignore read request */
@@ -411,9 +461,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
                 return NVM_IO_ERR;
         }
  
-       generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
+       generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
+                             &pblk->disk->part0);
  
-       bitmap_zero(&read_bitmap, nr_secs);
+       bitmap_zero(read_bitmap, nr_secs);
  
         rqd = pblk_alloc_rqd(pblk, PBLK_READ);
  
@@ -436,7 +487,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
         rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
                                                         &rqd->dma_meta_list);
         if (!rqd->meta_list) {
-               pr_err("pblk: not able to allocate ppa list\n");
+               pblk_err(pblk, "not able to allocate ppa list\n");
                 goto fail_rqd_free;
         }
  
@@ -444,32 +495,32 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
                 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
                 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
  
-               pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
+               pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
         } else {
-               pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
+               pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
         }
  
-       if (bitmap_full(&read_bitmap, nr_secs)) {
+       if (bitmap_full(read_bitmap, nr_secs)) {
                 atomic_inc(&pblk->inflight_io);
                 __pblk_end_io_read(pblk, rqd, false);
                 return NVM_IO_DONE;
         }
  
         /* All sectors are to be read from the device */
-       if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+       if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
                 struct bio *int_bio = NULL;
  
                 /* Clone read bio to deal with read errors internally */
                 int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
                 if (!int_bio) {
-                       pr_err("pblk: could not clone read bio\n");
+                       pblk_err(pblk, "could not clone read bio\n");
                         goto fail_end_io;
                 }
  
                 rqd->bio = int_bio;
  
                 if (pblk_submit_io(pblk, rqd)) {
-                       pr_err("pblk: read IO submission failed\n");
+                       pblk_err(pblk, "read IO submission failed\n");
                         ret = NVM_IO_ERR;
                         goto fail_end_io;
                 }
@@ -480,8 +531,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
         /* The read bio request could be partially filled by the write buffer,
          * but there are some holes that need to be read from the drive.
          */
-       return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
+       ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
+                                   nr_secs);
+       if (ret)
+               goto fail_meta_free;
+
+       return NVM_IO_OK;
  
+fail_meta_free:
+       nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
  fail_rqd_free:
         pblk_free_rqd(pblk, rqd, PBLK_READ);
         return ret;
@@ -514,7 +572,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
                 rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
         }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(valid_secs, &pblk->inflight_reads);
  #endif
  
@@ -548,7 +606,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
         rqd->ppa_addr = ppa_l2p;
         valid_secs = 1;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_inc(&pblk->inflight_reads);
  #endif
  
@@ -595,7 +653,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
         bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
                                                 PBLK_VMALLOC_META, GFP_KERNEL);
         if (IS_ERR(bio)) {
-               pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+               pblk_err(pblk, "could not allocate GC bio (%lu)\n",
+                               PTR_ERR(bio));
                 goto err_free_dma;
         }
  
@@ -609,7 +668,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
  
         if (pblk_submit_io_sync(pblk, &rqd)) {
                 ret = -EIO;
-               pr_err("pblk: GC read request failed\n");
+               pblk_err(pblk, "GC read request failed\n");
                 goto err_free_bio;
         }
  
@@ -619,12 +678,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
  
         if (rqd.error) {
                 atomic_long_inc(&pblk->read_failed_gc);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                 pblk_print_failed_rqd(pblk, &rqd, rqd.error);
  #endif
         }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
         atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
         atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c

index 3a50691..e232e47 100644 (file)
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
         }
  
         if (nr_valid_lbas != nr_lbas)
-               pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
+               pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
                                 line->id, nr_valid_lbas, nr_lbas);
  
         line->left_msecs = 0;
@@ -184,7 +184,7 @@ next_read_rq:
         /* If read fails, more padding is needed */
         ret = pblk_submit_io_sync(pblk, rqd);
         if (ret) {
-               pr_err("pblk: I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "I/O submission failed: %d\n", ret);
                 return ret;
         }
  
@@ -194,7 +194,7 @@ next_read_rq:
          * we cannot recover from here. Need FTL log.
          */
         if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
-               pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+               pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error);
                 return -EINTR;
         }
  
@@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
  next_pad_rq:
         rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
         if (rq_ppas < pblk->min_write_pgs) {
-               pr_err("pblk: corrupted pad line %d\n", line->id);
+               pblk_err(pblk, "corrupted pad line %d\n", line->id);
                 goto fail_free_pad;
         }
  
@@ -342,7 +342,7 @@ next_pad_rq:
  
         ret = pblk_submit_io(pblk, rqd);
         if (ret) {
-               pr_err("pblk: I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "I/O submission failed: %d\n", ret);
                 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
                 goto fail_free_bio;
         }
@@ -356,12 +356,12 @@ next_pad_rq:
  
         if (!wait_for_completion_io_timeout(&pad_rq->wait,
                                 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-               pr_err("pblk: pad write timed out\n");
+               pblk_err(pblk, "pad write timed out\n");
                 ret = -ETIME;
         }
  
         if (!pblk_line_is_full(line))
-               pr_err("pblk: corrupted padded line: %d\n", line->id);
+               pblk_err(pblk, "corrupted padded line: %d\n", line->id);
  
         vfree(data);
  free_rq:
@@ -461,7 +461,7 @@ next_rq:
  
         ret = pblk_submit_io_sync(pblk, rqd);
         if (ret) {
-               pr_err("pblk: I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "I/O submission failed: %d\n", ret);
                 return ret;
         }
  
@@ -501,11 +501,11 @@ next_rq:
  
                 ret = pblk_recov_pad_oob(pblk, line, pad_secs);
                 if (ret)
-                       pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+                       pblk_err(pblk, "OOB padding failed (err:%d)\n", ret);
  
                 ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
                 if (ret)
-                       pr_err("pblk: OOB read failed (err:%d)\n", ret);
+                       pblk_err(pblk, "OOB read failed (err:%d)\n", ret);
  
                 left_ppas = 0;
         }
@@ -592,7 +592,7 @@ next_rq:
  
         ret = pblk_submit_io_sync(pblk, rqd);
         if (ret) {
-               pr_err("pblk: I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "I/O submission failed: %d\n", ret);
                 bio_put(bio);
                 return ret;
         }
@@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
  
         ret = pblk_recov_scan_oob(pblk, line, p, &done);
         if (ret) {
-               pr_err("pblk: could not recover L2P from OOB\n");
+               pblk_err(pblk, "could not recover L2P from OOB\n");
                 goto out;
         }
  
         if (!done) {
                 ret = pblk_recov_scan_all_oob(pblk, line, p);
                 if (ret) {
-                       pr_err("pblk: could not recover L2P from OOB\n");
+                       pblk_err(pblk, "could not recover L2P from OOB\n");
                         goto out;
                 }
         }
@@ -737,14 +737,15 @@ static int pblk_recov_check_line_version(struct pblk *pblk,
         struct line_header *header = &emeta->header;
  
         if (header->version_major != EMETA_VERSION_MAJOR) {
-               pr_err("pblk: line major version mismatch: %d, expected: %d\n",
-                      header->version_major, EMETA_VERSION_MAJOR);
+               pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
+                        header->version_major, EMETA_VERSION_MAJOR);
                 return 1;
         }
  
-#ifdef NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         if (header->version_minor > EMETA_VERSION_MINOR)
-               pr_info("pblk: newer line minor version found: %d\n", line_v);
+               pblk_info(pblk, "newer line minor version found: %d\n",
+                               header->version_minor);
  #endif
  
         return 0;
@@ -851,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
                         continue;
  
                 if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
-                       pr_err("pblk: found incompatible line version %u\n",
+                       pblk_err(pblk, "found incompatible line version %u\n",
                                         smeta_buf->header.version_major);
                         return ERR_PTR(-EINVAL);
                 }
@@ -863,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
                 }
  
                 if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
-                       pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+                       pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
                                         i);
                         continue;
                 }
@@ -887,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
  
                 pblk_recov_line_add_ordered(&recov_list, line);
                 found_lines++;
-               pr_debug("pblk: recovering data line %d, seq:%llu\n",
+               pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
                                                 line->id, smeta_buf->seq_nr);
         }
  
@@ -947,7 +948,7 @@ next:
                         line->emeta = NULL;
                 } else {
                         if (open_lines > 1)
-                               pr_err("pblk: failed to recover L2P\n");
+                               pblk_err(pblk, "failed to recover L2P\n");
  
                         open_lines++;
                         line->meta_line = meta_line;
@@ -976,7 +977,7 @@ next:
  
  out:
         if (found_lines != recovered_lines)
-               pr_err("pblk: failed to recover all found lines %d/%d\n",
+               pblk_err(pblk, "failed to recover all found lines %d/%d\n",
                                                 found_lines, recovered_lines);
  
         return data_line;
@@ -999,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk)
  
         ret = pblk_recov_pad_oob(pblk, line, left_msecs);
         if (ret) {
-               pr_err("pblk: Tear down padding failed (%d)\n", ret);
+               pblk_err(pblk, "tear down padding failed (%d)\n", ret);
                 return ret;
         }
  
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c

index 88a0a7c..9fc3dfa 100644 (file)
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
         spin_unlock(&l_mg->free_lock);
  
         if (nr_free_lines != free_line_cnt)
-               pr_err("pblk: corrupted free line list:%d/%d\n",
+               pblk_err(pblk, "corrupted free line list:%d/%d\n",
                                                 nr_free_lines, free_line_cnt);
  
         sz = snprintf(page, PAGE_SIZE - sz,
@@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
         return sz;
  }
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
  static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
  {
         return snprintf(page, PAGE_SIZE,
@@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = {
         .mode = 0644,
  };
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
  static struct attribute sys_stats_debug_attr = {
         .name = "stats",
         .mode = 0444,
@@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = {
         &sys_write_amp_mileage,
         &sys_write_amp_trip,
         &sys_padding_dist,
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         &sys_stats_debug_attr,
  #endif
         NULL,
@@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
                 return pblk_sysfs_get_write_amp_trip(pblk, buf);
         else if (strcmp(attr->name, "padding_dist") == 0)
                 return pblk_sysfs_get_padding_dist(pblk, buf);
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         else if (strcmp(attr->name, "stats") == 0)
                 return pblk_sysfs_stats_debug(pblk, buf);
  #endif
@@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk)
                                         kobject_get(&parent_dev->kobj),
                                         "%s", "pblk");
         if (ret) {
-               pr_err("pblk: could not register %s/pblk\n",
-                                               tdisk->disk_name);
+               pblk_err(pblk, "could not register\n");
                 return ret;
         }
  
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c

index f353e52..ee774a8 100644 (file)
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
                         /* Release flags on context. Protect from writes */
                         smp_store_release(&w_ctx->flags, flags);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
                         atomic_dec(&rwb->inflight_flush_point);
  #endif
                 }
@@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
                 pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
                                                         c_ctx->nr_padded);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
  #endif
  
@@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
         unsigned long flags;
         unsigned long pos;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
  #endif
  
@@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
         list_add_tail(&r_ctx->list, &pblk->resubmit_list);
         spin_unlock(&pblk->resubmit_lock);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
  #endif
  }
@@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
  
         recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
         if (!recovery) {
-               pr_err("pblk: could not allocate recovery work\n");
+               pblk_err(pblk, "could not allocate recovery work\n");
                 return;
         }
  
@@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
                 pblk_end_w_fail(pblk, rqd);
                 return;
         }
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         else
                 WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
  #endif
@@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
  
         if (rqd->error) {
                 pblk_log_write_err(pblk, rqd);
-               pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
+               pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
                 line->w_err_gc->has_write_err = 1;
         }
  
@@ -356,11 +356,11 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
  
         secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         if ((!secs_to_sync && secs_to_flush)
                         || (secs_to_sync < 0)
                         || (secs_to_sync > secs_avail && !secs_to_flush)) {
-               pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+               pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
                                 secs_avail, secs_to_sync, secs_to_flush);
         }
  #endif
@@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
         bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
                                         l_mg->emeta_alloc_type, GFP_KERNEL);
         if (IS_ERR(bio)) {
-               pr_err("pblk: failed to map emeta io");
+               pblk_err(pblk, "failed to map emeta io");
                 ret = PTR_ERR(bio);
                 goto fail_free_rqd;
         }
@@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
  
         ret = pblk_submit_io(pblk, rqd);
         if (ret) {
-               pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+               pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
                 goto fail_rollback;
         }
  
@@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
         /* Assign lbas to ppas and populate request structure */
         err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
         if (err) {
-               pr_err("pblk: could not setup write request: %d\n", err);
+               pblk_err(pblk, "could not setup write request: %d\n", err);
                 return NVM_IO_ERR;
         }
  
@@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
         /* Submit data write for current data line */
         err = pblk_submit_io(pblk, rqd);
         if (err) {
-               pr_err("pblk: data I/O submission failed: %d\n", err);
+               pblk_err(pblk, "data I/O submission failed: %d\n", err);
                 return NVM_IO_ERR;
         }
  
@@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
                 /* Submit metadata write for previous data line */
                 err = pblk_submit_meta_io(pblk, meta_line);
                 if (err) {
-                       pr_err("pblk: metadata I/O submission failed: %d", err);
+                       pblk_err(pblk, "metadata I/O submission failed: %d",
+                                       err);
                         return NVM_IO_ERR;
                 }
         }
@@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk)
                 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
                                         secs_to_flush);
                 if (secs_to_sync > pblk->max_write_pgs) {
-                       pr_err("pblk: bad buffer sync calculation\n");
+                       pblk_err(pblk, "bad buffer sync calculation\n");
                         return 1;
                 }
  
@@ -633,14 +634,14 @@ static int pblk_submit_write(struct pblk *pblk)
  
         if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
                                                                 secs_avail)) {
-               pr_err("pblk: corrupted write bio\n");
+               pblk_err(pblk, "corrupted write bio\n");
                 goto fail_put_bio;
         }
  
         if (pblk_submit_io_set(pblk, rqd))
                 goto fail_free_bio;
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_long_add(secs_to_sync, &pblk->sub_writes);
  #endif
  
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h

index 34cc1d6..4760af7 100644 (file)
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -119,6 +119,16 @@ struct pblk_g_ctx {
         u64 lba;
  };
  
+/* partial read context */
+struct pblk_pr_ctx {
+       struct bio *orig_bio;
+       DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
+       unsigned int orig_nr_secs;
+       unsigned int bio_init_idx;
+       void *ppa_ptr;
+       dma_addr_t dma_ppa_list;
+};
+
  /* Pad context */
  struct pblk_pad_rq {
         struct pblk *pblk;
@@ -193,7 +203,7 @@ struct pblk_rb {
         spinlock_t w_lock;              /* Write lock */
         spinlock_t s_lock;              /* Sync lock */
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         atomic_t inflight_flush_point;  /* Not served REQ_FLUSH | REQ_FUA */
  #endif
  };
@@ -608,9 +618,6 @@ struct pblk {
  
         int min_write_pgs; /* Minimum amount of pages required by controller */
         int max_write_pgs; /* Maximum amount of pages supported by controller */
-       int pgs_in_buffer; /* Number of pages that need to be held in buffer to
-                           * guarantee successful reads.
-                           */
  
         sector_t capacity; /* Device capacity when bad blocks are subtracted */
  
@@ -639,7 +646,7 @@ struct pblk {
         u64 nr_flush_rst;               /* Flushes reset value for pad dist.*/
         atomic64_t nr_flush;            /* Number of flush/fua I/O */
  
-#ifdef CONFIG_NVM_DEBUG
+#ifdef CONFIG_NVM_PBLK_DEBUG
         /* Non-persistent debug counters, 4kb sector I/Os */
         atomic_long_t inflight_writes;  /* Inflight writes (user and gc) */
         atomic_long_t padded_writes;    /* Sectors padded due to flush/fua */
@@ -706,6 +713,15 @@ struct pblk_line_ws {
  #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
  #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
  
+#define pblk_err(pblk, fmt, ...)                       \
+       pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_info(pblk, fmt, ...)                      \
+       pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_warn(pblk, fmt, ...)                      \
+       pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+#define pblk_debug(pblk, fmt, ...)                     \
+       pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
+
  /*
   * pblk ring buffer operations
   */
@@ -1282,20 +1298,22 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
         return !(nr_secs % pblk->min_write_pgs);
  }
  
-#ifdef CONFIG_NVM_DEBUG
-static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p,
+#ifdef CONFIG_NVM_PBLK_DEBUG
+static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
                              char *msg, int error)
  {
+       struct nvm_geo *geo = &pblk->dev->geo;
+
         if (p->c.is_cached) {
-               pr_err("ppa: (%s: %x) cache line: %llu\n",
+               pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
                                 msg, error, (u64)p->c.line);
         } else if (geo->version == NVM_OCSSD_SPEC_12) {
-               pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+               pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
                         msg, error,
                         p->g.ch, p->g.lun, p->g.blk,
                         p->g.pg, p->g.pl, p->g.sec);
         } else {
-               pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
+               pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
                         msg, error,
                         p->m.grp, p->m.pu, p->m.chk, p->m.sec);
         }
@@ -1307,16 +1325,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
         int bit = -1;
  
         if (rqd->nr_ppas ==  1) {
-               print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error);
+               print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
                 return;
         }
  
         while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
                                                 bit + 1)) < rqd->nr_ppas) {
-               print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error);
+               print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
         }
  
-       pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+       pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
  }
  
  static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
@@ -1347,7 +1365,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
                                 continue;
                 }
  
-               print_ppa(geo, ppa, "boundary", i);
+               print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
  
                 return 1;
         }
@@ -1377,7 +1395,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
  
                         spin_lock(&line->lock);
                         if (line->state != PBLK_LINESTATE_OPEN) {
-                               pr_err("pblk: bad ppa: line:%d,state:%d\n",
+                               pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
                                                         line->id, line->state);
                                 WARN_ON(1);
                                 spin_unlock(&line->lock);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h

index d6bf294..872ef4d 100644 (file)
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -474,6 +474,7 @@ struct cache {
  
  struct gc_stat {
         size_t                  nodes;
+       size_t                  nodes_pre;
         size_t                  key_bytes;
  
         size_t                  nkeys;
@@ -525,6 +526,7 @@ struct cache_set {
         unsigned                devices_max_used;
         struct list_head        cached_devs;
         uint64_t                cached_dev_sectors;
+       atomic_long_t           flash_dev_dirty_sectors;
         struct closure          caching;
  
         struct closure          sb_write;
@@ -602,6 +604,10 @@ struct cache_set {
          * rescale; when it hits 0 we rescale all the bucket priorities.
          */
         atomic_t                rescale;
+       /*
+        * used for GC, identify if any front side I/Os is inflight
+        */
+       atomic_t                search_inflight;
         /*
          * When we invalidate buckets, we use both the priority and the amount
          * of good data to determine which buckets to reuse first - to weight
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c

index 547c9ee..475008f 100644 (file)
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
  
  #define MAX_NEED_GC            64
  #define MAX_SAVE_PRIO          72
+#define MAX_GC_TIMES           100
+#define MIN_GC_NODES           100
+#define GC_SLEEP_MS            100
  
  #define PTR_DIRTY_BIT          (((uint64_t) 1 << 36))
  
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
         return ret;
  }
  
+static size_t btree_gc_min_nodes(struct cache_set *c)
+{
+       size_t min_nodes;
+
+       /*
+        * Since incremental GC would stop 100ms when front
+        * side I/O comes, so when there are many btree nodes,
+        * if GC only processes constant (100) nodes each time,
+        * GC would last a long time, and the front side I/Os
+        * would run out of the buckets (since no new bucket
+        * can be allocated during GC), and be blocked again.
+        * So GC should not process constant nodes, but varied
+        * nodes according to the number of btree nodes, which
+        * realized by dividing GC into constant(100) times,
+        * so when there are many btree nodes, GC can process
+        * more nodes each time, otherwise, GC will process less
+        * nodes each time (but no less than MIN_GC_NODES)
+        */
+       min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
+       if (min_nodes < MIN_GC_NODES)
+               min_nodes = MIN_GC_NODES;
+
+       return min_nodes;
+}
+
+
  static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                             struct closure *writes, struct gc_stat *gc)
  {
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
                 r->b = NULL;
  
+               if (atomic_read(&b->c->search_inflight) &&
+                   gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+                       gc->nodes_pre =  gc->nodes;
+                       ret = -EAGAIN;
+                       break;
+               }
+
                 if (need_resched()) {
                         ret = -EAGAIN;
                         break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
                 closure_sync(&writes);
                 cond_resched();
  
-               if (ret && ret != -EAGAIN)
+               if (ret == -EAGAIN)
+                       schedule_timeout_interruptible(msecs_to_jiffies
+                                                      (GC_SLEEP_MS));
+               else if (ret)
                         pr_warn("gc failed!");
         } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
  
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
                 do {
                         k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                        bch_ptr_bad);
-                       if (k)
+                       if (k) {
                                 btree_node_prefetch(b, k);
+                               /*
+                                * initiallize c->gc_stats.nodes
+                                * for incremental GC
+                                */
+                               b->c->gc_stats.nodes++;
+                       }
  
                         if (p)
                                 ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c

index d030ce3..04d1467 100644 (file)
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
         struct bio_vec bv, cbv;
         struct bvec_iter iter, citer = { 0 };
  
-       check = bio_clone_kmalloc(bio, GFP_NOIO);
+       check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
         if (!check)
                 return;
+       check->bi_disk = bio->bi_disk;
         check->bi_opf = REQ_OP_READ;
+       check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+       check->bi_iter.bi_size = bio->bi_iter.bi_size;
  
+       bch_bio_map(check, NULL);
         if (bch_bio_alloc_pages(check, GFP_NOIO))
                 goto out_put;
  
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c

index 18f1b52..10748c6 100644 (file)
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
         free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
         free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
         free_fifo(&c->journal.pin);
+       free_heap(&c->flush_btree);
  }
  
  int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c

index ae67f5f..43af905 100644 (file)
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
  static void bio_complete(struct search *s)
  {
         if (s->orig_bio) {
-               generic_end_io_acct(s->d->disk->queue,
-                                   bio_data_dir(s->orig_bio),
+               generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
                                     &s->d->disk->part0, s->start_time);
  
                 trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
  {
         struct search *s = container_of(cl, struct search, cl);
  
+       atomic_dec(&s->d->c->search_inflight);
+
         if (s->iop.bio)
                 bio_put(s->iop.bio);
  
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
  
         closure_init(&s->cl, NULL);
         do_bio_hook(s, bio, request_endio);
+       atomic_inc(&d->c->search_inflight);
  
         s->orig_bio             = bio;
         s->cache_miss           = NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
         bio->bi_end_io = ddip->bi_end_io;
         bio->bi_private = ddip->bi_private;
  
-       generic_end_io_acct(ddip->d->disk->queue,
-                           bio_data_dir(bio),
+       generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
                             &ddip->d->disk->part0, ddip->start_time);
  
         if (bio->bi_status) {
@@ -1120,7 +1121,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
         }
  
         atomic_set(&dc->backing_idle, 0);
-       generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+       generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
  
         bio_set_dev(bio, dc->bdev);
         bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1230,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
         struct search *s;
         struct closure *cl;
         struct bcache_device *d = bio->bi_disk->private_data;
-       int rw = bio_data_dir(bio);
  
         if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
                 bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1237,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
                 return BLK_QC_T_NONE;
         }
  
-       generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+       generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
  
         s = search_alloc(bio, d);
         cl = &s->cl;
@@ -1254,7 +1254,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
                                       flash_dev_nodata,
                                       bcache_wq);
                 return BLK_QC_T_NONE;
-       } else if (rw) {
+       } else if (bio_data_dir(bio)) {
                 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
                                         &KEY(d->id, bio->bi_iter.bi_sector, 0),
                                         &KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c

index fa4058e..e0a9210 100644 (file)
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
                 goto err;
         }
  
-       sb->last_mount = get_seconds();
+       sb->last_mount = (u32)ktime_get_real_seconds();
         err = NULL;
  
         get_page(bh->b_page);
@@ -701,7 +701,7 @@ static void bcache_device_detach(struct bcache_device *d)
  
                 SET_UUID_FLASH_ONLY(u, 0);
                 memcpy(u->uuid, invalid_uuid, 16);
-               u->invalidated = cpu_to_le32(get_seconds());
+               u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
                 bch_uuid_write(d->c);
         }
  
@@ -796,11 +796,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
                 return idx;
  
         if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
-                       BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
-           !(d->disk = alloc_disk(BCACHE_MINORS))) {
-               ida_simple_remove(&bcache_device_idx, idx);
-               return -ENOMEM;
-       }
+                       BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+               goto err;
+
+       d->disk = alloc_disk(BCACHE_MINORS);
+       if (!d->disk)
+               goto err;
  
         set_capacity(d->disk, sectors);
         snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +835,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
         blk_queue_write_cache(q, true, true);
  
         return 0;
+
+err:
+       ida_simple_remove(&bcache_device_idx, idx);
+       return -ENOMEM;
+
  }
  
  /* Cached device */
@@ -1027,7 +1033,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
  int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
                           uint8_t *set_uuid)
  {
-       uint32_t rtime = cpu_to_le32(get_seconds());
+       uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
         struct uuid_entry *u;
         struct cached_dev *exist_dc, *t;
  
@@ -1070,7 +1076,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
             (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
              BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
                 memcpy(u->uuid, invalid_uuid, 16);
-               u->invalidated = cpu_to_le32(get_seconds());
+               u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
                 u = NULL;
         }
  
@@ -1311,6 +1317,8 @@ static void flash_dev_free(struct closure *cl)
  {
         struct bcache_device *d = container_of(cl, struct bcache_device, cl);
         mutex_lock(&bch_register_lock);
+       atomic_long_sub(bcache_dev_sectors_dirty(d),
+                       &d->c->flash_dev_dirty_sectors);
         bcache_device_free(d);
         mutex_unlock(&bch_register_lock);
         kobject_put(&d->kobj);
@@ -1390,7 +1398,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
  
         get_random_bytes(u->uuid, 16);
         memset(u->label, 0, 32);
-       u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+       u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
  
         SET_UUID_FLASH_ONLY(u, 1);
         u->sectors = size >> 9;
@@ -1894,7 +1902,7 @@ static void run_cache_set(struct cache_set *c)
                 goto err;
  
         closure_sync(&cl);
-       c->sb.last_mount = get_seconds();
+       c->sb.last_mount = (u32)ktime_get_real_seconds();
         bcache_write_super(c);
  
         list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2171,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
         if (!try_module_get(THIS_MODULE))
                 return -EBUSY;
  
-       if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
-           !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+       path = kstrndup(buffer, size, GFP_KERNEL);
+       if (!path)
+               goto err;
+
+       sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
+       if (!sb)
                 goto err;
  
         err = "failed to open device";
@@ -2324,9 +2336,15 @@ static int __init bcache_init(void)
                 return bcache_major;
         }
  
-       if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
-           !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
-           bch_request_init() ||
+       bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+       if (!bcache_wq)
+               goto err;
+
+       bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
+       if (!bcache_kobj)
+               goto err;
+
+       if (bch_request_init() ||
             bch_debug_init(bcache_kobj) || closure_debug_init() ||
             sysfs_create_files(bcache_kobj, files))
                 goto err;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c

index ad45ebe..912e969 100644 (file)
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
          * flash-only devices
          */
         uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
-                               bcache_flash_devs_sectors_dirty(c);
+                               atomic_long_read(&c->flash_dev_dirty_sectors);
  
         /*
          * Unfortunately there is no control of global dirty data.  If the
@@ -476,6 +476,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
         if (!d)
                 return;
  
+       if (UUID_FLASH_ONLY(&c->uuids[inode]))
+               atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
+
         stripe = offset_to_stripe(d, offset);
         stripe_offset = offset & (d->stripe_size - 1);
  
@@ -673,10 +676,14 @@ static int bch_writeback_thread(void *arg)
  }
  
  /* Init */
+#define INIT_KEYS_EACH_TIME    500000
+#define INIT_KEYS_SLEEP_MS     100
  
  struct sectors_dirty_init {
         struct btree_op op;
         unsigned        inode;
+       size_t          count;
+       struct bkey     start;
  };
  
  static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +698,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
                 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
                                              KEY_START(k), KEY_SIZE(k));
  
+       op->count++;
+       if (atomic_read(&b->c->search_inflight) &&
+           !(op->count % INIT_KEYS_EACH_TIME)) {
+               bkey_copy_key(&op->start, k);
+               return -EAGAIN;
+       }
+
         return MAP_CONTINUE;
  }
  
  void bch_sectors_dirty_init(struct bcache_device *d)
  {
         struct sectors_dirty_init op;
+       int ret;
  
         bch_btree_op_init(&op.op, -1);
         op.inode = d->id;
-
-       bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
-                          sectors_dirty_init_fn, 0);
+       op.count = 0;
+       op.start = KEY(op.inode, 0, 0);
+
+       do {
+               ret = bch_btree_map_keys(&op.op, d->c, &op.start,
+                                        sectors_dirty_init_fn, 0);
+               if (ret == -EAGAIN)
+                       schedule_timeout_interruptible(
+                               msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+               else if (ret < 0) {
+                       pr_warn("sectors dirty init failed, ret=%d!", ret);
+                       break;
+               }
+       } while (ret == -EAGAIN);
  }
  
  void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h

index 610fb01..3745d70 100644 (file)
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
         return ret;
  }
  
-static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
-{
-       uint64_t i, ret = 0;
-
-       mutex_lock(&bch_register_lock);
-
-       for (i = 0; i < c->devices_max_used; i++) {
-               struct bcache_device *d = c->devices[i];
-
-               if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
-                       continue;
-               ret += bcache_dev_sectors_dirty(d);
-       }
-
-       mutex_unlock(&bch_register_lock);
-
-       return ret;
-}
-
  static inline unsigned offset_to_stripe(struct bcache_device *d,
                                         uint64_t offset)
  {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index b0dd702..20f7e4e 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
  
         io->start_time = jiffies;
  
-       generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+       generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+                             &dm_disk(md)->part0);
  
         atomic_set(&dm_disk(md)->part0.in_flight[rw],
                    atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
         int pending;
         int rw = bio_data_dir(bio);
  
-       generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+       generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+                           io->start_time);
  
         if (unlikely(dm_stats_used(&md->stats)))
                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 994aed2..cb4eb5f 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
   */
  static bool create_on_open = true;
  
-/* bio_clone_mddev
- * like bio_clone_bioset, but with a local bio set
- */
-
  struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
                             struct mddev *mddev)
  {
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
  static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
  {
         const int rw = bio_data_dir(bio);
+       const int sgrp = op_stat_group(bio_op(bio));
         struct mddev *mddev = q->queuedata;
         unsigned int sectors;
         int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
         md_handle_request(mddev, bio);
  
         cpu = part_stat_lock();
-       part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-       part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+       part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+       part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
         part_stat_unlock();
  
         return BLK_QC_T_NONE;
@@ -8046,8 +8043,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
         rcu_read_lock();
         rdev_for_each_rcu(rdev, mddev) {
                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-               curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
-                             (int)part_stat_read(&disk->part0, sectors[1]) -
+               curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
                               atomic_read(&disk->sync_io);
                 /* sync IO will cause sync_io to increase before the disk_stats
                  * as sync_io is counted when a request starts, and
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c

index 85de805..0360c01 100644 (file)
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
  
  static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
                         struct page *page, unsigned int len, unsigned int off,
-                       bool is_write, sector_t sector)
+                       unsigned int op, sector_t sector)
  {
         int ret;
  
-       if (!is_write) {
+       if (!op_is_write(op)) {
                 ret = btt_read_pg(btt, bip, page, off, sector, len);
                 flush_dcache_page(page);
         } else {
@@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
                 }
  
                 err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
-                                 op_is_write(bio_op(bio)), iter.bi_sector);
+                                 bio_op(bio), iter.bi_sector);
                 if (err) {
                         dev_err(&btt->nd_btt->dev,
                                         "io error in %s sector %lld, len %d,\n",
@@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
  }
  
  static int btt_rw_page(struct block_device *bdev, sector_t sector,
-               struct page *page, bool is_write)
+               struct page *page, unsigned int op)
  {
         struct btt *btt = bdev->bd_disk->private_data;
         int rc;
         unsigned int len;
  
         len = hpage_nr_pages(page) * PAGE_SIZE;
-       rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
+       rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
         if (rc == 0)
-               page_endio(page, is_write, 0);
+               page_endio(page, op_is_write(op), 0);
  
         return rc;
  }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h

index 32e0364..6ee7fd7 100644 (file)
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
                 return false;
  
         *start = jiffies;
-       generic_start_io_acct(disk->queue, bio_data_dir(bio),
-                             bio_sectors(bio), &disk->part0);
+       generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
+                             &disk->part0);
         return true;
  }
  static inline void nd_iostat_end(struct bio *bio, unsigned long start)
  {
         struct gendisk *disk = bio->bi_disk;
  
-       generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
-                               start);
+       generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
  }
  static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
                 unsigned int len)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c

index 8b1fd7f..dd17acd 100644 (file)
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
  }
  
  static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
-                       unsigned int len, unsigned int off, bool is_write,
+                       unsigned int len, unsigned int off, unsigned int op,
                         sector_t sector)
  {
         blk_status_t rc = BLK_STS_OK;
@@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
         if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
                 bad_pmem = true;
  
-       if (!is_write) {
+       if (!op_is_write(op)) {
                 if (unlikely(bad_pmem))
                         rc = BLK_STS_IOERR;
                 else {
@@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
         do_acct = nd_iostat_start(bio, &start);
         bio_for_each_segment(bvec, bio, iter) {
                 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
-                               bvec.bv_offset, op_is_write(bio_op(bio)),
-                               iter.bi_sector);
+                               bvec.bv_offset, bio_op(bio), iter.bi_sector);
                 if (rc) {
                         bio->bi_status = rc;
                         break;
@@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
  }
  
  static int pmem_rw_page(struct block_device *bdev, sector_t sector,
-                      struct page *page, bool is_write)
+                      struct page *page, unsigned int op)
  {
         struct pmem_device *pmem = bdev->bd_queue->queuedata;
         blk_status_t rc;
  
         rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
-                         0, is_write, sector);
+                         0, op, sector);
  
         /*
          * The ->rw_page interface is subtle and tricky.  The core
@@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
          * caused by double completion.
          */
         if (rc == 0)
-               page_endio(page, is_write, 0);
+               page_endio(page, op_is_write(op), 0);
  
         return blk_status_to_errno(rc);
  }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index bf65501..9347f20 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -617,6 +617,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
                                 return BLK_STS_NOTSUPP;
                         control |= NVME_RW_PRINFO_PRACT;
+               } else if (req_op(req) == REQ_OP_WRITE) {
+                       t10_pi_prepare(req, ns->pi_type);
                 }
  
                 switch (ns->pi_type) {
@@ -627,8 +629,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
                 case NVME_NS_DPS_PI_TYPE2:
                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
                                         NVME_RW_PRINFO_PRCHK_REF;
-                       cmnd->rw.reftag = cpu_to_le32(
-                                       nvme_block_nr(ns, blk_rq_pos(req)));
+                       cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
                         break;
                 }
         }
@@ -638,6 +639,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
         return 0;
  }
  
+void nvme_cleanup_cmd(struct request *req)
+{
+       if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+           nvme_req(req)->status == 0) {
+               struct nvme_ns *ns = req->rq_disk->private_data;
+
+               t10_pi_complete(req, ns->pi_type,
+                               blk_rq_bytes(req) >> ns->lba_shift);
+       }
+       if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+               kfree(page_address(req->special_vec.bv_page) +
+                     req->special_vec.bv_offset);
+       }
+}
+EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
+
  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
                 struct nvme_command *cmd)
  {
@@ -668,10 +685,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
         }
  
         cmd->common.command_id = req->tag;
-       if (ns)
-               trace_nvme_setup_nvm_cmd(req->q->id, cmd);
-       else
-               trace_nvme_setup_admin_cmd(cmd);
+       trace_nvme_setup_cmd(req, cmd);
         return ret;
  }
  EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -864,9 +878,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
         if (unlikely(ctrl->kato == 0))
                 return;
  
-       INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
-       memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
-       ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
         schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
  }
  
@@ -3499,6 +3510,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
  
+       INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+       memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+       ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+
         ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
         if (ret < 0)
                 goto out;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c

index 41d45a1..9cc3375 100644 (file)
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
         struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
  
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
         return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
  }
  
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c

index 41279da..d9e4ccc 100644 (file)
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -583,7 +583,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
         struct ppa_addr ppa;
         size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
         size_t log_pos, offset, len;
-       int ret, i;
+       int ret, i, max_len;
+
+       /*
+        * limit requests to maximum 256K to avoid issuing arbitrary large
+        * requests when the device does not specific a maximum transfer size.
+        */
+       max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
  
         /* Normalize lba address space to obtain log offset */
         ppa.ppa = slba;
@@ -596,7 +602,7 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
         offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
  
         while (left) {
-               len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9);
+               len = min_t(unsigned int, left, max_len);
  
                 ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
                                 dev_meta, len, offset);
@@ -662,12 +668,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
  
         rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
  
-       if (rqd->bio) {
+       if (rqd->bio)
                 blk_init_request_from_bio(rq, rqd->bio);
-       } else {
+       else
                 rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
-               rq->__data_len = 0;
-       }
  
         return rq;
  }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h

index 0c4a33d..cf970f9 100644 (file)
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -102,6 +102,7 @@ struct nvme_request {
         u8                      retries;
         u8                      flags;
         u16                     status;
+       struct nvme_ctrl        *ctrl;
  };
  
  /*
@@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req)
         return blk_mq_rq_to_pdu(req);
  }
  
+static inline u16 nvme_req_qid(struct request *req)
+{
+       if (!req->rq_disk)
+               return 0;
+       return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
+}
+
  /* The below value is the specific amount of delay needed before checking
   * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
   * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -356,14 +364,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
         return (sector >> (ns->lba_shift - 9));
  }
  
-static inline void nvme_cleanup_cmd(struct request *req)
-{
-       if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-               kfree(page_address(req->special_vec.bv_page) +
-                     req->special_vec.bv_offset);
-       }
-}
-
  static inline void nvme_end_request(struct request *req, __le16 status,
                 union nvme_result result)
  {
@@ -420,6 +420,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
  #define NVME_QID_ANY -1
  struct request *nvme_alloc_request(struct request_queue *q,
                 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
+void nvme_cleanup_cmd(struct request *req);
  blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
                 struct nvme_command *cmd);
  int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index ddd441b..1b9951d 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
  
         BUG_ON(!nvmeq);
         iod->nvmeq = nvmeq;
+
+       nvme_req(req)->ctrl = &dev->ctrl;
         return 0;
  }
  
@@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
                 mempool_free(iod->sg, dev->iod_mempool);
  }
  
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-       if (be32_to_cpu(pi->ref_tag) == v)
-               pi->ref_tag = cpu_to_be32(p);
-}
-
-static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-       if (be32_to_cpu(pi->ref_tag) == p)
-               pi->ref_tag = cpu_to_be32(v);
-}
-
-/**
- * nvme_dif_remap - remaps ref tags to bip seed and physical lba
- *
- * The virtual start sector is the one that was originally submitted by the
- * block layer.        Due to partitioning, MD/DM cloning, etc. the actual physical
- * start sector may be different. Remap protection information to match the
- * physical LBA on writes, and back to the original seed on reads.
- *
- * Type 0 and 3 do not have a ref tag, so no remapping required.
- */
-static void nvme_dif_remap(struct request *req,
-                       void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
-{
-       struct nvme_ns *ns = req->rq_disk->private_data;
-       struct bio_integrity_payload *bip;
-       struct t10_pi_tuple *pi;
-       void *p, *pmap;
-       u32 i, nlb, ts, phys, virt;
-
-       if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
-               return;
-
-       bip = bio_integrity(req->bio);
-       if (!bip)
-               return;
-
-       pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
-
-       p = pmap;
-       virt = bip_get_seed(bip);
-       phys = nvme_block_nr(ns, blk_rq_pos(req));
-       nlb = (blk_rq_bytes(req) >> ns->lba_shift);
-       ts = ns->disk->queue->integrity.tuple_size;
-
-       for (i = 0; i < nlb; i++, virt++, phys++) {
-               pi = (struct t10_pi_tuple *)p;
-               dif_swap(phys, virt, pi);
-               p += ts;
-       }
-       kunmap_atomic(pmap);
-}
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_dif_remap(struct request *req,
-                       void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
-{
-}
-static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-}
-static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-}
-#endif
-
  static void nvme_print_sgl(struct scatterlist *sgl, int nents)
  {
         int i;
@@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
                         goto out_unmap;
  
-               if (req_op(req) == REQ_OP_WRITE)
-                       nvme_dif_remap(req, nvme_dif_prep);
-
                 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
                         goto out_unmap;
         }
@@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
  
         if (iod->nents) {
                 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
-               if (blk_integrity_rq(req)) {
-                       if (req_op(req) == REQ_OP_READ)
-                               nvme_dif_remap(req, nvme_dif_complete);
+               if (blk_integrity_rq(req))
                         dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
-               }
         }
  
         nvme_cleanup_cmd(req);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index 518c5b0..13a6064 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -40,13 +40,14 @@
  
  #define NVME_RDMA_MAX_SEGMENTS         256
  
-#define NVME_RDMA_MAX_INLINE_SEGMENTS  1
+#define NVME_RDMA_MAX_INLINE_SEGMENTS  4
  
  struct nvme_rdma_device {
         struct ib_device        *dev;
         struct ib_pd            *pd;
         struct kref             ref;
         struct list_head        entry;
+       unsigned int            num_inline_segments;
  };
  
  struct nvme_rdma_qe {
@@ -117,6 +118,7 @@ struct nvme_rdma_ctrl {
         struct sockaddr_storage src_addr;
  
         struct nvme_ctrl        ctrl;
+       bool                    use_inline_data;
  };
  
  static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
         /* +1 for drain */
         init_attr.cap.max_recv_wr = queue->queue_size + 1;
         init_attr.cap.max_recv_sge = 1;
-       init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
+       init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
         init_attr.qp_type = IB_QPT_RC;
         init_attr.send_cq = queue->ib_cq;
@@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
         struct ib_device *ibdev = dev->dev;
         int ret;
  
+       nvme_req(rq)->ctrl = &ctrl->ctrl;
         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
                         DMA_TO_DEVICE);
         if (ret)
@@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
                 goto out_free_pd;
         }
  
+       ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
+                                       ndev->dev->attrs.max_sge - 1);
         list_add(&ndev->entry, &device_list);
  out_unlock:
         mutex_unlock(&device_list_mutex);
@@ -868,6 +873,31 @@ out_free_io_queues:
         return ret;
  }
  
+static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+{
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
+                       &ctrl->ctrl);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
+               bool remove)
+{
+       if (ctrl->ctrl.queue_count > 1) {
+               nvme_stop_queues(&ctrl->ctrl);
+               nvme_rdma_stop_io_queues(ctrl);
+               blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
+                               &ctrl->ctrl);
+               if (remove)
+                       nvme_start_queues(&ctrl->ctrl);
+               nvme_rdma_destroy_io_queues(ctrl, remove);
+       }
+}
+
  static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
  {
         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
         }
  }
  
-static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
  {
-       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
-                       struct nvme_rdma_ctrl, reconnect_work);
+       int ret = -EINVAL;
         bool changed;
-       int ret;
  
-       ++ctrl->ctrl.nr_reconnects;
-
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
+       ret = nvme_rdma_configure_admin_queue(ctrl, new);
         if (ret)
-               goto requeue;
+               return ret;
+
+       if (ctrl->ctrl.icdoff) {
+               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
+               goto destroy_admin;
+       }
+
+       if (!(ctrl->ctrl.sgls & (1 << 2))) {
+               dev_err(ctrl->ctrl.device,
+                       "Mandatory keyed sgls are not supported!\n");
+               goto destroy_admin;
+       }
+
+       if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
+               dev_warn(ctrl->ctrl.device,
+                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
+                       ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
+       }
+
+       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
+               dev_warn(ctrl->ctrl.device,
+                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
+                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
+               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
+       }
+
+       if (ctrl->ctrl.sgls & (1 << 20))
+               ctrl->use_inline_data = true;
  
         if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
+               ret = nvme_rdma_configure_io_queues(ctrl, new);
                 if (ret)
                         goto destroy_admin;
         }
@@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
         if (!changed) {
                 /* state change failure is ok if we're in DELETING state */
                 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
+               ret = -EINVAL;
+               goto destroy_io;
         }
  
         nvme_start_ctrl(&ctrl->ctrl);
+       return 0;
+
+destroy_io:
+       if (ctrl->ctrl.queue_count > 1)
+               nvme_rdma_destroy_io_queues(ctrl, new);
+destroy_admin:
+       nvme_rdma_stop_queue(&ctrl->queues[0]);
+       nvme_rdma_destroy_admin_queue(ctrl, new);
+       return ret;
+}
+
+static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
+{
+       struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
+                       struct nvme_rdma_ctrl, reconnect_work);
+
+       ++ctrl->ctrl.nr_reconnects;
+
+       if (nvme_rdma_setup_ctrl(ctrl, false))
+               goto requeue;
  
         dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
                         ctrl->ctrl.nr_reconnects);
@@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
  
         return;
  
-destroy_admin:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
  requeue:
         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                         ctrl->ctrl.nr_reconnects);
@@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
                         struct nvme_rdma_ctrl, err_work);
  
         nvme_stop_keep_alive(&ctrl->ctrl);
-
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, false);
-       }
-
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       nvme_rdma_destroy_admin_queue(ctrl, false);
-
-       /*
-        * queues are not a live anymore, so restart the queues to fail fast
-        * new IO
-        */
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       nvme_rdma_teardown_io_queues(ctrl, false);
         nvme_start_queues(&ctrl->ctrl);
+       nvme_rdma_teardown_admin_queue(ctrl, false);
  
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we're in DELETING state */
@@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c)
  }
  
  static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
-               struct nvme_rdma_request *req, struct nvme_command *c)
+               struct nvme_rdma_request *req, struct nvme_command *c,
+               int count)
  {
         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+       struct scatterlist *sgl = req->sg_table.sgl;
+       struct ib_sge *sge = &req->sge[1];
+       u32 len = 0;
+       int i;
  
-       req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
-       req->sge[1].length = sg_dma_len(req->sg_table.sgl);
-       req->sge[1].lkey = queue->device->pd->local_dma_lkey;
+       for (i = 0; i < count; i++, sgl++, sge++) {
+               sge->addr = sg_dma_address(sgl);
+               sge->length = sg_dma_len(sgl);
+               sge->lkey = queue->device->pd->local_dma_lkey;
+               len += sge->length;
+       }
  
         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
-       sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
+       sg->length = cpu_to_le32(len);
         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
  
-       req->num_sge++;
+       req->num_sge += count;
         return 0;
  }
  
@@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
                 goto out_free_table;
         }
  
-       if (count == 1) {
+       if (count <= dev->num_inline_segments) {
                 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
+                   queue->ctrl->use_inline_data &&
                     blk_rq_payload_bytes(rq) <=
                                 nvme_rdma_inline_data_size(queue)) {
-                       ret = nvme_rdma_map_sg_inline(queue, req, c);
+                       ret = nvme_rdma_map_sg_inline(queue, req, c, count);
                         goto out;
                 }
  
-               if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+               if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
                         ret = nvme_rdma_map_sg_single(queue, req, c);
                         goto out;
                 }
@@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
         case RDMA_CM_EVENT_CONNECT_ERROR:
         case RDMA_CM_EVENT_UNREACHABLE:
                 nvme_rdma_destroy_queue_ib(queue);
+               /* fall through */
         case RDMA_CM_EVENT_ADDR_ERROR:
                 dev_dbg(queue->ctrl->ctrl.device,
                         "CM error event %d\n", ev->event);
@@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
  
  static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
  {
-       if (ctrl->ctrl.queue_count > 1) {
-               nvme_stop_queues(&ctrl->ctrl);
-               nvme_rdma_stop_io_queues(ctrl);
-               blk_mq_tagset_busy_iter(&ctrl->tag_set,
-                                       nvme_cancel_request, &ctrl->ctrl);
-               nvme_rdma_destroy_io_queues(ctrl, shutdown);
-       }
-
+       nvme_rdma_teardown_io_queues(ctrl, shutdown);
         if (shutdown)
                 nvme_shutdown_ctrl(&ctrl->ctrl);
         else
                 nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
-
-       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
-                               nvme_cancel_request, &ctrl->ctrl);
-       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-       nvme_rdma_destroy_admin_queue(ctrl, shutdown);
+       nvme_rdma_teardown_admin_queue(ctrl, shutdown);
  }
  
  static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
  {
         struct nvme_rdma_ctrl *ctrl =
                 container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
-       int ret;
-       bool changed;
  
         nvme_stop_ctrl(&ctrl->ctrl);
         nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
                 return;
         }
  
-       ret = nvme_rdma_configure_admin_queue(ctrl, false);
-       if (ret)
+       if (nvme_rdma_setup_ctrl(ctrl, false))
                 goto out_fail;
  
-       if (ctrl->ctrl.queue_count > 1) {
-               ret = nvme_rdma_configure_io_queues(ctrl, false);
-               if (ret)
-                       goto out_fail;
-       }
-
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       if (!changed) {
-               /* state change failure is ok if we're in DELETING state */
-               WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
-               return;
-       }
-
-       nvme_start_ctrl(&ctrl->ctrl);
-
         return;
  
  out_fail:
@@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
         WARN_ON_ONCE(!changed);
  
-       ret = nvme_rdma_configure_admin_queue(ctrl, true);
+       ret = nvme_rdma_setup_ctrl(ctrl, true);
         if (ret)
                 goto out_uninit_ctrl;
  
-       /* sanity check icdoff */
-       if (ctrl->ctrl.icdoff) {
-               dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
-
-       /* sanity check keyed sgls */
-       if (!(ctrl->ctrl.sgls & (1 << 2))) {
-               dev_err(ctrl->ctrl.device,
-                       "Mandatory keyed sgls are not supported!\n");
-               ret = -EINVAL;
-               goto out_remove_admin_queue;
-       }
-
-       /* only warn if argument is too large here, will clamp later */
-       if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
-               dev_warn(ctrl->ctrl.device,
-                       "queue_size %zu > ctrl sqsize %u, clamping down\n",
-                       opts->queue_size, ctrl->ctrl.sqsize + 1);
-       }
-
-       /* warn if maxcmd is lower than sqsize+1 */
-       if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
-               dev_warn(ctrl->ctrl.device,
-                       "sqsize %u > ctrl maxcmd %u, clamping down\n",
-                       ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
-               ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
-       }
-
-       if (opts->nr_io_queues) {
-               ret = nvme_rdma_configure_io_queues(ctrl, true);
-               if (ret)
-                       goto out_remove_admin_queue;
-       }
-
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
-
         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
  
@@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
         mutex_unlock(&nvme_rdma_ctrl_mutex);
  
-       nvme_start_ctrl(&ctrl->ctrl);
-
         return &ctrl->ctrl;
  
-out_remove_admin_queue:
-       nvme_rdma_stop_queue(&ctrl->queues[0]);
-       nvme_rdma_destroy_admin_queue(ctrl, true);
  out_uninit_ctrl:
         nvme_uninit_ctrl(&ctrl->ctrl);
         nvme_put_ctrl(&ctrl->ctrl);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c

index 41944bb..25b0e31 100644 (file)
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
                 return nvme_trace_common(p, cdw10);
         }
  }
+
+const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
+{
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       if (*name)
+               trace_seq_printf(p, "disk=%s, ", name);
+       trace_seq_putc(p, 0);
+
+       return ret;
+}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h

index 01390f0..a490790 100644 (file)
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -50,13 +50,8 @@
                 nvme_admin_opcode_name(nvme_admin_security_recv),       \
                 nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
  
-const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
-                                      u8 *cdw10);
-#define __parse_nvme_admin_cmd(opcode, cdw10) \
-       nvme_trace_parse_admin_cmd(p, opcode, cdw10)
-
  #define nvme_opcode_name(opcode)       { opcode, #opcode }
-#define show_opcode_name(val)                                  \
+#define show_nvm_opcode_name(val)                              \
         __print_symbolic(val,                                   \
                 nvme_opcode_name(nvme_cmd_flush),               \
                 nvme_opcode_name(nvme_cmd_write),               \
@@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
                 nvme_opcode_name(nvme_cmd_resv_acquire),        \
                 nvme_opcode_name(nvme_cmd_resv_release))
  
-const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
-                                    u8 *cdw10);
-#define __parse_nvme_cmd(opcode, cdw10) \
-       nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
-
-TRACE_EVENT(nvme_setup_admin_cmd,
-           TP_PROTO(struct nvme_command *cmd),
-           TP_ARGS(cmd),
-           TP_STRUCT__entry(
-                   __field(u8, opcode)
-                   __field(u8, flags)
-                   __field(u16, cid)
-                   __field(u64, metadata)
-                   __array(u8, cdw10, 24)
-           ),
-           TP_fast_assign(
-                   __entry->opcode = cmd->common.opcode;
-                   __entry->flags = cmd->common.flags;
-                   __entry->cid = cmd->common.command_id;
-                   __entry->metadata = le64_to_cpu(cmd->common.metadata);
-                   memcpy(__entry->cdw10, cmd->common.cdw10,
-                          sizeof(__entry->cdw10));
-           ),
-           TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
-                     __entry->cid, __entry->flags, __entry->metadata,
-                     show_admin_opcode_name(__entry->opcode),
-                     __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
-);
-
+#define show_opcode_name(qid, opcode)                                  \
+       (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
  
-TRACE_EVENT(nvme_setup_nvm_cmd,
-           TP_PROTO(int qid, struct nvme_command *cmd),
-           TP_ARGS(qid, cmd),
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+               u8 *cdw10);
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+               u8 *cdw10);
+
+#define parse_nvme_cmd(qid, opcode, cdw10)                     \
+       (qid ?                                                  \
+        nvme_trace_parse_nvm_cmd(p, opcode, cdw10) :           \
+        nvme_trace_parse_admin_cmd(p, opcode, cdw10))
+
+const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
+#define __print_disk_name(name)                                \
+       nvme_trace_disk_name(p, name)
+
+#ifndef TRACE_HEADER_MULTI_READ
+static inline void __assign_disk_name(char *name, struct gendisk *disk)
+{
+       if (disk)
+               memcpy(name, disk->disk_name, DISK_NAME_LEN);
+       else
+               memset(name, 0, DISK_NAME_LEN);
+}
+#endif
+
+TRACE_EVENT(nvme_setup_cmd,
+           TP_PROTO(struct request *req, struct nvme_command *cmd),
+           TP_ARGS(req, cmd),
             TP_STRUCT__entry(
-                   __field(int, qid)
-                   __field(u8, opcode)
-                   __field(u8, flags)
-                   __field(u16, cid)
-                   __field(u32, nsid)
-                   __field(u64, metadata)
-                   __array(u8, cdw10, 24)
+               __array(char, disk, DISK_NAME_LEN)
+               __field(int, ctrl_id)
+               __field(int, qid)
+               __field(u8, opcode)
+               __field(u8, flags)
+               __field(u16, cid)
+               __field(u32, nsid)
+               __field(u64, metadata)
+               __array(u8, cdw10, 24)
             ),
             TP_fast_assign(
-                   __entry->qid = qid;
-                   __entry->opcode = cmd->common.opcode;
-                   __entry->flags = cmd->common.flags;
-                   __entry->cid = cmd->common.command_id;
-                   __entry->nsid = le32_to_cpu(cmd->common.nsid);
-                   __entry->metadata = le64_to_cpu(cmd->common.metadata);
-                   memcpy(__entry->cdw10, cmd->common.cdw10,
-                          sizeof(__entry->cdw10));
+               __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+               __entry->qid = nvme_req_qid(req);
+               __entry->opcode = cmd->common.opcode;
+               __entry->flags = cmd->common.flags;
+               __entry->cid = cmd->common.command_id;
+               __entry->nsid = le32_to_cpu(cmd->common.nsid);
+               __entry->metadata = le64_to_cpu(cmd->common.metadata);
+               __assign_disk_name(__entry->disk, req->rq_disk);
+               memcpy(__entry->cdw10, cmd->common.cdw10,
+                      sizeof(__entry->cdw10));
             ),
-           TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
-                     __entry->qid, __entry->nsid, __entry->cid,
+           TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+                     __entry->ctrl_id, __print_disk_name(__entry->disk),
+                     __entry->qid, __entry->cid, __entry->nsid,
                       __entry->flags, __entry->metadata,
-                     show_opcode_name(__entry->opcode),
-                     __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
+                     show_opcode_name(__entry->qid, __entry->opcode),
+                     parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
  );
  
  TRACE_EVENT(nvme_complete_rq,
             TP_PROTO(struct request *req),
             TP_ARGS(req),
             TP_STRUCT__entry(
-                   __field(int, qid)
-                   __field(int, cid)
-                   __field(u64, result)
-                   __field(u8, retries)
-                   __field(u8, flags)
-                   __field(u16, status)
+               __array(char, disk, DISK_NAME_LEN)
+               __field(int, ctrl_id)
+               __field(int, qid)
+               __field(int, cid)
+               __field(u64, result)
+               __field(u8, retries)
+               __field(u8, flags)
+               __field(u16, status)
             ),
             TP_fast_assign(
-                   __entry->qid = req->q->id;
-                   __entry->cid = req->tag;
-                   __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
-                   __entry->retries = nvme_req(req)->retries;
-                   __entry->flags = nvme_req(req)->flags;
-                   __entry->status = nvme_req(req)->status;
+               __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+               __entry->qid = nvme_req_qid(req);
+               __entry->cid = req->tag;
+               __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+               __entry->retries = nvme_req(req)->retries;
+               __entry->flags = nvme_req(req)->flags;
+               __entry->status = nvme_req(req)->status;
+               __assign_disk_name(__entry->disk, req->rq_disk);
             ),
-           TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+           TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+                     __entry->ctrl_id, __print_disk_name(__entry->disk),
                       __entry->qid, __entry->cid, __entry->result,
                       __entry->retries, __entry->flags, __entry->status)
  
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c

index 3880357..16a9b24 100644 (file)
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -128,6 +128,36 @@ out:
         nvmet_req_complete(req, status);
  }
  
+static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
+{
+       u16 status = NVME_SC_INTERNAL;
+       struct nvme_effects_log *log;
+
+       log = kzalloc(sizeof(*log), GFP_KERNEL);
+       if (!log)
+               goto out;
+
+       log->acs[nvme_admin_get_log_page]       = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_identify]           = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_abort_cmd]          = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_set_features]       = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_get_features]       = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_async_event]        = cpu_to_le32(1 << 0);
+       log->acs[nvme_admin_keep_alive]         = cpu_to_le32(1 << 0);
+
+       log->iocs[nvme_cmd_read]                = cpu_to_le32(1 << 0);
+       log->iocs[nvme_cmd_write]               = cpu_to_le32(1 << 0);
+       log->iocs[nvme_cmd_flush]               = cpu_to_le32(1 << 0);
+       log->iocs[nvme_cmd_dsm]                 = cpu_to_le32(1 << 0);
+       log->iocs[nvme_cmd_write_zeroes]        = cpu_to_le32(1 << 0);
+
+       status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+
+       kfree(log);
+out:
+       nvmet_req_complete(req, status);
+}
+
  static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
  {
         struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -208,7 +238,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
  
         /* first slot is read-only, only one slot supported */
         id->frmw = (1 << 0) | (1 << 1);
-       id->lpa = (1 << 0) | (1 << 2);
+       id->lpa = (1 << 0) | (1 << 1) | (1 << 2);
         id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
         id->npss = 0;
  
@@ -238,14 +268,14 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
         id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
         if (ctrl->ops->has_keyed_sgls)
                 id->sgls |= cpu_to_le32(1 << 2);
-       if (ctrl->ops->sqe_inline_size)
+       if (req->port->inline_data_size)
                 id->sgls |= cpu_to_le32(1 << 20);
  
         strcpy(id->subnqn, ctrl->subsys->subsysnqn);
  
         /* Max command capsule size is sqe + single page of in-capsule data */
         id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
-                                 ctrl->ops->sqe_inline_size) / 16);
+                                 req->port->inline_data_size) / 16);
         /* Max response capsule size is cqe */
         id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
  
@@ -308,7 +338,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
          */
         id->nmic = (1 << 0);
  
-       memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
+       memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));
  
         id->lbaf[0].ds = ns->blksize_shift;
  
@@ -586,6 +616,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
                 case NVME_LOG_CHANGED_NS:
                         req->execute = nvmet_execute_get_log_changed_ns;
                         return 0;
+               case NVME_LOG_CMD_EFFECTS:
+                       req->execute = nvmet_execute_get_log_cmd_effects_ns;
+                       return 0;
                 }
                 break;
         case nvme_admin_identify:
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c

index d3f3b3e..3ba5ea5 100644 (file)
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
  
  CONFIGFS_ATTR(nvmet_, addr_trsvcid);
  
+static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
+               char *page)
+{
+       struct nvmet_port *port = to_nvmet_port(item);
+
+       return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
+}
+
+static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
+               const char *page, size_t count)
+{
+       struct nvmet_port *port = to_nvmet_port(item);
+       int ret;
+
+       if (port->enabled) {
+               pr_err("Cannot modify inline_data_size while port enabled\n");
+               pr_err("Disable the port before modifying\n");
+               return -EACCES;
+       }
+       ret = kstrtoint(page, 0, &port->inline_data_size);
+       if (ret) {
+               pr_err("Invalid value '%s' for inline_data_size\n", page);
+               return -EINVAL;
+       }
+       return count;
+}
+
+CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+
  static ssize_t nvmet_addr_trtype_show(struct config_item *item,
                 char *page)
  {
@@ -407,11 +436,40 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
  
  CONFIGFS_ATTR(nvmet_ns_, enable);
  
+static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
+{
+       return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
+}
+
+static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
+               const char *page, size_t count)
+{
+       struct nvmet_ns *ns = to_nvmet_ns(item);
+       bool val;
+
+       if (strtobool(page, &val))
+               return -EINVAL;
+
+       mutex_lock(&ns->subsys->lock);
+       if (ns->enabled) {
+               pr_err("disable ns before setting buffered_io value.\n");
+               mutex_unlock(&ns->subsys->lock);
+               return -EINVAL;
+       }
+
+       ns->buffered_io = val;
+       mutex_unlock(&ns->subsys->lock);
+       return count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, buffered_io);
+
  static struct configfs_attribute *nvmet_ns_attrs[] = {
         &nvmet_ns_attr_device_path,
         &nvmet_ns_attr_device_nguid,
         &nvmet_ns_attr_device_uuid,
         &nvmet_ns_attr_enable,
+       &nvmet_ns_attr_buffered_io,
         NULL,
  };
  
@@ -874,6 +932,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
         &nvmet_attr_addr_traddr,
         &nvmet_attr_addr_trsvcid,
         &nvmet_attr_addr_trtype,
+       &nvmet_attr_param_inline_data_size,
         NULL,
  };
  
@@ -903,6 +962,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
         INIT_LIST_HEAD(&port->entry);
         INIT_LIST_HEAD(&port->subsystems);
         INIT_LIST_HEAD(&port->referrals);
+       port->inline_data_size = -1;    /* < 0 == let the transport choose */
  
         port->disc_addr.portid = cpu_to_le16(portid);
         config_group_init_type_name(&port->group, name, &nvmet_port_type);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c

index 74d4b78..ddd8571 100644 (file)
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -18,6 +18,7 @@
  
  #include "nvmet.h"
  
+struct workqueue_struct *buffered_io_wq;
  static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  static DEFINE_IDA(cntlid_ida);
  
@@ -241,6 +242,10 @@ int nvmet_enable_port(struct nvmet_port *port)
                 return ret;
         }
  
+       /* If the transport didn't set inline_data_size, then disable it. */
+       if (port->inline_data_size < 0)
+               port->inline_data_size = 0;
+
         port->enabled = true;
         return 0;
  }
@@ -437,6 +442,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
         ns->nsid = nsid;
         ns->subsys = subsys;
         uuid_gen(&ns->uuid);
+       ns->buffered_io = false;
  
         return ns;
  }
@@ -1109,6 +1115,12 @@ static int __init nvmet_init(void)
  {
         int error;
  
+       buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
+                       WQ_MEM_RECLAIM, 0);
+       if (!buffered_io_wq) {
+               error = -ENOMEM;
+               goto out;
+       }
         error = nvmet_init_discovery();
         if (error)
                 goto out;
@@ -1129,6 +1141,7 @@ static void __exit nvmet_exit(void)
         nvmet_exit_configfs();
         nvmet_exit_discovery();
         ida_destroy(&cntlid_ida);
+       destroy_workqueue(buffered_io_wq);
  
         BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
         BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c

index 08656b8..eae29f4 100644 (file)
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
         id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
         if (ctrl->ops->has_keyed_sgls)
                 id->sgls |= cpu_to_le32(1 << 2);
-       if (ctrl->ops->sqe_inline_size)
+       if (req->port->inline_data_size)
                 id->sgls |= cpu_to_le32(1 << 20);
  
         strcpy(id->subnqn, ctrl->subsys->subsysnqn);
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c

index 8c42b3a..c2d0d08 100644 (file)
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -16,6 +16,8 @@
  void nvmet_file_ns_disable(struct nvmet_ns *ns)
  {
         if (ns->file) {
+               if (ns->buffered_io)
+                       flush_workqueue(buffered_io_wq);
                 mempool_destroy(ns->bvec_pool);
                 ns->bvec_pool = NULL;
                 kmem_cache_destroy(ns->bvec_cache);
@@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns)
  
  int nvmet_file_ns_enable(struct nvmet_ns *ns)
  {
-       int ret;
+       int flags = O_RDWR | O_LARGEFILE;
         struct kstat stat;
+       int ret;
  
-       ns->file = filp_open(ns->device_path,
-                       O_RDWR | O_LARGEFILE | O_DIRECT, 0);
+       if (!ns->buffered_io)
+               flags |= O_DIRECT;
+
+       ns->file = filp_open(ns->device_path, flags, 0);
         if (IS_ERR(ns->file)) {
                 pr_err("failed to open file %s: (%ld)\n",
                                 ns->device_path, PTR_ERR(ns->file));
@@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
  
         iocb->ki_pos = pos;
         iocb->ki_filp = req->ns->file;
-       iocb->ki_flags = IOCB_DIRECT | ki_flags;
+       iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
  
         ret = call_iter(iocb, &iter);
  
@@ -140,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
                 return;
         }
  
+       pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
+       if (unlikely(pos + req->data_len > req->ns->size)) {
+               nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+               return;
+       }
+
         if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
                 req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
                                 GFP_KERNEL);
@@ -155,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
                         is_sync = true;
         }
  
-       pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
-
         memset(&req->f.iocb, 0, sizeof(struct kiocb));
         for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) {
                 nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter);
@@ -189,6 +198,19 @@ out:
         nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
  }
  
+static void nvmet_file_buffered_io_work(struct work_struct *w)
+{
+       struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
+
+       nvmet_file_execute_rw(req);
+}
+
+static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
+{
+       INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
+       queue_work(buffered_io_wq, &req->f.work);
+}
+
  static void nvmet_file_flush_work(struct work_struct *w)
  {
         struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
@@ -209,22 +231,30 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
  {
         int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
         struct nvme_dsm_range range;
-       loff_t offset;
-       loff_t len;
-       int i, ret;
+       loff_t offset, len;
+       u16 ret;
+       int i;
  
         for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
-               if (nvmet_copy_from_sgl(req, i * sizeof(range), &range,
-                                       sizeof(range)))
+               ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+                                       sizeof(range));
+               if (ret)
                         break;
+
                 offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
                 len = le32_to_cpu(range.nlb) << req->ns->blksize_shift;
-               ret = vfs_fallocate(req->ns->file, mode, offset, len);
-               if (ret)
+               if (offset + len > req->ns->size) {
+                       ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
                         break;
+               }
+
+               if (vfs_fallocate(req->ns->file, mode, offset, len)) {
+                       ret = NVME_SC_INTERNAL | NVME_SC_DNR;
+                       break;
+               }
         }
  
-       nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+       nvmet_req_complete(req, ret);
  }
  
  static void nvmet_file_dsm_work(struct work_struct *w)
@@ -263,6 +293,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
         len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
                         req->ns->blksize_shift);
  
+       if (unlikely(offset + len > req->ns->size)) {
+               nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+               return;
+       }
+
         ret = vfs_fallocate(req->ns->file, mode, offset, len);
         nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
  }
@@ -280,7 +315,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
         switch (cmd->common.opcode) {
         case nvme_cmd_read:
         case nvme_cmd_write:
-               req->execute = nvmet_file_execute_rw;
+               if (req->ns->buffered_io)
+                       req->execute = nvmet_file_execute_rw_buffered_io;
+               else
+                       req->execute = nvmet_file_execute_rw;
                 req->data_len = nvmet_rw_len(req);
                 return 0;
         case nvme_cmd_flush:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c

index d8d91f0..af7fbf4 100644 (file)
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set,
  {
         struct nvme_loop_ctrl *ctrl = set->driver_data;
  
+       nvme_req(req)->ctrl = &ctrl->ctrl;
         return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
                         (set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
  }
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h

index 480dfe1..6889938 100644 (file)
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -65,6 +65,7 @@ struct nvmet_ns {
         u8                      nguid[16];
         uuid_t                  uuid;
  
+       bool                    buffered_io;
         bool                    enabled;
         struct nvmet_subsys     *subsys;
         const char              *device_path;
@@ -116,6 +117,7 @@ struct nvmet_port {
         struct list_head                referrals;
         void                            *priv;
         bool                            enabled;
+       int                             inline_data_size;
  };
  
  static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
@@ -225,7 +227,6 @@ struct nvmet_req;
  struct nvmet_fabrics_ops {
         struct module *owner;
         unsigned int type;
-       unsigned int sqe_inline_size;
         unsigned int msdbd;
         bool has_keyed_sgls : 1;
         void (*queue_response)(struct nvmet_req *req);
@@ -269,6 +270,8 @@ struct nvmet_req {
         const struct nvmet_fabrics_ops *ops;
  };
  
+extern struct workqueue_struct *buffered_io_wq;
+
  static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
  {
         req->rsp->status = cpu_to_le16(status << 1);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c

index 52e0c5d..e7f43d1 100644 (file)
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -33,16 +33,17 @@
  #include "nvmet.h"
  
  /*
- * We allow up to a page of inline data to go with the SQE
+ * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
   */
-#define NVMET_RDMA_INLINE_DATA_SIZE    PAGE_SIZE
+#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE    PAGE_SIZE
+#define NVMET_RDMA_MAX_INLINE_SGE              4
+#define NVMET_RDMA_MAX_INLINE_DATA_SIZE                max_t(int, SZ_16K, PAGE_SIZE)
  
  struct nvmet_rdma_cmd {
-       struct ib_sge           sge[2];
+       struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
         struct ib_cqe           cqe;
         struct ib_recv_wr       wr;
-       struct scatterlist      inline_sg;
-       struct page             *inline_page;
+       struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
         struct nvme_command     *nvme_cmd;
         struct nvmet_rdma_queue *queue;
  };
@@ -116,6 +117,8 @@ struct nvmet_rdma_device {
         size_t                  srq_size;
         struct kref             ref;
         struct list_head        entry;
+       int                     inline_data_size;
+       int                     inline_page_count;
  };
  
  static bool nvmet_rdma_use_srq;
@@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
  
  static const struct nvmet_fabrics_ops nvmet_rdma_ops;
  
+static int num_pages(int len)
+{
+       return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
+}
+
  /* XXX: really should move to a generic header sooner or later.. */
  static inline u32 get_unaligned_le24(const u8 *p)
  {
@@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
  }
  
+static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_cmd *c)
+{
+       struct scatterlist *sg;
+       struct ib_sge *sge;
+       int i;
+
+       if (!ndev->inline_data_size)
+               return;
+
+       sg = c->inline_sg;
+       sge = &c->sge[1];
+
+       for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+               if (sge->length)
+                       ib_dma_unmap_page(ndev->device, sge->addr,
+                                       sge->length, DMA_FROM_DEVICE);
+               if (sg_page(sg))
+                       __free_page(sg_page(sg));
+       }
+}
+
+static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_cmd *c)
+{
+       struct scatterlist *sg;
+       struct ib_sge *sge;
+       struct page *pg;
+       int len;
+       int i;
+
+       if (!ndev->inline_data_size)
+               return 0;
+
+       sg = c->inline_sg;
+       sg_init_table(sg, ndev->inline_page_count);
+       sge = &c->sge[1];
+       len = ndev->inline_data_size;
+
+       for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
+               pg = alloc_page(GFP_KERNEL);
+               if (!pg)
+                       goto out_err;
+               sg_assign_page(sg, pg);
+               sge->addr = ib_dma_map_page(ndev->device,
+                       pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+               if (ib_dma_mapping_error(ndev->device, sge->addr))
+                       goto out_err;
+               sge->length = min_t(int, len, PAGE_SIZE);
+               sge->lkey = ndev->pd->local_dma_lkey;
+               len -= sge->length;
+       }
+
+       return 0;
+out_err:
+       for (; i >= 0; i--, sg--, sge--) {
+               if (sge->length)
+                       ib_dma_unmap_page(ndev->device, sge->addr,
+                                       sge->length, DMA_FROM_DEVICE);
+               if (sg_page(sg))
+                       __free_page(sg_page(sg));
+       }
+       return -ENOMEM;
+}
+
  static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
                         struct nvmet_rdma_cmd *c, bool admin)
  {
@@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
         c->sge[0].length = sizeof(*c->nvme_cmd);
         c->sge[0].lkey = ndev->pd->local_dma_lkey;
  
-       if (!admin) {
-               c->inline_page = alloc_pages(GFP_KERNEL,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-               if (!c->inline_page)
-                       goto out_unmap_cmd;
-               c->sge[1].addr = ib_dma_map_page(ndev->device,
-                               c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
-                               DMA_FROM_DEVICE);
-               if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
-                       goto out_free_inline_page;
-               c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
-               c->sge[1].lkey = ndev->pd->local_dma_lkey;
-       }
+       if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
+               goto out_unmap_cmd;
  
         c->cqe.done = nvmet_rdma_recv_done;
  
         c->wr.wr_cqe = &c->cqe;
         c->wr.sg_list = c->sge;
-       c->wr.num_sge = admin ? 1 : 2;
+       c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
  
         return 0;
  
-out_free_inline_page:
-       if (!admin) {
-               __free_pages(c->inline_page,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-       }
  out_unmap_cmd:
         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
@@ -240,12 +297,8 @@ out:
  static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
                 struct nvmet_rdma_cmd *c, bool admin)
  {
-       if (!admin) {
-               ib_dma_unmap_page(ndev->device, c->sge[1].addr,
-                               NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
-               __free_pages(c->inline_page,
-                               get_order(NVMET_RDMA_INLINE_DATA_SIZE));
-       }
+       if (!admin)
+               nvmet_rdma_free_inline_pages(ndev, c);
         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
         kfree(c->nvme_cmd);
@@ -383,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
                 struct nvmet_rdma_cmd *cmd)
  {
         struct ib_recv_wr *bad_wr;
+       int ret;
  
         ib_dma_sync_single_for_device(ndev->device,
                 cmd->sge[0].addr, cmd->sge[0].length,
                 DMA_FROM_DEVICE);
  
         if (ndev->srq)
-               return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
-       return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
+               ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
+       else
+               ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
+
+       if (unlikely(ret))
+               pr_err("post_recv cmd failed\n");
+
+       return ret;
  }
  
  static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
@@ -429,7 +489,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
         }
  
-       if (rsp->req.sg != &rsp->cmd->inline_sg)
+       if (rsp->req.sg != rsp->cmd->inline_sg)
                 sgl_free(rsp->req.sg);
  
         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
@@ -493,7 +553,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
                 rsp->send_sge.addr, rsp->send_sge.length,
                 DMA_TO_DEVICE);
  
-       if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) {
+       if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) {
                 pr_err("sending cmd response failed\n");
                 nvmet_rdma_release_rsp(rsp);
         }
@@ -529,10 +589,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
  static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
                 u64 off)
  {
-       sg_init_table(&rsp->cmd->inline_sg, 1);
-       sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off);
-       rsp->req.sg = &rsp->cmd->inline_sg;
-       rsp->req.sg_cnt = 1;
+       int sg_count = num_pages(len);
+       struct scatterlist *sg;
+       int i;
+
+       sg = rsp->cmd->inline_sg;
+       for (i = 0; i < sg_count; i++, sg++) {
+               if (i < sg_count - 1)
+                       sg_unmark_end(sg);
+               else
+                       sg_mark_end(sg);
+               sg->offset = off;
+               sg->length = min_t(int, len, PAGE_SIZE - off);
+               len -= sg->length;
+               if (!i)
+                       off = 0;
+       }
+
+       rsp->req.sg = rsp->cmd->inline_sg;
+       rsp->req.sg_cnt = sg_count;
  }
  
  static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
@@ -544,7 +619,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
         if (!nvme_is_write(rsp->req.cmd))
                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  
-       if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) {
+       if (off + len > rsp->queue->dev->inline_data_size) {
                 pr_err("invalid inline data offset!\n");
                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
         }
@@ -743,7 +818,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
         srq_size = 4095;        /* XXX: tune */
  
         srq_attr.attr.max_wr = srq_size;
-       srq_attr.attr.max_sge = 2;
+       srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
         srq_attr.attr.srq_limit = 0;
         srq_attr.srq_type = IB_SRQT_BASIC;
         srq = ib_create_srq(ndev->pd, &srq_attr);
@@ -765,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
         ndev->srq = srq;
         ndev->srq_size = srq_size;
  
-       for (i = 0; i < srq_size; i++)
-               nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
+       for (i = 0; i < srq_size; i++) {
+               ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
+               if (ret)
+                       goto out_free_cmds;
+       }
  
         return 0;
  
+out_free_cmds:
+       nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
  out_destroy_srq:
         ib_destroy_srq(srq);
         return ret;
@@ -793,7 +873,10 @@ static void nvmet_rdma_free_dev(struct kref *ref)
  static struct nvmet_rdma_device *
  nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
  {
+       struct nvmet_port *port = cm_id->context;
         struct nvmet_rdma_device *ndev;
+       int inline_page_count;
+       int inline_sge_count;
         int ret;
  
         mutex_lock(&device_list_mutex);
@@ -807,6 +890,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
         if (!ndev)
                 goto out_err;
  
+       inline_page_count = num_pages(port->inline_data_size);
+       inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
+                               cm_id->device->attrs.max_sge) - 1;
+       if (inline_page_count > inline_sge_count) {
+               pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
+                       port->inline_data_size, cm_id->device->name,
+                       inline_sge_count * PAGE_SIZE);
+               port->inline_data_size = inline_sge_count * PAGE_SIZE;
+               inline_page_count = inline_sge_count;
+       }
+       ndev->inline_data_size = port->inline_data_size;
+       ndev->inline_page_count = inline_page_count;
         ndev->device = cm_id->device;
         kref_init(&ndev->ref);
  
@@ -881,7 +976,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
         } else {
                 /* +1 for drain */
                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
-               qp_attr.cap.max_recv_sge = 2;
+               qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
         }
  
         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
@@ -899,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
         if (!ndev->srq) {
                 for (i = 0; i < queue->recv_queue_size; i++) {
                         queue->cmds[i].queue = queue;
-                       nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
+                       ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
+                       if (ret)
+                               goto err_destroy_qp;
                 }
         }
  
  out:
         return ret;
  
+err_destroy_qp:
+       rdma_destroy_qp(queue->cm_id);
  err_destroy_cq:
         ib_free_cq(queue->cq);
         goto out;
@@ -1379,6 +1478,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
                 return -EINVAL;
         }
  
+       if (port->inline_data_size < 0) {
+               port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
+       } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
+               pr_warn("inline_data_size %u is too large, reducing to %u\n",
+                       port->inline_data_size,
+                       NVMET_RDMA_MAX_INLINE_DATA_SIZE);
+               port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
+       }
+
         ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
                         port->disc_addr.trsvcid, &addr);
         if (ret) {
@@ -1456,7 +1564,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
  static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
         .owner                  = THIS_MODULE,
         .type                   = NVMF_TRTYPE_RDMA,
-       .sqe_inline_size        = NVMET_RDMA_INLINE_DATA_SIZE,
         .msdbd                  = 1,
         .has_keyed_sgls         = 1,
         .add_port               = nvmet_rdma_add_port,
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile

index 80aca24..7689538 100644 (file)
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -21,6 +21,7 @@ CFLAGS_gdth.o    = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS
  obj-$(CONFIG_PCMCIA)           += pcmcia/
  
  obj-$(CONFIG_SCSI)             += scsi_mod.o
+obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_common.o
  
  obj-$(CONFIG_RAID_ATTRS)       += raid_class.o
  
@@ -156,7 +157,6 @@ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
  obj-$(CONFIG_SCSI_DEBUG)       += scsi_debug.o
  scsi_mod-y                     += scsi.o hosts.o scsi_ioctl.o \
                                    scsicam.o scsi_error.o scsi_lib.o
-scsi_mod-y                     += scsi_common.o
  scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o
  scsi_mod-$(CONFIG_SCSI_DMA)    += scsi_lib_dma.o
  scsi_mod-y                     += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c

index e489d89..379890c 100644 (file)
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -339,7 +339,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
         struct scsi_sense_hdr sshdr;
         u8 *cmd_buf = NULL;
         u8 *scsi_cmd = NULL;
-       u8 *sense_buf = NULL;
         int rc = 0;
         int result = 0;
         int retry_cnt = 0;
@@ -348,8 +347,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
  retry:
         cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
         scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
-       sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
-       if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+       if (unlikely(!cmd_buf || !scsi_cmd)) {
                 rc = -ENOMEM;
                 goto out;
         }
@@ -364,7 +362,7 @@ retry:
         /* Drop the ioctl read semahpore across lengthy call */
         up_read(&cfg->ioctl_rwsem);
         result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf,
-                             CMD_BUFSIZE, sense_buf, &sshdr, to, CMD_RETRIES,
+                             CMD_BUFSIZE, NULL, &sshdr, to, CMD_RETRIES,
                               0, 0, NULL);
         down_read(&cfg->ioctl_rwsem);
         rc = check_state(cfg);
@@ -395,7 +393,6 @@ retry:
                                         if (retry_cnt++ < 1) {
                                                 kfree(cmd_buf);
                                                 kfree(scsi_cmd);
-                                               kfree(sense_buf);
                                                 goto retry;
                                         }
                                 }
@@ -426,7 +423,6 @@ retry:
  out:
         kfree(cmd_buf);
         kfree(scsi_cmd);
-       kfree(sense_buf);
  
         dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n",
                 __func__, gli->max_lba, gli->blk_len, rc);
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c

index 66e445a..2c904bf 100644 (file)
--- a/drivers/scsi/cxlflash/vlun.c
+++ b/drivers/scsi/cxlflash/vlun.c
@@ -426,7 +426,6 @@ static int write_same16(struct scsi_device *sdev,
  {
         u8 *cmd_buf = NULL;
         u8 *scsi_cmd = NULL;
-       u8 *sense_buf = NULL;
         int rc = 0;
         int result = 0;
         u64 offset = lba;
@@ -440,8 +439,7 @@ static int write_same16(struct scsi_device *sdev,
  
         cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
         scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
-       sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
-       if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
+       if (unlikely(!cmd_buf || !scsi_cmd)) {
                 rc = -ENOMEM;
                 goto out;
         }
@@ -457,7 +455,7 @@ static int write_same16(struct scsi_device *sdev,
                 /* Drop the ioctl read semahpore across lengthy call */
                 up_read(&cfg->ioctl_rwsem);
                 result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf,
-                                     CMD_BUFSIZE, sense_buf, NULL, to,
+                                     CMD_BUFSIZE, NULL, NULL, to,
                                       CMD_RETRIES, 0, 0, NULL);
                 down_read(&cfg->ioctl_rwsem);
                 rc = check_state(cfg);
@@ -482,7 +480,6 @@ static int write_same16(struct scsi_device *sdev,
  out:
         kfree(cmd_buf);
         kfree(scsi_cmd);
-       kfree(sense_buf);
         dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
         return rc;
  }
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c

index b8d131a..dd738ae 100644 (file)
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                     MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
                     MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
                 mpi_request->CDB.EEDP32.PrimaryReferenceTag =
-                   cpu_to_be32(scsi_prot_ref_tag(scmd));
+                   cpu_to_be32(t10_pi_ref_tag(scmd->request));
                 break;
  
         case SCSI_PROT_DIF_TYPE3:
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c

index 41e9ac9..9cb9a16 100644 (file)
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -238,7 +238,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  
  
  /**
- * scsi_execute - insert request and wait for the result
+ * __scsi_execute - insert request and wait for the result
   * @sdev:      scsi device
   * @cmd:       scsi command
   * @data_direction: data direction
@@ -255,7 +255,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
   * Returns the scsi_cmnd result field if a command was executed, or a negative
   * Linux error code if we didn't get that far.
   */
-int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                  int data_direction, void *buffer, unsigned bufflen,
                  unsigned char *sense, struct scsi_sense_hdr *sshdr,
                  int timeout, int retries, u64 flags, req_flags_t rq_flags,
@@ -309,7 +309,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
  
         return ret;
  }
-EXPORT_SYMBOL(scsi_execute);
+EXPORT_SYMBOL(__scsi_execute);
  
  /*
   * Function:    scsi_init_cmd_errh()
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index 9421d98..bbebdc3 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
                 SCpnt->cmnd[0] = WRITE_6;
  
                 if (blk_integrity_rq(rq))
-                       sd_dif_prepare(SCpnt);
+                       t10_pi_prepare(SCpnt->request, sdkp->protection_type);
  
         } else if (rq_data_dir(rq) == READ) {
                 SCpnt->cmnd[0] = READ_6;
@@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt)
                                            "sd_done: completed %d of %d bytes\n",
                                            good_bytes, scsi_bufflen(SCpnt)));
  
-       if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))
-               sd_dif_complete(SCpnt, good_bytes);
+       if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) &&
+           good_bytes)
+               t10_pi_complete(SCpnt->request, sdkp->protection_type,
+                               good_bytes / scsi_prot_interval(SCpnt));
  
         return good_bytes;
  }
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h

index 392c7d0..a7d4f50 100644 (file)
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op)
  #ifdef CONFIG_BLK_DEV_INTEGRITY
  
  extern void sd_dif_config_host(struct scsi_disk *);
-extern void sd_dif_prepare(struct scsi_cmnd *scmd);
-extern void sd_dif_complete(struct scsi_cmnd *, unsigned int);
  
  #else /* CONFIG_BLK_DEV_INTEGRITY */
  
  static inline void sd_dif_config_host(struct scsi_disk *disk)
  {
  }
-static inline int sd_dif_prepare(struct scsi_cmnd *scmd)
-{
-       return 0;
-}
-static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a)
-{
-}
  
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c

index 9035380..db72c82 100644 (file)
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -95,116 +95,3 @@ out:
         blk_integrity_register(disk, &bi);
  }
  
-/*
- * The virtual start sector is the one that was originally submitted
- * by the block layer. Due to partitioning, MD/DM cloning, etc. the
- * actual physical start sector is likely to be different.  Remap
- * protection information to match the physical LBA.
- *
- * From a protocol perspective there's a slight difference between
- * Type 1 and 2.  The latter uses 32-byte CDBs exclusively, and the
- * reference tag is seeded in the CDB.  This gives us the potential to
- * avoid virt->phys remapping during write.  However, at read time we
- * don't know whether the virt sector is the same as when we wrote it
- * (we could be reading from real disk as opposed to MD/DM device.  So
- * we always remap Type 2 making it identical to Type 1.
- *
- * Type 3 does not have a reference tag so no remapping is required.
- */
-void sd_dif_prepare(struct scsi_cmnd *scmd)
-{
-       const int tuple_sz = sizeof(struct t10_pi_tuple);
-       struct bio *bio;
-       struct scsi_disk *sdkp;
-       struct t10_pi_tuple *pi;
-       u32 phys, virt;
-
-       sdkp = scsi_disk(scmd->request->rq_disk);
-
-       if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
-               return;
-
-       phys = scsi_prot_ref_tag(scmd);
-
-       __rq_for_each_bio(bio, scmd->request) {
-               struct bio_integrity_payload *bip = bio_integrity(bio);
-               struct bio_vec iv;
-               struct bvec_iter iter;
-               unsigned int j;
-
-               /* Already remapped? */
-               if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
-                       break;
-
-               virt = bip_get_seed(bip) & 0xffffffff;
-
-               bip_for_each_vec(iv, bip, iter) {
-                       pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
-                       for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
-                               if (be32_to_cpu(pi->ref_tag) == virt)
-                                       pi->ref_tag = cpu_to_be32(phys);
-
-                               virt++;
-                               phys++;
-                       }
-
-                       kunmap_atomic(pi);
-               }
-
-               bip->bip_flags |= BIP_MAPPED_INTEGRITY;
-       }
-}
-
-/*
- * Remap physical sector values in the reference tag to the virtual
- * values expected by the block layer.
- */
-void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
-{
-       const int tuple_sz = sizeof(struct t10_pi_tuple);
-       struct scsi_disk *sdkp;
-       struct bio *bio;
-       struct t10_pi_tuple *pi;
-       unsigned int j, intervals;
-       u32 phys, virt;
-
-       sdkp = scsi_disk(scmd->request->rq_disk);
-
-       if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0)
-               return;
-
-       intervals = good_bytes / scsi_prot_interval(scmd);
-       phys = scsi_prot_ref_tag(scmd);
-
-       __rq_for_each_bio(bio, scmd->request) {
-               struct bio_integrity_payload *bip = bio_integrity(bio);
-               struct bio_vec iv;
-               struct bvec_iter iter;
-
-               virt = bip_get_seed(bip) & 0xffffffff;
-
-               bip_for_each_vec(iv, bip, iter) {
-                       pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
-
-                       for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
-
-                               if (intervals == 0) {
-                                       kunmap_atomic(pi);
-                                       return;
-                               }
-
-                               if (be32_to_cpu(pi->ref_tag) == phys)
-                                       pi->ref_tag = cpu_to_be32(virt);
-
-                               virt++;
-                               phys++;
-                               intervals--;
-                       }
-
-                       kunmap_atomic(pi);
-               }
-       }
-}
-
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c

index 35fab1e..ffcf902 100644 (file)
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi,
  int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
  {
         struct scsi_device *SDev;
-       struct scsi_sense_hdr sshdr;
+       struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr;
         int result, err = 0, retries = 0;
-       unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL;
  
         SDev = cd->device;
  
-       if (cgc->sense)
-               senseptr = sense_buffer;
+       if (cgc->sshdr)
+               sshdr = cgc->sshdr;
  
        retry:
         if (!scsi_block_when_processing_errors(SDev)) {
@@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
         }
  
         result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
-                             cgc->buffer, cgc->buflen, senseptr, &sshdr,
+                             cgc->buffer, cgc->buflen, NULL, sshdr,
                               cgc->timeout, IOCTL_RETRIES, 0, 0, NULL);
  
-       if (cgc->sense)
-               memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
-
         /* Minimal error checking.  Ignore cases we know about, and report the rest. */
         if (driver_byte(result) != 0) {
-               switch (sshdr.sense_key) {
+               switch (sshdr->sense_key) {
                 case UNIT_ATTENTION:
                         SDev->changed = 1;
                         if (!cgc->quiet)
@@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
                         err = -ENOMEDIUM;
                         break;
                 case NOT_READY: /* This happens if there is no disc in drive */
-                       if (sshdr.asc == 0x04 &&
-                           sshdr.ascq == 0x01) {
+                       if (sshdr->asc == 0x04 &&
+                           sshdr->ascq == 0x01) {
                                 /* sense: Logical unit is in process of becoming ready */
                                 if (!cgc->quiet)
                                         sr_printk(KERN_INFO, cd,
@@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
                         break;
                 case ILLEGAL_REQUEST:
                         err = -EIO;
-                       if (sshdr.asc == 0x20 &&
-                           sshdr.ascq == 0x00)
+                       if (sshdr->asc == 0x20 &&
+                           sshdr->ascq == 0x00)
                                 /* sense: Invalid command operation code */
                                 err = -EDRIVE_CANT_DO_THIS;
                         break;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c

index 6dc8891..1c72db9 100644 (file)
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -513,12 +513,12 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev,
  
         if (sc->sc_data_direction == DMA_TO_DEVICE)
                 cmd_pi->pi_bytesout = cpu_to_virtio32(vdev,
-                                                       blk_rq_sectors(rq) *
-                                                       bi->tuple_size);
+                                                     bio_integrity_bytes(bi,
+                                                       blk_rq_sectors(rq)));
         else if (sc->sc_data_direction == DMA_FROM_DEVICE)
                 cmd_pi->pi_bytesin = cpu_to_virtio32(vdev,
-                                                      blk_rq_sectors(rq) *
-                                                      bi->tuple_size);
+                                                    bio_integrity_bytes(bi,
+                                                       blk_rq_sectors(rq)));
  }
  #endif
  
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig

index 4c44d7b..cb6f32c 100644 (file)
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -1,10 +1,10 @@
  
  menuconfig TARGET_CORE
         tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure"
-       depends on SCSI && BLOCK
+       depends on BLOCK
         select CONFIGFS_FS
         select CRC_T10DIF
-       select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
+       select BLK_SCSI_REQUEST
         select SGL_ALLOC
         default n
         help
@@ -29,6 +29,7 @@ config TCM_FILEIO
  
  config TCM_PSCSI
         tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI"
+       depends on SCSI
         help
         Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered
         passthrough access to Linux/SCSI device
diff --git a/fs/block_dev.c b/fs/block_dev.c

index 0dd87aa..496fb51 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -665,7 +665,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
         result = blk_queue_enter(bdev->bd_queue, 0);
         if (result)
                 return result;
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_READ);
         blk_queue_exit(bdev->bd_queue);
         return result;
  }
@@ -703,7 +704,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
                 return result;
  
         set_page_writeback(page);
-       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
+       result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+                             REQ_OP_WRITE);
         if (result) {
                 end_page_writeback(page);
         } else {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c

index 1b8b446..5331a15 100644 (file)
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -873,8 +873,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                         struct bio *bio;
  
                         if (per_dev != master_dev) {
-                               bio = bio_clone_kmalloc(master_dev->bio,
-                                                       GFP_KERNEL);
+                               bio = bio_clone_fast(master_dev->bio,
+                                                    GFP_KERNEL, NULL);
                                 if (unlikely(!bio)) {
                                         ORE_DBGMSG(
                                               "Failed to allocate BIO size=%u\n",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index ba2396a..4b8aef9 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3514,7 +3514,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_sb_block = sb_block;
         if (sb->s_bdev->bd_part)
                 sbi->s_sectors_written_start =
-                       part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+                       part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
  
         /* Cleanup superblock name */
         strreplace(sb->s_id, '/', '!');
@@ -4824,7 +4824,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
         if (sb->s_bdev->bd_part)
                 es->s_kbytes_written =
                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
-                           ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                           ((part_stat_read(sb->s_bdev->bd_part,
+                                            sectors[STAT_WRITE]) -
                               EXT4_SB(sb)->s_sectors_written_start) >> 1));
         else
                 es->s_kbytes_written =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c

index f34da0b..2be9ad7 100644 (file)
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -56,7 +56,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
         if (!sb->s_bdev->bd_part)
                 return snprintf(buf, PAGE_SIZE, "0\n");
         return snprintf(buf, PAGE_SIZE, "%lu\n",
-                       (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       (part_stat_read(sb->s_bdev->bd_part,
+                                       sectors[STAT_WRITE]) -
                          sbi->s_sectors_written_start) >> 1);
  }
  
@@ -68,7 +69,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
                 return snprintf(buf, PAGE_SIZE, "0\n");
         return snprintf(buf, PAGE_SIZE, "%llu\n",
                         (unsigned long long)(sbi->s_kbytes_written +
-                       ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                       ((part_stat_read(sb->s_bdev->bd_part,
+                                        sectors[STAT_WRITE]) -
                           EXT4_SB(sb)->s_sectors_written_start) >> 1)));
  }
  
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index 4d8b1de..6799c3f 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
   * and the return value is in kbytes. s is of struct f2fs_sb_info.
   */
  #define BD_PART_WRITTEN(s)                                              \
-(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) -           \
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) -   \
                 (s)->sectors_written_start) >> 1)
  
  static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 3995e92..17bcff7 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2882,7 +2882,8 @@ try_onemore:
         /* For write statistics */
         if (sb->s_bdev->bd_part)
                 sbi->sectors_written_start =
-                       (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+                       (u64)part_stat_read(sb->s_bdev->bd_part,
+                                           sectors[STAT_WRITE]);
  
         /* Read accumulated write IO statistics if exists */
         seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
diff --git a/fs/mpage.c b/fs/mpage.c

index b7e7f57..b73638d 100644 (file)
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio)
  
         bio_for_each_segment_all(bv, bio, i) {
                 struct page *page = bv->bv_page;
-               page_endio(page, op_is_write(bio_op(bio)),
-                               blk_status_to_errno(bio->bi_status));
+               page_endio(page, bio_op(bio),
+                          blk_status_to_errno(bio->bi_status));
         }
  
         bio_put(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h

index f08f5fe..5137174 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -429,7 +429,6 @@ extern void bio_put(struct bio *);
  
  extern void __bio_clone_fast(struct bio *, struct bio *);
  extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
-extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
  
  extern struct bio_set fs_bio_set;
  
@@ -443,12 +442,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
         return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
  }
  
-static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
-{
-       return bio_clone_bioset(bio, gfp_mask, NULL);
-
-}
-
  extern blk_qc_t submit_bio(struct bio *);
  
  extern void bio_endio(struct bio *);
@@ -496,9 +489,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
  extern void bio_set_pages_dirty(struct bio *bio);
  extern void bio_check_pages_dirty(struct bio *bio);
  
-void generic_start_io_acct(struct request_queue *q, int rw,
+void generic_start_io_acct(struct request_queue *q, int op,
                                 unsigned long sectors, struct hd_struct *part);
-void generic_end_io_acct(struct request_queue *q, int rw,
+void generic_end_io_acct(struct request_queue *q, int op,
                                 struct hd_struct *part,
                                 unsigned long start_time);
  
@@ -553,8 +546,16 @@ do {                                               \
  #define bio_dev(bio) \
         disk_devt((bio)->bi_disk)
  
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
+#else
+static inline int bio_associate_blkcg_from_page(struct bio *bio,
+                                               struct page *page) {  return 0; }
+#endif
+
  #ifdef CONFIG_BLK_CGROUP
  int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
  void bio_disassociate_task(struct bio *bio);
  void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
  #else  /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h

index 6c666fd..f7b9107 100644 (file)
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -35,6 +35,7 @@ enum blkg_rwstat_type {
         BLKG_RWSTAT_WRITE,
         BLKG_RWSTAT_SYNC,
         BLKG_RWSTAT_ASYNC,
+       BLKG_RWSTAT_DISCARD,
  
         BLKG_RWSTAT_NR,
         BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
@@ -136,6 +137,12 @@ struct blkcg_gq {
         struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
  
         struct rcu_head                 rcu_head;
+
+       atomic_t                        use_delay;
+       atomic64_t                      delay_nsec;
+       atomic64_t                      delay_start;
+       u64                             last_delay;
+       int                             last_use;
  };
  
  typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -148,6 +155,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
  typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
  typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
  typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
+                                     size_t size);
  
  struct blkcg_policy {
         int                             plid;
@@ -167,6 +176,7 @@ struct blkcg_policy {
         blkcg_pol_offline_pd_fn         *pd_offline_fn;
         blkcg_pol_free_pd_fn            *pd_free_fn;
         blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
+       blkcg_pol_stat_pd_fn            *pd_stat_fn;
  };
  
  extern struct blkcg blkcg_root;
@@ -238,6 +248,42 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
         return css_to_blkcg(task_css(current, io_cgrp_id));
  }
  
+static inline bool blk_cgroup_congested(void)
+{
+       struct cgroup_subsys_state *css;
+       bool ret = false;
+
+       rcu_read_lock();
+       css = kthread_blkcg();
+       if (!css)
+               css = task_css(current, io_cgrp_id);
+       while (css) {
+               if (atomic_read(&css->cgroup->congestion_count)) {
+                       ret = true;
+                       break;
+               }
+               css = css->parent;
+       }
+       rcu_read_unlock();
+       return ret;
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio.  Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+       return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
  /**
   * blkcg_parent - get the parent of a blkcg
   * @blkcg: blkcg of interest
@@ -355,6 +401,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
         atomic_inc(&blkg->refcnt);
  }
  
+/**
+ * blkg_try_get - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+{
+       if (atomic_inc_not_zero(&blkg->refcnt))
+               return blkg;
+       return NULL;
+}
+
+
  void __blkg_release_rcu(struct rcu_head *rcu);
  
  /**
@@ -589,7 +650,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
  {
         struct percpu_counter *cnt;
  
-       if (op_is_write(op))
+       if (op_is_discard(op))
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+       else if (op_is_write(op))
                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
         else
                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
@@ -706,8 +769,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
  
         if (!throtl) {
                 blkg = blkg ?: q->root_blkg;
-               blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-                               bio->bi_iter.bi_size);
+               /*
+                * If the bio is flagged with BIO_QUEUE_ENTERED it means this
+                * is a split bio and we would have already accounted for the
+                * size of the bio.
+                */
+               if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
+                       blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
+                                       bio->bi_iter.bi_size);
                 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
         }
  
@@ -715,6 +784,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
         return !throtl;
  }
  
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+       if (atomic_add_return(1, &blkg->use_delay) == 1)
+               atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+       int old = atomic_read(&blkg->use_delay);
+
+       if (old == 0)
+               return 0;
+
+       /*
+        * We do this song and dance because we can race with somebody else
+        * adding or removing delay.  If we just did an atomic_dec we'd end up
+        * negative and we'd already be in trouble.  We need to subtract 1 and
+        * then check to see if we were the last delay so we can drop the
+        * congestion count on the cgroup.
+        */
+       while (old) {
+               int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+               if (cur == old)
+                       break;
+               old = cur;
+       }
+
+       if (old == 0)
+               return 0;
+       if (old == 1)
+               atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+       return 1;
+}
+
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+       int old = atomic_read(&blkg->use_delay);
+       if (!old)
+               return;
+       /* We only want 1 person clearing the congestion count for this blkg. */
+       while (old) {
+               int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
+               if (cur == old) {
+                       atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+                       break;
+               }
+               old = cur;
+       }
+}
+
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
  #else  /* CONFIG_BLK_CGROUP */
  
  struct blkcg {
@@ -734,8 +856,13 @@ struct blkcg_policy {
  
  #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
  
+static inline void blkcg_maybe_throttle_current(void) { }
+static inline bool blk_cgroup_congested(void) { return false; }
+
  #ifdef CONFIG_BLOCK
  
+static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
+
  static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
  static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
  static inline void blkcg_drain_queue(struct request_queue *q) { }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index e3147eb..d710e92 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,10 +35,12 @@ struct blk_mq_hw_ctx {
         struct sbitmap          ctx_map;
  
         struct blk_mq_ctx       *dispatch_from;
+       unsigned int            dispatch_busy;
  
-       struct blk_mq_ctx       **ctxs;
         unsigned int            nr_ctx;
+       struct blk_mq_ctx       **ctxs;
  
+       spinlock_t              dispatch_wait_lock;
         wait_queue_entry_t      dispatch_wait;
         atomic_t                wait_index;
  
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 3c4f390..f6dfb30 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -179,10 +179,8 @@ struct bio {
          */
         struct io_context       *bi_ioc;
         struct cgroup_subsys_state *bi_css;
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       void                    *bi_cg_private;
+       struct blkcg_gq         *bi_blkg;
         struct bio_issue        bi_issue;
-#endif
  #endif
         union {
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -329,7 +327,7 @@ enum req_flag_bits {
  
         /* for driver use */
         __REQ_DRV,
-
+       __REQ_SWAP,             /* swapping request. */
         __REQ_NR_BITS,          /* stops here */
  };
  
@@ -351,6 +349,7 @@ enum req_flag_bits {
  #define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
  
  #define REQ_DRV                        (1ULL << __REQ_DRV)
+#define REQ_SWAP               (1ULL << __REQ_SWAP)
  
  #define REQ_FAILFAST_MASK \
         (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -358,6 +357,14 @@ enum req_flag_bits {
  #define REQ_NOMERGE_FLAGS \
         (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
  
+enum stat_group {
+       STAT_READ,
+       STAT_WRITE,
+       STAT_DISCARD,
+
+       NR_STAT_GROUPS
+};
+
  #define bio_op(bio) \
         ((bio)->bi_opf & REQ_OP_MASK)
  #define req_op(req) \
@@ -395,6 +402,18 @@ static inline bool op_is_sync(unsigned int op)
                 (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
  }
  
+static inline bool op_is_discard(unsigned int op)
+{
+       return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
+}
+
+static inline int op_stat_group(unsigned int op)
+{
+       if (op_is_discard(op))
+               return STAT_DISCARD;
+       return op_is_write(op);
+}
+
  typedef unsigned int blk_qc_t;
  #define BLK_QC_T_NONE          -1U
  #define BLK_QC_T_SHIFT         16
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 79226ca..050d599 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -42,7 +42,7 @@ struct bsg_job;
  struct blkcg_gq;
  struct blk_flush_queue;
  struct pr_ops;
-struct rq_wb;
+struct rq_qos;
  struct blk_queue_stats;
  struct blk_stat_callback;
  
@@ -442,10 +442,8 @@ struct request_queue {
         int                     nr_rqs[2];      /* # allocated [a]sync rqs */
         int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
  
-       atomic_t                shared_hctx_restart;
-
         struct blk_queue_stats  *stats;
-       struct rq_wb            *rq_wb;
+       struct rq_qos           *rq_qos;
  
         /*
          * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
@@ -592,6 +590,7 @@ struct request_queue {
  
         struct queue_limits     limits;
  
+#ifdef CONFIG_BLK_DEV_ZONED
         /*
          * Zoned block device information for request dispatch control.
          * nr_zones is the total number of zones of the device. This is always
@@ -612,6 +611,7 @@ struct request_queue {
         unsigned int            nr_zones;
         unsigned long           *seq_zones_bitmap;
         unsigned long           *seq_zones_wlock;
+#endif /* CONFIG_BLK_DEV_ZONED */
  
         /*
          * sg stuff
@@ -800,11 +800,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
         return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
  }
  
-static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
-{
-       return q->nr_zones;
-}
-
+#ifdef CONFIG_BLK_DEV_ZONED
  static inline unsigned int blk_queue_zone_no(struct request_queue *q,
                                              sector_t sector)
  {
@@ -820,6 +816,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
                 return false;
         return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
  }
+#endif /* CONFIG_BLK_DEV_ZONED */
  
  static inline bool rq_is_sync(struct request *rq)
  {
@@ -1070,6 +1067,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
         return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
  }
  
+#ifdef CONFIG_BLK_DEV_ZONED
  static inline unsigned int blk_rq_zone_no(struct request *rq)
  {
         return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
@@ -1079,6 +1077,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
  {
         return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
  }
+#endif /* CONFIG_BLK_DEV_ZONED */
  
  /*
   * Some commands like WRITE SAME have a payload or data transfer size which
@@ -1437,8 +1436,6 @@ enum blk_default_limits {
         BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
  };
  
-#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
-
  static inline unsigned long queue_segment_boundary(struct request_queue *q)
  {
         return q->limits.seg_boundary_mask;
@@ -1639,15 +1636,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
         return 0;
  }
  
-static inline unsigned int bdev_nr_zones(struct block_device *bdev)
-{
-       struct request_queue *q = bdev_get_queue(bdev);
-
-       if (q)
-               return blk_queue_nr_zones(q);
-       return 0;
-}
-
  static inline int queue_dma_alignment(struct request_queue *q)
  {
         return q ? q->dma_alignment : 511;
@@ -1877,6 +1865,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
                                 bip_next->bip_vec[0].bv_offset);
  }
  
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:                blk_integrity profile for device
+ * @sectors:   Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return sectors >> (bi->interval_exp - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
  #else /* CONFIG_BLK_DEV_INTEGRITY */
  
  struct bio;
@@ -1950,12 +1960,24 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
         return false;
  }
  
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+                                                  unsigned int sectors)
+{
+       return 0;
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+                                              unsigned int sectors)
+{
+       return 0;
+}
+
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
  struct block_device_operations {
         int (*open) (struct block_device *, fmode_t);
         void (*release) (struct gendisk *, fmode_t);
-       int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
+       int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
         int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         unsigned int (*check_events) (struct gendisk *disk,
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h

index e75dfd1..528271c 100644 (file)
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -13,6 +13,7 @@
  
  #include <linux/fs.h>          /* not really needed, later.. */
  #include <linux/list.h>
+#include <scsi/scsi_common.h>
  #include <uapi/linux/cdrom.h>
  
  struct packet_command
@@ -21,7 +22,7 @@ struct packet_command
         unsigned char           *buffer;
         unsigned int            buflen;
         int                     stat;
-       struct request_sense    *sense;
+       struct scsi_sense_hdr   *sshdr;
         unsigned char           data_direction;
         int                     quiet;
         int                     timeout;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index c0e68f9..ff20b67 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -438,6 +438,9 @@ struct cgroup {
         /* used to store eBPF programs */
         struct cgroup_bpf bpf;
  
+       /* If there is block congestion on this cgroup. */
+       atomic_t congestion_count;
+
         /* ids of the ancestors at each level including self */
         int ancestor_ids[];
  };
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index 6cb8a57..5786442 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -16,6 +16,7 @@
  #include <linux/slab.h>
  #include <linux/percpu-refcount.h>
  #include <linux/uuid.h>
+#include <linux/blk_types.h>
  
  #ifdef CONFIG_BLOCK
  
@@ -82,10 +83,10 @@ struct partition {
  } __attribute__((packed));
  
  struct disk_stats {
-       unsigned long sectors[2];       /* READs and WRITEs */
-       unsigned long ios[2];
-       unsigned long merges[2];
-       unsigned long ticks[2];
+       unsigned long sectors[NR_STAT_GROUPS];
+       unsigned long ios[NR_STAT_GROUPS];
+       unsigned long merges[NR_STAT_GROUPS];
+       unsigned long ticks[NR_STAT_GROUPS];
         unsigned long io_ticks;
         unsigned long time_in_queue;
  };
@@ -353,6 +354,11 @@ static inline void free_part_stats(struct hd_struct *part)
  
  #endif /* CONFIG_SMP */
  
+#define part_stat_read_accum(part, field)                              \
+       (part_stat_read(part, field[STAT_READ]) +                       \
+        part_stat_read(part, field[STAT_WRITE]) +                      \
+        part_stat_read(part, field[STAT_DISCARD]))
+
  #define part_stat_add(cpu, part, field, addnd) do {                    \
         __part_stat_add((cpu), (part), field, addnd);                   \
         if ((part)->partno)                                             \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 6c6fb11..680d339 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
                           bool compound);
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound);
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
                               bool lrucare, bool compound);
  void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
@@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
         return 0;
  }
  
+static inline int mem_cgroup_try_charge_delay(struct page *page,
+                                             struct mm_struct *mm,
+                                             gfp_t gfp_mask,
+                                             struct mem_cgroup **memcgp,
+                                             bool compound)
+{
+       *memcgp = NULL;
+       return 0;
+}
+
  static inline void mem_cgroup_commit_charge(struct page *page,
                                             struct mem_cgroup *memcg,
                                             bool lrucare, bool compound)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h

index 2950ce9..80dfedc 100644 (file)
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -749,6 +749,11 @@ enum {
         NVME_FEAT_HOST_MEM_BUF  = 0x0d,
         NVME_FEAT_TIMESTAMP     = 0x0e,
         NVME_FEAT_KATO          = 0x0f,
+       NVME_FEAT_HCTM          = 0x10,
+       NVME_FEAT_NOPSC         = 0x11,
+       NVME_FEAT_RRL           = 0x12,
+       NVME_FEAT_PLM_CONFIG    = 0x13,
+       NVME_FEAT_PLM_WINDOW    = 0x14,
         NVME_FEAT_SW_PROGRESS   = 0x80,
         NVME_FEAT_HOST_ID       = 0x81,
         NVME_FEAT_RESV_MASK     = 0x82,
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 43731fe..c2e993d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,6 +734,10 @@ struct task_struct {
         /* disallow userland-initiated cgroup migration */
         unsigned                        no_cgroup_migration:1;
  #endif
+#ifdef CONFIG_BLK_CGROUP
+       /* to be used once the psi infrastructure lands upstream. */
+       unsigned                        use_memdelay:1;
+#endif
  
         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
@@ -1151,6 +1155,10 @@ struct task_struct {
         unsigned int                    memcg_nr_pages_over_high;
  #endif
  
+#ifdef CONFIG_BLK_CGROUP
+       struct request_queue            *throttle_queue;
+#endif
+
  #ifdef CONFIG_UPROBES
         struct uprobe_task              *utask;
  #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h

index c063443..1a8bd05 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
  
         return memcg->swappiness;
  }
-
  #else
  static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
  {
@@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
  }
  #endif
  
+#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+                                        gfp_t gfp_mask);
+#else
+static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
+                                               int node, gfp_t gfp_mask)
+{
+}
+#endif
+
  #ifdef CONFIG_MEMCG_SWAP
  extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
  extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h

index c6aa8a3..b9626aa 100644 (file)
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -37,9 +37,33 @@ struct t10_pi_tuple {
  #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff)
  #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff)
  
+static inline u32 t10_pi_ref_tag(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+       return blk_rq_pos(rq) >>
+               (rq->q->integrity.interval_exp - 9) & 0xffffffff;
+#else
+       return -1U;
+#endif
+}
+
  extern const struct blk_integrity_profile t10_pi_type1_crc;
  extern const struct blk_integrity_profile t10_pi_type1_ip;
  extern const struct blk_integrity_profile t10_pi_type3_crc;
  extern const struct blk_integrity_profile t10_pi_type3_ip;
  
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+extern void t10_pi_prepare(struct request *rq, u8 protection_type);
+extern void t10_pi_complete(struct request *rq, u8 protection_type,
+                           unsigned int intervals);
+#else
+static inline void t10_pi_complete(struct request *rq, u8 protection_type,
+                                  unsigned int intervals)
+{
+}
+static inline void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+}
+#endif
+
  #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h

index 4a88419..05589a3 100644 (file)
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,7 @@
  #include <linux/security.h>
  #include <linux/task_work.h>
  #include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
  struct linux_binprm;
  
  /*
@@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
                 task_work_run();
  
         mem_cgroup_handle_over_high();
+       blkcg_maybe_throttle_current();
  }
  
  #endif /* <linux/tracehook.h> */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h

index aaf1e97..c891ada 100644 (file)
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
  
  #include <linux/dma-mapping.h>
  #include <linux/blkdev.h>
+#include <linux/t10-pi.h>
  #include <linux/list.h>
  #include <linux/types.h>
  #include <linux/timer.h>
@@ -14,8 +15,6 @@
  struct Scsi_Host;
  struct scsi_driver;
  
-#include <scsi/scsi_device.h>
-
  /*
   * MAX_COMMAND_SIZE is:
   * The longest fixed-length SCSI CDB as per the SCSI standard.
@@ -120,11 +119,11 @@ struct scsi_cmnd {
         struct request *request;        /* The command we are
                                            working on */
  
-#define SCSI_SENSE_BUFFERSIZE  96
         unsigned char *sense_buffer;
                                 /* obtained by REQUEST SENSE when
                                  * CHECK CONDITION is received on original
-                                * command (auto-sense) */
+                                * command (auto-sense). Length must be
+                                * SCSI_SENSE_BUFFERSIZE bytes. */
  
         /* Low-level done function - can be used by low-level driver to point
          *        to completion function.  Not used by mid/upper level code. */
@@ -313,12 +312,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
         return scmd->device->sector_size;
  }
  
-static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
-{
-       return blk_rq_pos(scmd->request) >>
-               (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff;
-}
-
  static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
  {
         return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h

index 4c36af6..202f4d6 100644 (file)
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -17,6 +17,8 @@ struct scsi_sense_hdr;
  
  typedef __u64 __bitwise blist_flags_t;
  
+#define SCSI_SENSE_BUFFERSIZE  96
+
  struct scsi_mode_data {
         __u32   length;
         __u16   block_descriptor_length;
@@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state);
  extern int scsi_is_sdev_device(const struct device *);
  extern int scsi_is_target_device(const struct device *);
  extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
-extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                         int data_direction, void *buffer, unsigned bufflen,
                         unsigned char *sense, struct scsi_sense_hdr *sshdr,
                         int timeout, int retries, u64 flags,
                         req_flags_t rq_flags, int *resid);
+/* Make sure any sense buffer is the correct size. */
+#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense,        \
+                    sshdr, timeout, retries, flags, rq_flags, resid)   \
+({                                                                     \
+       BUILD_BUG_ON((sense) != NULL &&                                 \
+                    sizeof(sense) != SCSI_SENSE_BUFFERSIZE);           \
+       __scsi_execute(sdev, cmd, data_direction, buffer, bufflen,      \
+                      sense, sshdr, timeout, retries, flags, rq_flags, \
+                      resid);                                          \
+})
  static inline int scsi_execute_req(struct scsi_device *sdev,
         const unsigned char *cmd, int data_direction, void *buffer,
         unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h

index 821f71a..8d19e02 100644 (file)
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -195,7 +195,7 @@ struct cache_sb {
         };
         };
  
-       __u32                   last_mount;     /* time_t */
+       __u32                   last_mount;     /* time overflow in y2106 */
  
         __u16                   first_bucket;
         union {
@@ -318,7 +318,7 @@ struct uuid_entry {
                 struct {
                         __u8    uuid[16];
                         __u8    label[32];
-                       __u32   first_reg;
+                       __u32   first_reg; /* time overflow in y2106 */
                         __u32   last_reg;
                         __u32   invalidated;
  
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h

index e3c70fe..ff5a5db 100644 (file)
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -117,7 +117,7 @@ struct blk_zone_report {
         __u32           nr_zones;
         __u8            reserved[4];
         struct blk_zone zones[0];
-} __packed;
+};
  
  /**
   * struct blk_zone_range - BLKRESETZONE ioctl request
diff --git a/kernel/fork.c b/kernel/fork.c

index a191c05..f40c82b 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -868,6 +868,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
         tsk->fail_nth = 0;
  #endif
  
+#ifdef CONFIG_BLK_CGROUP
+       tsk->throttle_queue = NULL;
+       tsk->use_memdelay = 0;
+#endif
+
         return tsk;
  
  free_stack:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c

index 987d9a9..b951aa1 100644 (file)
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
         if (!buts->buf_size || !buts->buf_nr)
                 return -EINVAL;
  
+       if (!blk_debugfs_root)
+               return -ENOENT;
+
         strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
         buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
  
@@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
  
         ret = -ENOENT;
  
-       if (!blk_debugfs_root)
-               goto err;
-
         dir = debugfs_lookup(buts->name, blk_debugfs_root);
         if (!dir)
                 bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 25346bd..a9e1e09 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
  
         VM_BUG_ON_PAGE(!PageCompound(page), page);
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
                 put_page(page);
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
                 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
                                                vmf->address, page_to_nid(page));
                 if (unlikely(!pages[i] ||
-                            mem_cgroup_try_charge(pages[i], vma->vm_mm,
+                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
                                      GFP_KERNEL, &memcg, false))) {
                         if (pages[i])
                                 put_page(pages[i]);
@@ -1312,7 +1312,7 @@ alloc:
                 goto out;
         }
  
-       if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
                                         huge_gfp, &memcg, true))) {
                 put_page(new_page);
                 split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 8c0280b..473278b 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5593,6 +5593,19 @@ out:
         return ret;
  }
  
+int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
+{
+       struct mem_cgroup *memcg;
+       int ret;
+
+       ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
+       memcg = *memcgp;
+       mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
+       return ret;
+}
+
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
diff --git a/mm/memory.c b/mm/memory.c

index 7206a63..dfe80c5 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2503,7 +2503,7 @@ static int wp_page_copy(struct vm_fault *vmf)
                 cow_user_page(new_page, old_page, vmf->address, vma);
         }
  
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
                 goto oom_free_new;
  
         __SetPageUptodate(new_page);
@@ -3003,8 +3003,8 @@ int do_swap_page(struct vm_fault *vmf)
                 goto out_page;
         }
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
+                                       &memcg, false)) {
                 ret = VM_FAULT_OOM;
                 goto out_page;
         }
@@ -3165,7 +3165,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
         if (!page)
                 goto oom;
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
+                                       false))
                 goto oom_free_page;
  
         /*
@@ -3661,7 +3662,7 @@ static int do_cow_fault(struct vm_fault *vmf)
         if (!vmf->cow_page)
                 return VM_FAULT_OOM;
  
-       if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+       if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
                                 &vmf->memcg, false)) {
                 put_page(vmf->cow_page);
                 return VM_FAULT_OOM;
diff --git a/mm/page_io.c b/mm/page_io.c

index b41cf96..aafd19e 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                 ret = -ENOMEM;
                 goto out;
         }
-       bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+       bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
+       bio_associate_blkcg_from_page(bio, page);
         count_swpout_vm_event(page);
         set_page_writeback(page);
         unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c

index e273f0d..a59ea70 100644 (file)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
  #include <linux/syscalls.h>
  #include <linux/file.h>
  #include <linux/mm_inline.h>
+#include <linux/blk-cgroup.h>
  
  #include "internal.h"
  
@@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
  {
         struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
         unsigned long max_pages = ra->ra_pages;
+       unsigned long add_pages;
         pgoff_t prev_offset;
  
         /*
@@ -474,10 +476,17 @@ readit:
          * Will this read hit the readahead marker made by itself?
          * If so, trigger the readahead marker hit now, and merge
          * the resulted next readahead window into the current one.
+        * Take care of maximum IO pages as above.
          */
         if (offset == ra->start && ra->size == ra->async_size) {
-               ra->async_size = get_next_ra_size(ra, max_pages);
-               ra->size += ra->async_size;
+               add_pages = get_next_ra_size(ra, max_pages);
+               if (ra->size + add_pages <= max_pages) {
+                       ra->async_size = add_pages;
+                       ra->size += add_pages;
+               } else {
+                       ra->size = max_pages;
+                       ra->async_size = max_pages >> 1;
+               }
         }
  
         return ra_submit(ra, mapping, filp);
@@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping,
         if (!ra->ra_pages)
                 return;
  
+       if (blk_cgroup_congested())
+               return;
+
         /* be dumb */
         if (filp && (filp->f_mode & FMODE_RANDOM)) {
                 force_page_cache_readahead(mapping, filp, offset, req_size);
@@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping,
         if (inode_read_congested(mapping->host))
                 return;
  
+       if (blk_cgroup_congested())
+               return;
+
         /* do read-ahead */
         ondemand_readahead(mapping, ra, filp, true, offset, req_size);
  }
diff --git a/mm/shmem.c b/mm/shmem.c

index 2cab844..6206ca3 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
          * the shmem_swaplist_mutex which might hold up shmem_writepage().
          * Charged back to the user (not to caller) when swap account is used.
          */
-       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
-                       false);
+       error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
+                                           &memcg, false);
         if (error)
                 goto out;
         /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1712,7 +1712,7 @@ repeat:
                                 goto failed;
                 }
  
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                 false);
                 if (!error) {
                         error = shmem_add_to_page_cache(page, mapping, index,
@@ -1818,7 +1818,7 @@ alloc_nohuge:             page = shmem_alloc_and_acct_page(gfp, inode,
                 if (sgp == SGP_WRITE)
                         __SetPageReferenced(page);
  
-               error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
+               error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                 PageTransHuge(page));
                 if (error)
                         goto unacct;
@@ -2291,7 +2291,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
         __SetPageSwapBacked(page);
         __SetPageUptodate(page);
  
-       ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+       ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
         if (ret)
                 goto out_release;
  
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 2cc2972..db4ec8a 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3731,6 +3731,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
         }
  }
  
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
+                                 gfp_t gfp_mask)
+{
+       struct swap_info_struct *si, *next;
+       if (!(gfp_mask & __GFP_IO) || !memcg)
+               return;
+
+       if (!blk_cgroup_congested())
+               return;
+
+       /*
+        * We've already scheduled a throttle, avoid taking the global swap
+        * lock.
+        */
+       if (current->throttle_queue)
+               return;
+
+       spin_lock(&swap_avail_lock);
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
+                                 avail_lists[node]) {
+               if (si->bdev) {
+                       blkcg_schedule_throttle(bdev_get_queue(si->bdev),
+                                               true);
+                       break;
+               }
+       }
+       spin_unlock(&swap_avail_lock);
+}
+#endif
+
  static int __init swapfile_init(void)
  {
         int nid;
author	Jens Axboe <axboe@kernel.dk>
	Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Mon, 6 Aug 2018 01:32:09 +0000 (19:32 -0600)
Documentation/ABI/testing/procfs-diskstats		patch \| blob \| history
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| history
Documentation/block/null_blk.txt		patch \| blob \| history
Documentation/block/stat.txt		patch \| blob \| history
Documentation/iostats.txt		patch \| blob \| history
block/Kconfig		patch \| blob \| history
block/Makefile		patch \| blob \| history
block/bfq-iosched.c		patch \| blob \| history
block/bfq-iosched.h		patch \| blob \| history
block/bfq-wf2q.c		patch \| blob \| history
block/bio-integrity.c		patch \| blob \| history
block/bio.c		patch \| blob \| history
block/blk-cgroup.c		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
block/blk-ioc.c		patch \| blob \| history
block/blk-iolatency.c	[new file with mode: 0644]	patch \| blob
block/blk-lib.c		patch \| blob \| history
block/blk-mq-debugfs-zoned.c	[new file with mode: 0644]	patch \| blob
block/blk-mq-debugfs.c		patch \| blob \| history
block/blk-mq-debugfs.h		patch \| blob \| history
block/blk-mq-pci.c		patch \| blob \| history
block/blk-mq-sched.c		patch \| blob \| history
block/blk-mq-tag.c		patch \| blob \| history
block/blk-mq.c		patch \| blob \| history
block/blk-mq.h		patch \| blob \| history
block/blk-rq-qos.c	[new file with mode: 0644]	patch \| blob
block/blk-rq-qos.h	[new file with mode: 0644]	patch \| blob
block/blk-settings.c		patch \| blob \| history
block/blk-stat.c		patch \| blob \| history
block/blk-stat.h		patch \| blob \| history
block/blk-sysfs.c		patch \| blob \| history
block/blk-throttle.c		patch \| blob \| history
block/blk-wbt.c		patch \| blob \| history
block/blk-wbt.h		patch \| blob \| history
block/blk-zoned.c		patch \| blob \| history
block/blk.h		patch \| blob \| history
block/bounce.c		patch \| blob \| history
block/bsg-lib.c		patch \| blob \| history
block/bsg.c		patch \| blob \| history
block/genhd.c		patch \| blob \| history
block/partition-generic.c		patch \| blob \| history
block/partitions/aix.c		patch \| blob \| history
block/partitions/ldm.c		patch \| blob \| history
block/t10-pi.c		patch \| blob \| history
drivers/Makefile		patch \| blob \| history
drivers/ata/libata-scsi.c		patch \| blob \| history
drivers/block/DAC960.c		patch \| blob \| history
drivers/block/Kconfig		patch \| blob \| history
drivers/block/Makefile		patch \| blob \| history
drivers/block/aoe/aoecmd.c		patch \| blob \| history
drivers/block/brd.c		patch \| blob \| history
drivers/block/drbd/drbd_int.h		patch \| blob \| history
drivers/block/drbd/drbd_receiver.c		patch \| blob \| history
drivers/block/drbd/drbd_req.c		patch \| blob \| history
drivers/block/drbd/drbd_worker.c		patch \| blob \| history
drivers/block/floppy.c		patch \| blob \| history
drivers/block/loop.c		patch \| blob \| history
drivers/block/null_blk.c	[deleted file]	patch \| blob \| history
drivers/block/null_blk.h	[new file with mode: 0644]	patch \| blob
drivers/block/null_blk_main.c	[new file with mode: 0644]	patch \| blob
drivers/block/null_blk_zoned.c	[new file with mode: 0644]	patch \| blob
drivers/block/paride/bpck.c		patch \| blob \| history
drivers/block/pktcdvd.c		patch \| blob \| history
drivers/block/rsxx/dev.c		patch \| blob \| history
drivers/block/skd_main.c		patch \| blob \| history
drivers/block/xen-blkfront.c		patch \| blob \| history
drivers/block/zram/zram_drv.c		patch \| blob \| history
drivers/cdrom/cdrom.c		patch \| blob \| history
drivers/ide/ide-cd.c		patch \| blob \| history
drivers/ide/ide-cd.h		patch \| blob \| history
drivers/ide/ide-cd_ioctl.c		patch \| blob \| history
drivers/infiniband/ulp/iser/iser_memory.c		patch \| blob \| history
drivers/lightnvm/Kconfig		patch \| blob \| history
drivers/lightnvm/pblk-cache.c		patch \| blob \| history
drivers/lightnvm/pblk-core.c		patch \| blob \| history
drivers/lightnvm/pblk-gc.c		patch \| blob \| history
drivers/lightnvm/pblk-init.c		patch \| blob \| history
drivers/lightnvm/pblk-rb.c		patch \| blob \| history
drivers/lightnvm/pblk-read.c		patch \| blob \| history
drivers/lightnvm/pblk-recovery.c		patch \| blob \| history
drivers/lightnvm/pblk-sysfs.c		patch \| blob \| history
drivers/lightnvm/pblk-write.c		patch \| blob \| history
drivers/lightnvm/pblk.h		patch \| blob \| history
drivers/md/bcache/bcache.h		patch \| blob \| history
drivers/md/bcache/btree.c		patch \| blob \| history
drivers/md/bcache/debug.c		patch \| blob \| history
drivers/md/bcache/journal.c		patch \| blob \| history
drivers/md/bcache/request.c		patch \| blob \| history
drivers/md/bcache/super.c		patch \| blob \| history
drivers/md/bcache/writeback.c		patch \| blob \| history
drivers/md/bcache/writeback.h		patch \| blob \| history
drivers/md/dm.c		patch \| blob \| history
drivers/md/md.c		patch \| blob \| history
drivers/nvdimm/btt.c		patch \| blob \| history
drivers/nvdimm/nd.h		patch \| blob \| history
drivers/nvdimm/pmem.c		patch \| blob \| history
drivers/nvme/host/core.c		patch \| blob \| history
drivers/nvme/host/fc.c		patch \| blob \| history
drivers/nvme/host/lightnvm.c		patch \| blob \| history
drivers/nvme/host/nvme.h		patch \| blob \| history
drivers/nvme/host/pci.c		patch \| blob \| history
drivers/nvme/host/rdma.c		patch \| blob \| history
drivers/nvme/host/trace.c		patch \| blob \| history
drivers/nvme/host/trace.h		patch \| blob \| history
drivers/nvme/target/admin-cmd.c		patch \| blob \| history
drivers/nvme/target/configfs.c		patch \| blob \| history
drivers/nvme/target/core.c		patch \| blob \| history
drivers/nvme/target/discovery.c		patch \| blob \| history
drivers/nvme/target/io-cmd-file.c		patch \| blob \| history
drivers/nvme/target/loop.c		patch \| blob \| history
drivers/nvme/target/nvmet.h		patch \| blob \| history
drivers/nvme/target/rdma.c		patch \| blob \| history
drivers/scsi/Makefile		patch \| blob \| history
drivers/scsi/cxlflash/superpipe.c		patch \| blob \| history
drivers/scsi/cxlflash/vlun.c		patch \| blob \| history
drivers/scsi/mpt3sas/mpt3sas_scsih.c		patch \| blob \| history
drivers/scsi/scsi_lib.c		patch \| blob \| history
drivers/scsi/sd.c		patch \| blob \| history
drivers/scsi/sd.h		patch \| blob \| history
drivers/scsi/sd_dif.c		patch \| blob \| history
drivers/scsi/sr_ioctl.c		patch \| blob \| history
drivers/scsi/virtio_scsi.c		patch \| blob \| history
drivers/target/Kconfig		patch \| blob \| history
fs/block_dev.c		patch \| blob \| history
fs/exofs/ore.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/ext4/sysfs.c		patch \| blob \| history
fs/f2fs/f2fs.h		patch \| blob \| history
fs/f2fs/super.c		patch \| blob \| history
fs/mpage.c		patch \| blob \| history
include/linux/bio.h		patch \| blob \| history
include/linux/blk-cgroup.h		patch \| blob \| history
include/linux/blk-mq.h		patch \| blob \| history
include/linux/blk_types.h		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
include/linux/cdrom.h		patch \| blob \| history
include/linux/cgroup-defs.h		patch \| blob \| history
include/linux/genhd.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/nvme.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/t10-pi.h		patch \| blob \| history
include/linux/tracehook.h		patch \| blob \| history
include/scsi/scsi_cmnd.h		patch \| blob \| history
include/scsi/scsi_device.h		patch \| blob \| history
include/uapi/linux/bcache.h		patch \| blob \| history
include/uapi/linux/blkzoned.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/trace/blktrace.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/page_io.c		patch \| blob \| history
mm/readahead.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history