nvme-rdma: use dynamic dma mapping per command
authorMax Gurtovoy <maxg@mellanox.com>
Thu, 6 Jun 2019 09:27:36 +0000 (12:27 +0300)
committerSagi Grimberg <sagi@grimberg.me>
Thu, 6 Jun 2019 16:53:19 +0000 (09:53 -0700)
Commit 87fd125344d6 ("nvme-rdma: remove redundant reference between
ib_device and tagset") caused a kernel panic when disconnecting from an
inaccessible controller (disconnect during re-connection).

--
nvme nvme0: Removing ctrl: NQN "testnqn1"
nvme_rdma: nvme_rdma_exit_request: hctx 0 queue_idx 1
BUG: unable to handle kernel paging request at 0000000080000228
PGD 0 P4D 0
Oops: 0000 [#1] SMP PTI
...
Call Trace:
 blk_mq_exit_hctx+0x5c/0xf0
 blk_mq_exit_queue+0xd4/0x100
 blk_cleanup_queue+0x9a/0xc0
 nvme_rdma_destroy_io_queues+0x52/0x60 [nvme_rdma]
 nvme_rdma_shutdown_ctrl+0x3e/0x80 [nvme_rdma]
 nvme_do_delete_ctrl+0x53/0x80 [nvme_core]
 nvme_sysfs_delete+0x45/0x60 [nvme_core]
 kernfs_fop_write+0x105/0x180
 vfs_write+0xad/0x1a0
 ksys_write+0x5a/0xd0
 do_syscall_64+0x55/0x110
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7fa215417154
--

The reason for this crash is accessing an already freed ib_device for
performing dma_unmap during exit_request commands. The root cause for
that is that during re-connection all the queues are destroyed and
re-created (and the ib_device is reference counted by the queues and
freed as well) but the tagset stays alive and all the DMA mappings (that
we perform in init_request) kept in the request context. The original
commit fixed a different bug that was introduced during bonding (aka nic
teaming) tests that for some scenarios change the underlying ib_device
and caused memory leakage and possible segmentation fault. This commit
is a complementary commit that also changes the wrong DMA mappings that
were saved in the request context and making the request sqe dma
mappings dynamic with the command lifetime (i.e. mapped in .queue_rq and
unmapped in .complete). It also fixes the above crash of accessing freed
ib_device during destruction of the tagset.

Fixes: 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset")
Reported-by: Jim Harris <james.r.harris@intel.com>
Suggested-by: Sagi Grimberg <sagi@grimberg.me>
Tested-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
drivers/nvme/host/rdma.c

index 26709a2..97f668a 100644 (file)
@@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
        if (!ring)
                return NULL;
 
+       /*
+        * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
+        * lifetime. It's safe, since any chage in the underlying RDMA device
+        * will issue error recovery and queue re-creation.
+        */
        for (i = 0; i < ib_queue_size; i++) {
                if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
                        goto out_free_ring;
@@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
                struct request *rq, unsigned int hctx_idx)
 {
-       struct nvme_rdma_ctrl *ctrl = set->driver_data;
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
-       int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
-       struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
-       struct nvme_rdma_device *dev = queue->device;
 
-       nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
-                       DMA_TO_DEVICE);
+       kfree(req->sqe.data);
 }
 
 static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
        struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
-       struct nvme_rdma_device *dev = queue->device;
-       struct ib_device *ibdev = dev->dev;
-       int ret;
 
        nvme_req(rq)->ctrl = &ctrl->ctrl;
-       ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
-                       DMA_TO_DEVICE);
-       if (ret)
-               return ret;
+       req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
+       if (!req->sqe.data)
+               return -ENOMEM;
 
        req->queue = queue;
 
@@ -779,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 
        ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 
+       /*
+        * Bind the async event SQE DMA mapping to the admin queue lifetime.
+        * It's safe, since any chage in the underlying RDMA device will issue
+        * error recovery and queue re-creation.
+        */
        error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
                        sizeof(struct nvme_command), DMA_TO_DEVICE);
        if (error)
@@ -1719,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
                return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
        dev = queue->device->dev;
+
+       req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
+                                        sizeof(struct nvme_command),
+                                        DMA_TO_DEVICE);
+       err = ib_dma_mapping_error(dev, req->sqe.dma);
+       if (unlikely(err))
+               return BLK_STS_RESOURCE;
+
        ib_dma_sync_single_for_cpu(dev, sqe->dma,
                        sizeof(struct nvme_command), DMA_TO_DEVICE);
 
        ret = nvme_setup_cmd(ns, rq, c);
        if (ret)
-               return ret;
+               goto unmap_qe;
 
        blk_mq_start_request(rq);
 
@@ -1749,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
        }
 
        return BLK_STS_OK;
+
 err:
        if (err == -ENOMEM || err == -EAGAIN)
-               return BLK_STS_RESOURCE;
-       return BLK_STS_IOERR;
+               ret = BLK_STS_RESOURCE;
+       else
+               ret = BLK_STS_IOERR;
+unmap_qe:
+       ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
+                           DMA_TO_DEVICE);
+       return ret;
 }
 
 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
@@ -1765,8 +1780,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
 static void nvme_rdma_complete_rq(struct request *rq)
 {
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+       struct nvme_rdma_queue *queue = req->queue;
+       struct ib_device *ibdev = queue->device->dev;
 
-       nvme_rdma_unmap_data(req->queue, rq);
+       nvme_rdma_unmap_data(queue, rq);
+       ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
+                           DMA_TO_DEVICE);
        nvme_complete_rq(rq);
 }