nvme-rdma: fix timeout handler

author Sagi Grimberg <sagi@grimberg.me>

Wed, 29 Jul 2020 09:36:03 +0000 (02:36 -0700)

committer Sagi Grimberg <sagi@grimberg.me>

Fri, 28 Aug 2020 23:43:57 +0000 (16:43 -0700)
author Sagi Grimberg <sagi@grimberg.me>
Wed, 29 Jul 2020 09:36:03 +0000 (02:36 -0700)
committer Sagi Grimberg <sagi@grimberg.me>
Fri, 28 Aug 2020 23:43:57 +0000 (16:43 -0700)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index ed387f6..cb8731f 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1185,6 +1185,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
                 return;
  
+       dev_warn(ctrl->ctrl.device, "starting error recovery\n");
         queue_work(nvme_reset_wq, &ctrl->err_work);
  }
  
@@ -1951,6 +1952,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
         return 0;
  }
  
+static void nvme_rdma_complete_timed_out(struct request *rq)
+{
+       struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+       struct nvme_rdma_queue *queue = req->queue;
+       struct nvme_rdma_ctrl *ctrl = queue->ctrl;
+
+       /* fence other contexts that may complete the command */
+       mutex_lock(&ctrl->teardown_lock);
+       nvme_rdma_stop_queue(queue);
+       if (!blk_mq_request_completed(rq)) {
+               nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
+               blk_mq_complete_request(rq);
+       }
+       mutex_unlock(&ctrl->teardown_lock);
+}
+
  static enum blk_eh_timer_return
  nvme_rdma_timeout(struct request *rq, bool reserved)
  {
@@ -1961,29 +1978,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
         dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
                  rq->tag, nvme_rdma_queue_idx(queue));
  
-       /*
-        * Restart the timer if a controller reset is already scheduled. Any
-        * timed out commands would be handled before entering the connecting
-        * state.
-        */
-       if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
-               return BLK_EH_RESET_TIMER;
-
         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
                 /*
-                * Teardown immediately if controller times out while starting
-                * or we are already started error recovery. all outstanding
-                * requests are completed on shutdown, so we return BLK_EH_DONE.
+                * If we are resetting, connecting or deleting we should
+                * complete immediately because we may block controller
+                * teardown or setup sequence
+                * - ctrl disable/shutdown fabrics requests
+                * - connect requests
+                * - initialization admin requests
+                * - I/O requests that entered after unquiescing and
+                *   the controller stopped responding
+                *
+                * All other requests should be cancelled by the error
+                * recovery work, so it's fine that we fail it here.
                  */
-               flush_work(&ctrl->err_work);
-               nvme_rdma_teardown_io_queues(ctrl, false);
-               nvme_rdma_teardown_admin_queue(ctrl, false);
+               nvme_rdma_complete_timed_out(rq);
                 return BLK_EH_DONE;
         }
  
-       dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+       /*
+        * LIVE state should trigger the normal error recovery which will
+        * handle completing this request.
+        */
         nvme_rdma_error_recovery(ctrl);
-
         return BLK_EH_RESET_TIMER;
  }
author	Sagi Grimberg <sagi@grimberg.me>
	Wed, 29 Jul 2020 09:36:03 +0000 (02:36 -0700)
committer	Sagi Grimberg <sagi@grimberg.me>
	Fri, 28 Aug 2020 23:43:57 +0000 (16:43 -0700)