Commands get stuck while Host NVMe-oF controller is in reconnect state.
The controller enters into reconnect state when it loses connection with
the target. It tries to reconnect every 10 seconds (default) until
a successful reconnect or until the reconnect time-out is reached.
The default reconnect time out is 10 minutes.
Applications are expecting commands to complete with success or error
within a certain timeout (30 seconds by default). The NVMe host is
enforcing that timeout while it is connected, but during reconnect the
timeout is not enforced and commands may get stuck for a long period or
even forever.
To fix this long delay due to the default timeout, introduce new
"fast_io_fail_tmo" session parameter. The timeout is measured in seconds
from the controller reconnect and any command beyond that timeout is
rejected. The new parameter value may be passed during 'connect'.
The default value of -1 means no timeout (similar to current behavior).
Signed-off-by: Victor Gladkov <victor.gladkov@kioxia.com>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
+static void nvme_failfast_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+ struct nvme_ctrl, failfast_work);
+
+ if (ctrl->state != NVME_CTRL_CONNECTING)
+ return;
+
+ set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+ dev_info(ctrl->device, "failfast expired\n");
+ nvme_kick_requeue_lists(ctrl);
+}
+
+static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
+{
+ if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
+ return;
+
+ schedule_delayed_work(&ctrl->failfast_work,
+ ctrl->opts->fast_io_fail_tmo * HZ);
+}
+
+static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
+{
+ if (!ctrl->opts)
+ return;
+
+ cancel_delayed_work_sync(&ctrl->failfast_work);
+ clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
+}
+
+
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
}
spin_unlock_irqrestore(&ctrl->lock, flags);
- if (changed && ctrl->state == NVME_CTRL_LIVE)
+ if (!changed)
+ return false;
+
+ if (ctrl->state == NVME_CTRL_LIVE) {
+ if (old_state == NVME_CTRL_CONNECTING)
+ nvme_stop_failfast_work(ctrl);
nvme_kick_requeue_lists(ctrl);
+ } else if (ctrl->state == NVME_CTRL_CONNECTING &&
+ old_state == NVME_CTRL_RESETTING) {
+ nvme_start_failfast_work(ctrl);
+ }
return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
{
nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
+ nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
}
int ret;
ctrl->state = NVME_CTRL_NEW;
+ clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+ INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
{
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
ctrl->state != NVME_CTRL_DEAD &&
+ !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_TOS, "tos=%d" },
+ { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" },
{ NVMF_OPT_ERR, NULL }
};
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
opts->kato = NVME_DEFAULT_KATO;
opts->duplicate_connect = false;
+ opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO;
opts->hdr_digest = false;
opts->data_digest = false;
opts->tos = -1; /* < 0 == use transport default */
pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
ctrl_loss_tmo = token;
break;
+ case NVMF_OPT_FAIL_FAST_TMO:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (token >= 0)
+ pr_warn("I/O fail on reconnect controller after %d sec\n",
+ token);
+ opts->fast_io_fail_tmo = token;
+ break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
opts->nr_poll_queues = 0;
opts->duplicate_connect = true;
}
- if (ctrl_loss_tmo < 0)
+ if (ctrl_loss_tmo < 0) {
opts->max_reconnects = -1;
- else
+ } else {
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
opts->reconnect_delay);
+ if (ctrl_loss_tmo < opts->fast_io_fail_tmo)
+ pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n",
+ opts->fast_io_fail_tmo, ctrl_loss_tmo);
+ }
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
- NVMF_OPT_DISABLE_SQFLOW)
+ NVMF_OPT_DISABLE_SQFLOW |\
+ NVMF_OPT_FAIL_FAST_TMO)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf)
#define NVMF_DEF_RECONNECT_DELAY 10
/* default to 600 seconds of reconnect attempts before giving up */
#define NVMF_DEF_CTRL_LOSS_TMO 600
+/* default is -1: the fail fast mechanism is disabled */
+#define NVMF_DEF_FAIL_FAST_TMO -1
/*
* Define a host as seen by the target. We allocate one at boot, but also
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
NVMF_OPT_TOS = 1 << 19,
+ NVMF_OPT_FAIL_FAST_TMO = 1 << 20,
};
/**
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
* @tos: type of service
+ * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
*/
struct nvmf_ctrl_options {
unsigned mask;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
int tos;
+ int fast_io_fail_tmo;
};
/*
struct nvme_ns *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
+ continue;
switch (ns->ctrl->state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
+ struct delayed_work failfast_work;
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
unsigned long events;
u16 icdoff;
u16 maxcmd;
int nr_reconnects;
+ unsigned long flags;
+#define NVME_CTRL_FAILFAST_EXPIRED 0
struct nvmf_ctrl_options *opts;
struct page *discard_page;