libceph: lower exponential backoff delay
authorIlya Dryomov <idryomov@gmail.com>
Thu, 29 Oct 2020 13:49:10 +0000 (14:49 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 14 Dec 2020 22:21:48 +0000 (23:21 +0100)
The current setting allows the backoff to climb up to 5 minutes.  This
is too high -- it becomes hard to tell whether the client is stuck on
something or just in backoff.

In userspace, ms_max_backoff is defaulted to 15 seconds.  Let's do the
same.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
include/linux/ceph/messenger.h
net/ceph/messenger.c

index 60b324e..b47c7cc 100644 (file)
@@ -241,8 +241,8 @@ struct ceph_msg {
 };
 
 /* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL    (HZ/2)
-#define MAX_DELAY_INTERVAL     (5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL    (HZ / 4)
+#define MAX_DELAY_INTERVAL     (15 * HZ)
 
 /*
  * A single connection with another host.
index 214ae2d..f3eb66b 100644 (file)
@@ -2812,6 +2812,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
                return -ENOENT;
        }
 
+       if (delay >= HZ)
+               delay = round_jiffies_relative(delay);
+
        dout("%s %p %lu\n", __func__, con, delay);
        if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
                dout("%s %p - already queued\n", __func__, con);
@@ -2871,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
        if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
                return false;
 
-       ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+       ret = queue_con_delay(con, con->delay);
        if (ret) {
                dout("%s: con %p FAILED to back off %lu\n", __func__,
                        con, con->delay);
@@ -3018,10 +3021,13 @@ static void con_fault(struct ceph_connection *con)
        } else {
                /* retry after a delay. */
                con->state = CON_STATE_PREOPEN;
-               if (con->delay == 0)
+               if (!con->delay) {
                        con->delay = BASE_DELAY_INTERVAL;
-               else if (con->delay < MAX_DELAY_INTERVAL)
+               } else if (con->delay < MAX_DELAY_INTERVAL) {
                        con->delay *= 2;
+                       if (con->delay > MAX_DELAY_INTERVAL)
+                               con->delay = MAX_DELAY_INTERVAL;
+               }
                con_flag_set(con, CON_FLAG_BACKOFF);
                queue_con(con);
        }