fs: dlm: reconnect if socket error report occurs
authorAlexander Aring <aahringo@redhat.com>
Fri, 21 May 2021 19:08:37 +0000 (15:08 -0400)
committerDavid Teigland <teigland@redhat.com>
Tue, 25 May 2021 14:22:20 +0000 (09:22 -0500)
This patch will change the reconnect handling that if an error occurs
if a socket error callback is occurred. This will also handle reconnects
in a non blocking connecting case which is currently missing. If error
ECONNREFUSED is reported we delay the reconnect by one second.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
fs/dlm/lowcomms.c

index 4944aef..051f22d 100644 (file)
@@ -79,6 +79,8 @@ struct connection {
 #define CF_CLOSING 8
 #define CF_SHUTDOWN 9
 #define CF_CONNECTED 10
+#define CF_RECONNECT 11
+#define CF_DELAY_CONNECT 12
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        void (*connect_action) (struct connection *);   /* What to do to connect */
@@ -87,6 +89,7 @@ struct connection {
 #define MAX_CONNECT_RETRIES 3
        struct hlist_node list;
        struct connection *othercon;
+       struct connection *sendcon;
        struct work_struct rwork; /* Receive workqueue */
        struct work_struct swork; /* Send workqueue */
        wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
@@ -585,6 +588,22 @@ static void lowcomms_error_report(struct sock *sk)
                                   dlm_config.ci_tcp_port, sk->sk_err,
                                   sk->sk_err_soft);
        }
+
+       /* below sendcon only handling */
+       if (test_bit(CF_IS_OTHERCON, &con->flags))
+               con = con->sendcon;
+
+       switch (sk->sk_err) {
+       case ECONNREFUSED:
+               set_bit(CF_DELAY_CONNECT, &con->flags);
+               break;
+       default:
+               break;
+       }
+
+       if (!test_and_set_bit(CF_RECONNECT, &con->flags))
+               queue_work(send_workqueue, &con->swork);
+
 out:
        read_unlock_bh(&sk->sk_callback_lock);
        if (orig_report)
@@ -702,6 +721,8 @@ static void close_connection(struct connection *con, bool and_other,
        con->rx_leftover = 0;
        con->retries = 0;
        clear_bit(CF_CONNECTED, &con->flags);
+       clear_bit(CF_DELAY_CONNECT, &con->flags);
+       clear_bit(CF_RECONNECT, &con->flags);
        mutex_unlock(&con->sock_mutex);
        clear_bit(CF_CLOSING, &con->flags);
 }
@@ -840,18 +861,15 @@ out_resched:
 
 out_close:
        mutex_unlock(&con->sock_mutex);
-       if (ret != -EAGAIN) {
-               /* Reconnect when there is something to send */
+       if (ret == 0) {
                close_connection(con, false, true, false);
-               if (ret == 0) {
-                       log_print("connection %p got EOF from %d",
-                                 con, con->nodeid);
-                       /* handling for tcp shutdown */
-                       clear_bit(CF_SHUTDOWN, &con->flags);
-                       wake_up(&con->shutdown_wait);
-                       /* signal to breaking receive worker */
-                       ret = -1;
-               }
+               log_print("connection %p got EOF from %d",
+                         con, con->nodeid);
+               /* handling for tcp shutdown */
+               clear_bit(CF_SHUTDOWN, &con->flags);
+               wake_up(&con->shutdown_wait);
+               /* signal to breaking receive worker */
+               ret = -1;
        }
        return ret;
 }
@@ -940,6 +958,7 @@ static int accept_from_sock(struct listen_connection *con)
                        lockdep_set_subclass(&othercon->sock_mutex, 1);
                        set_bit(CF_IS_OTHERCON, &othercon->flags);
                        newcon->othercon = othercon;
+                       othercon->sendcon = newcon;
                } else {
                        /* close other sock con if we have something new */
                        close_connection(othercon, false, true, false);
@@ -1504,7 +1523,7 @@ static void send_to_sock(struct connection *con)
                                cond_resched();
                                goto out;
                        } else if (ret < 0)
-                               goto send_error;
+                               goto out;
                }
 
                /* Don't starve people filling buffers */
@@ -1521,14 +1540,6 @@ out:
        mutex_unlock(&con->sock_mutex);
        return;
 
-send_error:
-       mutex_unlock(&con->sock_mutex);
-       close_connection(con, false, false, true);
-       /* Requeue the send work. When the work daemon runs again, it will try
-          a new connection, then call this function again. */
-       queue_work(send_workqueue, &con->swork);
-       return;
-
 out_connect:
        mutex_unlock(&con->sock_mutex);
        queue_work(send_workqueue, &con->swork);
@@ -1605,8 +1616,15 @@ static void process_send_sockets(struct work_struct *work)
        WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
 
        clear_bit(CF_WRITE_PENDING, &con->flags);
-       if (con->sock == NULL) /* not mutex protected so check it inside too */
+
+       if (test_and_clear_bit(CF_RECONNECT, &con->flags))
+               close_connection(con, false, false, true);
+
+       if (con->sock == NULL) { /* not mutex protected so check it inside too */
+               if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
+                       msleep(1000);
                con->connect_action(con);
+       }
        if (!list_empty(&con->writequeue))
                send_to_sock(con);
 }