tcp: adjust TSO packet sizes based on min_rtt

author Eric Dumazet <edumazet@google.com>

Wed, 9 Mar 2022 01:57:57 +0000 (17:57 -0800)

committer Jakub Kicinski <kuba@kernel.org>

Thu, 10 Mar 2022 04:05:44 +0000 (20:05 -0800)
author Eric Dumazet <edumazet@google.com>
Wed, 9 Mar 2022 01:57:57 +0000 (17:57 -0800)
committer Jakub Kicinski <kuba@kernel.org>
Thu, 10 Mar 2022 04:05:44 +0000 (20:05 -0800)
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst

index 2572eec..b0024aa 100644 (file)
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -878,6 +878,29 @@ tcp_min_tso_segs - INTEGER
  
         Default: 2
  
+tcp_tso_rtt_log - INTEGER
+       Adjustment of TSO packet sizes based on min_rtt
+
+       Starting from linux-5.18, TCP autosizing can be tweaked
+       for flows having small RTT.
+
+       Old autosizing was splitting the pacing budget to send 1024 TSO
+       per second.
+
+       tso_packet_size = sk->sk_pacing_rate / 1024;
+
+       With the new mechanism, we increase this TSO sizing using:
+
+       distance = min_rtt_usec / (2^tcp_tso_rtt_log)
+       tso_packet_size += gso_max_size >> distance;
+
+       This means that flows between very close hosts can use bigger
+       TSO packets, reducing their cpu costs.
+
+       If you want to use the old autosizing, set this sysctl to 0.
+
+       Default: 9  (2^9 = 512 usec)
+
  tcp_pacing_ss_ratio - INTEGER
         sk->sk_pacing_rate is set by TCP stack using a ratio applied
         to current rate. (current_rate = cwnd * mss / srtt)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h

index f068786..ce0cc4e 100644 (file)
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -127,6 +127,7 @@ struct netns_ipv4 {
         u8 sysctl_tcp_synack_retries;
         u8 sysctl_tcp_syncookies;
         u8 sysctl_tcp_migrate_req;
+       u8 sysctl_tcp_comp_sack_nr;
         int sysctl_tcp_reordering;
         u8 sysctl_tcp_retries1;
         u8 sysctl_tcp_retries2;
@@ -160,9 +161,9 @@ struct netns_ipv4 {
         int sysctl_tcp_challenge_ack_limit;
         int sysctl_tcp_min_rtt_wlen;
         u8 sysctl_tcp_min_tso_segs;
+       u8 sysctl_tcp_tso_rtt_log;
         u8 sysctl_tcp_autocorking;
         u8 sysctl_tcp_reflect_tos;
-       u8 sysctl_tcp_comp_sack_nr;
         int sysctl_tcp_invalid_ratelimit;
         int sysctl_tcp_pacing_ss_ratio;
         int sysctl_tcp_pacing_ca_ratio;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 1cae27b..ad80d18 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1271,6 +1271,13 @@ static struct ctl_table ipv4_net_table[] = {
                 .proc_handler   = proc_dou8vec_minmax,
                 .extra1         = SYSCTL_ONE,
         },
+       {
+               .procname       = "tcp_tso_rtt_log",
+               .data           = &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+               .maxlen         = sizeof(u8),
+               .mode           = 0644,
+               .proc_handler   = proc_dou8vec_minmax,
+       },
         {
                 .procname       = "tcp_min_rtt_wlen",
                 .data           = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 81694a3..f9cec62 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3137,6 +3137,7 @@ static int __net_init tcp_sk_init(struct net *net)
         /* rfc5961 challenge ack rate limiting */
         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
         net->ipv4.sysctl_tcp_min_tso_segs = 2;
+       net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
         net->ipv4.sysctl_tcp_autocorking = 1;
         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 2319531..81aaa7d 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1951,25 +1951,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
  }
  
  /* Return how many segs we'd like on a TSO packet,
- * to send one TSO packet per ms
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ *   cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ *   with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
   */
  static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
                             int min_tso_segs)
  {
-       u32 bytes, segs;
+       unsigned long bytes;
+       u32 r;
  
-       bytes = min_t(unsigned long,
-                     sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-                     sk->sk_gso_max_size);
+       bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
  
-       /* Goal is to send at least one packet per ms,
-        * not one big TSO packet every 100 ms.
-        * This preserves ACK clocking and is consistent
-        * with tcp_tso_should_defer() heuristic.
-        */
-       segs = max_t(u32, bytes / mss_now, min_tso_segs);
+       r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log;
+       if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+               bytes += sk->sk_gso_max_size >> r;
+
+       bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
  
-       return segs;
+       return max_t(u32, bytes / mss_now, min_tso_segs);
  }
  
  /* Return the number of segments we want in the skb we are transmitting.
author	Eric Dumazet <edumazet@google.com>
	Wed, 9 Mar 2022 01:57:57 +0000 (17:57 -0800)
committer	Jakub Kicinski <kuba@kernel.org>
	Thu, 10 Mar 2022 04:05:44 +0000 (20:05 -0800)
Documentation/networking/ip-sysctl.rst		patch \| blob \| history
include/net/netns/ipv4.h		patch \| blob \| history
net/ipv4/sysctl_net_ipv4.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history