tcp: internal implementation for pacing

author Eric Dumazet <edumazet@google.com>

Tue, 16 May 2017 11:24:36 +0000 (04:24 -0700)

committer David S. Miller <davem@davemloft.net>

Tue, 16 May 2017 19:43:31 +0000 (15:43 -0400)
author Eric Dumazet <edumazet@google.com>
Tue, 16 May 2017 11:24:36 +0000 (04:24 -0700)
committer David S. Miller <davem@davemloft.net>
Tue, 16 May 2017 19:43:31 +0000 (15:43 -0400)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h

index b6d5adc..22854f0 100644 (file)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,6 +293,8 @@ struct tcp_sock {
         u32     sacked_out;     /* SACK'd packets                       */
         u32     fackets_out;    /* FACK'd packets                       */
  
+       struct hrtimer  pacing_timer;
+
         /* from STCP, retrans queue hinting */
         struct sk_buff* lost_skb_hint;
         struct sk_buff *retransmit_skb_hint;
diff --git a/include/net/sock.h b/include/net/sock.h

index 4226403..3467d9e 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -253,6 +253,7 @@ struct sock_common {
    *    @sk_ll_usec: usecs to busypoll when there is no data
    *    @sk_allocation: allocation mode
    *    @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
+  *    @sk_pacing_status: Pacing status (requested, handled by sch_fq)
    *    @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
    *    @sk_sndbuf: size of send buffer in bytes
    *    @sk_padding: unused element for alignment
@@ -396,7 +397,7 @@ struct sock {
         __s32                   sk_peek_off;
         int                     sk_write_pending;
         __u32                   sk_dst_pending_confirm;
-       /* Note: 32bit hole on 64bit arches */
+       u32                     sk_pacing_status; /* see enum sk_pacing */
         long                    sk_sndtimeo;
         struct timer_list       sk_timer;
         __u32                   sk_priority;
@@ -475,6 +476,12 @@ struct sock {
         struct rcu_head         sk_rcu;
  };
  
+enum sk_pacing {
+       SK_PACING_NONE          = 0,
+       SK_PACING_NEEDED        = 1,
+       SK_PACING_FQ            = 2,
+};
+
  #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
  
  #define rcu_dereference_sk_user_data(sk)       rcu_dereference(__sk_user_data((sk)))
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 38a7427..b4dc93d 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -574,6 +574,7 @@ void tcp_fin(struct sock *sk);
  void tcp_init_xmit_timers(struct sock *);
  static inline void tcp_clear_xmit_timers(struct sock *sk)
  {
+       hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
         inet_csk_clear_xmit_timers(sk);
  }
  
@@ -1945,4 +1946,6 @@ static inline void tcp_listendrop(const struct sock *sk)
         __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
  }
  
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
+
  #endif /* _TCP_H */
diff --git a/net/core/sock.c b/net/core/sock.c

index e43e71d..93d011e 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1041,6 +1041,10 @@ set_rcvbuf:
  #endif
  
         case SO_MAX_PACING_RATE:
+               if (val != ~0U)
+                       cmpxchg(&sk->sk_pacing_status,
+                               SK_PACING_NONE,
+                               SK_PACING_NEEDED);
                 sk->sk_max_pacing_rate = val;
                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
                                          sk->sk_max_pacing_rate);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c

index b89bce4..92b045c 100644 (file)
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -52,10 +52,9 @@
   * There is a public e-mail list for discussing BBR development and testing:
   *   https://groups.google.com/forum/#!forum/bbr-dev
   *
- * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
- * since pacing is integral to the BBR design and implementation.
- * BBR without pacing would not function properly, and may incur unnecessary
- * high packet loss rates.
+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * otherwise TCP stack falls back to an internal pacing using one high
+ * resolution timer per TCP socket and may use more resources.
   */
  #include <linux/module.h>
  #include <net/tcp.h>
@@ -830,6 +829,8 @@ static void bbr_init(struct sock *sk)
         bbr->cycle_idx = 0;
         bbr_reset_lt_bw_sampling(sk);
         bbr_reset_startup_mode(sk);
+
+       cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
  }
  
  static u32 bbr_sndbuf_expand(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 4858e19..a32172d 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -904,6 +904,72 @@ out:
         sk_free(sk);
  }
  
+/* Note: Called under hard irq.
+ * We can not call TCP stack right away.
+ */
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
+{
+       struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
+       struct sock *sk = (struct sock *)tp;
+       unsigned long nval, oval;
+
+       for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
+               struct tsq_tasklet *tsq;
+               bool empty;
+
+               if (oval & TSQF_QUEUED)
+                       break;
+
+               nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+               nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
+               if (nval != oval)
+                       continue;
+
+               if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
+                       break;
+               /* queue this socket to tasklet queue */
+               tsq = this_cpu_ptr(&tsq_tasklet);
+               empty = list_empty(&tsq->head);
+               list_add(&tp->tsq_node, &tsq->head);
+               if (empty)
+                       tasklet_schedule(&tsq->tasklet);
+               break;
+       }
+       return HRTIMER_NORESTART;
+}
+
+/* BBR congestion control needs pacing.
+ * Same remark for SO_MAX_PACING_RATE.
+ * sch_fq packet scheduler is efficiently handling pacing,
+ * but is not always installed/used.
+ * Return true if TCP stack should pace packets itself.
+ */
+static bool tcp_needs_internal_pacing(const struct sock *sk)
+{
+       return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+}
+
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+{
+       u64 len_ns;
+       u32 rate;
+
+       if (!tcp_needs_internal_pacing(sk))
+               return;
+       rate = sk->sk_pacing_rate;
+       if (!rate || rate == ~0U)
+               return;
+
+       /* Should account for header sizes as sch_fq does,
+        * but lets make things simple.
+        */
+       len_ns = (u64)skb->len * NSEC_PER_SEC;
+       do_div(len_ns, rate);
+       hrtimer_start(&tcp_sk(sk)->pacing_timer,
+                     ktime_add_ns(ktime_get(), len_ns),
+                     HRTIMER_MODE_ABS_PINNED);
+}
+
  /* This routine actually transmits TCP packets queued in by
   * tcp_do_sendmsg().  This is used by both the initial
   * transmission and possible later retransmissions.
@@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
         if (skb->len != tcp_header_size) {
                 tcp_event_data_sent(tp, sk);
                 tp->data_segs_out += tcp_skb_pcount(skb);
+               tcp_internal_pacing(sk, skb);
         }
  
         if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -2086,6 +2153,12 @@ static int tcp_mtu_probe(struct sock *sk)
         return -1;
  }
  
+static bool tcp_pacing_check(const struct sock *sk)
+{
+       return tcp_needs_internal_pacing(sk) &&
+              hrtimer_active(&tcp_sk(sk)->pacing_timer);
+}
+
  /* TCP Small Queues :
   * Control number of packets in qdisc/devices to two packets / or ~1 ms.
   * (These limits are doubled for retransmits)
@@ -2210,6 +2283,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
         while ((skb = tcp_send_head(sk))) {
                 unsigned int limit;
  
+               if (tcp_pacing_check(sk))
+                       break;
+
                 tso_segs = tcp_init_tso_segs(skb, mss_now);
                 BUG_ON(!tso_segs);
  
@@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
  
                 if (skb == tcp_send_head(sk))
                         break;
+
+               if (tcp_pacing_check(sk))
+                       break;
+
                 /* we could do better than to assign each time */
                 if (!hole)
                         tp->retransmit_skb_hint = skb;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index 1467254..86934bc 100644 (file)
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -710,4 +710,7 @@ void tcp_init_xmit_timers(struct sock *sk)
  {
         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
                                   &tcp_keepalive_timer);
+       hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_ABS_PINNED);
+       tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
  }
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c

index b488721..147fde7 100644 (file)
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -390,9 +390,17 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                 q->stat_tcp_retrans++;
         qdisc_qstats_backlog_inc(sch, skb);
         if (fq_flow_is_detached(f)) {
+               struct sock *sk = skb->sk;
+
                 fq_flow_add_tail(&q->new_flows, f);
                 if (time_after(jiffies, f->age + q->flow_refill_delay))
                         f->credit = max_t(u32, f->credit, q->quantum);
+               if (sk && q->rate_enable) {
+                       if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
+                                    SK_PACING_FQ))
+                               smp_store_release(&sk->sk_pacing_status,
+                                                 SK_PACING_FQ);
+               }
                 q->inactive_flows--;
         }
author	Eric Dumazet <edumazet@google.com>
	Tue, 16 May 2017 11:24:36 +0000 (04:24 -0700)
committer	David S. Miller <davem@davemloft.net>
	Tue, 16 May 2017 19:43:31 +0000 (15:43 -0400)
include/linux/tcp.h		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/core/sock.c		patch \| blob \| history
net/ipv4/tcp_bbr.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history
net/ipv4/tcp_timer.c		patch \| blob \| history
net/sched/sch_fq.c		patch \| blob \| history