bpf, sockmap: Allow skipping sk_skb parser program
authorJohn Fastabend <john.fastabend@gmail.com>
Sun, 11 Oct 2020 05:09:38 +0000 (22:09 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Mon, 12 Oct 2020 01:09:44 +0000 (18:09 -0700)
Currently, we often run with a nop parser namely one that just does
this, 'return skb->len'. This happens when either our verdict program
can handle streaming data or it is only looking at socket data such
as IP addresses and other metadata associated with the flow. The second
case is common for a L3/L4 proxy for instance.

So lets allow loading programs without the parser then we can skip
the stream parser logic and avoid having to add a BPF program that
is effectively a nop.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower
include/linux/skmsg.h
net/core/skmsg.c
net/core/sock_map.c

index 3119928..fec0c5a 100644 (file)
@@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node);
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 
 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
                         struct sk_msg *msg);
index 881a5b2..654182e 100644 (file)
@@ -627,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
        rcu_assign_sk_user_data(sk, NULL);
        if (psock->progs.skb_parser)
                sk_psock_stop_strp(sk, psock);
+       else if (psock->progs.skb_verdict)
+               sk_psock_stop_verdict(sk, psock);
        write_unlock_bh(&sk->sk_callback_lock);
        sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 
@@ -871,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
        rcu_read_unlock();
 }
 
+static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
+                                unsigned int offset, size_t orig_len)
+{
+       struct sock *sk = (struct sock *)desc->arg.data;
+       struct sk_psock *psock;
+       struct bpf_prog *prog;
+       int ret = __SK_DROP;
+       int len = skb->len;
+
+       /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
+       skb = skb_clone(skb, GFP_ATOMIC);
+       if (!skb) {
+               desc->error = -ENOMEM;
+               return 0;
+       }
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (unlikely(!psock)) {
+               len = 0;
+               kfree_skb(skb);
+               goto out;
+       }
+       skb_set_owner_r(skb, sk);
+       prog = READ_ONCE(psock->progs.skb_verdict);
+       if (likely(prog)) {
+               tcp_skb_bpf_redirect_clear(skb);
+               ret = sk_psock_bpf_run(psock, prog, skb);
+               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+       }
+       sk_psock_verdict_apply(psock, skb, ret);
+out:
+       rcu_read_unlock();
+       return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+       struct socket *sock = sk->sk_socket;
+       read_descriptor_t desc;
+
+       if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+               return;
+
+       desc.arg.data = sk;
+       desc.error = 0;
+       desc.count = 1;
+
+       sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+}
+
 static void sk_psock_write_space(struct sock *sk)
 {
        struct sk_psock *psock;
@@ -900,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
        return strp_init(&psock->parser.strp, sk, &cb);
 }
 
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (parser->enabled)
+               return;
+
+       parser->saved_data_ready = sk->sk_data_ready;
+       sk->sk_data_ready = sk_psock_verdict_data_ready;
+       sk->sk_write_space = sk_psock_write_space;
+       parser->enabled = true;
+}
+
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
 {
        struct sk_psock_parser *parser = &psock->parser;
@@ -925,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
        strp_stop(&parser->strp);
        parser->enabled = false;
 }
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (!parser->enabled)
+               return;
+
+       sk->sk_data_ready = parser->saved_data_ready;
+       parser->saved_data_ready = NULL;
+       parser->enabled = false;
+}
index a2ed5b6..df09c39 100644 (file)
@@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
 static void sock_map_del_link(struct sock *sk,
                              struct sk_psock *psock, void *link_raw)
 {
+       bool strp_stop = false, verdict_stop = false;
        struct sk_psock_link *link, *tmp;
-       bool strp_stop = false;
 
        spin_lock_bh(&psock->link_lock);
        list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
                                                             map);
                        if (psock->parser.enabled && stab->progs.skb_parser)
                                strp_stop = true;
+                       if (psock->parser.enabled && stab->progs.skb_verdict)
+                               verdict_stop = true;
                        list_del(&link->list);
                        sk_psock_free_link(link);
                }
        }
        spin_unlock_bh(&psock->link_lock);
-       if (strp_stop) {
+       if (strp_stop || verdict_stop) {
                write_lock_bh(&sk->sk_callback_lock);
-               sk_psock_stop_strp(sk, psock);
+               if (strp_stop)
+                       sk_psock_stop_strp(sk, psock);
+               else
+                       sk_psock_stop_verdict(sk, psock);
                write_unlock_bh(&sk->sk_callback_lock);
        }
 }
@@ -288,16 +293,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
        write_lock_bh(&sk->sk_callback_lock);
        if (skb_parser && skb_verdict && !psock->parser.enabled) {
                ret = sk_psock_init_strp(sk, psock);
-               if (ret) {
-                       write_unlock_bh(&sk->sk_callback_lock);
-                       goto out_drop;
-               }
+               if (ret)
+                       goto out_unlock_drop;
                psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
                psock_set_prog(&psock->progs.skb_parser, skb_parser);
                sk_psock_start_strp(sk, psock);
+       } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+               psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+               sk_psock_start_verdict(sk,psock);
        }
        write_unlock_bh(&sk->sk_callback_lock);
        return 0;
+out_unlock_drop:
+       write_unlock_bh(&sk->sk_callback_lock);
 out_drop:
        sk_psock_put(sk, psock);
 out_progs: