bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign

author Lorenz Bauer <lmb@isovalent.com>

Thu, 20 Jul 2023 15:30:11 +0000 (17:30 +0200)

committer Martin KaFai Lau <martin.lau@kernel.org>

Tue, 25 Jul 2023 20:55:55 +0000 (13:55 -0700)
author Lorenz Bauer <lmb@isovalent.com>
Thu, 20 Jul 2023 15:30:11 +0000 (17:30 +0200)
committer Martin KaFai Lau <martin.lau@kernel.org>
Tue, 25 Jul 2023 20:55:55 +0000 (13:55 -0700)
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h

index a6722d6ef80fa6bfbc69b40da979c506e353badd..284b5ce7205d923106be13ecb6da1a7bf6f392dd 100644 (file)
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -103,6 +103,46 @@ static inline struct sock *__inet6_lookup(struct net *net,
                                      daddr, hnum, dif, sdif);
  }
  
+static inline
+struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
+                             const struct in6_addr *saddr, const __be16 sport,
+                             const struct in6_addr *daddr, const __be16 dport,
+                             bool *refcounted, inet6_ehashfn_t *ehashfn)
+{
+       struct sock *sk, *reuse_sk;
+       bool prefetched;
+
+       sk = skb_steal_sock(skb, refcounted, &prefetched);
+       if (!sk)
+               return NULL;
+
+       if (!prefetched)
+               return sk;
+
+       if (sk->sk_protocol == IPPROTO_TCP) {
+               if (sk->sk_state != TCP_LISTEN)
+                       return sk;
+       } else if (sk->sk_protocol == IPPROTO_UDP) {
+               if (sk->sk_state != TCP_CLOSE)
+                       return sk;
+       } else {
+               return sk;
+       }
+
+       reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
+                                         saddr, sport, daddr, ntohs(dport),
+                                         ehashfn);
+       if (!reuse_sk)
+               return sk;
+
+       /* We've chosen a new reuseport sock which is never refcounted. This
+        * implies that sk also isn't refcounted.
+        */
+       WARN_ON_ONCE(*refcounted);
+
+       return reuse_sk;
+}
+
  static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
                                               struct sk_buff *skb, int doff,
                                               const __be16 sport,
@@ -110,14 +150,20 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
                                               int iif, int sdif,
                                               bool *refcounted)
  {
-       struct sock *sk = skb_steal_sock(skb, refcounted);
-
+       struct net *net = dev_net(skb_dst(skb)->dev);
+       const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+       struct sock *sk;
+
+       sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
+                             refcounted, inet6_ehashfn);
+       if (IS_ERR(sk))
+               return NULL;
         if (sk)
                 return sk;
  
-       return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
-                             doff, &ipv6_hdr(skb)->saddr, sport,
-                             &ipv6_hdr(skb)->daddr, ntohs(dport),
+       return __inet6_lookup(net, hashinfo, skb,
+                             doff, &ip6h->saddr, sport,
+                             &ip6h->daddr, ntohs(dport),
                               iif, sdif, refcounted);
  }
  
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h

index c0532cc7587f713a54032de37545c1497cead2dd..1177effabed35b494d0da1b18fd2210abf25629e 100644 (file)
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -449,6 +449,46 @@ static inline struct sock *inet_lookup(struct net *net,
         return sk;
  }
  
+static inline
+struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff,
+                            const __be32 saddr, const __be16 sport,
+                            const __be32 daddr, const __be16 dport,
+                            bool *refcounted, inet_ehashfn_t *ehashfn)
+{
+       struct sock *sk, *reuse_sk;
+       bool prefetched;
+
+       sk = skb_steal_sock(skb, refcounted, &prefetched);
+       if (!sk)
+               return NULL;
+
+       if (!prefetched)
+               return sk;
+
+       if (sk->sk_protocol == IPPROTO_TCP) {
+               if (sk->sk_state != TCP_LISTEN)
+                       return sk;
+       } else if (sk->sk_protocol == IPPROTO_UDP) {
+               if (sk->sk_state != TCP_CLOSE)
+                       return sk;
+       } else {
+               return sk;
+       }
+
+       reuse_sk = inet_lookup_reuseport(net, sk, skb, doff,
+                                        saddr, sport, daddr, ntohs(dport),
+                                        ehashfn);
+       if (!reuse_sk)
+               return sk;
+
+       /* We've chosen a new reuseport sock which is never refcounted. This
+        * implies that sk also isn't refcounted.
+        */
+       WARN_ON_ONCE(*refcounted);
+
+       return reuse_sk;
+}
+
  static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
                                              struct sk_buff *skb,
                                              int doff,
@@ -457,13 +497,18 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
                                              const int sdif,
                                              bool *refcounted)
  {
-       struct sock *sk = skb_steal_sock(skb, refcounted);
+       struct net *net = dev_net(skb_dst(skb)->dev);
         const struct iphdr *iph = ip_hdr(skb);
+       struct sock *sk;
  
+       sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport,
+                            refcounted, inet_ehashfn);
+       if (IS_ERR(sk))
+               return NULL;
         if (sk)
                 return sk;
  
-       return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+       return __inet_lookup(net, hashinfo, skb,
                              doff, iph->saddr, sport,
                              iph->daddr, dport, inet_iif(skb), sdif,
                              refcounted);
diff --git a/include/net/sock.h b/include/net/sock.h

index 7ae44bf866af5cd788ff10021a441d96b1f8d937..74cbfb15d28927b7ede08f8fc903a2cab99b5da2 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2815,20 +2815,23 @@ sk_is_refcounted(struct sock *sk)
   * skb_steal_sock - steal a socket from an sk_buff
   * @skb: sk_buff to steal the socket from
   * @refcounted: is set to true if the socket is reference-counted
+ * @prefetched: is set to true if the socket was assigned from bpf
   */
  static inline struct sock *
-skb_steal_sock(struct sk_buff *skb, bool *refcounted)
+skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched)
  {
         if (skb->sk) {
                 struct sock *sk = skb->sk;
  
                 *refcounted = true;
-               if (skb_sk_is_prefetched(skb))
+               *prefetched = skb_sk_is_prefetched(skb);
+               if (*prefetched)
                         *refcounted = sk_is_refcounted(sk);
                 skb->destructor = NULL;
                 skb->sk = NULL;
                 return sk;
         }
+       *prefetched = false;
         *refcounted = false;
         return NULL;
  }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 739c15906a65d65b8b5928461afa348d5d9191a1..7fc98f4b63e94884ec911372fc5941eb33e4c5ab 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4198,9 +4198,6 @@ union bpf_attr {
   *             **-EOPNOTSUPP** if the operation is not supported, for example
   *             a call from outside of TC ingress.
   *
- *             **-ESOCKTNOSUPPORT** if the socket type is not supported
- *             (reuseport).
- *
   * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
   *     Description
   *             Helper is overloaded depending on BPF program type. This
diff --git a/net/core/filter.c b/net/core/filter.c

index b5b51ef48c5f84c85aa8304235d0c1f6c86d43e4..7c37f4646c2081c7e4039fdc3b63d3a5726d3f90 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7351,8 +7351,6 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
                 return -EOPNOTSUPP;
         if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                 return -ENETUNREACH;
-       if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
-               return -ESOCKTNOSUPPORT;
         if (sk_unhashed(sk))
                 return -EOPNOTSUPP;
         if (sk_is_refcounted(sk) &&
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 456fb6ee6ee8e2854d76dad79e053b8a0bb96155..d89c4a39cf271762bb47c0051021284b622a3666 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2388,7 +2388,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
         if (udp4_csum_init(skb, uh, proto))
                 goto csum_error;
  
-       sk = skb_steal_sock(skb, &refcounted);
+       sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+                            &refcounted, udp_ehashfn);
+       if (IS_ERR(sk))
+               goto no_sk;
+
         if (sk) {
                 struct dst_entry *dst = skb_dst(skb);
                 int ret;
@@ -2409,7 +2413,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
         sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
         if (sk)
                 return udp_unicast_rcv_skb(sk, skb, uh);
-
+no_sk:
         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                 goto drop;
         nf_reset_ct(skb);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c

index 4b8974a8c7a3b76f65e0ee8d7756e0c096d618f8..00996f0f7cfe54f0f94fa8fd815f8f1e78787b56 100644 (file)
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -964,7 +964,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                 goto csum_error;
  
         /* Check if the socket is already available, e.g. due to early demux */
-       sk = skb_steal_sock(skb, &refcounted);
+       sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+                             &refcounted, udp6_ehashfn);
+       if (IS_ERR(sk))
+               goto no_sk;
+
         if (sk) {
                 struct dst_entry *dst = skb_dst(skb);
                 int ret;
@@ -998,7 +1002,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                         goto report_csum_error;
                 return udp6_unicast_rcv_skb(sk, skb, uh);
         }
-
+no_sk:
         reason = SKB_DROP_REASON_NO_SOCKET;
  
         if (!uh->check)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index 739c15906a65d65b8b5928461afa348d5d9191a1..7fc98f4b63e94884ec911372fc5941eb33e4c5ab 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4198,9 +4198,6 @@ union bpf_attr {
   *             **-EOPNOTSUPP** if the operation is not supported, for example
   *             a call from outside of TC ingress.
   *
- *             **-ESOCKTNOSUPPORT** if the socket type is not supported
- *             (reuseport).
- *
   * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
   *     Description
   *             Helper is overloaded depending on BPF program type. This
author	Lorenz Bauer <lmb@isovalent.com>
	Thu, 20 Jul 2023 15:30:11 +0000 (17:30 +0200)
committer	Martin KaFai Lau <martin.lau@kernel.org>
	Tue, 25 Jul 2023 20:55:55 +0000 (13:55 -0700)
include/net/inet6_hashtables.h		patch \| blob \| history
include/net/inet_hashtables.h		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
net/core/filter.c		patch \| blob \| history
net/ipv4/udp.c		patch \| blob \| history
net/ipv6/udp.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history