tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.
authorKuniyuki Iwashima <kuniyu@amazon.co.jp>
Sat, 12 Jun 2021 12:32:20 +0000 (21:32 +0900)
committerDaniel Borkmann <daniel@iogearbox.net>
Tue, 15 Jun 2021 16:01:06 +0000 (18:01 +0200)
This patch also changes the code to call reuseport_migrate_sock() and
inet_reqsk_clone(), but unlike the other cases, we do not call
inet_reqsk_clone() right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210612123224.12525-8-kuniyu@amazon.co.jp
net/ipv4/inet_connection_sock.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv6/tcp_ipv6.c

index f4b771e45ac11bc86fd6c55e909be3355d071021..0eea878edc304dcceef730654535133d597fdbae 100644 (file)
@@ -1114,12 +1114,40 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
                                         struct request_sock *req, bool own_req)
 {
        if (own_req) {
-               inet_csk_reqsk_queue_drop(sk, req);
-               reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-               if (inet_csk_reqsk_queue_add(sk, req, child))
+               inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+               reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+               if (sk != req->rsk_listener) {
+                       /* another listening sk has been selected,
+                        * migrate the req to it.
+                        */
+                       struct request_sock *nreq;
+
+                       /* hold a refcnt for the nreq->rsk_listener
+                        * which is assigned in inet_reqsk_clone()
+                        */
+                       sock_hold(sk);
+                       nreq = inet_reqsk_clone(req, sk);
+                       if (!nreq) {
+                               inet_child_forget(sk, req, child);
+                               goto child_put;
+                       }
+
+                       refcount_set(&nreq->rsk_refcnt, 1);
+                       if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+                               reqsk_migrate_reset(req);
+                               reqsk_put(req);
+                               return child;
+                       }
+
+                       reqsk_migrate_reset(nreq);
+                       __reqsk_free(nreq);
+               } else if (inet_csk_reqsk_queue_add(sk, req, child)) {
                        return child;
+               }
        }
        /* Too bad, another child took ownership of the request, undo. */
+child_put:
        bh_unlock_sock(child);
        sock_put(child);
        return NULL;
index 4f5b68a90be96bdc4be0753007a0fcea50044fd3..6cb8e269f1ab4b7e5e34332546efa1817292aba9 100644 (file)
@@ -2002,13 +2002,21 @@ process:
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
-                       inet_csk_reqsk_queue_drop_and_put(sk, req);
-                       goto lookup;
+                       nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+                       if (!nsk) {
+                               inet_csk_reqsk_queue_drop_and_put(sk, req);
+                               goto lookup;
+                       }
+                       sk = nsk;
+                       /* reuseport_migrate_sock() has already held one sk_refcnt
+                        * before returning.
+                        */
+               } else {
+                       /* We own a reference on the listener, increase it again
+                        * as we might lose it too soon.
+                        */
+                       sock_hold(sk);
                }
-               /* We own a reference on the listener, increase it again
-                * as we might lose it too soon.
-                */
-               sock_hold(sk);
                refcounted = true;
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
index 7513ba45553dba4de8fc9b85b2c18874f1ecbe22..f258a4c0da716d5f0f5c44664780bd640568ce8b 100644 (file)
@@ -775,8 +775,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                goto listen_overflow;
 
        if (own_req && rsk_drop_req(req)) {
-               reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
-               inet_csk_reqsk_queue_drop_and_put(sk, req);
+               reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+               inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
                return child;
        }
 
index 4435fa342e7aa756bd426ffe051ade7d84ac5523..4d71464094b3111f56854f51d120f596eaf699c3 100644 (file)
@@ -1664,10 +1664,18 @@ process:
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
-                       inet_csk_reqsk_queue_drop_and_put(sk, req);
-                       goto lookup;
+                       nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+                       if (!nsk) {
+                               inet_csk_reqsk_queue_drop_and_put(sk, req);
+                               goto lookup;
+                       }
+                       sk = nsk;
+                       /* reuseport_migrate_sock() has already held one sk_refcnt
+                        * before returning.
+                        */
+               } else {
+                       sock_hold(sk);
                }
-               sock_hold(sk);
                refcounted = true;
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {