rxrpc: Fix congestion management
authorDavid Howells <dhowells@redhat.com>
Mon, 3 Oct 2022 17:49:11 +0000 (18:49 +0100)
committerDavid Howells <dhowells@redhat.com>
Tue, 8 Nov 2022 16:42:28 +0000 (16:42 +0000)
rxrpc has a problem in its congestion management in that it saves the
congestion window size (cwnd) from one call to another, but if this is 0 at
the time is saved, then the next call may not actually manage to ever
transmit anything.

To this end:

 (1) Don't save cwnd between calls, but rather reset back down to the
     initial cwnd and re-enter slow-start if data transmission is idle for
     more than an RTT.

 (2) Preserve ssthresh instead, as that is a handy estimate of pipe
     capacity.  Knowing roughly when to stop slow start and enter
     congestion avoidance can reduce the tendency to overshoot and drop
     larger amounts of packets when probing.

In future, cwind growth also needs to be constrained when the window isn't
being filled due to being application limited.

Reported-by: Simon Wilkinson <sxw@auristor.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org

include/trace/events/rxrpc.h
net/rxrpc/ar-internal.h
net/rxrpc/call_accept.c
net/rxrpc/call_object.c
net/rxrpc/conn_client.c
net/rxrpc/conn_object.c
net/rxrpc/input.c
net/rxrpc/output.c
net/rxrpc/peer_object.c
net/rxrpc/proc.c
net/rxrpc/sendmsg.c

index a11de55c3c14aae16648169d980a9152af38b3b3..b9886d1df825ce2bf26f1b0c0ff742a79460dcb5 100644 (file)
        EM(rxrpc_cong_new_low_nack,             " NewLowN") \
        EM(rxrpc_cong_no_change,                " -") \
        EM(rxrpc_cong_progress,                 " Progres") \
+       EM(rxrpc_cong_idle_reset,               " IdleRes") \
        EM(rxrpc_cong_retransmit_again,         " ReTxAgn") \
        EM(rxrpc_cong_rtt_window_end,           " RttWinE") \
        E_(rxrpc_cong_saw_nack,                 " SawNack")
index 775eb91aabb2d144efd166dbcc5fba0cb0452a5c..6bbe28ecf583a9d3a05a3b072aca71ad55f4e1cf 100644 (file)
@@ -332,7 +332,7 @@ struct rxrpc_peer {
        u32                     rto_j;          /* Retransmission timeout in jiffies */
        u8                      backoff;        /* Backoff timeout */
 
-       u8                      cong_cwnd;      /* Congestion window size */
+       u8                      cong_ssthresh;  /* Congestion slow-start threshold */
 };
 
 /*
@@ -626,6 +626,7 @@ struct rxrpc_call {
        u16                     tx_backoff;     /* Delay to insert due to Tx failure */
        u8                      tx_winsize;     /* Maximum size of Tx window */
 #define RXRPC_TX_MAX_WINDOW    128
+       ktime_t                 tx_last_sent;   /* Last time a transmission occurred */
 
        /* Received data tracking */
        struct sk_buff_head     recvmsg_queue;  /* Queue of packets ready for recvmsg() */
@@ -687,10 +688,10 @@ struct rxrpc_call {
  * Summary of a new ACK and the changes it made to the Tx buffer packet states.
  */
 struct rxrpc_ack_summary {
+       u16                     nr_acks;                /* Number of ACKs in packet */
+       u16                     nr_new_acks;            /* Number of new ACKs in packet */
+       u16                     nr_rot_new_acks;        /* Number of rotated new ACKs */
        u8                      ack_reason;
-       u8                      nr_acks;                /* Number of ACKs in packet */
-       u8                      nr_new_acks;            /* Number of new ACKs in packet */
-       u8                      nr_rot_new_acks;        /* Number of rotated new ACKs */
        bool                    saw_nacks;              /* Saw NACKs in packet */
        bool                    new_low_nack;           /* T if new low NACK found */
        bool                    retrans_timeo;          /* T if reTx due to timeout happened */
index d8db277d5ebec14e8484b6f3183cad3904976a1f..48790ee770192806204288e5475a2cbf27f836d8 100644 (file)
@@ -324,7 +324,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
        call->security = conn->security;
        call->security_ix = conn->security_ix;
        call->peer = rxrpc_get_peer(conn->params.peer);
-       call->cong_cwnd = call->peer->cong_cwnd;
+       call->cong_ssthresh = call->peer->cong_ssthresh;
+       call->tx_last_sent = ktime_get_real();
        return call;
 }
 
index aa19daaa487befeab0037ccfb741ed5130ef99ad..1befe22cd3017245cd5dfcfa268bbc707d7680ba 100644 (file)
@@ -166,7 +166,12 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
        call->rx_winsize = rxrpc_rx_window_size;
        call->tx_winsize = 16;
 
-       call->cong_cwnd = 2;
+       if (RXRPC_TX_SMSS > 2190)
+               call->cong_cwnd = 2;
+       else if (RXRPC_TX_SMSS > 1095)
+               call->cong_cwnd = 3;
+       else
+               call->cong_cwnd = 4;
        call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
 
        call->rxnet = rxnet;
index 3c9eeb5b750c11262db88034927062c225b1ad60..f020f308ed9ecc18ff9e984d5b6bdf74a41fd1ba 100644 (file)
@@ -363,7 +363,8 @@ static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx,
        if (!cp->peer)
                goto error;
 
-       call->cong_cwnd = cp->peer->cong_cwnd;
+       call->tx_last_sent = ktime_get_real();
+       call->cong_ssthresh = cp->peer->cong_ssthresh;
        if (call->cong_cwnd >= call->cong_ssthresh)
                call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
        else
index f7ea71ae6159d458f15d5e65c7f42a32dd2fdbe2..156bd26daf74cd906f70fa856fd745137f5f6cec 100644 (file)
@@ -207,7 +207,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 {
        struct rxrpc_connection *conn = call->conn;
 
-       call->peer->cong_cwnd = call->cong_cwnd;
+       call->peer->cong_ssthresh = call->cong_ssthresh;
 
        if (!hlist_unhashed(&call->error_link)) {
                spin_lock_bh(&call->peer->lock);
index 5c17fed4b60f421ad22c1d65d23783033b639ac4..bdf70b81addc497172e55f103d09751ac8a24706 100644 (file)
@@ -58,6 +58,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
        summary->cumulative_acks = cumulative_acks;
        summary->dup_acks = call->cong_dup_acks;
 
+       /* If we haven't transmitted anything for >1RTT, we should reset the
+        * congestion management state.
+        */
+       if ((call->cong_mode == RXRPC_CALL_SLOW_START ||
+            call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) &&
+           ktime_before(ktime_add_us(call->tx_last_sent,
+                                     call->peer->srtt_us >> 3),
+                        ktime_get_real())
+           ) {
+               change = rxrpc_cong_idle_reset;
+               summary->mode = RXRPC_CALL_SLOW_START;
+               if (RXRPC_TX_SMSS > 2190)
+                       summary->cwnd = 2;
+               else if (RXRPC_TX_SMSS > 1095)
+                       summary->cwnd = 3;
+               else
+                       summary->cwnd = 4;
+       }
+
        switch (call->cong_mode) {
        case RXRPC_CALL_SLOW_START:
                if (summary->saw_nacks)
@@ -205,7 +224,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 
        if (call->acks_lowest_nak == call->acks_hard_ack) {
                call->acks_lowest_nak = to;
-       } else if (before_eq(call->acks_lowest_nak, to)) {
+       } else if (after(to, call->acks_lowest_nak)) {
                summary->new_low_nack = true;
                call->acks_lowest_nak = to;
        }
index 2c3f7e4e30d72086d372f17f689b147fef36fd3f..46432e70a16b649fabd295a6a3f1d447de020c13 100644 (file)
@@ -501,6 +501,7 @@ dont_set_request_ack:
 
 done:
        if (ret >= 0) {
+               call->tx_last_sent = txb->last_sent;
                if (txb->wire.flags & RXRPC_REQUEST_ACK) {
                        call->peer->rtt_last_req = txb->last_sent;
                        if (call->peer->rtt_count > 1) {
index 26d2ae9baaf2c5c3ce22d2336c21c90738a377a5..041a51225c5f342ba833692d83a93da58f2eefdc 100644 (file)
@@ -227,12 +227,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 
                rxrpc_peer_init_rtt(peer);
 
-               if (RXRPC_TX_SMSS > 2190)
-                       peer->cong_cwnd = 2;
-               else if (RXRPC_TX_SMSS > 1095)
-                       peer->cong_cwnd = 3;
-               else
-                       peer->cong_cwnd = 4;
+               peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
                trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here);
        }
 
index 0807753ec2dc40556f7f95a3e025738ba6dfd8ec..fae22a8b38d647f18a20ab9be75d1f1a5b558c45 100644 (file)
@@ -217,7 +217,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
                seq_puts(seq,
                         "Proto Local                                          "
                         " Remote                                         "
-                        " Use  CW   MTU LastUse      RTT      RTO\n"
+                        " Use SST   MTU LastUse      RTT      RTO\n"
                         );
                return 0;
        }
@@ -235,7 +235,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
                   lbuff,
                   rbuff,
                   refcount_read(&peer->ref),
-                  peer->cong_cwnd,
+                  peer->cong_ssthresh,
                   peer->mtu,
                   now - peer->last_tx_at,
                   peer->srtt_us >> 3,
index 9b567aff3e84ec657e26f2eca7acfb84d94eda21..e5fd8a95bf716456d9f9b67883f25652bf84336b 100644 (file)
  */
 static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
 {
-       unsigned int win_size =
-               min_t(unsigned int, call->tx_winsize,
-                     call->cong_cwnd + call->cong_extra);
+       unsigned int win_size;
        rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack);
 
+       /* If we haven't transmitted anything for >1RTT, we should reset the
+        * congestion management state.
+        */
+       if (ktime_before(ktime_add_us(call->tx_last_sent,
+                                     call->peer->srtt_us >> 3),
+                        ktime_get_real())) {
+               if (RXRPC_TX_SMSS > 2190)
+                       win_size = 2;
+               else if (RXRPC_TX_SMSS > 1095)
+                       win_size = 3;
+               else
+                       win_size = 4;
+               win_size += call->cong_extra;
+       } else {
+               win_size = min_t(unsigned int, call->tx_winsize,
+                                call->cong_cwnd + call->cong_extra);
+       }
+
        if (_tx_win)
                *_tx_win = tx_win;
        return call->tx_top - tx_win < win_size;