xsk: Fix race in SKB mode transmit with shared cq
authorMagnus Karlsson <magnus.karlsson@intel.com>
Fri, 18 Dec 2020 13:45:24 +0000 (14:45 +0100)
committerDaniel Borkmann <daniel@iogearbox.net>
Fri, 18 Dec 2020 15:10:21 +0000 (16:10 +0100)
Fix a race when multiple sockets are simultaneously calling sendto()
when the completion ring is shared in the SKB case. This is the case
when you share the same netdev and queue id through the
XDP_SHARED_UMEM bind flag. The problem is that multiple processes can
be in xsk_generic_xmit() and call the backpressure mechanism in
xskq_prod_reserve(xs->pool->cq). As this is a shared resource in this
specific scenario, a race might occur since the rings are
single-producer single-consumer.

Fix this by moving the tx_completion_lock from the socket to the pool
as the pool is shared between the sockets that share the completion
ring. (The pool is not shared when this is not the case.) And then
protect the accesses to xskq_prod_reserve() with this lock. The
tx_completion_lock is renamed cq_lock to better reflect that it
protects accesses to the potentially shared completion ring.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Reported-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Link: https://lore.kernel.org/bpf/20201218134525.13119-2-magnus.karlsson@gmail.com
include/net/xdp_sock.h
include/net/xsk_buff_pool.h
net/xdp/xsk.c
net/xdp/xsk_buff_pool.c

index 4f4e93bf814c3e66b392f4bca8dd6e565685f0bc..cc17bc957548257602a5d9a6cdd59bb704f4e278 100644 (file)
@@ -58,10 +58,6 @@ struct xdp_sock {
 
        struct xsk_queue *tx ____cacheline_aligned_in_smp;
        struct list_head tx_list;
-       /* Mutual exclusion of NAPI TX thread and sendmsg error paths
-        * in the SKB destructor callback.
-        */
-       spinlock_t tx_completion_lock;
        /* Protects generic receive. */
        spinlock_t rx_lock;
 
index 01755b838c745079c53b6daefd7a44322ef1d7c3..eaa8386dbc630b3d30c9b8b1f76d4e18a538d9f5 100644 (file)
@@ -73,6 +73,11 @@ struct xsk_buff_pool {
        bool dma_need_sync;
        bool unaligned;
        void *addrs;
+       /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
+        * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
+        * sockets share a single cq when the same netdev and queue id is shared.
+        */
+       spinlock_t cq_lock;
        struct xdp_buff_xsk *free_heads[];
 };
 
index c6532d77fde76922066d38e3fbd3d5199f8a1135..d531f9cd0de6a163f4d45860c7eaa2e0079c6d72 100644 (file)
@@ -423,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
        struct xdp_sock *xs = xdp_sk(skb->sk);
        unsigned long flags;
 
-       spin_lock_irqsave(&xs->tx_completion_lock, flags);
+       spin_lock_irqsave(&xs->pool->cq_lock, flags);
        xskq_prod_submit_addr(xs->pool->cq, addr);
-       spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
+       spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
        sock_wfree(skb);
 }
@@ -437,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk)
        bool sent_frame = false;
        struct xdp_desc desc;
        struct sk_buff *skb;
+       unsigned long flags;
        int err = 0;
 
        mutex_lock(&xs->mutex);
@@ -468,10 +469,13 @@ static int xsk_generic_xmit(struct sock *sk)
                 * if there is space in it. This avoids having to implement
                 * any buffering in the Tx path.
                 */
+               spin_lock_irqsave(&xs->pool->cq_lock, flags);
                if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+                       spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
                        kfree_skb(skb);
                        goto out;
                }
+               spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
                skb->dev = xs->dev;
                skb->priority = sk->sk_priority;
@@ -1303,7 +1307,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
        xs->state = XSK_READY;
        mutex_init(&xs->mutex);
        spin_lock_init(&xs->rx_lock);
-       spin_lock_init(&xs->tx_completion_lock);
 
        INIT_LIST_HEAD(&xs->map_list);
        spin_lock_init(&xs->map_list_lock);
index 818b750609220e42560adc26cd8b6601309c4d9c..20598eea658c472fbea46f8365f7ca369c4b435a 100644 (file)
@@ -71,6 +71,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
        INIT_LIST_HEAD(&pool->free_list);
        INIT_LIST_HEAD(&pool->xsk_tx_list);
        spin_lock_init(&pool->xsk_tx_list_lock);
+       spin_lock_init(&pool->cq_lock);
        refcount_set(&pool->users, 1);
 
        pool->fq = xs->fq_tmp;