net/mlx4_en: Handle TX error CQE
authorMoshe Shemesh <moshe@mellanox.com>
Wed, 9 Dec 2020 13:03:39 +0000 (15:03 +0200)
committerDavid S. Miller <davem@davemloft.net>
Thu, 10 Dec 2020 00:44:35 +0000 (16:44 -0800)
In case error CQE was found while polling TX CQ, the QP is in error
state and all posted WQEs will generate error CQEs without any data
transmitted. Fix it by reopening the channels, via same method used for
TX timeout handling.

In addition add some more info on error CQE and WQE for debug.

Fixes: bd2f631d7c60 ("net/mlx4_en: Notify user when TX ring in error state")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlx4/en_netdev.c
drivers/net/ethernet/mellanox/mlx4/en_tx.c
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h

index 1a2b0bd64aa9687b0aebfceb2bb4d782c76e476a..6f290319b617874f05cda708aa2d2cb32bc3bdc2 100644 (file)
@@ -1735,6 +1735,7 @@ int mlx4_en_start_port(struct net_device *dev)
                                mlx4_en_deactivate_cq(priv, cq);
                                goto tx_err;
                        }
+                       clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state);
                        if (t != TX_XDP) {
                                tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
                                tx_ring->recycle_ring = NULL;
index 3ddb7268e415f871dbdbd424ef1dfa86eb2ae0e9..59b097cda3278611e2bba55ad7192b5cde57260e 100644 (file)
@@ -392,6 +392,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
        return cnt;
 }
 
+static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
+                                  u16 cqe_index, struct mlx4_en_tx_ring *ring)
+{
+       struct mlx4_en_dev *mdev = priv->mdev;
+       struct mlx4_en_tx_info *tx_info;
+       struct mlx4_en_tx_desc *tx_desc;
+       u16 wqe_index;
+       int desc_size;
+
+       en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
+              ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
+       print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
+                      false);
+
+       wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
+       tx_info = &ring->tx_info[wqe_index];
+       desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
+       en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
+              wqe_index, desc_size);
+       tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
+       print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);
+
+       if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
+               return;
+
+       en_err(priv, "Scheduling port restart\n");
+       queue_work(mdev->workqueue, &priv->restart_task);
+}
+
 int mlx4_en_process_tx_cq(struct net_device *dev,
                          struct mlx4_en_cq *cq, int napi_budget)
 {
@@ -438,13 +467,10 @@ int mlx4_en_process_tx_cq(struct net_device *dev,
                dma_rmb();
 
                if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
-                            MLX4_CQE_OPCODE_ERROR)) {
-                       struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
-
-                       en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
-                              cqe_err->vendor_err_syndrome,
-                              cqe_err->syndrome);
-               }
+                            MLX4_CQE_OPCODE_ERROR))
+                       if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))
+                               mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
+                                                      ring);
 
                /* Skip over last polled CQE */
                new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
index fd9535bde1b89271375430486b99e92e415b74b4..30378e4c90b5b147a7f67b6b47588dd8a6212b9a 100644 (file)
@@ -271,6 +271,10 @@ struct mlx4_en_page_cache {
        } buf[MLX4_EN_CACHE_SIZE];
 };
 
+enum {
+       MLX4_EN_TX_RING_STATE_RECOVERING,
+};
+
 struct mlx4_en_priv;
 
 struct mlx4_en_tx_ring {
@@ -317,6 +321,7 @@ struct mlx4_en_tx_ring {
         * Only queue_stopped might be used if BQL is not properly working.
         */
        unsigned long           queue_stopped;
+       unsigned long           state;
        struct mlx4_hwq_resources sp_wqres;
        struct mlx4_qp          sp_qp;
        struct mlx4_qp_context  sp_context;