struct hnae3_queue {
        void __iomem *io_base;
+       void __iomem *mem_base;
        struct hnae3_ae_algo *ae_algo;
        struct hnae3_handle *handle;
        int tqp_index;          /* index in a handle */
 
        return bd_num;
 }
 
+static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num)
+{
+#define HNS3_BYTES_PER_64BIT           8
+
+       struct hns3_desc desc[HNS3_MAX_PUSH_BD_NUM] = {};
+       int offset = 0;
+
+       /* make sure everything is visible to device before
+        * excuting tx push or updating doorbell
+        */
+       dma_wmb();
+
+       do {
+               int idx = (ring->next_to_use - num + ring->desc_num) %
+                         ring->desc_num;
+
+               u64_stats_update_begin(&ring->syncp);
+               ring->stats.tx_push++;
+               u64_stats_update_end(&ring->syncp);
+               memcpy(&desc[offset], &ring->desc[idx],
+                      sizeof(struct hns3_desc));
+               offset++;
+       } while (--num);
+
+       __iowrite64_copy(ring->tqp->mem_base, desc,
+                        (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) /
+                        HNS3_BYTES_PER_64BIT);
+
+       io_stop_wc();
+}
+
+static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
+{
+#define HNS3_MEM_DOORBELL_OFFSET       64
+
+       __le64 bd_num = cpu_to_le64((u64)ring->pending_buf);
+
+       /* make sure everything is visible to device before
+        * excuting tx push or updating doorbell
+        */
+       dma_wmb();
+
+       __iowrite64_copy(ring->tqp->mem_base + HNS3_MEM_DOORBELL_OFFSET,
+                        &bd_num, 1);
+       u64_stats_update_begin(&ring->syncp);
+       ring->stats.tx_mem_doorbell += ring->pending_buf;
+       u64_stats_update_end(&ring->syncp);
+
+       io_stop_wc();
+}
+
 static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,
                             bool doorbell)
 {
+       struct net_device *netdev = ring_to_netdev(ring);
+       struct hns3_nic_priv *priv = netdev_priv(netdev);
+
+       /* when tx push is enabled, the packet whose number of BD below
+        * HNS3_MAX_PUSH_BD_NUM can be pushed directly.
+        */
+       if (test_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state) && num &&
+           !ring->pending_buf && num <= HNS3_MAX_PUSH_BD_NUM && doorbell) {
+               hns3_tx_push_bd(ring, num);
+               WRITE_ONCE(ring->last_to_use, ring->next_to_use);
+               return;
+       }
+
        ring->pending_buf += num;
 
        if (!doorbell) {
                return;
        }
 
-       if (!ring->pending_buf)
-               return;
+       if (ring->tqp->mem_base)
+               hns3_tx_mem_doorbell(ring);
+       else
+               writel(ring->pending_buf,
+                      ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
 
-       writel(ring->pending_buf,
-              ring->tqp->io_base + HNS3_RING_TX_RING_TAIL_REG);
        ring->pending_buf = 0;
        WRITE_ONCE(ring->last_to_use, ring->next_to_use);
 }
                    "seg_pkt_cnt: %llu, tx_more: %llu, restart_queue: %llu, tx_busy: %llu\n",
                    tx_ring->stats.seg_pkt_cnt, tx_ring->stats.tx_more,
                    tx_ring->stats.restart_queue, tx_ring->stats.tx_busy);
+
+       netdev_info(ndev, "tx_push: %llu, tx_mem_doorbell: %llu\n",
+                   tx_ring->stats.tx_push, tx_ring->stats.tx_mem_doorbell);
 }
 
 static void hns3_dump_queue_reg(struct net_device *ndev,
 
        set_bit(HNS3_NIC_STATE_INITED, &priv->state);
 
+       if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+               set_bit(HNS3_NIC_STATE_TX_PUSH_ENABLE, &priv->state);
+
        if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3)
                set_bit(HNAE3_PFLAG_LIMIT_PROMISC, &handle->supported_pflags);
 
 
 #include <linux/dim.h>
 #include <linux/if_vlan.h>
 #include <net/page_pool.h>
+#include <asm/barrier.h>
 
 #include "hnae3.h"
 
        HNS3_NIC_STATE2_RESET_REQUESTED,
        HNS3_NIC_STATE_HW_TX_CSUM_ENABLE,
        HNS3_NIC_STATE_RXD_ADV_LAYOUT_ENABLE,
+       HNS3_NIC_STATE_TX_PUSH_ENABLE,
        HNS3_NIC_STATE_MAX
 };
 
+#define HNS3_MAX_PUSH_BD_NUM           2
+
 #define HNS3_RING_RX_RING_BASEADDR_L_REG       0x00000
 #define HNS3_RING_RX_RING_BASEADDR_H_REG       0x00004
 #define HNS3_RING_RX_RING_BD_NUM_REG           0x00008
                        u64 tx_pkts;
                        u64 tx_bytes;
                        u64 tx_more;
+                       u64 tx_push;
+                       u64 tx_mem_doorbell;
                        u64 restart_queue;
                        u64 tx_busy;
                        u64 tx_copy;
 
        HNS3_TQP_STAT("packets", tx_pkts),
        HNS3_TQP_STAT("bytes", tx_bytes),
        HNS3_TQP_STAT("more", tx_more),
+       HNS3_TQP_STAT("push", tx_push),
+       HNS3_TQP_STAT("mem_doorbell", tx_mem_doorbell),
        HNS3_TQP_STAT("wake", restart_queue),
        HNS3_TQP_STAT("busy", tx_busy),
        HNS3_TQP_STAT("copy", tx_copy),
 
 
 static int hclge_alloc_tqps(struct hclge_dev *hdev)
 {
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        struct hclge_comm_tqp *tqp;
        int i;
 
                                         (i - HCLGE_TQP_MAX_SIZE_DEV_V2) *
                                         HCLGE_TQP_REG_SIZE;
 
+               /* when device supports tx push and has device memory,
+                * the queue can execute push mode or doorbell mode on
+                * device memory.
+                */
+               if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+                       tqp->q.mem_base = hdev->hw.hw.mem_base +
+                                         HCLGE_TQP_MEM_OFFSET(hdev, i);
+
                tqp++;
        }
 
 
 static int hclge_dev_mem_map(struct hclge_dev *hdev)
 {
-#define HCLGE_MEM_BAR          4
-
        struct pci_dev *pdev = hdev->pdev;
        struct hclge_hw *hw = &hdev->hw;
 
 
 #define HCLGE_VECTOR0_ALL_MSIX_ERR_B   6U
 #define HCLGE_TRIGGER_IMP_RESET_B      7U
 
+#define HCLGE_TQP_MEM_SIZE             0x10000
+#define HCLGE_MEM_BAR                  4
+/* in the bar4, the first half is for roce, and the second half is for nic */
+#define HCLGE_NIC_MEM_OFFSET(hdev)     \
+       (pci_resource_len((hdev)->pdev, HCLGE_MEM_BAR) >> 1)
+#define HCLGE_TQP_MEM_OFFSET(hdev, i)  \
+       (HCLGE_NIC_MEM_OFFSET(hdev) + HCLGE_TQP_MEM_SIZE * (i))
+
 #define HCLGE_MAC_DEFAULT_FRAME \
        (ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)
 #define HCLGE_MAC_MIN_FRAME            64
 
 
 static int hclgevf_alloc_tqps(struct hclgevf_dev *hdev)
 {
+       struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
        struct hclge_comm_tqp *tqp;
        int i;
 
                                         (i - HCLGEVF_TQP_MAX_SIZE_DEV_V2) *
                                         HCLGEVF_TQP_REG_SIZE;
 
+               /* when device supports tx push and has device memory,
+                * the queue can execute push mode or doorbell mode on
+                * device memory.
+                */
+               if (test_bit(HNAE3_DEV_SUPPORT_TX_PUSH_B, ae_dev->caps))
+                       tqp->q.mem_base = hdev->hw.hw.mem_base +
+                                         HCLGEVF_TQP_MEM_OFFSET(hdev, i);
+
                tqp++;
        }
 
 
 static int hclgevf_dev_mem_map(struct hclgevf_dev *hdev)
 {
-#define HCLGEVF_MEM_BAR                4
-
        struct pci_dev *pdev = hdev->pdev;
        struct hclgevf_hw *hw = &hdev->hw;
 
 
 
 #define HCLGEVF_RSS_IND_TBL_SIZE               512
 
+#define HCLGEVF_TQP_MEM_SIZE           0x10000
+#define HCLGEVF_MEM_BAR                        4
+/* in the bar4, the first half is for roce, and the second half is for nic */
+#define HCLGEVF_NIC_MEM_OFFSET(hdev)   \
+       (pci_resource_len((hdev)->pdev, HCLGEVF_MEM_BAR) >> 1)
+#define HCLGEVF_TQP_MEM_OFFSET(hdev, i)                \
+       (HCLGEVF_NIC_MEM_OFFSET(hdev) + HCLGEVF_TQP_MEM_SIZE * (i))
+
 #define HCLGEVF_MAC_MAX_FRAME          9728
 
 #define HCLGEVF_STATS_TIMER_INTERVAL   36U