tcl_data and reo_dst rings are currently being allocated using
dma_allocate_coherent() which is non cacheable.
Allocating ring memory from cacheable memory area allows cached descriptor
access and prefetch next descriptors to optimize CPU usage during
descriptor processing on NAPI. Based on the hardware param we can enable
or disable this feature for the corresponding platform.
Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
Co-developed-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Co-developed-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Signed-off-by: P Praneesh <ppranees@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/1630560820-21905-3-git-send-email-ppranees@codeaurora.org
                .max_tx_ring = DP_TCL_NUM_RING_MAX,
                .hal_params = &ath11k_hw_hal_params_ipq8074,
                .supports_dynamic_smps_6ghz = false,
+               .alloc_cacheable_memory = true,
        },
        {
                .hw_rev = ATH11K_HW_IPQ6018_HW10,
                .max_tx_ring = DP_TCL_NUM_RING_MAX,
                .hal_params = &ath11k_hw_hal_params_ipq8074,
                .supports_dynamic_smps_6ghz = false,
+               .alloc_cacheable_memory = true,
        },
        {
                .name = "qca6390 hw2.0",
                .max_tx_ring = DP_TCL_NUM_RING_MAX_QCA6390,
                .hal_params = &ath11k_hw_hal_params_qca6390,
                .supports_dynamic_smps_6ghz = false,
+               .alloc_cacheable_memory = false,
        },
        {
                .name = "qcn9074 hw1.0",
                .max_tx_ring = DP_TCL_NUM_RING_MAX,
                .hal_params = &ath11k_hw_hal_params_ipq8074,
                .supports_dynamic_smps_6ghz = true,
+               .alloc_cacheable_memory = true,
        },
        {
                .name = "wcn6855 hw2.0",
                .max_tx_ring = DP_TCL_NUM_RING_MAX_QCA6390,
                .hal_params = &ath11k_hw_hal_params_qca6390,
                .supports_dynamic_smps_6ghz = false,
+               .alloc_cacheable_memory = false,
        },
 };
 
 
        if (!ring->vaddr_unaligned)
                return;
 
-       dma_free_coherent(ab->dev, ring->size, ring->vaddr_unaligned,
-                         ring->paddr_unaligned);
+       if (ring->cached)
+               kfree(ring->vaddr_unaligned);
+       else
+               dma_free_coherent(ab->dev, ring->size, ring->vaddr_unaligned,
+                                 ring->paddr_unaligned);
 
        ring->vaddr_unaligned = NULL;
 }
        int entry_sz = ath11k_hal_srng_get_entrysize(ab, type);
        int max_entries = ath11k_hal_srng_get_max_entries(ab, type);
        int ret;
+       bool cached = false;
 
        if (max_entries < 0 || entry_sz < 0)
                return -EINVAL;
                num_entries = max_entries;
 
        ring->size = (num_entries * entry_sz) + HAL_RING_BASE_ALIGN - 1;
-       ring->vaddr_unaligned = dma_alloc_coherent(ab->dev, ring->size,
-                                                  &ring->paddr_unaligned,
-                                                  GFP_KERNEL);
+
+       if (ab->hw_params.alloc_cacheable_memory) {
+               /* Allocate the reo dst and tx completion rings from cacheable memory */
+               switch (type) {
+               case HAL_REO_DST:
+                       cached = true;
+                       break;
+               default:
+                       cached = false;
+               }
+
+               if (cached) {
+                       ring->vaddr_unaligned = kzalloc(ring->size, GFP_KERNEL);
+                       ring->paddr_unaligned = virt_to_phys(ring->vaddr_unaligned);
+               }
+       }
+
+       if (!cached)
+               ring->vaddr_unaligned = dma_alloc_coherent(ab->dev, ring->size,
+                                                          &ring->paddr_unaligned,
+                                                          GFP_KERNEL);
+
        if (!ring->vaddr_unaligned)
                return -ENOMEM;
 
                return -EINVAL;
        }
 
+       if (cached) {
+               params.flags |= HAL_SRNG_FLAGS_CACHED;
+               ring->cached = 1;
+       }
+
        ret = ath11k_hal_srng_setup(ab, type, ring_num, mac_id, ¶ms);
        if (ret < 0) {
                ath11k_warn(ab, "failed to setup srng: %d ring_id %d\n",
 
        dma_addr_t paddr;
        int size;
        u32 ring_id;
+       u8 cached;
 };
 
 struct dp_rxdma_ring {
 
        return NULL;
 }
 
+static void ath11k_hal_srng_prefetch_desc(struct ath11k_base *ab,
+                                         struct hal_srng *srng)
+{
+       u32 *desc;
+
+       /* prefetch only if desc is available */
+       desc = ath11k_hal_srng_dst_peek(ab, srng);
+       if (likely(desc)) {
+               dma_sync_single_for_cpu(ab->dev, virt_to_phys(desc),
+                                       (srng->entry_size * sizeof(u32)),
+                                       DMA_FROM_DEVICE);
+               prefetch(desc);
+       }
+}
+
 u32 *ath11k_hal_srng_dst_get_next_entry(struct ath11k_base *ab,
                                        struct hal_srng *srng)
 {
        srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size) %
                              srng->ring_size;
 
+       /* Try to prefetch the next descriptor in the ring */
+       if (srng->flags & HAL_SRNG_FLAGS_CACHED)
+               ath11k_hal_srng_prefetch_desc(ab, srng);
+
        return desc;
 }
 
 {
        lockdep_assert_held(&srng->lock);
 
-       if (srng->ring_dir == HAL_SRNG_DIR_SRC)
+       if (srng->ring_dir == HAL_SRNG_DIR_SRC) {
                srng->u.src_ring.cached_tp =
                        *(volatile u32 *)srng->u.src_ring.tp_addr;
-       else
+       } else {
                srng->u.dst_ring.cached_hp = *srng->u.dst_ring.hp_addr;
+
+               /* Try to prefetch the next descriptor in the ring */
+               if (srng->flags & HAL_SRNG_FLAGS_CACHED)
+                       ath11k_hal_srng_prefetch_desc(ab, srng);
+       }
 }
 
 /* Update cached ring head/tail pointers to HW. ath11k_hal_srng_access_begin()
 
 #define HAL_SRNG_FLAGS_DATA_TLV_SWAP           0x00000020
 #define HAL_SRNG_FLAGS_LOW_THRESH_INTR_EN      0x00010000
 #define HAL_SRNG_FLAGS_MSI_INTR                        0x00020000
+#define HAL_SRNG_FLAGS_CACHED                   0x20000000
 #define HAL_SRNG_FLAGS_LMAC_RING               0x80000000
 
 #define HAL_SRNG_TLV_HDR_TAG           GENMASK(9, 1)
 
        u8 max_tx_ring;
        const struct ath11k_hw_hal_params *hal_params;
        bool supports_dynamic_smps_6ghz;
+       bool alloc_cacheable_memory;
 };
 
 struct ath11k_hw_ops {