#define HSSPI_MAX_PREPEND_LEN                  15
 
-#define HSSPI_MAX_SYNC_CLOCK                   30000000
+/*
+ * Some chip require 30MHz but other require 25MHz. Use smaller value to cover
+ * both cases.
+ */
+#define HSSPI_MAX_SYNC_CLOCK                   25000000
 
 #define HSSPI_SPI_MAX_CS                       8
 #define HSSPI_BUS_NUM                          1 /* 0 is legacy SPI */
 #define HSSPI_WAIT_MODE_INTR           1
 #define HSSPI_WAIT_MODE_MAX                    HSSPI_WAIT_MODE_INTR
 
+/*
+ * Default transfer mode is auto. If the msg is prependable, use the prepend
+ * mode.  If not, falls back to use the dummy cs workaround mode but limit the
+ * clock to 25MHz to make sure it works in all board design.
+ */
+#define HSSPI_XFER_MODE_AUTO           0
+#define HSSPI_XFER_MODE_PREPEND                1
+#define HSSPI_XFER_MODE_DUMMYCS                2
+#define HSSPI_XFER_MODE_MAX                    HSSPI_XFER_MODE_DUMMYCS
+
+#define bcm63xx_prepend_printk_on_checkfail(bs, fmt, ...)      \
+do {                                                                           \
+       if (bs->xfer_mode == HSSPI_XFER_MODE_AUTO)                              \
+               dev_dbg(&bs->pdev->dev, fmt, ##__VA_ARGS__);            \
+       else if (bs->xfer_mode == HSSPI_XFER_MODE_PREPEND)              \
+               dev_err(&bs->pdev->dev, fmt, ##__VA_ARGS__);            \
+} while (0)
+
 struct bcm63xx_hsspi {
        struct completion done;
        struct mutex bus_mutex;
        u32 speed_hz;
        u8 cs_polarity;
        u32 wait_mode;
+       u32 xfer_mode;
+       u32 prepend_cnt;
+       u8 *prepend_buf;
 };
 
 static ssize_t wait_mode_show(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_RW(wait_mode);
 
+static ssize_t xfer_mode_show(struct device *dev, struct device_attribute *attr,
+                        char *buf)
+{
+       struct spi_controller *ctrl = dev_get_drvdata(dev);
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(ctrl);
+
+       return sprintf(buf, "%d\n", bs->xfer_mode);
+}
+
+static ssize_t xfer_mode_store(struct device *dev, struct device_attribute *attr,
+                         const char *buf, size_t count)
+{
+       struct spi_controller *ctrl = dev_get_drvdata(dev);
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(ctrl);
+       u32 val;
+
+       if (kstrtou32(buf, 10, &val))
+               return -EINVAL;
+
+       if (val > HSSPI_XFER_MODE_MAX) {
+               dev_warn(dev, "invalid xfer mode %u\n", val);
+               return -EINVAL;
+       }
+
+       mutex_lock(&bs->msg_mutex);
+       bs->xfer_mode = val;
+       mutex_unlock(&bs->msg_mutex);
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(xfer_mode);
+
 static struct attribute *bcm63xx_hsspi_attrs[] = {
        &dev_attr_wait_mode.attr,
+       &dev_attr_xfer_mode.attr,
        NULL,
 };
 
        .attrs = bcm63xx_hsspi_attrs,
 };
 
+static void bcm63xx_hsspi_set_clk(struct bcm63xx_hsspi *bs,
+                                 struct spi_device *spi, int hz);
+
+static size_t bcm63xx_hsspi_max_message_size(struct spi_device *spi)
+{
+       return HSSPI_BUFFER_LEN - HSSPI_OPCODE_LEN;
+}
+
+static int bcm63xx_hsspi_wait_cmd(struct bcm63xx_hsspi *bs)
+{
+       unsigned long limit;
+       u32 reg = 0;
+       int rc = 0;
+
+       if (bs->wait_mode == HSSPI_WAIT_MODE_INTR) {
+               if (wait_for_completion_timeout(&bs->done, HZ) == 0)
+                       rc = 1;
+       } else {
+               /* polling mode checks for status busy bit */
+               limit = jiffies + msecs_to_jiffies(HSSPI_POLL_STATUS_TIMEOUT_MS);
+
+               while (!time_after(jiffies, limit)) {
+                       reg = __raw_readl(bs->regs + HSSPI_PINGPONG_STATUS_REG(0));
+                       if (reg & HSSPI_PINGPONG_STATUS_SRC_BUSY)
+                               cpu_relax();
+                       else
+                               break;
+               }
+               if (reg & HSSPI_PINGPONG_STATUS_SRC_BUSY)
+                       rc = 1;
+       }
+
+       if (rc)
+               dev_err(&bs->pdev->dev, "transfer timed out!\n");
+
+       return rc;
+}
+
+static bool bcm63xx_prepare_prepend_transfer(struct spi_master *master,
+                                         struct spi_message *msg,
+                                         struct spi_transfer *t_prepend)
+{
+
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(master);
+       bool tx_only = false;
+       struct spi_transfer *t;
+
+       /*
+        * Multiple transfers within a message may be combined into one transfer
+        * to the controller using its prepend feature. A SPI message is prependable
+        * only if the following are all true:
+        *   1. One or more half duplex write transfer in single bit mode
+        *   2. Optional full duplex read/write at the end
+        *   3. No delay and cs_change between transfers
+        */
+       bs->prepend_cnt = 0;
+       list_for_each_entry(t, &msg->transfers, transfer_list) {
+               if ((spi_delay_to_ns(&t->delay, t) > 0) || t->cs_change) {
+                       bcm63xx_prepend_printk_on_checkfail(bs,
+                                "Delay or cs change not supported in prepend mode!\n");
+                       return false;
+               }
+
+               tx_only = false;
+               if (t->tx_buf && !t->rx_buf) {
+                       tx_only = true;
+                       if (bs->prepend_cnt + t->len >
+                               (HSSPI_BUFFER_LEN - HSSPI_OPCODE_LEN)) {
+                               bcm63xx_prepend_printk_on_checkfail(bs,
+                                        "exceed max buf len, abort prepending transfers!\n");
+                               return false;
+                       }
+
+                       if (t->tx_nbits > SPI_NBITS_SINGLE &&
+                               !list_is_last(&t->transfer_list, &msg->transfers)) {
+                               bcm63xx_prepend_printk_on_checkfail(bs,
+                                        "multi-bit prepend buf not supported!\n");
+                               return false;
+                       }
+
+                       if (t->tx_nbits == SPI_NBITS_SINGLE) {
+                               memcpy(bs->prepend_buf + bs->prepend_cnt, t->tx_buf, t->len);
+                               bs->prepend_cnt += t->len;
+                       }
+               } else {
+                       if (!list_is_last(&t->transfer_list, &msg->transfers)) {
+                               bcm63xx_prepend_printk_on_checkfail(bs,
+                                        "rx/tx_rx transfer not supported when it is not last one!\n");
+                               return false;
+                       }
+               }
+
+               if (list_is_last(&t->transfer_list, &msg->transfers)) {
+                       memcpy(t_prepend, t, sizeof(struct spi_transfer));
+
+                       if (tx_only && t->tx_nbits == SPI_NBITS_SINGLE) {
+                               /*
+                                * if the last one is also a single bit tx only transfer, merge
+                                * all of them into one single tx transfer
+                                */
+                               t_prepend->len = bs->prepend_cnt;
+                               t_prepend->tx_buf = bs->prepend_buf;
+                               bs->prepend_cnt = 0;
+                       } else {
+                               /*
+                                * if the last one is not a tx only transfer or dual tx xfer, all
+                                * the previous transfers are sent through prepend bytes and
+                                * make sure it does not exceed the max prepend len
+                                */
+                               if (bs->prepend_cnt > HSSPI_MAX_PREPEND_LEN) {
+                                       bcm63xx_prepend_printk_on_checkfail(bs,
+                                               "exceed max prepend len, abort prepending transfers!\n");
+                                       return false;
+                               }
+                       }
+               }
+       }
+
+       return true;
+}
+
+static int bcm63xx_hsspi_do_prepend_txrx(struct spi_device *spi,
+                                        struct spi_transfer *t)
+{
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(spi->master);
+       unsigned int chip_select = spi->chip_select;
+       u16 opcode = 0;
+       const u8 *tx = t->tx_buf;
+       u8 *rx = t->rx_buf;
+       u32 reg = 0;
+
+       /*
+        * shouldn't happen as we set the max_message_size in the probe.
+        * but check it again in case some driver does not honor the max size
+        */
+       if (t->len + bs->prepend_cnt > (HSSPI_BUFFER_LEN - HSSPI_OPCODE_LEN)) {
+               dev_warn(&bs->pdev->dev,
+                        "Prepend message large than fifo size len %d prepend %d\n",
+                        t->len, bs->prepend_cnt);
+               return -EINVAL;
+       }
+
+       bcm63xx_hsspi_set_clk(bs, spi, t->speed_hz);
+
+       if (tx && rx)
+               opcode = HSSPI_OP_READ_WRITE;
+       else if (tx)
+               opcode = HSSPI_OP_WRITE;
+       else if (rx)
+               opcode = HSSPI_OP_READ;
+
+       if ((opcode == HSSPI_OP_READ && t->rx_nbits == SPI_NBITS_DUAL) ||
+           (opcode == HSSPI_OP_WRITE && t->tx_nbits == SPI_NBITS_DUAL)) {
+               opcode |= HSSPI_OP_MULTIBIT;
+
+               if (t->rx_nbits == SPI_NBITS_DUAL) {
+                       reg |= 1 << MODE_CTRL_MULTIDATA_RD_SIZE_SHIFT;
+                       reg |= bs->prepend_cnt << MODE_CTRL_MULTIDATA_RD_STRT_SHIFT;
+               }
+               if (t->tx_nbits == SPI_NBITS_DUAL) {
+                       reg |= 1 << MODE_CTRL_MULTIDATA_WR_SIZE_SHIFT;
+                       reg |= bs->prepend_cnt << MODE_CTRL_MULTIDATA_WR_STRT_SHIFT;
+               }
+       }
+
+       reg |= bs->prepend_cnt << MODE_CTRL_PREPENDBYTE_CNT_SHIFT;
+       __raw_writel(reg | 0xff,
+                    bs->regs + HSSPI_PROFILE_MODE_CTRL_REG(chip_select));
+
+       reinit_completion(&bs->done);
+       if (bs->prepend_cnt)
+               memcpy_toio(bs->fifo + HSSPI_OPCODE_LEN, bs->prepend_buf,
+                           bs->prepend_cnt);
+       if (tx)
+               memcpy_toio(bs->fifo + HSSPI_OPCODE_LEN + bs->prepend_cnt, tx,
+                           t->len);
+
+       __raw_writew((u16)cpu_to_be16(opcode | t->len), bs->fifo);
+       /* enable interrupt */
+       if (bs->wait_mode == HSSPI_WAIT_MODE_INTR)
+               __raw_writel(HSSPI_PINGx_CMD_DONE(0), bs->regs + HSSPI_INT_MASK_REG);
+
+       /* start the transfer */
+       reg = chip_select << PINGPONG_CMD_SS_SHIFT |
+           chip_select << PINGPONG_CMD_PROFILE_SHIFT |
+           PINGPONG_COMMAND_START_NOW;
+       __raw_writel(reg, bs->regs + HSSPI_PINGPONG_COMMAND_REG(0));
+
+       if (bcm63xx_hsspi_wait_cmd(bs))
+               return -ETIMEDOUT;
+
+       if (rx)
+               memcpy_fromio(rx, bs->fifo, t->len);
+
+       return 0;
+}
+
 static void bcm63xx_hsspi_set_cs(struct bcm63xx_hsspi *bs, unsigned int cs,
                                 bool active)
 {
        int step_size = HSSPI_BUFFER_LEN;
        const u8 *tx = t->tx_buf;
        u8 *rx = t->rx_buf;
-       u32 val = 0;
-       unsigned long limit;
+       u32 reg = 0;
 
        bcm63xx_hsspi_set_clk(bs, spi, t->speed_hz);
        if (!t->cs_off)
                opcode |= HSSPI_OP_MULTIBIT;
 
                if (t->rx_nbits == SPI_NBITS_DUAL)
-                       val |= 1 << MODE_CTRL_MULTIDATA_RD_SIZE_SHIFT;
+                       reg |= 1 << MODE_CTRL_MULTIDATA_RD_SIZE_SHIFT;
                if (t->tx_nbits == SPI_NBITS_DUAL)
-                       val |= 1 << MODE_CTRL_MULTIDATA_WR_SIZE_SHIFT;
+                       reg |= 1 << MODE_CTRL_MULTIDATA_WR_SIZE_SHIFT;
        }
 
-       __raw_writel(val | 0xff,
+       __raw_writel(reg | 0xff,
                     bs->regs + HSSPI_PROFILE_MODE_CTRL_REG(chip_select));
 
        while (pending > 0) {
                        __raw_writel(HSSPI_PINGx_CMD_DONE(0),
                                     bs->regs + HSSPI_INT_MASK_REG);
 
-               /* start the transfer */
-               __raw_writel(!chip_select << PINGPONG_CMD_SS_SHIFT |
-                            chip_select << PINGPONG_CMD_PROFILE_SHIFT |
-                            PINGPONG_COMMAND_START_NOW,
-                            bs->regs + HSSPI_PINGPONG_COMMAND_REG(0));
+               reg =  !chip_select << PINGPONG_CMD_SS_SHIFT |
+                           chip_select << PINGPONG_CMD_PROFILE_SHIFT |
+                           PINGPONG_COMMAND_START_NOW;
+               __raw_writel(reg, bs->regs + HSSPI_PINGPONG_COMMAND_REG(0));
 
-               if (bs->wait_mode == HSSPI_WAIT_MODE_INTR) {
-                       if (wait_for_completion_timeout(&bs->done, HZ) == 0)
-                               goto err_timeout;
-               } else {
-                       /* polling mode checks for status busy bit */
-                       limit = jiffies + msecs_to_jiffies(HSSPI_POLL_STATUS_TIMEOUT_MS);
-                       while (!time_after(jiffies, limit)) {
-                               val = __raw_readl(bs->regs + HSSPI_PINGPONG_STATUS_REG(0));
-                               if (val & HSSPI_PINGPONG_STATUS_SRC_BUSY)
-                                       cpu_relax();
-                               else
-                                       break;
-                       }
-                       if (val & HSSPI_PINGPONG_STATUS_SRC_BUSY)
-                               goto err_timeout;
-               }
+               if (bcm63xx_hsspi_wait_cmd(bs))
+                       return -ETIMEDOUT;
 
                if (rx) {
                        memcpy_fromio(rx, bs->fifo, curr_step);
        }
 
        return 0;
-
-err_timeout:
-       dev_err(&bs->pdev->dev, "transfer timed out!\n");
-       return -ETIMEDOUT;
 }
 
 static int bcm63xx_hsspi_setup(struct spi_device *spi)
        return 0;
 }
 
-static int bcm63xx_hsspi_transfer_one(struct spi_master *master,
+static int bcm63xx_hsspi_do_dummy_cs_txrx(struct spi_device *spi,
                                      struct spi_message *msg)
 {
-       struct bcm63xx_hsspi *bs = spi_master_get_devdata(master);
-       struct spi_transfer *t;
-       struct spi_device *spi = msg->spi;
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(spi->master);
        int status = -EINVAL;
        int dummy_cs;
        bool keep_cs = false;
+       struct spi_transfer *t;
 
-       mutex_lock(&bs->msg_mutex);
-       /* This controller does not support keeping CS active during idle.
+       /*
+        * This controller does not support keeping CS active during idle.
         * To work around this, we use the following ugly hack:
         *
         * a. Invert the target chip select's polarity so it will be active.
        bcm63xx_hsspi_set_cs(bs, dummy_cs, true);
 
        list_for_each_entry(t, &msg->transfers, transfer_list) {
+               /*
+                * We are here because one of reasons below:
+                * a. Message is not prependable and in default auto xfer mode. This mean
+                *    we fallback to dummy cs mode at maximum 25MHz safe clock rate.
+                * b. User set to use the dummy cs mode.
+                */
+               if (bs->xfer_mode == HSSPI_XFER_MODE_AUTO) {
+                       if (t->speed_hz > HSSPI_MAX_SYNC_CLOCK) {
+                               t->speed_hz = HSSPI_MAX_SYNC_CLOCK;
+                               dev_warn_once(&bs->pdev->dev,
+                                       "Force to dummy cs mode. Reduce the speed to %dHz",
+                                       t->speed_hz);
+                       }
+               }
+
                status = bcm63xx_hsspi_do_txrx(spi, t);
                if (status)
                        break;
        if (status || !keep_cs)
                bcm63xx_hsspi_set_cs(bs, spi->chip_select, false);
 
+       return status;
+}
+
+static int bcm63xx_hsspi_transfer_one(struct spi_master *master,
+                                     struct spi_message *msg)
+{
+       struct bcm63xx_hsspi *bs = spi_master_get_devdata(master);
+       struct spi_device *spi = msg->spi;
+       int status = -EINVAL;
+       bool prependable = false;
+       struct spi_transfer t_prepend;
+
+       mutex_lock(&bs->msg_mutex);
+
+       if (bs->xfer_mode != HSSPI_XFER_MODE_DUMMYCS)
+               prependable = bcm63xx_prepare_prepend_transfer(master, msg, &t_prepend);
+
+       if (prependable) {
+               status = bcm63xx_hsspi_do_prepend_txrx(spi, &t_prepend);
+               msg->actual_length = (t_prepend.len + bs->prepend_cnt);
+       } else {
+               if (bs->xfer_mode == HSSPI_XFER_MODE_PREPEND) {
+                       dev_err(&bs->pdev->dev,
+                               "User sets prepend mode but msg not prependable! Abort transfer\n");
+                       status = -EINVAL;
+               } else
+                       status = bcm63xx_hsspi_do_dummy_cs_txrx(spi, msg);
+       }
+
        mutex_unlock(&bs->msg_mutex);
        msg->status = status;
        spi_finalize_current_message(master);
        bs->speed_hz = rate;
        bs->fifo = (u8 __iomem *)(bs->regs + HSSPI_FIFO_REG(0));
        bs->wait_mode = HSSPI_WAIT_MODE_POLLING;
+       bs->prepend_buf = devm_kzalloc(dev, HSSPI_BUFFER_LEN, GFP_KERNEL);
+       if (!bs->prepend_buf) {
+               ret = -ENOMEM;
+               goto out_put_master;
+       }
 
        mutex_init(&bs->bus_mutex);
        mutex_init(&bs->msg_mutex);
        master->num_chipselect = num_cs;
        master->setup = bcm63xx_hsspi_setup;
        master->transfer_one_message = bcm63xx_hsspi_transfer_one;
+       master->max_transfer_size = bcm63xx_hsspi_max_message_size;
+       master->max_message_size = bcm63xx_hsspi_max_message_size;
+
        master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH |
                            SPI_RX_DUAL | SPI_TX_DUAL;
        master->bits_per_word_mask = SPI_BPW_MASK(8);