const void                              *tx;
        void                                    *rx;
        u16                                     tx_cmd;
-       u8                                      bits_per_word;
-       u8                                      bytes_per_word;
        const struct fsl_dspi_devtype_data      *devtype_data;
 
        wait_queue_head_t                       waitq;
 
        struct fsl_dspi_dma                     *dma;
 
+       int                                     oper_word_size;
+       int                                     oper_bits_per_word;
+
        int                                     words_in_flight;
+
+       void (*host_to_dev)(struct fsl_dspi *dspi, u32 *txdata);
+       void (*dev_to_host)(struct fsl_dspi *dspi, u32 rxdata);
 };
 
+static void dspi_native_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
+{
+       memcpy(txdata, dspi->tx, dspi->oper_word_size);
+       dspi->tx += dspi->oper_word_size;
+}
+
+static void dspi_native_dev_to_host(struct fsl_dspi *dspi, u32 rxdata)
+{
+       memcpy(dspi->rx, &rxdata, dspi->oper_word_size);
+       dspi->rx += dspi->oper_word_size;
+}
+
+static void dspi_8on32_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
+{
+       *txdata = cpu_to_be32(*(u32 *)dspi->tx);
+       dspi->tx += sizeof(u32);
+}
+
+static void dspi_8on32_dev_to_host(struct fsl_dspi *dspi, u32 rxdata)
+{
+       *(u32 *)dspi->rx = be32_to_cpu(rxdata);
+       dspi->rx += sizeof(u32);
+}
+
+static void dspi_8on16_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
+{
+       *txdata = cpu_to_be16(*(u16 *)dspi->tx);
+       dspi->tx += sizeof(u16);
+}
+
+static void dspi_8on16_dev_to_host(struct fsl_dspi *dspi, u32 rxdata)
+{
+       *(u16 *)dspi->rx = be16_to_cpu(rxdata);
+       dspi->rx += sizeof(u16);
+}
+
+static void dspi_16on32_host_to_dev(struct fsl_dspi *dspi, u32 *txdata)
+{
+       u16 hi = *(u16 *)dspi->tx;
+       u16 lo = *(u16 *)(dspi->tx + 2);
+
+       *txdata = (u32)hi << 16 | lo;
+       dspi->tx += sizeof(u32);
+}
+
+static void dspi_16on32_dev_to_host(struct fsl_dspi *dspi, u32 rxdata)
+{
+       u16 hi = rxdata & 0xffff;
+       u16 lo = rxdata >> 16;
+
+       *(u16 *)dspi->rx = lo;
+       *(u16 *)(dspi->rx + 2) = hi;
+       dspi->rx += sizeof(u32);
+}
+
 /*
  * Pop one word from the TX buffer for pushing into the
  * PUSHR register (TX FIFO)
 {
        u32 txdata = 0;
 
-       if (dspi->tx) {
-               memcpy(&txdata, dspi->tx, dspi->bytes_per_word);
-               dspi->tx += dspi->bytes_per_word;
-       }
-       dspi->len -= dspi->bytes_per_word;
+       if (dspi->tx)
+               dspi->host_to_dev(dspi, &txdata);
+       dspi->len -= dspi->oper_word_size;
        return txdata;
 }
 
 {
        if (!dspi->rx)
                return;
-
-       memcpy(dspi->rx, &rxdata, dspi->bytes_per_word);
-       dspi->rx += dspi->bytes_per_word;
+       dspi->dev_to_host(dspi, rxdata);
 }
 
 static void dspi_tx_dma_callback(void *arg)
                           dspi->devtype_data->fifo_size;
        while (curr_remaining_bytes) {
                /* Check if current transfer fits the DMA buffer */
-               dma->curr_xfer_len = curr_remaining_bytes
-                       / dspi->bytes_per_word;
+               dma->curr_xfer_len = curr_remaining_bytes /
+                                    dspi->oper_word_size;
                if (dma->curr_xfer_len > bytes_per_buffer)
                        dma->curr_xfer_len = bytes_per_buffer;
 
                        goto exit;
 
                } else {
-                       const int len =
-                               dma->curr_xfer_len * dspi->bytes_per_word;
+                       const int len = dma->curr_xfer_len *
+                                       dspi->oper_word_size;
                        curr_remaining_bytes -= len;
                        message->actual_length += len;
                        if (curr_remaining_bytes < 0)
         * generate a new PUSHR command with the final word that will have PCS
         * deasserted (not continued) here.
         */
-       if (dspi->len > dspi->bytes_per_word)
+       if (dspi->len > dspi->oper_word_size)
                cmd |= SPI_PUSHR_CMD_CONT;
        regmap_write(dspi->regmap_pushr, PUSHR_CMD, cmd);
 }
 
 static void dspi_xspi_write(struct fsl_dspi *dspi, int cnt)
 {
+       /* Update CTARE */
        regmap_write(dspi->regmap, SPI_CTARE(0),
-                    SPI_FRAME_EBITS(dspi->bits_per_word) |
+                    SPI_FRAME_EBITS(dspi->oper_bits_per_word) |
                     SPI_CTARE_DTCP(cnt));
 
        /*
                u32 data = dspi_pop_tx(dspi);
 
                dspi_pushr_txdata_write(dspi, data & 0xFFFF);
-               if (dspi->bits_per_word > 16)
+               if (dspi->oper_bits_per_word > 16)
                        dspi_pushr_txdata_write(dspi, data >> 16);
        }
 }
        int bytes_in_flight;
 
        /* In XSPI mode each 32-bit word occupies 2 TX FIFO entries */
-       if (dspi->bits_per_word > 16)
+       if (dspi->oper_word_size == 4)
                num_fifo_entries /= 2;
 
-       dspi->words_in_flight = dspi->len / dspi->bytes_per_word;
+       /*
+        * Integer division intentionally trims off odd (or non-multiple of 4)
+        * numbers of bytes at the end of the buffer, which will be sent next
+        * time using a smaller oper_word_size.
+        */
+       dspi->words_in_flight = dspi->len / dspi->oper_word_size;
 
        if (dspi->words_in_flight > num_fifo_entries)
                dspi->words_in_flight = num_fifo_entries;
 
-       bytes_in_flight = dspi->words_in_flight * dspi->bytes_per_word;
+       bytes_in_flight = dspi->words_in_flight * dspi->oper_word_size;
 
        /*
         * If the PCS needs to de-assert (i.e. we're at the end of the buffer
        while (dspi->len && num_fifo_entries--) {
                dspi->tx_cmd = xfer_cmd;
                /* Request EOQF for last transfer in FIFO */
-               if (dspi->len == dspi->bytes_per_word || num_fifo_entries == 0)
+               if (dspi->len == dspi->oper_word_size || num_fifo_entries == 0)
                        dspi->tx_cmd |= SPI_PUSHR_CMD_EOQ;
                /* Write combined TX FIFO and CMD FIFO entry */
                dspi_pushr_write(dspi);
                dspi_push_rx(dspi, dspi_popr_read(dspi));
 }
 
+static void dspi_setup_accel(struct fsl_dspi *dspi)
+{
+       struct spi_transfer *xfer = dspi->cur_transfer;
+
+       /* Start off with maximum supported by hardware */
+       if (dspi->devtype_data->trans_mode == DSPI_XSPI_MODE)
+               dspi->oper_bits_per_word = 32;
+       else
+               dspi->oper_bits_per_word = 16;
+
+       /* And go down only if the buffer can't be sent with words this big */
+       do {
+               if (dspi->len >= DIV_ROUND_UP(dspi->oper_bits_per_word, 8))
+                       break;
+
+               dspi->oper_bits_per_word /= 2;
+       } while (dspi->oper_bits_per_word > 8);
+
+       if (xfer->bits_per_word == 8 && dspi->oper_bits_per_word == 32) {
+               dspi->dev_to_host = dspi_8on32_dev_to_host;
+               dspi->host_to_dev = dspi_8on32_host_to_dev;
+       } else if (xfer->bits_per_word == 8 && dspi->oper_bits_per_word == 16) {
+               dspi->dev_to_host = dspi_8on16_dev_to_host;
+               dspi->host_to_dev = dspi_8on16_host_to_dev;
+       } else if (xfer->bits_per_word == 16 && dspi->oper_bits_per_word == 32) {
+               dspi->dev_to_host = dspi_16on32_dev_to_host;
+               dspi->host_to_dev = dspi_16on32_host_to_dev;
+       } else {
+               /* No acceleration needed (8<N<=16 on 16, 16<N<=32 on 32) */
+               dspi->dev_to_host = dspi_native_dev_to_host;
+               dspi->host_to_dev = dspi_native_host_to_dev;
+               dspi->oper_bits_per_word = xfer->bits_per_word;
+       }
+
+       dspi->oper_word_size = DIV_ROUND_UP(dspi->oper_bits_per_word, 8);
+
+       /*
+        * Update CTAR here (code is common for both EOQ and XSPI modes).
+        * We will update CTARE in the portion specific to XSPI, when we
+        * also know the preload value (DTCP).
+        */
+       regmap_write(dspi->regmap, SPI_CTAR(0),
+                    dspi->cur_chip->ctar_val |
+                    SPI_FRAME_BITS(dspi->oper_bits_per_word));
+}
+
 static void dspi_fifo_write(struct fsl_dspi *dspi)
 {
+       dspi_setup_accel(dspi);
+
        if (dspi->devtype_data->trans_mode == DSPI_EOQ_MODE)
                dspi_eoq_fifo_write(dspi);
        else
        int bytes_sent;
 
        /* Update total number of bytes that were transferred */
-       bytes_sent = dspi->words_in_flight * dspi->bytes_per_word;
+       bytes_sent = dspi->words_in_flight * dspi->oper_word_size;
        msg->actual_length += bytes_sent;
        dspi->progress += bytes_sent / DIV_ROUND_UP(xfer->bits_per_word, 8);
 
                dspi->rx = transfer->rx_buf;
                dspi->len = transfer->len;
                dspi->progress = 0;
-               /* Validated transfer specific frame size (defaults applied) */
-               dspi->bits_per_word = transfer->bits_per_word;
-               dspi->bytes_per_word = DIV_ROUND_UP(dspi->bits_per_word, 8);
 
                regmap_update_bits(dspi->regmap, SPI_MCR,
                                   SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF,
                                   SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF);
+               /*
+                * Static CTAR setup for modes that don't dynamically adjust it
+                * via dspi_setup_accel (aka for DMA)
+                */
                regmap_write(dspi->regmap, SPI_CTAR(0),
                             dspi->cur_chip->ctar_val |
                             SPI_FRAME_BITS(transfer->bits_per_word));