--- /dev/null
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+
+       rounds          .req    x11
+       bskey           .req    x12
+
+       .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+       eor             \b2, \b2, \b1
+       eor             \b5, \b5, \b6
+       eor             \b3, \b3, \b0
+       eor             \b6, \b6, \b2
+       eor             \b5, \b5, \b0
+       eor             \b6, \b6, \b3
+       eor             \b3, \b3, \b7
+       eor             \b7, \b7, \b5
+       eor             \b3, \b3, \b4
+       eor             \b4, \b4, \b5
+       eor             \b2, \b2, \b7
+       eor             \b3, \b3, \b1
+       eor             \b1, \b1, \b5
+       .endm
+
+       .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+       eor             \b0, \b0, \b6
+       eor             \b1, \b1, \b4
+       eor             \b4, \b4, \b6
+       eor             \b2, \b2, \b0
+       eor             \b6, \b6, \b1
+       eor             \b1, \b1, \b5
+       eor             \b5, \b5, \b3
+       eor             \b3, \b3, \b7
+       eor             \b7, \b7, \b5
+       eor             \b2, \b2, \b5
+       eor             \b4, \b4, \b7
+       .endm
+
+       .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+       eor             \b1, \b1, \b7
+       eor             \b4, \b4, \b7
+       eor             \b7, \b7, \b5
+       eor             \b1, \b1, \b3
+       eor             \b2, \b2, \b5
+       eor             \b3, \b3, \b7
+       eor             \b6, \b6, \b1
+       eor             \b2, \b2, \b0
+       eor             \b5, \b5, \b3
+       eor             \b4, \b4, \b6
+       eor             \b0, \b0, \b6
+       eor             \b1, \b1, \b4
+       .endm
+
+       .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+       eor             \b1, \b1, \b5
+       eor             \b2, \b2, \b7
+       eor             \b3, \b3, \b1
+       eor             \b4, \b4, \b5
+       eor             \b7, \b7, \b5
+       eor             \b3, \b3, \b4
+       eor             \b5, \b5, \b0
+       eor             \b3, \b3, \b7
+       eor             \b6, \b6, \b2
+       eor             \b2, \b2, \b1
+       eor             \b6, \b6, \b3
+       eor             \b3, \b3, \b0
+       eor             \b5, \b5, \b6
+       .endm
+
+       .macro          mul_gf4, x0, x1, y0, y1, t0, t1
+       eor             \t0, \y0, \y1
+       and             \t0, \t0, \x0
+       eor             \x0, \x0, \x1
+       and             \t1, \x1, \y0
+       and             \x0, \x0, \y1
+       eor             \x1, \t1, \t0
+       eor             \x0, \x0, \t1
+       .endm
+
+       .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+       eor             \t0, \y0, \y1
+       eor             \t1, \y2, \y3
+       and             \t0, \t0, \x0
+       and             \t1, \t1, \x2
+       eor             \x0, \x0, \x1
+       eor             \x2, \x2, \x3
+       and             \x1, \x1, \y0
+       and             \x3, \x3, \y2
+       and             \x0, \x0, \y1
+       and             \x2, \x2, \y3
+       eor             \x1, \x1, \x0
+       eor             \x2, \x2, \x3
+       eor             \x0, \x0, \t0
+       eor             \x3, \x3, \t1
+       .endm
+
+       .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+                                   y0, y1, y2, y3, t0, t1, t2, t3
+       eor             \t0, \x0, \x2
+       eor             \t1, \x1, \x3
+       mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
+       eor             \y0, \y0, \y2
+       eor             \y1, \y1, \y3
+       mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+       eor             \x0, \x0, \t0
+       eor             \x2, \x2, \t0
+       eor             \x1, \x1, \t1
+       eor             \x3, \x3, \t1
+       eor             \t0, \x4, \x6
+       eor             \t1, \x5, \x7
+       mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+       eor             \y0, \y0, \y2
+       eor             \y1, \y1, \y3
+       mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
+       eor             \x4, \x4, \t0
+       eor             \x6, \x6, \t0
+       eor             \x5, \x5, \t1
+       eor             \x7, \x7, \t1
+       .endm
+
+       .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+                                  t0, t1, t2, t3, s0, s1, s2, s3
+       eor             \t3, \x4, \x6
+       eor             \t0, \x5, \x7
+       eor             \t1, \x1, \x3
+       eor             \s1, \x7, \x6
+       eor             \s0, \x0, \x2
+       eor             \s3, \t3, \t0
+       orr             \t2, \t0, \t1
+       and             \s2, \t3, \s0
+       orr             \t3, \t3, \s0
+       eor             \s0, \s0, \t1
+       and             \t0, \t0, \t1
+       eor             \t1, \x3, \x2
+       and             \s3, \s3, \s0
+       and             \s1, \s1, \t1
+       eor             \t1, \x4, \x5
+       eor             \s0, \x1, \x0
+       eor             \t3, \t3, \s1
+       eor             \t2, \t2, \s1
+       and             \s1, \t1, \s0
+       orr             \t1, \t1, \s0
+       eor             \t3, \t3, \s3
+       eor             \t0, \t0, \s1
+       eor             \t2, \t2, \s2
+       eor             \t1, \t1, \s3
+       eor             \t0, \t0, \s2
+       and             \s0, \x7, \x3
+       eor             \t1, \t1, \s2
+       and             \s1, \x6, \x2
+       and             \s2, \x5, \x1
+       orr             \s3, \x4, \x0
+       eor             \t3, \t3, \s0
+       eor             \t1, \t1, \s2
+       eor             \s0, \t0, \s3
+       eor             \t2, \t2, \s1
+       and             \s2, \t3, \t1
+       eor             \s1, \t2, \s2
+       eor             \s3, \s0, \s2
+       bsl             \s1, \t1, \s0
+       not             \t0, \s0
+       bsl             \s0, \s1, \s3
+       bsl             \t0, \s1, \s3
+       bsl             \s3, \t3, \t2
+       eor             \t3, \t3, \t2
+       and             \s2, \s0, \s3
+       eor             \t1, \t1, \t0
+       eor             \s2, \s2, \t3
+       mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+                       \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+       .endm
+
+       .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+                             t0, t1, t2, t3, s0, s1, s2, s3
+       in_bs_ch        \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+                       \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+       inv_gf256       \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+                       \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+                       \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+                       \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+       out_bs_ch       \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+                       \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+       .endm
+
+       .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+                                 t0, t1, t2, t3, s0, s1, s2, s3
+       inv_in_bs_ch    \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+                       \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+       inv_gf256       \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+                       \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+                       \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+                       \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+       inv_out_bs_ch   \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+                       \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+       .endm
+
+       .macro          enc_next_rk
+       ldp             q16, q17, [bskey], #128
+       ldp             q18, q19, [bskey, #-96]
+       ldp             q20, q21, [bskey, #-64]
+       ldp             q22, q23, [bskey, #-32]
+       .endm
+
+       .macro          dec_next_rk
+       ldp             q16, q17, [bskey, #-128]!
+       ldp             q18, q19, [bskey, #32]
+       ldp             q20, q21, [bskey, #64]
+       ldp             q22, q23, [bskey, #96]
+       .endm
+
+       .macro          add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+       eor             \x0\().16b, \x0\().16b, v16.16b
+       eor             \x1\().16b, \x1\().16b, v17.16b
+       eor             \x2\().16b, \x2\().16b, v18.16b
+       eor             \x3\().16b, \x3\().16b, v19.16b
+       eor             \x4\().16b, \x4\().16b, v20.16b
+       eor             \x5\().16b, \x5\().16b, v21.16b
+       eor             \x6\().16b, \x6\().16b, v22.16b
+       eor             \x7\().16b, \x7\().16b, v23.16b
+       .endm
+
+       .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+       tbl             \x0\().16b, {\x0\().16b}, \mask\().16b
+       tbl             \x1\().16b, {\x1\().16b}, \mask\().16b
+       tbl             \x2\().16b, {\x2\().16b}, \mask\().16b
+       tbl             \x3\().16b, {\x3\().16b}, \mask\().16b
+       tbl             \x4\().16b, {\x4\().16b}, \mask\().16b
+       tbl             \x5\().16b, {\x5\().16b}, \mask\().16b
+       tbl             \x6\().16b, {\x6\().16b}, \mask\().16b
+       tbl             \x7\().16b, {\x7\().16b}, \mask\().16b
+       .endm
+
+       .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+                                 t0, t1, t2, t3, t4, t5, t6, t7, inv
+       ext             \t0\().16b, \x0\().16b, \x0\().16b, #12
+       ext             \t1\().16b, \x1\().16b, \x1\().16b, #12
+       eor             \x0\().16b, \x0\().16b, \t0\().16b
+       ext             \t2\().16b, \x2\().16b, \x2\().16b, #12
+       eor             \x1\().16b, \x1\().16b, \t1\().16b
+       ext             \t3\().16b, \x3\().16b, \x3\().16b, #12
+       eor             \x2\().16b, \x2\().16b, \t2\().16b
+       ext             \t4\().16b, \x4\().16b, \x4\().16b, #12
+       eor             \x3\().16b, \x3\().16b, \t3\().16b
+       ext             \t5\().16b, \x5\().16b, \x5\().16b, #12
+       eor             \x4\().16b, \x4\().16b, \t4\().16b
+       ext             \t6\().16b, \x6\().16b, \x6\().16b, #12
+       eor             \x5\().16b, \x5\().16b, \t5\().16b
+       ext             \t7\().16b, \x7\().16b, \x7\().16b, #12
+       eor             \x6\().16b, \x6\().16b, \t6\().16b
+       eor             \t1\().16b, \t1\().16b, \x0\().16b
+       eor             \x7\().16b, \x7\().16b, \t7\().16b
+       ext             \x0\().16b, \x0\().16b, \x0\().16b, #8
+       eor             \t2\().16b, \t2\().16b, \x1\().16b
+       eor             \t0\().16b, \t0\().16b, \x7\().16b
+       eor             \t1\().16b, \t1\().16b, \x7\().16b
+       ext             \x1\().16b, \x1\().16b, \x1\().16b, #8
+       eor             \t5\().16b, \t5\().16b, \x4\().16b
+       eor             \x0\().16b, \x0\().16b, \t0\().16b
+       eor             \t6\().16b, \t6\().16b, \x5\().16b
+       eor             \x1\().16b, \x1\().16b, \t1\().16b
+       ext             \t0\().16b, \x4\().16b, \x4\().16b, #8
+       eor             \t4\().16b, \t4\().16b, \x3\().16b
+       ext             \t1\().16b, \x5\().16b, \x5\().16b, #8
+       eor             \t7\().16b, \t7\().16b, \x6\().16b
+       ext             \x4\().16b, \x3\().16b, \x3\().16b, #8
+       eor             \t3\().16b, \t3\().16b, \x2\().16b
+       ext             \x5\().16b, \x7\().16b, \x7\().16b, #8
+       eor             \t4\().16b, \t4\().16b, \x7\().16b
+       ext             \x3\().16b, \x6\().16b, \x6\().16b, #8
+       eor             \t3\().16b, \t3\().16b, \x7\().16b
+       ext             \x6\().16b, \x2\().16b, \x2\().16b, #8
+       eor             \x7\().16b, \t1\().16b, \t5\().16b
+       .ifb            \inv
+       eor             \x2\().16b, \t0\().16b, \t4\().16b
+       eor             \x4\().16b, \x4\().16b, \t3\().16b
+       eor             \x5\().16b, \x5\().16b, \t7\().16b
+       eor             \x3\().16b, \x3\().16b, \t6\().16b
+       eor             \x6\().16b, \x6\().16b, \t2\().16b
+       .else
+       eor             \t3\().16b, \t3\().16b, \x4\().16b
+       eor             \x5\().16b, \x5\().16b, \t7\().16b
+       eor             \x2\().16b, \x3\().16b, \t6\().16b
+       eor             \x3\().16b, \t0\().16b, \t4\().16b
+       eor             \x4\().16b, \x6\().16b, \t2\().16b
+       mov             \x6\().16b, \t3\().16b
+       .endif
+       .endm
+
+       .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+                                     t0, t1, t2, t3, t4, t5, t6, t7
+       ext             \t0\().16b, \x0\().16b, \x0\().16b, #8
+       ext             \t6\().16b, \x6\().16b, \x6\().16b, #8
+       ext             \t7\().16b, \x7\().16b, \x7\().16b, #8
+       eor             \t0\().16b, \t0\().16b, \x0\().16b
+       ext             \t1\().16b, \x1\().16b, \x1\().16b, #8
+       eor             \t6\().16b, \t6\().16b, \x6\().16b
+       ext             \t2\().16b, \x2\().16b, \x2\().16b, #8
+       eor             \t7\().16b, \t7\().16b, \x7\().16b
+       ext             \t3\().16b, \x3\().16b, \x3\().16b, #8
+       eor             \t1\().16b, \t1\().16b, \x1\().16b
+       ext             \t4\().16b, \x4\().16b, \x4\().16b, #8
+       eor             \t2\().16b, \t2\().16b, \x2\().16b
+       ext             \t5\().16b, \x5\().16b, \x5\().16b, #8
+       eor             \t3\().16b, \t3\().16b, \x3\().16b
+       eor             \t4\().16b, \t4\().16b, \x4\().16b
+       eor             \t5\().16b, \t5\().16b, \x5\().16b
+       eor             \x0\().16b, \x0\().16b, \t6\().16b
+       eor             \x1\().16b, \x1\().16b, \t6\().16b
+       eor             \x2\().16b, \x2\().16b, \t0\().16b
+       eor             \x4\().16b, \x4\().16b, \t2\().16b
+       eor             \x3\().16b, \x3\().16b, \t1\().16b
+       eor             \x1\().16b, \x1\().16b, \t7\().16b
+       eor             \x2\().16b, \x2\().16b, \t7\().16b
+       eor             \x4\().16b, \x4\().16b, \t6\().16b
+       eor             \x5\().16b, \x5\().16b, \t3\().16b
+       eor             \x3\().16b, \x3\().16b, \t6\().16b
+       eor             \x6\().16b, \x6\().16b, \t4\().16b
+       eor             \x4\().16b, \x4\().16b, \t7\().16b
+       eor             \x5\().16b, \x5\().16b, \t7\().16b
+       eor             \x7\().16b, \x7\().16b, \t5\().16b
+       mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+                       \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+       .endm
+
+       .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+       ushr            \t0\().2d, \b0\().2d, #\n
+       ushr            \t1\().2d, \b1\().2d, #\n
+       eor             \t0\().16b, \t0\().16b, \a0\().16b
+       eor             \t1\().16b, \t1\().16b, \a1\().16b
+       and             \t0\().16b, \t0\().16b, \mask\().16b
+       and             \t1\().16b, \t1\().16b, \mask\().16b
+       eor             \a0\().16b, \a0\().16b, \t0\().16b
+       shl             \t0\().2d, \t0\().2d, #\n
+       eor             \a1\().16b, \a1\().16b, \t1\().16b
+       shl             \t1\().2d, \t1\().2d, #\n
+       eor             \b0\().16b, \b0\().16b, \t0\().16b
+       eor             \b1\().16b, \b1\().16b, \t1\().16b
+       .endm
+
+       .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+       movi            \t0\().16b, #0x55
+       movi            \t1\().16b, #0x33
+       swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+       swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+       movi            \t0\().16b, #0x0f
+       swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+       swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+       swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+       swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+       .endm
+
+
+       .align          6
+M0:    .octa           0x0004080c0105090d02060a0e03070b0f
+
+M0SR:  .octa           0x0004080c05090d010a0e02060f03070b
+SR:    .octa           0x0f0e0d0c0a09080b0504070600030201
+SRM0:  .octa           0x01060b0c0207080d0304090e00050a0f
+
+M0ISR: .octa           0x0004080c0d0105090a0e0206070b0f03
+ISR:   .octa           0x0f0e0d0c080b0a090504070602010003
+ISRM0: .octa           0x0306090c00070a0d01040b0e0205080f
+
+       /*
+        * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+        */
+ENTRY(aesbs_convert_key)
+       ld1             {v7.4s}, [x1], #16              // load round 0 key
+       ld1             {v17.4s}, [x1], #16             // load round 1 key
+
+       movi            v8.16b,  #0x01                  // bit masks
+       movi            v9.16b,  #0x02
+       movi            v10.16b, #0x04
+       movi            v11.16b, #0x08
+       movi            v12.16b, #0x10
+       movi            v13.16b, #0x20
+       movi            v14.16b, #0x40
+       movi            v15.16b, #0x80
+       ldr             q16, M0
+
+       sub             x2, x2, #1
+       str             q7, [x0], #16           // save round 0 key
+
+.Lkey_loop:
+       tbl             v7.16b ,{v17.16b}, v16.16b
+       ld1             {v17.4s}, [x1], #16             // load next round key
+
+       cmtst           v0.16b, v7.16b, v8.16b
+       cmtst           v1.16b, v7.16b, v9.16b
+       cmtst           v2.16b, v7.16b, v10.16b
+       cmtst           v3.16b, v7.16b, v11.16b
+       cmtst           v4.16b, v7.16b, v12.16b
+       cmtst           v5.16b, v7.16b, v13.16b
+       cmtst           v6.16b, v7.16b, v14.16b
+       cmtst           v7.16b, v7.16b, v15.16b
+       not             v0.16b, v0.16b
+       not             v1.16b, v1.16b
+       not             v5.16b, v5.16b
+       not             v6.16b, v6.16b
+
+       subs            x2, x2, #1
+       stp             q0, q1, [x0], #128
+       stp             q2, q3, [x0, #-96]
+       stp             q4, q5, [x0, #-64]
+       stp             q6, q7, [x0, #-32]
+       b.ne            .Lkey_loop
+
+       movi            v7.16b, #0x63                   // compose .L63
+       eor             v17.16b, v17.16b, v7.16b
+       str             q17, [x0]
+       ret
+ENDPROC(aesbs_convert_key)
+
+       .align          4
+aesbs_encrypt8:
+       ldr             q9, [bskey], #16                // round 0 key
+       ldr             q8, M0SR
+       ldr             q24, SR
+
+       eor             v10.16b, v0.16b, v9.16b         // xor with round0 key
+       eor             v11.16b, v1.16b, v9.16b
+       tbl             v0.16b, {v10.16b}, v8.16b
+       eor             v12.16b, v2.16b, v9.16b
+       tbl             v1.16b, {v11.16b}, v8.16b
+       eor             v13.16b, v3.16b, v9.16b
+       tbl             v2.16b, {v12.16b}, v8.16b
+       eor             v14.16b, v4.16b, v9.16b
+       tbl             v3.16b, {v13.16b}, v8.16b
+       eor             v15.16b, v5.16b, v9.16b
+       tbl             v4.16b, {v14.16b}, v8.16b
+       eor             v10.16b, v6.16b, v9.16b
+       tbl             v5.16b, {v15.16b}, v8.16b
+       eor             v11.16b, v7.16b, v9.16b
+       tbl             v6.16b, {v10.16b}, v8.16b
+       tbl             v7.16b, {v11.16b}, v8.16b
+
+       bitslice        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+       sub             rounds, rounds, #1
+       b               .Lenc_sbox
+
+.Lenc_loop:
+       shift_rows      v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+       sbox            v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+                                                               v13, v14, v15
+       subs            rounds, rounds, #1
+       b.cc            .Lenc_done
+
+       enc_next_rk
+
+       mix_cols        v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+                                                               v13, v14, v15
+
+       add_round_key   v0, v1, v2, v3, v4, v5, v6, v7
+
+       b.ne            .Lenc_loop
+       ldr             q24, SRM0
+       b               .Lenc_loop
+
+.Lenc_done:
+       ldr             q12, [bskey]                    // last round key
+
+       bitslice        v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+       eor             v0.16b, v0.16b, v12.16b
+       eor             v1.16b, v1.16b, v12.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v6.16b, v6.16b, v12.16b
+       eor             v3.16b, v3.16b, v12.16b
+       eor             v7.16b, v7.16b, v12.16b
+       eor             v2.16b, v2.16b, v12.16b
+       eor             v5.16b, v5.16b, v12.16b
+       ret
+ENDPROC(aesbs_encrypt8)
+
+       .align          4
+aesbs_decrypt8:
+       lsl             x9, rounds, #7
+       add             bskey, bskey, x9
+
+       ldr             q9, [bskey, #-112]!             // round 0 key
+       ldr             q8, M0ISR
+       ldr             q24, ISR
+
+       eor             v10.16b, v0.16b, v9.16b         // xor with round0 key
+       eor             v11.16b, v1.16b, v9.16b
+       tbl             v0.16b, {v10.16b}, v8.16b
+       eor             v12.16b, v2.16b, v9.16b
+       tbl             v1.16b, {v11.16b}, v8.16b
+       eor             v13.16b, v3.16b, v9.16b
+       tbl             v2.16b, {v12.16b}, v8.16b
+       eor             v14.16b, v4.16b, v9.16b
+       tbl             v3.16b, {v13.16b}, v8.16b
+       eor             v15.16b, v5.16b, v9.16b
+       tbl             v4.16b, {v14.16b}, v8.16b
+       eor             v10.16b, v6.16b, v9.16b
+       tbl             v5.16b, {v15.16b}, v8.16b
+       eor             v11.16b, v7.16b, v9.16b
+       tbl             v6.16b, {v10.16b}, v8.16b
+       tbl             v7.16b, {v11.16b}, v8.16b
+
+       bitslice        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+       sub             rounds, rounds, #1
+       b               .Ldec_sbox
+
+.Ldec_loop:
+       shift_rows      v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+       inv_sbox        v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+                                                               v13, v14, v15
+       subs            rounds, rounds, #1
+       b.cc            .Ldec_done
+
+       dec_next_rk
+
+       add_round_key   v0, v1, v6, v4, v2, v7, v3, v5
+
+       inv_mix_cols    v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+                                                               v13, v14, v15
+
+       b.ne            .Ldec_loop
+       ldr             q24, ISRM0
+       b               .Ldec_loop
+.Ldec_done:
+       ldr             q12, [bskey, #-16]              // last round key
+
+       bitslice        v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+       eor             v0.16b, v0.16b, v12.16b
+       eor             v1.16b, v1.16b, v12.16b
+       eor             v6.16b, v6.16b, v12.16b
+       eor             v4.16b, v4.16b, v12.16b
+       eor             v2.16b, v2.16b, v12.16b
+       eor             v7.16b, v7.16b, v12.16b
+       eor             v3.16b, v3.16b, v12.16b
+       eor             v5.16b, v5.16b, v12.16b
+       ret
+ENDPROC(aesbs_decrypt8)
+
+       /*
+        * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                   int blocks)
+        * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                   int blocks)
+        */
+       .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
+99:    mov             x5, #1
+       lsl             x5, x5, x4
+       subs            w4, w4, #8
+       csel            x4, x4, xzr, pl
+       csel            x5, x5, xzr, mi
+
+       ld1             {v0.16b}, [x1], #16
+       tbnz            x5, #1, 0f
+       ld1             {v1.16b}, [x1], #16
+       tbnz            x5, #2, 0f
+       ld1             {v2.16b}, [x1], #16
+       tbnz            x5, #3, 0f
+       ld1             {v3.16b}, [x1], #16
+       tbnz            x5, #4, 0f
+       ld1             {v4.16b}, [x1], #16
+       tbnz            x5, #5, 0f
+       ld1             {v5.16b}, [x1], #16
+       tbnz            x5, #6, 0f
+       ld1             {v6.16b}, [x1], #16
+       tbnz            x5, #7, 0f
+       ld1             {v7.16b}, [x1], #16
+
+0:     mov             bskey, x2
+       mov             rounds, x3
+       bl              \do8
+
+       st1             {\o0\().16b}, [x0], #16
+       tbnz            x5, #1, 1f
+       st1             {\o1\().16b}, [x0], #16
+       tbnz            x5, #2, 1f
+       st1             {\o2\().16b}, [x0], #16
+       tbnz            x5, #3, 1f
+       st1             {\o3\().16b}, [x0], #16
+       tbnz            x5, #4, 1f
+       st1             {\o4\().16b}, [x0], #16
+       tbnz            x5, #5, 1f
+       st1             {\o5\().16b}, [x0], #16
+       tbnz            x5, #6, 1f
+       st1             {\o6\().16b}, [x0], #16
+       tbnz            x5, #7, 1f
+       st1             {\o7\().16b}, [x0], #16
+
+       cbnz            x4, 99b
+
+1:     ldp             x29, x30, [sp], #16
+       ret
+       .endm
+
+       .align          4
+ENTRY(aesbs_ecb_encrypt)
+       __ecb_crypt     aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+       .align          4
+ENTRY(aesbs_ecb_decrypt)
+       __ecb_crypt     aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+       /*
+        * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                   int blocks, u8 iv[])
+        */
+       .align          4
+ENTRY(aesbs_cbc_decrypt)
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
+99:    mov             x6, #1
+       lsl             x6, x6, x4
+       subs            w4, w4, #8
+       csel            x4, x4, xzr, pl
+       csel            x6, x6, xzr, mi
+
+       ld1             {v0.16b}, [x1], #16
+       mov             v25.16b, v0.16b
+       tbnz            x6, #1, 0f
+       ld1             {v1.16b}, [x1], #16
+       mov             v26.16b, v1.16b
+       tbnz            x6, #2, 0f
+       ld1             {v2.16b}, [x1], #16
+       mov             v27.16b, v2.16b
+       tbnz            x6, #3, 0f
+       ld1             {v3.16b}, [x1], #16
+       mov             v28.16b, v3.16b
+       tbnz            x6, #4, 0f
+       ld1             {v4.16b}, [x1], #16
+       mov             v29.16b, v4.16b
+       tbnz            x6, #5, 0f
+       ld1             {v5.16b}, [x1], #16
+       mov             v30.16b, v5.16b
+       tbnz            x6, #6, 0f
+       ld1             {v6.16b}, [x1], #16
+       mov             v31.16b, v6.16b
+       tbnz            x6, #7, 0f
+       ld1             {v7.16b}, [x1]
+
+0:     mov             bskey, x2
+       mov             rounds, x3
+       bl              aesbs_decrypt8
+
+       ld1             {v24.16b}, [x5]                 // load IV
+
+       eor             v1.16b, v1.16b, v25.16b
+       eor             v6.16b, v6.16b, v26.16b
+       eor             v4.16b, v4.16b, v27.16b
+       eor             v2.16b, v2.16b, v28.16b
+       eor             v7.16b, v7.16b, v29.16b
+       eor             v0.16b, v0.16b, v24.16b
+       eor             v3.16b, v3.16b, v30.16b
+       eor             v5.16b, v5.16b, v31.16b
+
+       st1             {v0.16b}, [x0], #16
+       mov             v24.16b, v25.16b
+       tbnz            x6, #1, 1f
+       st1             {v1.16b}, [x0], #16
+       mov             v24.16b, v26.16b
+       tbnz            x6, #2, 1f
+       st1             {v6.16b}, [x0], #16
+       mov             v24.16b, v27.16b
+       tbnz            x6, #3, 1f
+       st1             {v4.16b}, [x0], #16
+       mov             v24.16b, v28.16b
+       tbnz            x6, #4, 1f
+       st1             {v2.16b}, [x0], #16
+       mov             v24.16b, v29.16b
+       tbnz            x6, #5, 1f
+       st1             {v7.16b}, [x0], #16
+       mov             v24.16b, v30.16b
+       tbnz            x6, #6, 1f
+       st1             {v3.16b}, [x0], #16
+       mov             v24.16b, v31.16b
+       tbnz            x6, #7, 1f
+       ld1             {v24.16b}, [x1], #16
+       st1             {v5.16b}, [x0], #16
+1:     st1             {v24.16b}, [x5]                 // store IV
+
+       cbnz            x4, 99b
+
+       ldp             x29, x30, [sp], #16
+       ret
+ENDPROC(aesbs_cbc_decrypt)
+
+       .macro          next_tweak, out, in, const, tmp
+       sshr            \tmp\().2d,  \in\().2d,   #63
+       and             \tmp\().16b, \tmp\().16b, \const\().16b
+       add             \out\().2d,  \in\().2d,   \in\().2d
+       ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+       eor             \out\().16b, \out\().16b, \tmp\().16b
+       .endm
+
+       .align          4
+.Lxts_mul_x:
+CPU_LE(        .quad           1, 0x87         )
+CPU_BE(        .quad           0x87, 1         )
+
+       /*
+        * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                   int blocks, u8 iv[])
+        * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+        *                   int blocks, u8 iv[])
+        */
+__xts_crypt8:
+       mov             x6, #1
+       lsl             x6, x6, x4
+       subs            w4, w4, #8
+       csel            x4, x4, xzr, pl
+       csel            x6, x6, xzr, mi
+
+       ld1             {v0.16b}, [x1], #16
+       next_tweak      v26, v25, v30, v31
+       eor             v0.16b, v0.16b, v25.16b
+       tbnz            x6, #1, 0f
+
+       ld1             {v1.16b}, [x1], #16
+       next_tweak      v27, v26, v30, v31
+       eor             v1.16b, v1.16b, v26.16b
+       tbnz            x6, #2, 0f
+
+       ld1             {v2.16b}, [x1], #16
+       next_tweak      v28, v27, v30, v31
+       eor             v2.16b, v2.16b, v27.16b
+       tbnz            x6, #3, 0f
+
+       ld1             {v3.16b}, [x1], #16
+       next_tweak      v29, v28, v30, v31
+       eor             v3.16b, v3.16b, v28.16b
+       tbnz            x6, #4, 0f
+
+       ld1             {v4.16b}, [x1], #16
+       str             q29, [sp, #16]
+       eor             v4.16b, v4.16b, v29.16b
+       next_tweak      v29, v29, v30, v31
+       tbnz            x6, #5, 0f
+
+       ld1             {v5.16b}, [x1], #16
+       str             q29, [sp, #32]
+       eor             v5.16b, v5.16b, v29.16b
+       next_tweak      v29, v29, v30, v31
+       tbnz            x6, #6, 0f
+
+       ld1             {v6.16b}, [x1], #16
+       str             q29, [sp, #48]
+       eor             v6.16b, v6.16b, v29.16b
+       next_tweak      v29, v29, v30, v31
+       tbnz            x6, #7, 0f
+
+       ld1             {v7.16b}, [x1], #16
+       str             q29, [sp, #64]
+       eor             v7.16b, v7.16b, v29.16b
+       next_tweak      v29, v29, v30, v31
+
+0:     mov             bskey, x2
+       mov             rounds, x3
+       br              x7
+ENDPROC(__xts_crypt8)
+
+       .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+       stp             x29, x30, [sp, #-80]!
+       mov             x29, sp
+
+       ldr             q30, .Lxts_mul_x
+       ld1             {v25.16b}, [x5]
+
+99:    adr             x7, \do8
+       bl              __xts_crypt8
+
+       ldp             q16, q17, [sp, #16]
+       ldp             q18, q19, [sp, #48]
+
+       eor             \o0\().16b, \o0\().16b, v25.16b
+       eor             \o1\().16b, \o1\().16b, v26.16b
+       eor             \o2\().16b, \o2\().16b, v27.16b
+       eor             \o3\().16b, \o3\().16b, v28.16b
+
+       st1             {\o0\().16b}, [x0], #16
+       mov             v25.16b, v26.16b
+       tbnz            x6, #1, 1f
+       st1             {\o1\().16b}, [x0], #16
+       mov             v25.16b, v27.16b
+       tbnz            x6, #2, 1f
+       st1             {\o2\().16b}, [x0], #16
+       mov             v25.16b, v28.16b
+       tbnz            x6, #3, 1f
+       st1             {\o3\().16b}, [x0], #16
+       mov             v25.16b, v29.16b
+       tbnz            x6, #4, 1f
+
+       eor             \o4\().16b, \o4\().16b, v16.16b
+       eor             \o5\().16b, \o5\().16b, v17.16b
+       eor             \o6\().16b, \o6\().16b, v18.16b
+       eor             \o7\().16b, \o7\().16b, v19.16b
+
+       st1             {\o4\().16b}, [x0], #16
+       tbnz            x6, #5, 1f
+       st1             {\o5\().16b}, [x0], #16
+       tbnz            x6, #6, 1f
+       st1             {\o6\().16b}, [x0], #16
+       tbnz            x6, #7, 1f
+       st1             {\o7\().16b}, [x0], #16
+
+       cbnz            x4, 99b
+
+1:     st1             {v25.16b}, [x5]
+       ldp             x29, x30, [sp], #80
+       ret
+       .endm
+
+ENTRY(aesbs_xts_encrypt)
+       __xts_crypt     aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+       __xts_crypt     aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+       .macro          next_ctr, v
+       mov             \v\().d[1], x8
+       adds            x8, x8, #1
+       mov             \v\().d[0], x7
+       adc             x7, x7, xzr
+       rev64           \v\().16b, \v\().16b
+       .endm
+
+       /*
+        * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+        *                   int rounds, int blocks, u8 iv[], bool final)
+        */
+ENTRY(aesbs_ctr_encrypt)
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
+       add             x4, x4, x6              // do one extra block if final
+
+       ldp             x7, x8, [x5]
+       ld1             {v0.16b}, [x5]
+CPU_LE(        rev             x7, x7          )
+CPU_LE(        rev             x8, x8          )
+       adds            x8, x8, #1
+       adc             x7, x7, xzr
+
+99:    mov             x9, #1
+       lsl             x9, x9, x4
+       subs            w4, w4, #8
+       csel            x4, x4, xzr, pl
+       csel            x9, x9, xzr, le
+
+       next_ctr        v1
+       next_ctr        v2
+       next_ctr        v3
+       next_ctr        v4
+       next_ctr        v5
+       next_ctr        v6
+       next_ctr        v7
+
+0:     mov             bskey, x2
+       mov             rounds, x3
+       bl              aesbs_encrypt8
+
+       lsr             x9, x9, x6              // disregard the extra block
+       tbnz            x9, #0, 0f
+
+       ld1             {v8.16b}, [x1], #16
+       eor             v0.16b, v0.16b, v8.16b
+       st1             {v0.16b}, [x0], #16
+       tbnz            x9, #1, 1f
+
+       ld1             {v9.16b}, [x1], #16
+       eor             v1.16b, v1.16b, v9.16b
+       st1             {v1.16b}, [x0], #16
+       tbnz            x9, #2, 2f
+
+       ld1             {v10.16b}, [x1], #16
+       eor             v4.16b, v4.16b, v10.16b
+       st1             {v4.16b}, [x0], #16
+       tbnz            x9, #3, 3f
+
+       ld1             {v11.16b}, [x1], #16
+       eor             v6.16b, v6.16b, v11.16b
+       st1             {v6.16b}, [x0], #16
+       tbnz            x9, #4, 4f
+
+       ld1             {v12.16b}, [x1], #16
+       eor             v3.16b, v3.16b, v12.16b
+       st1             {v3.16b}, [x0], #16
+       tbnz            x9, #5, 5f
+
+       ld1             {v13.16b}, [x1], #16
+       eor             v7.16b, v7.16b, v13.16b
+       st1             {v7.16b}, [x0], #16
+       tbnz            x9, #6, 6f
+
+       ld1             {v14.16b}, [x1], #16
+       eor             v2.16b, v2.16b, v14.16b
+       st1             {v2.16b}, [x0], #16
+       tbnz            x9, #7, 7f
+
+       ld1             {v15.16b}, [x1], #16
+       eor             v5.16b, v5.16b, v15.16b
+       st1             {v5.16b}, [x0], #16
+
+       next_ctr        v0
+       cbnz            x4, 99b
+
+0:     st1             {v0.16b}, [x5]
+8:     ldp             x29, x30, [sp], #16
+       ret
+
+       /*
+        * If we are handling the tail of the input (x6 == 1), return the
+        * final keystream block back to the caller via the IV buffer.
+        */
+1:     cbz             x6, 8b
+       st1             {v1.16b}, [x5]
+       b               8b
+2:     cbz             x6, 8b
+       st1             {v4.16b}, [x5]
+       b               8b
+3:     cbz             x6, 8b
+       st1             {v6.16b}, [x5]
+       b               8b
+4:     cbz             x6, 8b
+       st1             {v3.16b}, [x5]
+       b               8b
+5:     cbz             x6, 8b
+       st1             {v7.16b}, [x5]
+       b               8b
+6:     cbz             x6, 8b
+       st1             {v2.16b}, [x5]
+       b               8b
+7:     cbz             x6, 8b
+       st1             {v5.16b}, [x5]
+       b               8b
+ENDPROC(aesbs_ctr_encrypt)
 
--- /dev/null
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/cbc.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks, u8 iv[], bool final);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks, u8 iv[]);
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+struct aesbs_ctx {
+       u8      rk[13 * (8 * AES_BLOCK_SIZE) + 32];
+       int     rounds;
+} __aligned(AES_BLOCK_SIZE);
+
+struct aesbs_cbc_ctx {
+       struct aesbs_ctx        key;
+       u32                     enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+       struct aesbs_ctx        key;
+       u32                     twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+                       unsigned int key_len)
+{
+       struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct crypto_aes_ctx rk;
+       int err;
+
+       err = crypto_aes_expand_key(&rk, in_key, key_len);
+       if (err)
+               return err;
+
+       ctx->rounds = 6 + key_len / 4;
+
+       kernel_neon_begin();
+       aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+       kernel_neon_end();
+
+       return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+                      void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks))
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       kernel_neon_begin();
+       while (walk.nbytes >= AES_BLOCK_SIZE) {
+               unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+               if (walk.nbytes < walk.total)
+                       blocks = round_down(blocks,
+                                           walk.stride / AES_BLOCK_SIZE);
+
+               fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+                  ctx->rounds, blocks);
+               err = skcipher_walk_done(&walk,
+                                        walk.nbytes - blocks * AES_BLOCK_SIZE);
+       }
+       kernel_neon_end();
+
+       return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+       return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+       return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct crypto_aes_ctx rk;
+       int err;
+
+       err = crypto_aes_expand_key(&rk, in_key, key_len);
+       if (err)
+               return err;
+
+       ctx->key.rounds = 6 + key_len / 4;
+
+       memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+       kernel_neon_begin();
+       aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+       kernel_neon_end();
+
+       return 0;
+}
+
+static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+{
+       struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+       __aes_arm64_encrypt(ctx->enc, dst, src, ctx->key.rounds);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+       return crypto_cbc_encrypt_walk(req, cbc_encrypt_one);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       kernel_neon_begin();
+       while (walk.nbytes >= AES_BLOCK_SIZE) {
+               unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+               if (walk.nbytes < walk.total)
+                       blocks = round_down(blocks,
+                                           walk.stride / AES_BLOCK_SIZE);
+
+               aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                 ctx->key.rk, ctx->key.rounds, blocks,
+                                 walk.iv);
+               err = skcipher_walk_done(&walk,
+                                        walk.nbytes - blocks * AES_BLOCK_SIZE);
+       }
+       kernel_neon_end();
+
+       return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       kernel_neon_begin();
+       while (walk.nbytes > 0) {
+               unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+               bool final = (walk.total % AES_BLOCK_SIZE) != 0;
+
+               if (walk.nbytes < walk.total) {
+                       blocks = round_down(blocks,
+                                           walk.stride / AES_BLOCK_SIZE);
+                       final = false;
+               }
+
+               aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+                                 ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+               if (final) {
+                       u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+                       u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+                       if (dst != src)
+                               memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+                       crypto_xor(dst, walk.iv, walk.total % AES_BLOCK_SIZE);
+
+                       err = skcipher_walk_done(&walk, 0);
+                       break;
+               }
+               err = skcipher_walk_done(&walk,
+                                        walk.nbytes - blocks * AES_BLOCK_SIZE);
+       }
+       kernel_neon_end();
+
+       return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+                           unsigned int key_len)
+{
+       struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct crypto_aes_ctx rk;
+       int err;
+
+       err = xts_verify_key(tfm, in_key, key_len);
+       if (err)
+               return err;
+
+       key_len /= 2;
+       err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+       if (err)
+               return err;
+
+       memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+       return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+                      void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+                                 int rounds, int blocks, u8 iv[]))
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       __aes_arm64_encrypt(ctx->twkey, walk.iv, walk.iv, ctx->key.rounds);
+
+       kernel_neon_begin();
+       while (walk.nbytes >= AES_BLOCK_SIZE) {
+               unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+               if (walk.nbytes < walk.total)
+                       blocks = round_down(blocks,
+                                           walk.stride / AES_BLOCK_SIZE);
+
+               fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+                  ctx->key.rounds, blocks, walk.iv);
+               err = skcipher_walk_done(&walk,
+                                        walk.nbytes - blocks * AES_BLOCK_SIZE);
+       }
+       kernel_neon_end();
+
+       return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+       return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+       return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+       .base.cra_name          = "__ecb(aes)",
+       .base.cra_driver_name   = "__ecb-aes-neonbs",
+       .base.cra_priority      = 250,
+       .base.cra_blocksize     = AES_BLOCK_SIZE,
+       .base.cra_ctxsize       = sizeof(struct aesbs_ctx),
+       .base.cra_module        = THIS_MODULE,
+       .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+
+       .min_keysize            = AES_MIN_KEY_SIZE,
+       .max_keysize            = AES_MAX_KEY_SIZE,
+       .walksize               = 8 * AES_BLOCK_SIZE,
+       .setkey                 = aesbs_setkey,
+       .encrypt                = ecb_encrypt,
+       .decrypt                = ecb_decrypt,
+}, {
+       .base.cra_name          = "__cbc(aes)",
+       .base.cra_driver_name   = "__cbc-aes-neonbs",
+       .base.cra_priority      = 250,
+       .base.cra_blocksize     = AES_BLOCK_SIZE,
+       .base.cra_ctxsize       = sizeof(struct aesbs_cbc_ctx),
+       .base.cra_module        = THIS_MODULE,
+       .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+
+       .min_keysize            = AES_MIN_KEY_SIZE,
+       .max_keysize            = AES_MAX_KEY_SIZE,
+       .walksize               = 8 * AES_BLOCK_SIZE,
+       .ivsize                 = AES_BLOCK_SIZE,
+       .setkey                 = aesbs_cbc_setkey,
+       .encrypt                = cbc_encrypt,
+       .decrypt                = cbc_decrypt,
+}, {
+       .base.cra_name          = "__ctr(aes)",
+       .base.cra_driver_name   = "__ctr-aes-neonbs",
+       .base.cra_priority      = 250,
+       .base.cra_blocksize     = 1,
+       .base.cra_ctxsize       = sizeof(struct aesbs_ctx),
+       .base.cra_module        = THIS_MODULE,
+       .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+
+       .min_keysize            = AES_MIN_KEY_SIZE,
+       .max_keysize            = AES_MAX_KEY_SIZE,
+       .chunksize              = AES_BLOCK_SIZE,
+       .walksize               = 8 * AES_BLOCK_SIZE,
+       .ivsize                 = AES_BLOCK_SIZE,
+       .setkey                 = aesbs_setkey,
+       .encrypt                = ctr_encrypt,
+       .decrypt                = ctr_encrypt,
+}, {
+       .base.cra_name          = "ctr(aes)",
+       .base.cra_driver_name   = "ctr-aes-neonbs",
+       .base.cra_priority      = 250 - 1,
+       .base.cra_blocksize     = 1,
+       .base.cra_ctxsize       = sizeof(struct aesbs_ctx),
+       .base.cra_module        = THIS_MODULE,
+
+       .min_keysize            = AES_MIN_KEY_SIZE,
+       .max_keysize            = AES_MAX_KEY_SIZE,
+       .chunksize              = AES_BLOCK_SIZE,
+       .walksize               = 8 * AES_BLOCK_SIZE,
+       .ivsize                 = AES_BLOCK_SIZE,
+       .setkey                 = aesbs_setkey,
+       .encrypt                = ctr_encrypt,
+       .decrypt                = ctr_encrypt,
+}, {
+       .base.cra_name          = "__xts(aes)",
+       .base.cra_driver_name   = "__xts-aes-neonbs",
+       .base.cra_priority      = 250,
+       .base.cra_blocksize     = AES_BLOCK_SIZE,
+       .base.cra_ctxsize       = sizeof(struct aesbs_xts_ctx),
+       .base.cra_module        = THIS_MODULE,
+       .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+
+       .min_keysize            = 2 * AES_MIN_KEY_SIZE,
+       .max_keysize            = 2 * AES_MAX_KEY_SIZE,
+       .walksize               = 8 * AES_BLOCK_SIZE,
+       .ivsize                 = AES_BLOCK_SIZE,
+       .setkey                 = aesbs_xts_setkey,
+       .encrypt                = xts_encrypt,
+       .decrypt                = xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+               if (aes_simd_algs[i])
+                       simd_skcipher_free(aes_simd_algs[i]);
+
+       crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+       struct simd_skcipher_alg *simd;
+       const char *basename;
+       const char *algname;
+       const char *drvname;
+       int err;
+       int i;
+
+       if (!(elf_hwcap & HWCAP_ASIMD))
+               return -ENODEV;
+
+       err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+       if (err)
+               return err;
+
+       for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+               if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+                       continue;
+
+               algname = aes_algs[i].base.cra_name + 2;
+               drvname = aes_algs[i].base.cra_driver_name + 2;
+               basename = aes_algs[i].base.cra_driver_name;
+               simd = simd_skcipher_create_compat(algname, drvname, basename);
+               err = PTR_ERR(simd);
+               if (IS_ERR(simd))
+                       goto unregister_simds;
+
+               aes_simd_algs[i] = simd;
+       }
+       return 0;
+
+unregister_simds:
+       aes_exit();
+       return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);