From: Claudio Fontana Date: Fri, 17 Feb 2023 20:11:30 +0000 (-0300) Subject: target/arm: move helpers to tcg/ X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=a3ef070ea9b2d9af95422b38b022f11d8c302d2e;p=qemu.git target/arm: move helpers to tcg/ Signed-off-by: Claudio Fontana Signed-off-by: Fabiano Rosas Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Signed-off-by: Peter Maydell --- diff --git a/target/arm/crypto_helper.c b/target/arm/crypto_helper.c deleted file mode 100644 index d28690321f..0000000000 --- a/target/arm/crypto_helper.c +++ /dev/null @@ -1,778 +0,0 @@ -/* - * crypto_helper.c - emulate v8 Crypto Extensions instructions - * - * Copyright (C) 2013 - 2018 Linaro Ltd - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "cpu.h" -#include "exec/helper-proto.h" -#include "tcg/tcg-gvec-desc.h" -#include "crypto/aes.h" -#include "crypto/sm4.h" -#include "vec_internal.h" - -union CRYPTO_STATE { - uint8_t bytes[16]; - uint32_t words[4]; - uint64_t l[2]; -}; - -#if HOST_BIG_ENDIAN -#define CR_ST_BYTE(state, i) ((state).bytes[(15 - (i)) ^ 8]) -#define CR_ST_WORD(state, i) ((state).words[(3 - (i)) ^ 2]) -#else -#define CR_ST_BYTE(state, i) ((state).bytes[i]) -#define CR_ST_WORD(state, i) ((state).words[i]) -#endif - -/* - * The caller has not been converted to full gvec, and so only - * modifies the low 16 bytes of the vector register. - */ -static void clear_tail_16(void *vd, uint32_t desc) -{ - int opr_sz = simd_oprsz(desc); - int max_sz = simd_maxsz(desc); - - assert(opr_sz == 16); - clear_tail(vd, opr_sz, max_sz); -} - -static void do_crypto_aese(uint64_t *rd, uint64_t *rn, - uint64_t *rm, bool decrypt) -{ - static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox }; - static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts }; - union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } }; - union CRYPTO_STATE st = { .l = { rn[0], rn[1] } }; - int i; - - /* xor state vector with round key */ - rk.l[0] ^= st.l[0]; - rk.l[1] ^= st.l[1]; - - /* combine ShiftRows operation and sbox substitution */ - for (i = 0; i < 16; i++) { - CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])]; - } - - rd[0] = st.l[0]; - rd[1] = st.l[1]; -} - -void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - bool decrypt = simd_data(desc); - - for (i = 0; i < opr_sz; i += 16) { - do_crypto_aese(vd + i, vn + i, vm + i, decrypt); - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} - -static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt) -{ - static uint32_t const mc[][256] = { { - /* MixColumns lookup table */ - 0x00000000, 0x03010102, 0x06020204, 0x05030306, - 0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e, - 0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16, - 0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e, - 0x30101020, 0x33111122, 0x36121224, 0x35131326, - 0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e, - 0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36, - 0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e, - 0x60202040, 0x63212142, 0x66222244, 0x65232346, - 0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e, - 0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56, - 0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e, - 0x50303060, 0x53313162, 0x56323264, 0x55333366, - 0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e, - 0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76, - 0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e, - 0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386, - 0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e, - 0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96, - 0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e, - 0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6, - 0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae, - 0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6, - 0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe, - 0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6, - 0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce, - 0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6, - 0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde, - 0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6, - 0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee, - 0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6, - 0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe, - 0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d, - 0x97848413, 0x94858511, 0x91868617, 0x92878715, - 0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d, - 0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05, - 0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d, - 0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735, - 0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d, - 0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25, - 0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d, - 0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755, - 0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d, - 0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45, - 0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d, - 0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775, - 0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d, - 0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65, - 0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d, - 0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795, - 0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d, - 0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85, - 0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd, - 0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5, - 0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad, - 0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5, - 0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd, - 0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5, - 0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd, - 0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5, - 0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd, - 0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5, - 0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed, - 0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5, - }, { - /* Inverse MixColumns lookup table */ - 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12, - 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a, - 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362, - 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a, - 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2, - 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca, - 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382, - 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba, - 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9, - 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1, - 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9, - 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81, - 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029, - 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411, - 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859, - 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61, - 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf, - 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987, - 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf, - 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7, - 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f, - 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967, - 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f, - 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117, - 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664, - 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c, - 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14, - 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c, - 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684, - 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc, - 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4, - 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc, - 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753, - 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b, - 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23, - 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b, - 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3, - 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b, - 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3, - 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb, - 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88, - 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0, - 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8, - 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0, - 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68, - 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850, - 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418, - 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020, - 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe, - 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6, - 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e, - 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6, - 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e, - 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526, - 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e, - 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56, - 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25, - 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d, - 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255, - 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d, - 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5, - 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd, - 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5, - 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d, - } }; - - union CRYPTO_STATE st = { .l = { rm[0], rm[1] } }; - int i; - - for (i = 0; i < 16; i += 4) { - CR_ST_WORD(st, i >> 2) = - mc[decrypt][CR_ST_BYTE(st, i)] ^ - rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^ - rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^ - rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24); - } - - rd[0] = st.l[0]; - rd[1] = st.l[1]; -} - -void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - bool decrypt = simd_data(desc); - - for (i = 0; i < opr_sz; i += 16) { - do_crypto_aesmc(vd + i, vm + i, decrypt); - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} - -/* - * SHA-1 logical functions - */ - -static uint32_t cho(uint32_t x, uint32_t y, uint32_t z) -{ - return (x & (y ^ z)) ^ z; -} - -static uint32_t par(uint32_t x, uint32_t y, uint32_t z) -{ - return x ^ y ^ z; -} - -static uint32_t maj(uint32_t x, uint32_t y, uint32_t z) -{ - return (x & y) | ((x | y) & z); -} - -void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t d0, d1; - - d0 = d[1] ^ d[0] ^ m[0]; - d1 = n[0] ^ d[1] ^ m[1]; - d[0] = d0; - d[1] = d1; - - clear_tail_16(vd, desc); -} - -static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn, - uint64_t *rm, uint32_t desc, - uint32_t (*fn)(union CRYPTO_STATE *d)) -{ - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - int i; - - for (i = 0; i < 4; i++) { - uint32_t t = fn(&d); - - t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0) - + CR_ST_WORD(m, i); - - CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3); - CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); - CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2); - CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); - CR_ST_WORD(d, 0) = t; - } - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(rd, desc); -} - -static uint32_t do_sha1c(union CRYPTO_STATE *d) -{ - return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); -} - -void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc) -{ - crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c); -} - -static uint32_t do_sha1p(union CRYPTO_STATE *d) -{ - return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); -} - -void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc) -{ - crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p); -} - -static uint32_t do_sha1m(union CRYPTO_STATE *d) -{ - return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); -} - -void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc) -{ - crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m); -} - -void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rm = vm; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - - CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2); - CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0; - - rd[0] = m.l[0]; - rd[1] = m.l[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - - CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1); - CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1); - CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1); - CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1); - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -/* - * The SHA-256 logical functions, according to - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - */ - -static uint32_t S0(uint32_t x) -{ - return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22); -} - -static uint32_t S1(uint32_t x) -{ - return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25); -} - -static uint32_t s0(uint32_t x) -{ - return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3); -} - -static uint32_t s1(uint32_t x) -{ - return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10); -} - -void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - int i; - - for (i = 0; i < 4; i++) { - uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2)) - + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0)) - + CR_ST_WORD(m, i); - - CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2); - CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1); - CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0); - CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t; - - t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2)) - + S0(CR_ST_WORD(d, 0)); - - CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); - CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); - CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); - CR_ST_WORD(d, 0) = t; - } - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - int i; - - for (i = 0; i < 4; i++) { - uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2)) - + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0)) - + CR_ST_WORD(m, i); - - CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); - CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); - CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); - CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t; - } - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - - CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1)); - CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2)); - CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3)); - CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0)); - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - - CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1); - CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2); - CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3); - CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0); - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -/* - * The SHA-512 logical functions (same as above but using 64-bit operands) - */ - -static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z) -{ - return (x & (y ^ z)) ^ z; -} - -static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z) -{ - return (x & y) | ((x | y) & z); -} - -static uint64_t S0_512(uint64_t x) -{ - return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); -} - -static uint64_t S1_512(uint64_t x) -{ - return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); -} - -static uint64_t s0_512(uint64_t x) -{ - return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); -} - -static uint64_t s1_512(uint64_t x) -{ - return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); -} - -void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - uint64_t d0 = rd[0]; - uint64_t d1 = rd[1]; - - d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]); - d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]); - - rd[0] = d0; - rd[1] = d1; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - uint64_t d0 = rd[0]; - uint64_t d1 = rd[1]; - - d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]); - d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]); - - rd[0] = d0; - rd[1] = d1; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t d0 = rd[0]; - uint64_t d1 = rd[1]; - - d0 += s0_512(rd[1]); - d1 += s0_512(rn[0]); - - rd[0] = d0; - rd[1] = d1; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - - rd[0] += s1_512(rn[0]) + rm[0]; - rd[1] += s1_512(rn[1]) + rm[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - uint32_t t; - - t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17); - CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9); - - t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17); - CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9); - - t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17); - CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9); - - t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17); - CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9); - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc) -{ - uint64_t *rd = vd; - uint64_t *rn = vn; - uint64_t *rm = vm; - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25); - - CR_ST_WORD(d, 0) ^= t; - CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25); - CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25); - CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^ - ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26); - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(vd, desc); -} - -static inline void QEMU_ALWAYS_INLINE -crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm, - uint32_t desc, uint32_t opcode) -{ - union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - uint32_t imm2 = simd_data(desc); - uint32_t t; - - assert(imm2 < 4); - - if (opcode == 0 || opcode == 2) { - /* SM3TT1A, SM3TT2A */ - t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); - } else if (opcode == 1) { - /* SM3TT1B */ - t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); - } else if (opcode == 3) { - /* SM3TT2B */ - t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); - } else { - qemu_build_not_reached(); - } - - t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2); - - CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1); - - if (opcode < 2) { - /* SM3TT1A, SM3TT1B */ - t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20); - - CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23); - } else { - /* SM3TT2A, SM3TT2B */ - t += CR_ST_WORD(n, 3); - t ^= rol32(t, 9) ^ rol32(t, 17); - - CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13); - } - - CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3); - CR_ST_WORD(d, 3) = t; - - rd[0] = d.l[0]; - rd[1] = d.l[1]; - - clear_tail_16(rd, desc); -} - -#define DO_SM3TT(NAME, OPCODE) \ - void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ - { crypto_sm3tt(vd, vn, vm, desc, OPCODE); } - -DO_SM3TT(crypto_sm3tt1a, 0) -DO_SM3TT(crypto_sm3tt1b, 1) -DO_SM3TT(crypto_sm3tt2a, 2) -DO_SM3TT(crypto_sm3tt2b, 3) - -#undef DO_SM3TT - -static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm) -{ - union CRYPTO_STATE d = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE n = { .l = { rm[0], rm[1] } }; - uint32_t t, i; - - for (i = 0; i < 4; i++) { - t = CR_ST_WORD(d, (i + 1) % 4) ^ - CR_ST_WORD(d, (i + 2) % 4) ^ - CR_ST_WORD(d, (i + 3) % 4) ^ - CR_ST_WORD(n, i); - - t = sm4_sbox[t & 0xff] | - sm4_sbox[(t >> 8) & 0xff] << 8 | - sm4_sbox[(t >> 16) & 0xff] << 16 | - sm4_sbox[(t >> 24) & 0xff] << 24; - - CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^ - rol32(t, 24); - } - - rd[0] = d.l[0]; - rd[1] = d.l[1]; -} - -void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - - for (i = 0; i < opr_sz; i += 16) { - do_crypto_sm4e(vd + i, vn + i, vm + i); - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} - -static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm) -{ - union CRYPTO_STATE d; - union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; - union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; - uint32_t t, i; - - d = n; - for (i = 0; i < 4; i++) { - t = CR_ST_WORD(d, (i + 1) % 4) ^ - CR_ST_WORD(d, (i + 2) % 4) ^ - CR_ST_WORD(d, (i + 3) % 4) ^ - CR_ST_WORD(m, i); - - t = sm4_sbox[t & 0xff] | - sm4_sbox[(t >> 8) & 0xff] << 8 | - sm4_sbox[(t >> 16) & 0xff] << 16 | - sm4_sbox[(t >> 24) & 0xff] << 24; - - CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23); - } - - rd[0] = d.l[0]; - rd[1] = d.l[1]; -} - -void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - - for (i = 0; i < opr_sz; i += 16) { - do_crypto_sm4ekey(vd + i, vn + i, vm + i); - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} - -void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = n[i] ^ rol64(m[i], 1); - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c deleted file mode 100644 index 0972a4bdd0..0000000000 --- a/target/arm/helper-a64.c +++ /dev/null @@ -1,954 +0,0 @@ -/* - * AArch64 specific helpers - * - * Copyright (c) 2013 Alexander Graf - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "qemu/units.h" -#include "cpu.h" -#include "exec/gdbstub.h" -#include "exec/helper-proto.h" -#include "qemu/host-utils.h" -#include "qemu/log.h" -#include "qemu/main-loop.h" -#include "qemu/bitops.h" -#include "internals.h" -#include "qemu/crc32c.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" -#include "qemu/int128.h" -#include "qemu/atomic128.h" -#include "fpu/softfloat.h" -#include /* For crc32 */ - -/* C2.4.7 Multiply and divide */ -/* special cases for 0 and LLONG_MIN are mandated by the standard */ -uint64_t HELPER(udiv64)(uint64_t num, uint64_t den) -{ - if (den == 0) { - return 0; - } - return num / den; -} - -int64_t HELPER(sdiv64)(int64_t num, int64_t den) -{ - if (den == 0) { - return 0; - } - if (num == LLONG_MIN && den == -1) { - return LLONG_MIN; - } - return num / den; -} - -uint64_t HELPER(rbit64)(uint64_t x) -{ - return revbit64(x); -} - -void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm) -{ - update_spsel(env, imm); -} - -static void daif_check(CPUARMState *env, uint32_t op, - uint32_t imm, uintptr_t ra) -{ - /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set. */ - if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) { - raise_exception_ra(env, EXCP_UDEF, - syn_aa64_sysregtrap(0, extract32(op, 0, 3), - extract32(op, 3, 3), 4, - imm, 0x1f, 0), - exception_target_el(env), ra); - } -} - -void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm) -{ - daif_check(env, 0x1e, imm, GETPC()); - env->daif |= (imm << 6) & PSTATE_DAIF; - arm_rebuild_hflags(env); -} - -void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm) -{ - daif_check(env, 0x1f, imm, GETPC()); - env->daif &= ~((imm << 6) & PSTATE_DAIF); - arm_rebuild_hflags(env); -} - -/* Convert a softfloat float_relation_ (as returned by - * the float*_compare functions) to the correct ARM - * NZCV flag state. - */ -static inline uint32_t float_rel_to_flags(int res) -{ - uint64_t flags; - switch (res) { - case float_relation_equal: - flags = PSTATE_Z | PSTATE_C; - break; - case float_relation_less: - flags = PSTATE_N; - break; - case float_relation_greater: - flags = PSTATE_C; - break; - case float_relation_unordered: - default: - flags = PSTATE_C | PSTATE_V; - break; - } - return flags; -} - -uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status) -{ - return float_rel_to_flags(float16_compare_quiet(x, y, fp_status)); -} - -uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status) -{ - return float_rel_to_flags(float16_compare(x, y, fp_status)); -} - -uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status) -{ - return float_rel_to_flags(float32_compare_quiet(x, y, fp_status)); -} - -uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status) -{ - return float_rel_to_flags(float32_compare(x, y, fp_status)); -} - -uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status) -{ - return float_rel_to_flags(float64_compare_quiet(x, y, fp_status)); -} - -uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status) -{ - return float_rel_to_flags(float64_compare(x, y, fp_status)); -} - -float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float32_squash_input_denormal(a, fpst); - b = float32_squash_input_denormal(b, fpst); - - if ((float32_is_zero(a) && float32_is_infinity(b)) || - (float32_is_infinity(a) && float32_is_zero(b))) { - /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ - return make_float32((1U << 30) | - ((float32_val(a) ^ float32_val(b)) & (1U << 31))); - } - return float32_mul(a, b, fpst); -} - -float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float64_squash_input_denormal(a, fpst); - b = float64_squash_input_denormal(b, fpst); - - if ((float64_is_zero(a) && float64_is_infinity(b)) || - (float64_is_infinity(a) && float64_is_zero(b))) { - /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ - return make_float64((1ULL << 62) | - ((float64_val(a) ^ float64_val(b)) & (1ULL << 63))); - } - return float64_mul(a, b, fpst); -} - -/* 64bit/double versions of the neon float compare functions */ -uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float64_eq_quiet(a, b, fpst); -} - -uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float64_le(b, a, fpst); -} - -uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float64_lt(b, a, fpst); -} - -/* Reciprocal step and sqrt step. Note that unlike the A32/T32 - * versions, these do a fully fused multiply-add or - * multiply-add-and-halve. - */ - -uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float16_squash_input_denormal(a, fpst); - b = float16_squash_input_denormal(b, fpst); - - a = float16_chs(a); - if ((float16_is_infinity(a) && float16_is_zero(b)) || - (float16_is_infinity(b) && float16_is_zero(a))) { - return float16_two; - } - return float16_muladd(a, b, float16_two, 0, fpst); -} - -float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float32_squash_input_denormal(a, fpst); - b = float32_squash_input_denormal(b, fpst); - - a = float32_chs(a); - if ((float32_is_infinity(a) && float32_is_zero(b)) || - (float32_is_infinity(b) && float32_is_zero(a))) { - return float32_two; - } - return float32_muladd(a, b, float32_two, 0, fpst); -} - -float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float64_squash_input_denormal(a, fpst); - b = float64_squash_input_denormal(b, fpst); - - a = float64_chs(a); - if ((float64_is_infinity(a) && float64_is_zero(b)) || - (float64_is_infinity(b) && float64_is_zero(a))) { - return float64_two; - } - return float64_muladd(a, b, float64_two, 0, fpst); -} - -uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float16_squash_input_denormal(a, fpst); - b = float16_squash_input_denormal(b, fpst); - - a = float16_chs(a); - if ((float16_is_infinity(a) && float16_is_zero(b)) || - (float16_is_infinity(b) && float16_is_zero(a))) { - return float16_one_point_five; - } - return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst); -} - -float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float32_squash_input_denormal(a, fpst); - b = float32_squash_input_denormal(b, fpst); - - a = float32_chs(a); - if ((float32_is_infinity(a) && float32_is_zero(b)) || - (float32_is_infinity(b) && float32_is_zero(a))) { - return float32_one_point_five; - } - return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst); -} - -float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float64_squash_input_denormal(a, fpst); - b = float64_squash_input_denormal(b, fpst); - - a = float64_chs(a); - if ((float64_is_infinity(a) && float64_is_zero(b)) || - (float64_is_infinity(b) && float64_is_zero(a))) { - return float64_one_point_five; - } - return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst); -} - -/* Pairwise long add: add pairs of adjacent elements into - * double-width elements in the result (eg _s8 is an 8x8->16 op) - */ -uint64_t HELPER(neon_addlp_s8)(uint64_t a) -{ - uint64_t nsignmask = 0x0080008000800080ULL; - uint64_t wsignmask = 0x8000800080008000ULL; - uint64_t elementmask = 0x00ff00ff00ff00ffULL; - uint64_t tmp1, tmp2; - uint64_t res, signres; - - /* Extract odd elements, sign extend each to a 16 bit field */ - tmp1 = a & elementmask; - tmp1 ^= nsignmask; - tmp1 |= wsignmask; - tmp1 = (tmp1 - nsignmask) ^ wsignmask; - /* Ditto for the even elements */ - tmp2 = (a >> 8) & elementmask; - tmp2 ^= nsignmask; - tmp2 |= wsignmask; - tmp2 = (tmp2 - nsignmask) ^ wsignmask; - - /* calculate the result by summing bits 0..14, 16..22, etc, - * and then adjusting the sign bits 15, 23, etc manually. - * This ensures the addition can't overflow the 16 bit field. - */ - signres = (tmp1 ^ tmp2) & wsignmask; - res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); - res ^= signres; - - return res; -} - -uint64_t HELPER(neon_addlp_u8)(uint64_t a) -{ - uint64_t tmp; - - tmp = a & 0x00ff00ff00ff00ffULL; - tmp += (a >> 8) & 0x00ff00ff00ff00ffULL; - return tmp; -} - -uint64_t HELPER(neon_addlp_s16)(uint64_t a) -{ - int32_t reslo, reshi; - - reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); - reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); - - return (uint32_t)reslo | (((uint64_t)reshi) << 32); -} - -uint64_t HELPER(neon_addlp_u16)(uint64_t a) -{ - uint64_t tmp; - - tmp = a & 0x0000ffff0000ffffULL; - tmp += (a >> 16) & 0x0000ffff0000ffffULL; - return tmp; -} - -/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */ -uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp) -{ - float_status *fpst = fpstp; - uint16_t val16, sbit; - int16_t exp; - - if (float16_is_any_nan(a)) { - float16 nan = a; - if (float16_is_signaling_nan(a, fpst)) { - float_raise(float_flag_invalid, fpst); - if (!fpst->default_nan_mode) { - nan = float16_silence_nan(a, fpst); - } - } - if (fpst->default_nan_mode) { - nan = float16_default_nan(fpst); - } - return nan; - } - - a = float16_squash_input_denormal(a, fpst); - - val16 = float16_val(a); - sbit = 0x8000 & val16; - exp = extract32(val16, 10, 5); - - if (exp == 0) { - return make_float16(deposit32(sbit, 10, 5, 0x1e)); - } else { - return make_float16(deposit32(sbit, 10, 5, ~exp)); - } -} - -float32 HELPER(frecpx_f32)(float32 a, void *fpstp) -{ - float_status *fpst = fpstp; - uint32_t val32, sbit; - int32_t exp; - - if (float32_is_any_nan(a)) { - float32 nan = a; - if (float32_is_signaling_nan(a, fpst)) { - float_raise(float_flag_invalid, fpst); - if (!fpst->default_nan_mode) { - nan = float32_silence_nan(a, fpst); - } - } - if (fpst->default_nan_mode) { - nan = float32_default_nan(fpst); - } - return nan; - } - - a = float32_squash_input_denormal(a, fpst); - - val32 = float32_val(a); - sbit = 0x80000000ULL & val32; - exp = extract32(val32, 23, 8); - - if (exp == 0) { - return make_float32(sbit | (0xfe << 23)); - } else { - return make_float32(sbit | (~exp & 0xff) << 23); - } -} - -float64 HELPER(frecpx_f64)(float64 a, void *fpstp) -{ - float_status *fpst = fpstp; - uint64_t val64, sbit; - int64_t exp; - - if (float64_is_any_nan(a)) { - float64 nan = a; - if (float64_is_signaling_nan(a, fpst)) { - float_raise(float_flag_invalid, fpst); - if (!fpst->default_nan_mode) { - nan = float64_silence_nan(a, fpst); - } - } - if (fpst->default_nan_mode) { - nan = float64_default_nan(fpst); - } - return nan; - } - - a = float64_squash_input_denormal(a, fpst); - - val64 = float64_val(a); - sbit = 0x8000000000000000ULL & val64; - exp = extract64(float64_val(a), 52, 11); - - if (exp == 0) { - return make_float64(sbit | (0x7feULL << 52)); - } else { - return make_float64(sbit | (~exp & 0x7ffULL) << 52); - } -} - -float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env) -{ - /* Von Neumann rounding is implemented by using round-to-zero - * and then setting the LSB of the result if Inexact was raised. - */ - float32 r; - float_status *fpst = &env->vfp.fp_status; - float_status tstat = *fpst; - int exflags; - - set_float_rounding_mode(float_round_to_zero, &tstat); - set_float_exception_flags(0, &tstat); - r = float64_to_float32(a, &tstat); - exflags = get_float_exception_flags(&tstat); - if (exflags & float_flag_inexact) { - r = make_float32(float32_val(r) | 1); - } - exflags |= get_float_exception_flags(fpst); - set_float_exception_flags(exflags, fpst); - return r; -} - -/* 64-bit versions of the CRC helpers. Note that although the operation - * (and the prototypes of crc32c() and crc32() mean that only the bottom - * 32 bits of the accumulator and result are used, we pass and return - * uint64_t for convenience of the generated code. Unlike the 32-bit - * instruction set versions, val may genuinely have 64 bits of data in it. - * The upper bytes of val (above the number specified by 'bytes') must have - * been zeroed out by the caller. - */ -uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes) -{ - uint8_t buf[8]; - - stq_le_p(buf, val); - - /* zlib crc32 converts the accumulator and output to one's complement. */ - return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff; -} - -uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes) -{ - uint8_t buf[8]; - - stq_le_p(buf, val); - - /* Linux crc32c converts the output to one's complement. */ - return crc32c(acc, buf, bytes) ^ 0xffffffff; -} - -/* - * AdvSIMD half-precision - */ - -#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix)) - -#define ADVSIMD_HALFOP(name) \ -uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \ -{ \ - float_status *fpst = fpstp; \ - return float16_ ## name(a, b, fpst); \ -} - -ADVSIMD_HALFOP(add) -ADVSIMD_HALFOP(sub) -ADVSIMD_HALFOP(mul) -ADVSIMD_HALFOP(div) -ADVSIMD_HALFOP(min) -ADVSIMD_HALFOP(max) -ADVSIMD_HALFOP(minnum) -ADVSIMD_HALFOP(maxnum) - -#define ADVSIMD_TWOHALFOP(name) \ -uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \ -{ \ - float16 a1, a2, b1, b2; \ - uint32_t r1, r2; \ - float_status *fpst = fpstp; \ - a1 = extract32(two_a, 0, 16); \ - a2 = extract32(two_a, 16, 16); \ - b1 = extract32(two_b, 0, 16); \ - b2 = extract32(two_b, 16, 16); \ - r1 = float16_ ## name(a1, b1, fpst); \ - r2 = float16_ ## name(a2, b2, fpst); \ - return deposit32(r1, 16, 16, r2); \ -} - -ADVSIMD_TWOHALFOP(add) -ADVSIMD_TWOHALFOP(sub) -ADVSIMD_TWOHALFOP(mul) -ADVSIMD_TWOHALFOP(div) -ADVSIMD_TWOHALFOP(min) -ADVSIMD_TWOHALFOP(max) -ADVSIMD_TWOHALFOP(minnum) -ADVSIMD_TWOHALFOP(maxnum) - -/* Data processing - scalar floating-point and advanced SIMD */ -static float16 float16_mulx(float16 a, float16 b, void *fpstp) -{ - float_status *fpst = fpstp; - - a = float16_squash_input_denormal(a, fpst); - b = float16_squash_input_denormal(b, fpst); - - if ((float16_is_zero(a) && float16_is_infinity(b)) || - (float16_is_infinity(a) && float16_is_zero(b))) { - /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ - return make_float16((1U << 14) | - ((float16_val(a) ^ float16_val(b)) & (1U << 15))); - } - return float16_mul(a, b, fpst); -} - -ADVSIMD_HALFOP(mulx) -ADVSIMD_TWOHALFOP(mulx) - -/* fused multiply-accumulate */ -uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c, - void *fpstp) -{ - float_status *fpst = fpstp; - return float16_muladd(a, b, c, 0, fpst); -} - -uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b, - uint32_t two_c, void *fpstp) -{ - float_status *fpst = fpstp; - float16 a1, a2, b1, b2, c1, c2; - uint32_t r1, r2; - a1 = extract32(two_a, 0, 16); - a2 = extract32(two_a, 16, 16); - b1 = extract32(two_b, 0, 16); - b2 = extract32(two_b, 16, 16); - c1 = extract32(two_c, 0, 16); - c2 = extract32(two_c, 16, 16); - r1 = float16_muladd(a1, b1, c1, 0, fpst); - r2 = float16_muladd(a2, b2, c2, 0, fpst); - return deposit32(r1, 16, 16, r2); -} - -/* - * Floating point comparisons produce an integer result. Softfloat - * routines return float_relation types which we convert to the 0/-1 - * Neon requires. - */ - -#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0 - -uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - int compare = float16_compare_quiet(a, b, fpst); - return ADVSIMD_CMPRES(compare == float_relation_equal); -} - -uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - int compare = float16_compare(a, b, fpst); - return ADVSIMD_CMPRES(compare == float_relation_greater || - compare == float_relation_equal); -} - -uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - int compare = float16_compare(a, b, fpst); - return ADVSIMD_CMPRES(compare == float_relation_greater); -} - -uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float16 f0 = float16_abs(a); - float16 f1 = float16_abs(b); - int compare = float16_compare(f0, f1, fpst); - return ADVSIMD_CMPRES(compare == float_relation_greater || - compare == float_relation_equal); -} - -uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float16 f0 = float16_abs(a); - float16 f1 = float16_abs(b); - int compare = float16_compare(f0, f1, fpst); - return ADVSIMD_CMPRES(compare == float_relation_greater); -} - -/* round to integral */ -uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status) -{ - return float16_round_to_int(x, fp_status); -} - -uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status) -{ - int old_flags = get_float_exception_flags(fp_status), new_flags; - float16 ret; - - ret = float16_round_to_int(x, fp_status); - - /* Suppress any inexact exceptions the conversion produced */ - if (!(old_flags & float_flag_inexact)) { - new_flags = get_float_exception_flags(fp_status); - set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status); - } - - return ret; -} - -/* - * Half-precision floating point conversion functions - * - * There are a multitude of conversion functions with various - * different rounding modes. This is dealt with by the calling code - * setting the mode appropriately before calling the helper. - */ - -uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp) -{ - float_status *fpst = fpstp; - - /* Invalid if we are passed a NaN */ - if (float16_is_any_nan(a)) { - float_raise(float_flag_invalid, fpst); - return 0; - } - return float16_to_int16(a, fpst); -} - -uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp) -{ - float_status *fpst = fpstp; - - /* Invalid if we are passed a NaN */ - if (float16_is_any_nan(a)) { - float_raise(float_flag_invalid, fpst); - return 0; - } - return float16_to_uint16(a, fpst); -} - -static int el_from_spsr(uint32_t spsr) -{ - /* Return the exception level that this SPSR is requesting a return to, - * or -1 if it is invalid (an illegal return) - */ - if (spsr & PSTATE_nRW) { - switch (spsr & CPSR_M) { - case ARM_CPU_MODE_USR: - return 0; - case ARM_CPU_MODE_HYP: - return 2; - case ARM_CPU_MODE_FIQ: - case ARM_CPU_MODE_IRQ: - case ARM_CPU_MODE_SVC: - case ARM_CPU_MODE_ABT: - case ARM_CPU_MODE_UND: - case ARM_CPU_MODE_SYS: - return 1; - case ARM_CPU_MODE_MON: - /* Returning to Mon from AArch64 is never possible, - * so this is an illegal return. - */ - default: - return -1; - } - } else { - if (extract32(spsr, 1, 1)) { - /* Return with reserved M[1] bit set */ - return -1; - } - if (extract32(spsr, 0, 4) == 1) { - /* return to EL0 with M[0] bit set */ - return -1; - } - return extract32(spsr, 2, 2); - } -} - -static void cpsr_write_from_spsr_elx(CPUARMState *env, - uint32_t val) -{ - uint32_t mask; - - /* Save SPSR_ELx.SS into PSTATE. */ - env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS); - val &= ~PSTATE_SS; - - /* Move DIT to the correct location for CPSR */ - if (val & PSTATE_DIT) { - val &= ~PSTATE_DIT; - val |= CPSR_DIT; - } - - mask = aarch32_cpsr_valid_mask(env->features, \ - &env_archcpu(env)->isar); - cpsr_write(env, val, mask, CPSRWriteRaw); -} - -void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) -{ - int cur_el = arm_current_el(env); - unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el); - uint32_t spsr = env->banked_spsr[spsr_idx]; - int new_el; - bool return_to_aa64 = (spsr & PSTATE_nRW) == 0; - - aarch64_save_sp(env, cur_el); - - arm_clear_exclusive(env); - - /* We must squash the PSTATE.SS bit to zero unless both of the - * following hold: - * 1. debug exceptions are currently disabled - * 2. singlestep will be active in the EL we return to - * We check 1 here and 2 after we've done the pstate/cpsr write() to - * transition to the EL we're going to. - */ - if (arm_generate_debug_exceptions(env)) { - spsr &= ~PSTATE_SS; - } - - new_el = el_from_spsr(spsr); - if (new_el == -1) { - goto illegal_return; - } - if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) { - /* Disallow return to an EL which is unimplemented or higher - * than the current one. - */ - goto illegal_return; - } - - if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) { - /* Return to an EL which is configured for a different register width */ - goto illegal_return; - } - - if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) { - goto illegal_return; - } - - qemu_mutex_lock_iothread(); - arm_call_pre_el_change_hook(env_archcpu(env)); - qemu_mutex_unlock_iothread(); - - if (!return_to_aa64) { - env->aarch64 = false; - /* We do a raw CPSR write because aarch64_sync_64_to_32() - * will sort the register banks out for us, and we've already - * caught all the bad-mode cases in el_from_spsr(). - */ - cpsr_write_from_spsr_elx(env, spsr); - if (!arm_singlestep_active(env)) { - env->pstate &= ~PSTATE_SS; - } - aarch64_sync_64_to_32(env); - - if (spsr & CPSR_T) { - env->regs[15] = new_pc & ~0x1; - } else { - env->regs[15] = new_pc & ~0x3; - } - helper_rebuild_hflags_a32(env, new_el); - qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to " - "AArch32 EL%d PC 0x%" PRIx32 "\n", - cur_el, new_el, env->regs[15]); - } else { - int tbii; - - env->aarch64 = true; - spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar); - pstate_write(env, spsr); - if (!arm_singlestep_active(env)) { - env->pstate &= ~PSTATE_SS; - } - aarch64_restore_sp(env, new_el); - helper_rebuild_hflags_a64(env, new_el); - - /* - * Apply TBI to the exception return address. We had to delay this - * until after we selected the new EL, so that we could select the - * correct TBI+TBID bits. This is made easier by waiting until after - * the hflags rebuild, since we can pull the composite TBII field - * from there. - */ - tbii = EX_TBFLAG_A64(env->hflags, TBII); - if ((tbii >> extract64(new_pc, 55, 1)) & 1) { - /* TBI is enabled. */ - int core_mmu_idx = cpu_mmu_index(env, false); - if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) { - new_pc = sextract64(new_pc, 0, 56); - } else { - new_pc = extract64(new_pc, 0, 56); - } - } - env->pc = new_pc; - - qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to " - "AArch64 EL%d PC 0x%" PRIx64 "\n", - cur_el, new_el, env->pc); - } - - /* - * Note that cur_el can never be 0. If new_el is 0, then - * el0_a64 is return_to_aa64, else el0_a64 is ignored. - */ - aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64); - - qemu_mutex_lock_iothread(); - arm_call_el_change_hook(env_archcpu(env)); - qemu_mutex_unlock_iothread(); - - return; - -illegal_return: - /* Illegal return events of various kinds have architecturally - * mandated behaviour: - * restore NZCV and DAIF from SPSR_ELx - * set PSTATE.IL - * restore PC from ELR_ELx - * no change to exception level, execution state or stack pointer - */ - env->pstate |= PSTATE_IL; - env->pc = new_pc; - spsr &= PSTATE_NZCV | PSTATE_DAIF; - spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF); - pstate_write(env, spsr); - if (!arm_singlestep_active(env)) { - env->pstate &= ~PSTATE_SS; - } - helper_rebuild_hflags_a64(env, cur_el); - qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: " - "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc); -} - -/* - * Square Root and Reciprocal square root - */ - -uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp) -{ - float_status *s = fpstp; - - return float16_sqrt(a, s); -} - -void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in) -{ - /* - * Implement DC ZVA, which zeroes a fixed-length block of memory. - * Note that we do not implement the (architecturally mandated) - * alignment fault for attempts to use this on Device memory - * (which matches the usual QEMU behaviour of not implementing either - * alignment faults or any memory attribute handling). - */ - int blocklen = 4 << env_archcpu(env)->dcz_blocksize; - uint64_t vaddr = vaddr_in & ~(blocklen - 1); - int mmu_idx = cpu_mmu_index(env, false); - void *mem; - - /* - * Trapless lookup. In addition to actual invalid page, may - * return NULL for I/O, watchpoints, clean pages, etc. - */ - mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx); - -#ifndef CONFIG_USER_ONLY - if (unlikely(!mem)) { - uintptr_t ra = GETPC(); - - /* - * Trap if accessing an invalid page. DC_ZVA requires that we supply - * the original pointer for an invalid page. But watchpoints require - * that we probe the actual space. So do both. - */ - (void) probe_write(env, vaddr_in, 1, mmu_idx, ra); - mem = probe_write(env, vaddr, blocklen, mmu_idx, ra); - - if (unlikely(!mem)) { - /* - * The only remaining reason for mem == NULL is I/O. - * Just do a series of byte writes as the architecture demands. - */ - for (int i = 0; i < blocklen; i++) { - cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra); - } - return; - } - } -#endif - - memset(mem, 0, blocklen); -} diff --git a/target/arm/iwmmxt_helper.c b/target/arm/iwmmxt_helper.c deleted file mode 100644 index 610b1b2103..0000000000 --- a/target/arm/iwmmxt_helper.c +++ /dev/null @@ -1,670 +0,0 @@ -/* - * iwMMXt micro operations for XScale. - * - * Copyright (c) 2007 OpenedHand, Ltd. - * Written by Andrzej Zaborowski - * Copyright (c) 2008 CodeSourcery - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" - -#include "cpu.h" -#include "exec/helper-proto.h" - -/* iwMMXt macros extracted from GNU gdb. */ - -/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations. */ -#define SIMD8_SET(v, n, b) ((v != 0) << ((((b) + 1) * 4) + (n))) -#define SIMD16_SET(v, n, h) ((v != 0) << ((((h) + 1) * 8) + (n))) -#define SIMD32_SET(v, n, w) ((v != 0) << ((((w) + 1) * 16) + (n))) -#define SIMD64_SET(v, n) ((v != 0) << (32 + (n))) -/* Flags to pass as "n" above. */ -#define SIMD_NBIT -1 -#define SIMD_ZBIT -2 -#define SIMD_CBIT -3 -#define SIMD_VBIT -4 -/* Various status bit macros. */ -#define NBIT8(x) ((x) & 0x80) -#define NBIT16(x) ((x) & 0x8000) -#define NBIT32(x) ((x) & 0x80000000) -#define NBIT64(x) ((x) & 0x8000000000000000ULL) -#define ZBIT8(x) (((x) & 0xff) == 0) -#define ZBIT16(x) (((x) & 0xffff) == 0) -#define ZBIT32(x) (((x) & 0xffffffff) == 0) -#define ZBIT64(x) (x == 0) -/* Sign extension macros. */ -#define EXTEND8H(a) ((uint16_t) (int8_t) (a)) -#define EXTEND8(a) ((uint32_t) (int8_t) (a)) -#define EXTEND16(a) ((uint32_t) (int16_t) (a)) -#define EXTEND16S(a) ((int32_t) (int16_t) (a)) -#define EXTEND32(a) ((uint64_t) (int32_t) (a)) - -uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b) -{ - a = (( - EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) + - EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff) - ) & 0xffffffff) | ((uint64_t) ( - EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) + - EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff) - ) << 32); - return a; -} - -uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b) -{ - a = (( - ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) + - ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff) - ) & 0xffffffff) | (( - ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) + - ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff) - ) << 32); - return a; -} - -uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b) -{ -#define abs(x) (((x) >= 0) ? x : -x) -#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff)) - return - SADB(0) + SADB(8) + SADB(16) + SADB(24) + - SADB(32) + SADB(40) + SADB(48) + SADB(56); -#undef SADB -} - -uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b) -{ -#define SADW(SHR) \ - abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff)) - return SADW(0) + SADW(16) + SADW(32) + SADW(48); -#undef SADW -} - -uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b) -{ -#define MULS(SHR) ((uint64_t) ((( \ - EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \ - ) >> 0) & 0xffff) << SHR) - return MULS(0) | MULS(16) | MULS(32) | MULS(48); -#undef MULS -} - -uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b) -{ -#define MULS(SHR) ((uint64_t) ((( \ - EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \ - ) >> 16) & 0xffff) << SHR) - return MULS(0) | MULS(16) | MULS(32) | MULS(48); -#undef MULS -} - -uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b) -{ -#define MULU(SHR) ((uint64_t) ((( \ - ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \ - ) >> 0) & 0xffff) << SHR) - return MULU(0) | MULU(16) | MULU(32) | MULU(48); -#undef MULU -} - -uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b) -{ -#define MULU(SHR) ((uint64_t) ((( \ - ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \ - ) >> 16) & 0xffff) << SHR) - return MULU(0) | MULU(16) | MULU(32) | MULU(48); -#undef MULU -} - -uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b) -{ -#define MACS(SHR) ( \ - EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff)) - return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48)); -#undef MACS -} - -uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b) -{ -#define MACU(SHR) ( \ - (uint32_t) ((a >> SHR) & 0xffff) * \ - (uint32_t) ((b >> SHR) & 0xffff)) - return MACU(0) + MACU(16) + MACU(32) + MACU(48); -#undef MACU -} - -#define NZBIT8(x, i) \ - SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \ - SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i) -#define NZBIT16(x, i) \ - SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \ - SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i) -#define NZBIT32(x, i) \ - SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \ - SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i) -#define NZBIT64(x) \ - SIMD64_SET(NBIT64(x), SIMD_NBIT) | \ - SIMD64_SET(ZBIT64(x), SIMD_ZBIT) -#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3) \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = \ - (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) | \ - (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) | \ - (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) | \ - (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \ - NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \ - NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \ - NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \ - return a; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = \ - (((a >> SH0) & 0xffff) << 0) | \ - (((b >> SH0) & 0xffff) << 16) | \ - (((a >> SH2) & 0xffff) << 32) | \ - (((b >> SH2) & 0xffff) << 48); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) | \ - NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3); \ - return a; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = \ - (((a >> SH0) & 0xffffffff) << 0) | \ - (((b >> SH0) & 0xffffffff) << 32); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \ - return a; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = \ - (((x >> SH0) & 0xff) << 0) | \ - (((x >> SH1) & 0xff) << 16) | \ - (((x >> SH2) & 0xff) << 32) | \ - (((x >> SH3) & 0xff) << 48); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \ - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \ - return x; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = \ - (((x >> SH0) & 0xffff) << 0) | \ - (((x >> SH2) & 0xffff) << 32); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \ - return x; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = (((x >> SH0) & 0xffffffff) << 0); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \ - return x; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = \ - ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) | \ - ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) | \ - ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) | \ - ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \ - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \ - return x; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = \ - ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) | \ - ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \ - return x; \ -} \ -uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \ - uint64_t x) \ -{ \ - x = EXTEND32((x >> SH0) & 0xffffffff); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \ - return x; \ -} -IWMMXT_OP_UNPACK(l, 0, 8, 16, 24) -IWMMXT_OP_UNPACK(h, 32, 40, 48, 56) - -#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O) \ -uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = \ - CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) | \ - CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) | \ - CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) | \ - CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \ - NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \ - NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \ - NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \ - return a; \ -} \ -uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) | \ - CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | \ - NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); \ - return a; \ -} \ -uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env, \ - uint64_t a, uint64_t b) \ -{ \ - a = CMP(0, Tl, O, 0xffffffff) | \ - CMP(32, Tl, O, 0xffffffff); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \ - return a; \ -} -#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \ - (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR) -IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==) -IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >) -IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >) -#undef CMP -#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \ - (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR)) -IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <) -IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <) -IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >) -IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >) -#undef CMP -#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \ - OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR) -IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -) -IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +) -#undef CMP -/* TODO Signed- and Unsigned-Saturation */ -#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \ - OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR) -IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -) -IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +) -IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -) -IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +) -#undef CMP -#undef IWMMXT_OP_CMP - -#define AVGB(SHR) ((( \ - ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR) -#define IWMMXT_OP_AVGB(r) \ -uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b) \ -{ \ - const int round = r; \ - a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) | \ - AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) | \ - SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) | \ - SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) | \ - SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) | \ - SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) | \ - SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) | \ - SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) | \ - SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7); \ - return a; \ -} -IWMMXT_OP_AVGB(0) -IWMMXT_OP_AVGB(1) -#undef IWMMXT_OP_AVGB -#undef AVGB - -#define AVGW(SHR) ((( \ - ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR) -#define IWMMXT_OP_AVGW(r) \ -uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b) \ -{ \ - const int round = r; \ - a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48); \ - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ - SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) | \ - SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) | \ - SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) | \ - SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3); \ - return a; \ -} -IWMMXT_OP_AVGW(0) -IWMMXT_OP_AVGW(1) -#undef IWMMXT_OP_AVGW -#undef AVGW - -uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n) -{ - a >>= n << 3; - a |= b << (64 - (n << 3)); - return a; -} - -uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n) -{ - x &= ~((uint64_t) b << n); - x |= (uint64_t) (a & b) << n; - return x; -} - -uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x) -{ - return SIMD64_SET((x == 0), SIMD_ZBIT) | - SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT); -} - -uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg) -{ - arg &= 0xff; - return - ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) | - ((uint64_t) arg << 16) | ((uint64_t) arg << 24) | - ((uint64_t) arg << 32) | ((uint64_t) arg << 40) | - ((uint64_t) arg << 48) | ((uint64_t) arg << 56); -} - -uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg) -{ - arg &= 0xffff; - return - ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) | - ((uint64_t) arg << 32) | ((uint64_t) arg << 48); -} - -uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg) -{ - return arg | ((uint64_t) arg << 32); -} - -uint64_t HELPER(iwmmxt_addcb)(uint64_t x) -{ - return - ((x >> 0) & 0xff) + ((x >> 8) & 0xff) + - ((x >> 16) & 0xff) + ((x >> 24) & 0xff) + - ((x >> 32) & 0xff) + ((x >> 40) & 0xff) + - ((x >> 48) & 0xff) + ((x >> 56) & 0xff); -} - -uint64_t HELPER(iwmmxt_addcw)(uint64_t x) -{ - return - ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) + - ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff); -} - -uint64_t HELPER(iwmmxt_addcl)(uint64_t x) -{ - return (x & 0xffffffff) + (x >> 32); -} - -uint32_t HELPER(iwmmxt_msbb)(uint64_t x) -{ - return - ((x >> 7) & 0x01) | ((x >> 14) & 0x02) | - ((x >> 21) & 0x04) | ((x >> 28) & 0x08) | - ((x >> 35) & 0x10) | ((x >> 42) & 0x20) | - ((x >> 49) & 0x40) | ((x >> 56) & 0x80); -} - -uint32_t HELPER(iwmmxt_msbw)(uint64_t x) -{ - return - ((x >> 15) & 0x01) | ((x >> 30) & 0x02) | - ((x >> 45) & 0x04) | ((x >> 52) & 0x08); -} - -uint32_t HELPER(iwmmxt_msbl)(uint64_t x) -{ - return ((x >> 31) & 0x01) | ((x >> 62) & 0x02); -} - -/* FIXME: Split wCASF setting into a separate op to avoid env use. */ -uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) | - (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) | - (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) | - (((x & (0xffffll << 48)) >> n) & (0xffffll << 48)); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); - return x; -} - -uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ((x & (0xffffffffll << 0)) >> n) | - ((x >> n) & (0xffffffffll << 32)); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); - return x; -} - -uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x >>= n; - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); - return x; -} - -uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) | - (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) | - (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) | - (((x & (0xffffll << 48)) << n) & (0xffffll << 48)); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); - return x; -} - -uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ((x << n) & (0xffffffffll << 0)) | - ((x & (0xffffffffll << 32)) << n); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); - return x; -} - -uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x <<= n; - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); - return x; -} - -uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) | - ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) | - ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) | - ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); - return x; -} - -uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) | - (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); - return x; -} - -uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = (int64_t) x >> n; - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); - return x; -} - -uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ((((x & (0xffffll << 0)) >> n) | - ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) | - ((((x & (0xffffll << 16)) >> n) | - ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) | - ((((x & (0xffffll << 32)) >> n) | - ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) | - ((((x & (0xffffll << 48)) >> n) | - ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48)); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); - return x; -} - -uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ((x & (0xffffffffll << 0)) >> n) | - ((x >> n) & (0xffffffffll << 32)) | - ((x << (32 - n)) & (0xffffffffll << 0)) | - ((x & (0xffffffffll << 32)) << (32 - n)); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); - return x; -} - -uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = ror64(x, n); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); - return x; -} - -uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n) -{ - x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) | - (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) | - (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) | - (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | - NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); - return x; -} - -/* TODO: Unsigned-Saturation */ -uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) | - (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) | - (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) | - (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | - NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | - NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | - NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); - return a; -} - -uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) | - (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | - NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); - return a; -} - -uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (a & 0xffffffff) | ((b & 0xffffffff) << 32); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); - return a; -} - -/* TODO: Signed-Saturation */ -uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) | - (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) | - (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) | - (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | - NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | - NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | - NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); - return a; -} - -uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) | - (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | - NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); - return a; -} - -uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b) -{ - a = (a & 0xffffffff) | ((b & 0xffffffff) << 32); - env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = - NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); - return a; -} - -uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b) -{ - return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b)); -} - -uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b) -{ - c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) * - EXTEND16S((b >> 0) & 0xffff)); - c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) * - EXTEND16S((b >> 16) & 0xffff)); - return c; -} - -uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b) -{ - return c + (EXTEND32(EXTEND16S(a & 0xffff) * - EXTEND16S(b & 0xffff))); -} diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c deleted file mode 100644 index f94e87e728..0000000000 --- a/target/arm/m_helper.c +++ /dev/null @@ -1,2902 +0,0 @@ -/* - * ARM generic helpers. - * - * This code is licensed under the GNU GPL v2 or later. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "exec/helper-proto.h" -#include "qemu/main-loop.h" -#include "qemu/bitops.h" -#include "qemu/log.h" -#include "exec/exec-all.h" -#ifdef CONFIG_TCG -#include "exec/cpu_ldst.h" -#include "semihosting/common-semi.h" -#endif -#if !defined(CONFIG_USER_ONLY) -#include "hw/intc/armv7m_nvic.h" -#endif - -static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask, - uint32_t reg, uint32_t val) -{ - /* Only APSR is actually writable */ - if (!(reg & 4)) { - uint32_t apsrmask = 0; - - if (mask & 8) { - apsrmask |= XPSR_NZCV | XPSR_Q; - } - if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) { - apsrmask |= XPSR_GE; - } - xpsr_write(env, val, apsrmask); - } -} - -static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el) -{ - uint32_t mask = 0; - - if ((reg & 1) && el) { - mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */ - } - if (!(reg & 4)) { - mask |= XPSR_NZCV | XPSR_Q; /* APSR */ - if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) { - mask |= XPSR_GE; - } - } - /* EPSR reads as zero */ - return xpsr_read(env) & mask; -} - -static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure) -{ - uint32_t value = env->v7m.control[secure]; - - if (!secure) { - /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */ - value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK; - } - return value; -} - -#ifdef CONFIG_USER_ONLY - -void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val) -{ - uint32_t mask = extract32(maskreg, 8, 4); - uint32_t reg = extract32(maskreg, 0, 8); - - switch (reg) { - case 0 ... 7: /* xPSR sub-fields */ - v7m_msr_xpsr(env, mask, reg, val); - break; - case 20: /* CONTROL */ - /* There are no sub-fields that are actually writable from EL0. */ - break; - default: - /* Unprivileged writes to other registers are ignored */ - break; - } -} - -uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg) -{ - switch (reg) { - case 0 ... 7: /* xPSR sub-fields */ - return v7m_mrs_xpsr(env, reg, 0); - case 20: /* CONTROL */ - return v7m_mrs_control(env, 0); - default: - /* Unprivileged reads others as zero. */ - return 0; - } -} - -void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest) -{ - /* translate.c should never generate calls here in user-only mode */ - g_assert_not_reached(); -} - -void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest) -{ - /* translate.c should never generate calls here in user-only mode */ - g_assert_not_reached(); -} - -void HELPER(v7m_preserve_fp_state)(CPUARMState *env) -{ - /* translate.c should never generate calls here in user-only mode */ - g_assert_not_reached(); -} - -void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) -{ - /* translate.c should never generate calls here in user-only mode */ - g_assert_not_reached(); -} - -void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) -{ - /* translate.c should never generate calls here in user-only mode */ - g_assert_not_reached(); -} - -uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op) -{ - /* - * The TT instructions can be used by unprivileged code, but in - * user-only emulation we don't have the MPU. - * Luckily since we know we are NonSecure unprivileged (and that in - * turn means that the A flag wasn't specified), all the bits in the - * register must be zero: - * IREGION: 0 because IRVALID is 0 - * IRVALID: 0 because NS - * S: 0 because NS - * NSRW: 0 because NS - * NSR: 0 because NS - * RW: 0 because unpriv and A flag not set - * R: 0 because unpriv and A flag not set - * SRVALID: 0 because NS - * MRVALID: 0 because unpriv and A flag not set - * SREGION: 0 becaus SRVALID is 0 - * MREGION: 0 because MRVALID is 0 - */ - return 0; -} - -ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate) -{ - return ARMMMUIdx_MUser; -} - -#else /* !CONFIG_USER_ONLY */ - -static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env, - bool secstate, bool priv, bool negpri) -{ - ARMMMUIdx mmu_idx = ARM_MMU_IDX_M; - - if (priv) { - mmu_idx |= ARM_MMU_IDX_M_PRIV; - } - - if (negpri) { - mmu_idx |= ARM_MMU_IDX_M_NEGPRI; - } - - if (secstate) { - mmu_idx |= ARM_MMU_IDX_M_S; - } - - return mmu_idx; -} - -static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env, - bool secstate, bool priv) -{ - bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate); - - return arm_v7m_mmu_idx_all(env, secstate, priv, negpri); -} - -/* Return the MMU index for a v7M CPU in the specified security state */ -ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate) -{ - bool priv = arm_v7m_is_handler_mode(env) || - !(env->v7m.control[secstate] & 1); - - return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv); -} - -/* - * What kind of stack write are we doing? This affects how exceptions - * generated during the stacking are treated. - */ -typedef enum StackingMode { - STACK_NORMAL, - STACK_IGNFAULTS, - STACK_LAZYFP, -} StackingMode; - -static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value, - ARMMMUIdx mmu_idx, StackingMode mode) -{ - CPUState *cs = CPU(cpu); - CPUARMState *env = &cpu->env; - MemTxResult txres; - GetPhysAddrResult res = {}; - ARMMMUFaultInfo fi = {}; - bool secure = mmu_idx & ARM_MMU_IDX_M_S; - int exc; - bool exc_secure; - - if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) { - /* MPU/SAU lookup failed */ - if (fi.type == ARMFault_QEMU_SFault) { - if (mode == STACK_LAZYFP) { - qemu_log_mask(CPU_LOG_INT, - "...SecureFault with SFSR.LSPERR " - "during lazy stacking\n"); - env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK; - } else { - qemu_log_mask(CPU_LOG_INT, - "...SecureFault with SFSR.AUVIOL " - "during stacking\n"); - env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK; - } - env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK; - env->v7m.sfar = addr; - exc = ARMV7M_EXCP_SECURE; - exc_secure = false; - } else { - if (mode == STACK_LAZYFP) { - qemu_log_mask(CPU_LOG_INT, - "...MemManageFault with CFSR.MLSPERR\n"); - env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK; - } else { - qemu_log_mask(CPU_LOG_INT, - "...MemManageFault with CFSR.MSTKERR\n"); - env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK; - } - exc = ARMV7M_EXCP_MEM; - exc_secure = secure; - } - goto pend_fault; - } - address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr, - value, res.f.attrs, &txres); - if (txres != MEMTX_OK) { - /* BusFault trying to write the data */ - if (mode == STACK_LAZYFP) { - qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n"); - env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK; - } else { - qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n"); - env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK; - } - exc = ARMV7M_EXCP_BUS; - exc_secure = false; - goto pend_fault; - } - return true; - -pend_fault: - /* - * By pending the exception at this point we are making - * the IMPDEF choice "overridden exceptions pended" (see the - * MergeExcInfo() pseudocode). The other choice would be to not - * pend them now and then make a choice about which to throw away - * later if we have two derived exceptions. - * The only case when we must not pend the exception but instead - * throw it away is if we are doing the push of the callee registers - * and we've already generated a derived exception (this is indicated - * by the caller passing STACK_IGNFAULTS). Even in this case we will - * still update the fault status registers. - */ - switch (mode) { - case STACK_NORMAL: - armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure); - break; - case STACK_LAZYFP: - armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure); - break; - case STACK_IGNFAULTS: - break; - } - return false; -} - -static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr, - ARMMMUIdx mmu_idx) -{ - CPUState *cs = CPU(cpu); - CPUARMState *env = &cpu->env; - MemTxResult txres; - GetPhysAddrResult res = {}; - ARMMMUFaultInfo fi = {}; - bool secure = mmu_idx & ARM_MMU_IDX_M_S; - int exc; - bool exc_secure; - uint32_t value; - - if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) { - /* MPU/SAU lookup failed */ - if (fi.type == ARMFault_QEMU_SFault) { - qemu_log_mask(CPU_LOG_INT, - "...SecureFault with SFSR.AUVIOL during unstack\n"); - env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; - env->v7m.sfar = addr; - exc = ARMV7M_EXCP_SECURE; - exc_secure = false; - } else { - qemu_log_mask(CPU_LOG_INT, - "...MemManageFault with CFSR.MUNSTKERR\n"); - env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK; - exc = ARMV7M_EXCP_MEM; - exc_secure = secure; - } - goto pend_fault; - } - - value = address_space_ldl(arm_addressspace(cs, res.f.attrs), - res.f.phys_addr, res.f.attrs, &txres); - if (txres != MEMTX_OK) { - /* BusFault trying to read the data */ - qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n"); - env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK; - exc = ARMV7M_EXCP_BUS; - exc_secure = false; - goto pend_fault; - } - - *dest = value; - return true; - -pend_fault: - /* - * By pending the exception at this point we are making - * the IMPDEF choice "overridden exceptions pended" (see the - * MergeExcInfo() pseudocode). The other choice would be to not - * pend them now and then make a choice about which to throw away - * later if we have two derived exceptions. - */ - armv7m_nvic_set_pending(env->nvic, exc, exc_secure); - return false; -} - -void HELPER(v7m_preserve_fp_state)(CPUARMState *env) -{ - /* - * Preserve FP state (because LSPACT was set and we are about - * to execute an FP instruction). This corresponds to the - * PreserveFPState() pseudocode. - * We may throw an exception if the stacking fails. - */ - ARMCPU *cpu = env_archcpu(env); - bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; - bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK); - bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK); - bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK; - uint32_t fpcar = env->v7m.fpcar[is_secure]; - bool stacked_ok = true; - bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK); - bool take_exception; - - /* Take the iothread lock as we are going to touch the NVIC */ - qemu_mutex_lock_iothread(); - - /* Check the background context had access to the FPU */ - if (!v7m_cpacr_pass(env, is_secure, is_priv)) { - armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure); - env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK; - stacked_ok = false; - } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) { - armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S); - env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; - stacked_ok = false; - } - - if (!splimviol && stacked_ok) { - /* We only stack if the stack limit wasn't violated */ - int i; - ARMMMUIdx mmu_idx; - - mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri); - for (i = 0; i < (ts ? 32 : 16); i += 2) { - uint64_t dn = *aa32_vfp_dreg(env, i / 2); - uint32_t faddr = fpcar + 4 * i; - uint32_t slo = extract64(dn, 0, 32); - uint32_t shi = extract64(dn, 32, 32); - - if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR/VPR */ - } - stacked_ok = stacked_ok && - v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) && - v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP); - } - - stacked_ok = stacked_ok && - v7m_stack_write(cpu, fpcar + 0x40, - vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP); - if (cpu_isar_feature(aa32_mve, cpu)) { - stacked_ok = stacked_ok && - v7m_stack_write(cpu, fpcar + 0x44, - env->v7m.vpr, mmu_idx, STACK_LAZYFP); - } - } - - /* - * We definitely pended an exception, but it's possible that it - * might not be able to be taken now. If its priority permits us - * to take it now, then we must not update the LSPACT or FP regs, - * but instead jump out to take the exception immediately. - * If it's just pending and won't be taken until the current - * handler exits, then we do update LSPACT and the FP regs. - */ - take_exception = !stacked_ok && - armv7m_nvic_can_take_pending_exception(env->nvic); - - qemu_mutex_unlock_iothread(); - - if (take_exception) { - raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC()); - } - - env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK; - - if (ts) { - /* Clear s0 to s31 and the FPSCR and VPR */ - int i; - - for (i = 0; i < 32; i += 2) { - *aa32_vfp_dreg(env, i / 2) = 0; - } - vfp_set_fpscr(env, 0); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = 0; - } - } - /* - * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them - * unchanged. - */ -} - -/* - * Write to v7M CONTROL.SPSEL bit for the specified security bank. - * This may change the current stack pointer between Main and Process - * stack pointers if it is done for the CONTROL register for the current - * security state. - */ -static void write_v7m_control_spsel_for_secstate(CPUARMState *env, - bool new_spsel, - bool secstate) -{ - bool old_is_psp = v7m_using_psp(env); - - env->v7m.control[secstate] = - deposit32(env->v7m.control[secstate], - R_V7M_CONTROL_SPSEL_SHIFT, - R_V7M_CONTROL_SPSEL_LENGTH, new_spsel); - - if (secstate == env->v7m.secure) { - bool new_is_psp = v7m_using_psp(env); - uint32_t tmp; - - if (old_is_psp != new_is_psp) { - tmp = env->v7m.other_sp; - env->v7m.other_sp = env->regs[13]; - env->regs[13] = tmp; - } - } -} - -/* - * Write to v7M CONTROL.SPSEL bit. This may change the current - * stack pointer between Main and Process stack pointers. - */ -static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel) -{ - write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure); -} - -void write_v7m_exception(CPUARMState *env, uint32_t new_exc) -{ - /* - * Write a new value to v7m.exception, thus transitioning into or out - * of Handler mode; this may result in a change of active stack pointer. - */ - bool new_is_psp, old_is_psp = v7m_using_psp(env); - uint32_t tmp; - - env->v7m.exception = new_exc; - - new_is_psp = v7m_using_psp(env); - - if (old_is_psp != new_is_psp) { - tmp = env->v7m.other_sp; - env->v7m.other_sp = env->regs[13]; - env->regs[13] = tmp; - } -} - -/* Switch M profile security state between NS and S */ -static void switch_v7m_security_state(CPUARMState *env, bool new_secstate) -{ - uint32_t new_ss_msp, new_ss_psp; - - if (env->v7m.secure == new_secstate) { - return; - } - - /* - * All the banked state is accessed by looking at env->v7m.secure - * except for the stack pointer; rearrange the SP appropriately. - */ - new_ss_msp = env->v7m.other_ss_msp; - new_ss_psp = env->v7m.other_ss_psp; - - if (v7m_using_psp(env)) { - env->v7m.other_ss_psp = env->regs[13]; - env->v7m.other_ss_msp = env->v7m.other_sp; - } else { - env->v7m.other_ss_msp = env->regs[13]; - env->v7m.other_ss_psp = env->v7m.other_sp; - } - - env->v7m.secure = new_secstate; - - if (v7m_using_psp(env)) { - env->regs[13] = new_ss_psp; - env->v7m.other_sp = new_ss_msp; - } else { - env->regs[13] = new_ss_msp; - env->v7m.other_sp = new_ss_psp; - } -} - -void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest) -{ - /* - * Handle v7M BXNS: - * - if the return value is a magic value, do exception return (like BX) - * - otherwise bit 0 of the return value is the target security state - */ - uint32_t min_magic; - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - /* Covers FNC_RETURN and EXC_RETURN magic */ - min_magic = FNC_RETURN_MIN_MAGIC; - } else { - /* EXC_RETURN magic only */ - min_magic = EXC_RETURN_MIN_MAGIC; - } - - if (dest >= min_magic) { - /* - * This is an exception return magic value; put it where - * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT. - * Note that if we ever add gen_ss_advance() singlestep support to - * M profile this should count as an "instruction execution complete" - * event (compare gen_bx_excret_final_code()). - */ - env->regs[15] = dest & ~1; - env->thumb = dest & 1; - HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT); - /* notreached */ - } - - /* translate.c should have made BXNS UNDEF unless we're secure */ - assert(env->v7m.secure); - - if (!(dest & 1)) { - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; - } - switch_v7m_security_state(env, dest & 1); - env->thumb = true; - env->regs[15] = dest & ~1; - arm_rebuild_hflags(env); -} - -void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest) -{ - /* - * Handle v7M BLXNS: - * - bit 0 of the destination address is the target security state - */ - - /* At this point regs[15] is the address just after the BLXNS */ - uint32_t nextinst = env->regs[15] | 1; - uint32_t sp = env->regs[13] - 8; - uint32_t saved_psr; - - /* translate.c will have made BLXNS UNDEF unless we're secure */ - assert(env->v7m.secure); - - if (dest & 1) { - /* - * Target is Secure, so this is just a normal BLX, - * except that the low bit doesn't indicate Thumb/not. - */ - env->regs[14] = nextinst; - env->thumb = true; - env->regs[15] = dest & ~1; - return; - } - - /* Target is non-secure: first push a stack frame */ - if (!QEMU_IS_ALIGNED(sp, 8)) { - qemu_log_mask(LOG_GUEST_ERROR, - "BLXNS with misaligned SP is UNPREDICTABLE\n"); - } - - if (sp < v7m_sp_limit(env)) { - raise_exception(env, EXCP_STKOF, 0, 1); - } - - saved_psr = env->v7m.exception; - if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) { - saved_psr |= XPSR_SFPA; - } - - /* Note that these stores can throw exceptions on MPU faults */ - cpu_stl_data_ra(env, sp, nextinst, GETPC()); - cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC()); - - env->regs[13] = sp; - env->regs[14] = 0xfeffffff; - if (arm_v7m_is_handler_mode(env)) { - /* - * Write a dummy value to IPSR, to avoid leaking the current secure - * exception number to non-secure code. This is guaranteed not - * to cause write_v7m_exception() to actually change stacks. - */ - write_v7m_exception(env, 1); - } - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; - switch_v7m_security_state(env, 0); - env->thumb = true; - env->regs[15] = dest; - arm_rebuild_hflags(env); -} - -static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode, - bool spsel) -{ - /* - * Return a pointer to the location where we currently store the - * stack pointer for the requested security state and thread mode. - * This pointer will become invalid if the CPU state is updated - * such that the stack pointers are switched around (eg changing - * the SPSEL control bit). - * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode(). - * Unlike that pseudocode, we require the caller to pass us in the - * SPSEL control bit value; this is because we also use this - * function in handling of pushing of the callee-saves registers - * part of the v8M stack frame (pseudocode PushCalleeStack()), - * and in the tailchain codepath the SPSEL bit comes from the exception - * return magic LR value from the previous exception. The pseudocode - * opencodes the stack-selection in PushCalleeStack(), but we prefer - * to make this utility function generic enough to do the job. - */ - bool want_psp = threadmode && spsel; - - if (secure == env->v7m.secure) { - if (want_psp == v7m_using_psp(env)) { - return &env->regs[13]; - } else { - return &env->v7m.other_sp; - } - } else { - if (want_psp) { - return &env->v7m.other_ss_psp; - } else { - return &env->v7m.other_ss_msp; - } - } -} - -static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure, - uint32_t *pvec) -{ - CPUState *cs = CPU(cpu); - CPUARMState *env = &cpu->env; - MemTxResult result; - uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4; - uint32_t vector_entry; - MemTxAttrs attrs = {}; - ARMMMUIdx mmu_idx; - bool exc_secure; - - qemu_log_mask(CPU_LOG_INT, - "...loading from element %d of %s vector table at 0x%x\n", - exc, targets_secure ? "secure" : "non-secure", addr); - - mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true); - - /* - * We don't do a get_phys_addr() here because the rules for vector - * loads are special: they always use the default memory map, and - * the default memory map permits reads from all addresses. - * Since there's no easy way to pass through to pmsav8_mpu_lookup() - * that we want this special case which would always say "yes", - * we just do the SAU lookup here followed by a direct physical load. - */ - attrs.secure = targets_secure; - attrs.user = false; - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - V8M_SAttributes sattrs = {}; - - v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, - targets_secure, &sattrs); - if (sattrs.ns) { - attrs.secure = false; - } else if (!targets_secure) { - /* - * NS access to S memory: the underlying exception which we escalate - * to HardFault is SecureFault, which always targets Secure. - */ - exc_secure = true; - goto load_fail; - } - } - - vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr, - attrs, &result); - if (result != MEMTX_OK) { - /* - * Underlying exception is BusFault: its target security state - * depends on BFHFNMINS. - */ - exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK); - goto load_fail; - } - *pvec = vector_entry; - qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec); - return true; - -load_fail: - /* - * All vector table fetch fails are reported as HardFault, with - * HFSR.VECTTBL and .FORCED set. (FORCED is set because - * technically the underlying exception is a SecureFault or BusFault - * that is escalated to HardFault.) This is a terminal exception, - * so we will either take the HardFault immediately or else enter - * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()). - * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are - * secure); otherwise it targets the same security state as the - * underlying exception. - * In v8.1M HardFaults from vector table fetch fails don't set FORCED. - */ - if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) { - exc_secure = true; - } - env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK; - if (!arm_feature(env, ARM_FEATURE_V8_1M)) { - env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK; - } - armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure); - return false; -} - -static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr) -{ - /* - * Return the integrity signature value for the callee-saves - * stack frame section. @lr is the exception return payload/LR value - * whose FType bit forms bit 0 of the signature if FP is present. - */ - uint32_t sig = 0xfefa125a; - - if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) - || (lr & R_V7M_EXCRET_FTYPE_MASK)) { - sig |= 1; - } - return sig; -} - -static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain, - bool ignore_faults) -{ - /* - * For v8M, push the callee-saves register part of the stack frame. - * Compare the v8M pseudocode PushCalleeStack(). - * In the tailchaining case this may not be the current stack. - */ - CPUARMState *env = &cpu->env; - uint32_t *frame_sp_p; - uint32_t frameptr; - ARMMMUIdx mmu_idx; - bool stacked_ok; - uint32_t limit; - bool want_psp; - uint32_t sig; - StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL; - - if (dotailchain) { - bool mode = lr & R_V7M_EXCRET_MODE_MASK; - bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) || - !mode; - - mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv); - frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode, - lr & R_V7M_EXCRET_SPSEL_MASK); - want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK); - if (want_psp) { - limit = env->v7m.psplim[M_REG_S]; - } else { - limit = env->v7m.msplim[M_REG_S]; - } - } else { - mmu_idx = arm_mmu_idx(env); - frame_sp_p = &env->regs[13]; - limit = v7m_sp_limit(env); - } - - frameptr = *frame_sp_p - 0x28; - if (frameptr < limit) { - /* - * Stack limit failure: set SP to the limit value, and generate - * STKOF UsageFault. Stack pushes below the limit must not be - * performed. It is IMPDEF whether pushes above the limit are - * performed; we choose not to. - */ - qemu_log_mask(CPU_LOG_INT, - "...STKOF during callee-saves register stacking\n"); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - *frame_sp_p = limit; - return true; - } - - /* - * Write as much of the stack frame as we can. A write failure may - * cause us to pend a derived exception. - */ - sig = v7m_integrity_sig(env, lr); - stacked_ok = - v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) && - v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode); - - /* Update SP regardless of whether any of the stack accesses failed. */ - *frame_sp_p = frameptr; - - return !stacked_ok; -} - -static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain, - bool ignore_stackfaults) -{ - /* - * Do the "take the exception" parts of exception entry, - * but not the pushing of state to the stack. This is - * similar to the pseudocode ExceptionTaken() function. - */ - CPUARMState *env = &cpu->env; - uint32_t addr; - bool targets_secure; - int exc; - bool push_failed = false; - - armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure); - qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n", - targets_secure ? "secure" : "nonsecure", exc); - - if (dotailchain) { - /* Sanitize LR FType and PREFIX bits */ - if (!cpu_isar_feature(aa32_vfp_simd, cpu)) { - lr |= R_V7M_EXCRET_FTYPE_MASK; - } - lr = deposit32(lr, 24, 8, 0xff); - } - - if (arm_feature(env, ARM_FEATURE_V8)) { - if (arm_feature(env, ARM_FEATURE_M_SECURITY) && - (lr & R_V7M_EXCRET_S_MASK)) { - /* - * The background code (the owner of the registers in the - * exception frame) is Secure. This means it may either already - * have or now needs to push callee-saves registers. - */ - if (targets_secure) { - if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) { - /* - * We took an exception from Secure to NonSecure - * (which means the callee-saved registers got stacked) - * and are now tailchaining to a Secure exception. - * Clear DCRS so eventual return from this Secure - * exception unstacks the callee-saved registers. - */ - lr &= ~R_V7M_EXCRET_DCRS_MASK; - } - } else { - /* - * We're going to a non-secure exception; push the - * callee-saves registers to the stack now, if they're - * not already saved. - */ - if (lr & R_V7M_EXCRET_DCRS_MASK && - !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) { - push_failed = v7m_push_callee_stack(cpu, lr, dotailchain, - ignore_stackfaults); - } - lr |= R_V7M_EXCRET_DCRS_MASK; - } - } - - lr &= ~R_V7M_EXCRET_ES_MASK; - if (targets_secure) { - lr |= R_V7M_EXCRET_ES_MASK; - } - lr &= ~R_V7M_EXCRET_SPSEL_MASK; - if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) { - lr |= R_V7M_EXCRET_SPSEL_MASK; - } - - /* - * Clear registers if necessary to prevent non-secure exception - * code being able to see register values from secure code. - * Where register values become architecturally UNKNOWN we leave - * them with their previous values. v8.1M is tighter than v8.0M - * here and always zeroes the caller-saved registers regardless - * of the security state the exception is targeting. - */ - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) { - /* - * Always clear the caller-saved registers (they have been - * pushed to the stack earlier in v7m_push_stack()). - * Clear callee-saved registers if the background code is - * Secure (in which case these regs were saved in - * v7m_push_callee_stack()). - */ - int i; - /* - * r4..r11 are callee-saves, zero only if background - * state was Secure (EXCRET.S == 1) and exception - * targets Non-secure state - */ - bool zero_callee_saves = !targets_secure && - (lr & R_V7M_EXCRET_S_MASK); - - for (i = 0; i < 13; i++) { - if (i < 4 || i > 11 || zero_callee_saves) { - env->regs[i] = 0; - } - } - /* Clear EAPSR */ - xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT); - } - } - } - - if (push_failed && !ignore_stackfaults) { - /* - * Derived exception on callee-saves register stacking: - * we might now want to take a different exception which - * targets a different security state, so try again from the top. - */ - qemu_log_mask(CPU_LOG_INT, - "...derived exception on callee-saves register stacking"); - v7m_exception_taken(cpu, lr, true, true); - return; - } - - if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) { - /* Vector load failed: derived exception */ - qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load"); - v7m_exception_taken(cpu, lr, true, true); - return; - } - - /* - * Now we've done everything that might cause a derived exception - * we can go ahead and activate whichever exception we're going to - * take (which might now be the derived exception). - */ - armv7m_nvic_acknowledge_irq(env->nvic); - - /* Switch to target security state -- must do this before writing SPSEL */ - switch_v7m_security_state(env, targets_secure); - write_v7m_control_spsel(env, 0); - arm_clear_exclusive(env); - /* Clear SFPA and FPCA (has no effect if no FPU) */ - env->v7m.control[M_REG_S] &= - ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK); - /* Clear IT bits */ - env->condexec_bits = 0; - env->regs[14] = lr; - env->regs[15] = addr & 0xfffffffe; - env->thumb = addr & 1; - arm_rebuild_hflags(env); -} - -static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr, - bool apply_splim) -{ - /* - * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR - * that we will need later in order to do lazy FP reg stacking. - */ - bool is_secure = env->v7m.secure; - NVICState *nvic = env->nvic; - /* - * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits - * are banked and we want to update the bit in the bank for the - * current security state; and in one case we want to specifically - * update the NS banked version of a bit even if we are secure. - */ - uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S]; - uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS]; - uint32_t *fpccr = &env->v7m.fpccr[is_secure]; - bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy; - - env->v7m.fpcar[is_secure] = frameptr & ~0x7; - - if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) { - bool splimviol; - uint32_t splim = v7m_sp_limit(env); - bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) && - (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK); - - splimviol = !ign && frameptr < splim; - *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol); - } - - *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1); - - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure); - - *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0); - - *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD, - !arm_v7m_is_handler_mode(env)); - - hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false); - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy); - - bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false); - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy); - - mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure); - *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy); - - ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false); - *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy); - - monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false); - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy); - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true); - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy); - - sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false); - *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy); - } -} - -void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) -{ - /* fptr is the value of Rn, the frame pointer we store the FP regs to */ - ARMCPU *cpu = env_archcpu(env); - bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; - bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK; - uintptr_t ra = GETPC(); - - assert(env->v7m.secure); - - if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { - return; - } - - /* Check access to the coprocessor is permitted */ - if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) { - raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC()); - } - - if (lspact) { - /* LSPACT should not be active when there is active FP state */ - raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC()); - } - - if (fptr & 7) { - raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC()); - } - - /* - * Note that we do not use v7m_stack_write() here, because the - * accesses should not set the FSR bits for stacking errors if they - * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK - * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions - * and longjmp out. - */ - if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) { - bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK; - int i; - - for (i = 0; i < (ts ? 32 : 16); i += 2) { - uint64_t dn = *aa32_vfp_dreg(env, i / 2); - uint32_t faddr = fptr + 4 * i; - uint32_t slo = extract64(dn, 0, 32); - uint32_t shi = extract64(dn, 32, 32); - - if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR */ - } - cpu_stl_data_ra(env, faddr, slo, ra); - cpu_stl_data_ra(env, faddr + 4, shi, ra); - } - cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra); - if (cpu_isar_feature(aa32_mve, cpu)) { - cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra); - } - - /* - * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to - * leave them unchanged, matching our choice in v7m_preserve_fp_state. - */ - if (ts) { - for (i = 0; i < 32; i += 2) { - *aa32_vfp_dreg(env, i / 2) = 0; - } - vfp_set_fpscr(env, 0); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = 0; - } - } - } else { - v7m_update_fpccr(env, fptr, false); - } - - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; -} - -void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) -{ - ARMCPU *cpu = env_archcpu(env); - uintptr_t ra = GETPC(); - - /* fptr is the value of Rn, the frame pointer we load the FP regs from */ - assert(env->v7m.secure); - - if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { - return; - } - - /* Check access to the coprocessor is permitted */ - if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) { - raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC()); - } - - if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) { - /* State in FP is still valid */ - env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK; - } else { - bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK; - int i; - uint32_t fpscr; - - if (fptr & 7) { - raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC()); - } - - for (i = 0; i < (ts ? 32 : 16); i += 2) { - uint32_t slo, shi; - uint64_t dn; - uint32_t faddr = fptr + 4 * i; - - if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR and VPR */ - } - - slo = cpu_ldl_data_ra(env, faddr, ra); - shi = cpu_ldl_data_ra(env, faddr + 4, ra); - - dn = (uint64_t) shi << 32 | slo; - *aa32_vfp_dreg(env, i / 2) = dn; - } - fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra); - vfp_set_fpscr(env, fpscr); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra); - } - } - - env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK; -} - -static bool v7m_push_stack(ARMCPU *cpu) -{ - /* - * Do the "set up stack frame" part of exception entry, - * similar to pseudocode PushStack(). - * Return true if we generate a derived exception (and so - * should ignore further stack faults trying to process - * that derived exception.) - */ - bool stacked_ok = true, limitviol = false; - CPUARMState *env = &cpu->env; - uint32_t xpsr = xpsr_read(env); - uint32_t frameptr = env->regs[13]; - ARMMMUIdx mmu_idx = arm_mmu_idx(env); - uint32_t framesize; - bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1); - - if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) && - (env->v7m.secure || nsacr_cp10)) { - if (env->v7m.secure && - env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) { - framesize = 0xa8; - } else { - framesize = 0x68; - } - } else { - framesize = 0x20; - } - - /* Align stack pointer if the guest wants that */ - if ((frameptr & 4) && - (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) { - frameptr -= 4; - xpsr |= XPSR_SPREALIGN; - } - - xpsr &= ~XPSR_SFPA; - if (env->v7m.secure && - (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { - xpsr |= XPSR_SFPA; - } - - frameptr -= framesize; - - if (arm_feature(env, ARM_FEATURE_V8)) { - uint32_t limit = v7m_sp_limit(env); - - if (frameptr < limit) { - /* - * Stack limit failure: set SP to the limit value, and generate - * STKOF UsageFault. Stack pushes below the limit must not be - * performed. It is IMPDEF whether pushes above the limit are - * performed; we choose not to. - */ - qemu_log_mask(CPU_LOG_INT, - "...STKOF during stacking\n"); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - env->regs[13] = limit; - /* - * We won't try to perform any further memory accesses but - * we must continue through the following code to check for - * permission faults during FPU state preservation, and we - * must update FPCCR if lazy stacking is enabled. - */ - limitviol = true; - stacked_ok = false; - } - } - - /* - * Write as much of the stack frame as we can. If we fail a stack - * write this will result in a derived exception being pended - * (which may be taken in preference to the one we started with - * if it has higher priority). - */ - stacked_ok = stacked_ok && - v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 4, env->regs[1], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 8, env->regs[2], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 12, env->regs[3], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 16, env->regs[12], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 20, env->regs[14], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 24, env->regs[15], - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL); - - if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) { - /* FPU is active, try to save its registers */ - bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; - bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK; - - if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) { - qemu_log_mask(CPU_LOG_INT, - "...SecureFault because LSPACT and FPCA both set\n"); - env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - } else if (!env->v7m.secure && !nsacr_cp10) { - qemu_log_mask(CPU_LOG_INT, - "...Secure UsageFault with CFSR.NOCP because " - "NSACR.CP10 prevents stacking FP regs\n"); - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S); - env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; - } else { - if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) { - /* Lazy stacking disabled, save registers now */ - int i; - bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure, - arm_current_el(env) != 0); - - if (stacked_ok && !cpacr_pass) { - /* - * Take UsageFault if CPACR forbids access. The pseudocode - * here does a full CheckCPEnabled() but we know the NSACR - * check can never fail as we have already handled that. - */ - qemu_log_mask(CPU_LOG_INT, - "...UsageFault with CFSR.NOCP because " - "CPACR.CP10 prevents stacking FP regs\n"); - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK; - stacked_ok = false; - } - - for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) { - uint64_t dn = *aa32_vfp_dreg(env, i / 2); - uint32_t faddr = frameptr + 0x20 + 4 * i; - uint32_t slo = extract64(dn, 0, 32); - uint32_t shi = extract64(dn, 32, 32); - - if (i >= 16) { - faddr += 8; /* skip the slot for the FPSCR and VPR */ - } - stacked_ok = stacked_ok && - v7m_stack_write(cpu, faddr, slo, - mmu_idx, STACK_NORMAL) && - v7m_stack_write(cpu, faddr + 4, shi, - mmu_idx, STACK_NORMAL); - } - stacked_ok = stacked_ok && - v7m_stack_write(cpu, frameptr + 0x60, - vfp_get_fpscr(env), mmu_idx, STACK_NORMAL); - if (cpu_isar_feature(aa32_mve, cpu)) { - stacked_ok = stacked_ok && - v7m_stack_write(cpu, frameptr + 0x64, - env->v7m.vpr, mmu_idx, STACK_NORMAL); - } - if (cpacr_pass) { - for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) { - *aa32_vfp_dreg(env, i / 2) = 0; - } - vfp_set_fpscr(env, 0); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = 0; - } - } - } else { - /* Lazy stacking enabled, save necessary info to stack later */ - v7m_update_fpccr(env, frameptr + 0x20, true); - } - } - } - - /* - * If we broke a stack limit then SP was already updated earlier; - * otherwise we update SP regardless of whether any of the stack - * accesses failed or we took some other kind of fault. - */ - if (!limitviol) { - env->regs[13] = frameptr; - } - - return !stacked_ok; -} - -static void do_v7m_exception_exit(ARMCPU *cpu) -{ - CPUARMState *env = &cpu->env; - uint32_t excret; - uint32_t xpsr, xpsr_mask; - bool ufault = false; - bool sfault = false; - bool return_to_sp_process; - bool return_to_handler; - bool rettobase = false; - bool exc_secure = false; - bool return_to_secure; - bool ftype; - bool restore_s16_s31 = false; - - /* - * If we're not in Handler mode then jumps to magic exception-exit - * addresses don't have magic behaviour. However for the v8M - * security extensions the magic secure-function-return has to - * work in thread mode too, so to avoid doing an extra check in - * the generated code we allow exception-exit magic to also cause the - * internal exception and bring us here in thread mode. Correct code - * will never try to do this (the following insn fetch will always - * fault) so we the overhead of having taken an unnecessary exception - * doesn't matter. - */ - if (!arm_v7m_is_handler_mode(env)) { - return; - } - - /* - * In the spec pseudocode ExceptionReturn() is called directly - * from BXWritePC() and gets the full target PC value including - * bit zero. In QEMU's implementation we treat it as a normal - * jump-to-register (which is then caught later on), and so split - * the target value up between env->regs[15] and env->thumb in - * gen_bx(). Reconstitute it. - */ - excret = env->regs[15]; - if (env->thumb) { - excret |= 1; - } - - qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32 - " previous exception %d\n", - excret, env->v7m.exception); - - if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) { - qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception " - "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n", - excret); - } - - ftype = excret & R_V7M_EXCRET_FTYPE_MASK; - - if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) { - qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception " - "exit PC value 0x%" PRIx32 " is UNPREDICTABLE " - "if FPU not present\n", - excret); - ftype = true; - } - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - /* - * EXC_RETURN.ES validation check (R_SMFL). We must do this before - * we pick which FAULTMASK to clear. - */ - if (!env->v7m.secure && - ((excret & R_V7M_EXCRET_ES_MASK) || - !(excret & R_V7M_EXCRET_DCRS_MASK))) { - sfault = 1; - /* For all other purposes, treat ES as 0 (R_HXSR) */ - excret &= ~R_V7M_EXCRET_ES_MASK; - } - exc_secure = excret & R_V7M_EXCRET_ES_MASK; - } - - if (env->v7m.exception != ARMV7M_EXCP_NMI) { - /* - * Auto-clear FAULTMASK on return from other than NMI. - * If the security extension is implemented then this only - * happens if the raw execution priority is >= 0; the - * value of the ES bit in the exception return value indicates - * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.) - */ - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) { - env->v7m.faultmask[exc_secure] = 0; - } - } else { - env->v7m.faultmask[M_REG_NS] = 0; - } - } - - switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception, - exc_secure)) { - case -1: - /* attempt to exit an exception that isn't active */ - ufault = true; - break; - case 0: - /* still an irq active now */ - break; - case 1: - /* - * We returned to base exception level, no nesting. - * (In the pseudocode this is written using "NestedActivation != 1" - * where we have 'rettobase == false'.) - */ - rettobase = true; - break; - default: - g_assert_not_reached(); - } - - return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK); - return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK; - return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) && - (excret & R_V7M_EXCRET_S_MASK); - - if (arm_feature(env, ARM_FEATURE_V8)) { - if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) { - /* - * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP); - * we choose to take the UsageFault. - */ - if ((excret & R_V7M_EXCRET_S_MASK) || - (excret & R_V7M_EXCRET_ES_MASK) || - !(excret & R_V7M_EXCRET_DCRS_MASK)) { - ufault = true; - } - } - if (excret & R_V7M_EXCRET_RES0_MASK) { - ufault = true; - } - } else { - /* For v7M we only recognize certain combinations of the low bits */ - switch (excret & 0xf) { - case 1: /* Return to Handler */ - break; - case 13: /* Return to Thread using Process stack */ - case 9: /* Return to Thread using Main stack */ - /* - * We only need to check NONBASETHRDENA for v7M, because in - * v8M this bit does not exist (it is RES1). - */ - if (!rettobase && - !(env->v7m.ccr[env->v7m.secure] & - R_V7M_CCR_NONBASETHRDENA_MASK)) { - ufault = true; - } - break; - default: - ufault = true; - } - } - - /* - * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in - * Handler mode (and will be until we write the new XPSR.Interrupt - * field) this does not switch around the current stack pointer. - * We must do this before we do any kind of tailchaining, including - * for the derived exceptions on integrity check failures, or we will - * give the guest an incorrect EXCRET.SPSEL value on exception entry. - */ - write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure); - - /* - * Clear scratch FP values left in caller saved registers; this - * must happen before any kind of tail chaining. - */ - if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) && - (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) { - if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) { - env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " - "stackframe: error during lazy state deactivation\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } else { - if (arm_feature(env, ARM_FEATURE_V8_1M)) { - /* v8.1M adds this NOCP check */ - bool nsacr_pass = exc_secure || - extract32(env->v7m.nsacr, 10, 1); - bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true); - if (!nsacr_pass) { - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true); - env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; - qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " - "stackframe: NSACR prevents clearing FPU registers\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } else if (!cpacr_pass) { - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - exc_secure); - env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK; - qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " - "stackframe: CPACR prevents clearing FPU registers\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - } - /* Clear s0..s15, FPSCR and VPR */ - int i; - - for (i = 0; i < 16; i += 2) { - *aa32_vfp_dreg(env, i / 2) = 0; - } - vfp_set_fpscr(env, 0); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = 0; - } - } - } - - if (sfault) { - env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " - "stackframe: failed EXC_RETURN.ES validity check\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - if (ufault) { - /* - * Bad exception return: instead of popping the exception - * stack, directly take a usage fault on the current stack. - */ - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " - "stackframe: failed exception return integrity check\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - /* - * Tailchaining: if there is currently a pending exception that - * is high enough priority to preempt execution at the level we're - * about to return to, then just directly take that exception now, - * avoiding an unstack-and-then-stack. Note that now we have - * deactivated the previous exception by calling armv7m_nvic_complete_irq() - * our current execution priority is already the execution priority we are - * returning to -- none of the state we would unstack or set based on - * the EXCRET value affects it. - */ - if (armv7m_nvic_can_take_pending_exception(env->nvic)) { - qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - switch_v7m_security_state(env, return_to_secure); - - { - /* - * The stack pointer we should be reading the exception frame from - * depends on bits in the magic exception return type value (and - * for v8M isn't necessarily the stack pointer we will eventually - * end up resuming execution with). Get a pointer to the location - * in the CPU state struct where the SP we need is currently being - * stored; we will use and modify it in place. - * We use this limited C variable scope so we don't accidentally - * use 'frame_sp_p' after we do something that makes it invalid. - */ - bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK; - uint32_t *frame_sp_p = get_v7m_sp_ptr(env, - return_to_secure, - !return_to_handler, - spsel); - uint32_t frameptr = *frame_sp_p; - bool pop_ok = true; - ARMMMUIdx mmu_idx; - bool return_to_priv = return_to_handler || - !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK); - - mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure, - return_to_priv); - - if (!QEMU_IS_ALIGNED(frameptr, 8) && - arm_feature(env, ARM_FEATURE_V8)) { - qemu_log_mask(LOG_GUEST_ERROR, - "M profile exception return with non-8-aligned SP " - "for destination state is UNPREDICTABLE\n"); - } - - /* Do we need to pop callee-saved registers? */ - if (return_to_secure && - ((excret & R_V7M_EXCRET_ES_MASK) == 0 || - (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) { - uint32_t actual_sig; - - pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx); - - if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) { - /* Take a SecureFault on the current stack */ - env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " - "stackframe: failed exception return integrity " - "signature check\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - pop_ok = pop_ok && - v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) && - v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) && - v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) && - v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) && - v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) && - v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) && - v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) && - v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx); - - frameptr += 0x28; - } - - /* Pop registers */ - pop_ok = pop_ok && - v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) && - v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) && - v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) && - v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) && - v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) && - v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) && - v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) && - v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx); - - if (!pop_ok) { - /* - * v7m_stack_read() pended a fault, so take it (as a tail - * chained exception on the same stack frame) - */ - qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - /* - * Returning from an exception with a PC with bit 0 set is defined - * behaviour on v8M (bit 0 is ignored), but for v7M it was specified - * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore - * the lsbit, and there are several RTOSes out there which incorrectly - * assume the r15 in the stack frame should be a Thumb-style "lsbit - * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but - * complain about the badly behaved guest. - */ - if (env->regs[15] & 1) { - env->regs[15] &= ~1U; - if (!arm_feature(env, ARM_FEATURE_V8)) { - qemu_log_mask(LOG_GUEST_ERROR, - "M profile return from interrupt with misaligned " - "PC is UNPREDICTABLE on v7M\n"); - } - } - - if (arm_feature(env, ARM_FEATURE_V8)) { - /* - * For v8M we have to check whether the xPSR exception field - * matches the EXCRET value for return to handler/thread - * before we commit to changing the SP and xPSR. - */ - bool will_be_handler = (xpsr & XPSR_EXCP) != 0; - if (return_to_handler != will_be_handler) { - /* - * Take an INVPC UsageFault on the current stack. - * By this point we will have switched to the security state - * for the background state, so this UsageFault will target - * that state. - */ - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " - "stackframe: failed exception return integrity " - "check\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - } - - if (!ftype) { - /* FP present and we need to handle it */ - if (!return_to_secure && - (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) { - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; - qemu_log_mask(CPU_LOG_INT, - "...taking SecureFault on existing stackframe: " - "Secure LSPACT set but exception return is " - "not to secure state\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - restore_s16_s31 = return_to_secure && - (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK); - - if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) { - /* State in FPU is still valid, just clear LSPACT */ - env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK; - } else { - int i; - uint32_t fpscr; - bool cpacr_pass, nsacr_pass; - - cpacr_pass = v7m_cpacr_pass(env, return_to_secure, - return_to_priv); - nsacr_pass = return_to_secure || - extract32(env->v7m.nsacr, 10, 1); - - if (!cpacr_pass) { - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - return_to_secure); - env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK; - qemu_log_mask(CPU_LOG_INT, - "...taking UsageFault on existing " - "stackframe: CPACR.CP10 prevents unstacking " - "FP regs\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } else if (!nsacr_pass) { - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true); - env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK; - qemu_log_mask(CPU_LOG_INT, - "...taking Secure UsageFault on existing " - "stackframe: NSACR.CP10 prevents unstacking " - "FP regs\n"); - v7m_exception_taken(cpu, excret, true, false); - return; - } - - for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) { - uint32_t slo, shi; - uint64_t dn; - uint32_t faddr = frameptr + 0x20 + 4 * i; - - if (i >= 16) { - faddr += 8; /* Skip the slot for the FPSCR and VPR */ - } - - pop_ok = pop_ok && - v7m_stack_read(cpu, &slo, faddr, mmu_idx) && - v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx); - - if (!pop_ok) { - break; - } - - dn = (uint64_t)shi << 32 | slo; - *aa32_vfp_dreg(env, i / 2) = dn; - } - pop_ok = pop_ok && - v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx); - if (pop_ok) { - vfp_set_fpscr(env, fpscr); - } - if (cpu_isar_feature(aa32_mve, cpu)) { - pop_ok = pop_ok && - v7m_stack_read(cpu, &env->v7m.vpr, - frameptr + 0x64, mmu_idx); - } - if (!pop_ok) { - /* - * These regs are 0 if security extension present; - * otherwise merely UNKNOWN. We zero always. - */ - for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) { - *aa32_vfp_dreg(env, i / 2) = 0; - } - vfp_set_fpscr(env, 0); - if (cpu_isar_feature(aa32_mve, cpu)) { - env->v7m.vpr = 0; - } - } - } - } - env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S], - V7M_CONTROL, FPCA, !ftype); - - /* Commit to consuming the stack frame */ - frameptr += 0x20; - if (!ftype) { - frameptr += 0x48; - if (restore_s16_s31) { - frameptr += 0x40; - } - } - /* - * Undo stack alignment (the SPREALIGN bit indicates that the original - * pre-exception SP was not 8-aligned and we added a padding word to - * align it, so we undo this by ORing in the bit that increases it - * from the current 8-aligned value to the 8-unaligned value. (Adding 4 - * would work too but a logical OR is how the pseudocode specifies it.) - */ - if (xpsr & XPSR_SPREALIGN) { - frameptr |= 4; - } - *frame_sp_p = frameptr; - } - - xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA); - if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) { - xpsr_mask &= ~XPSR_GE; - } - /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */ - xpsr_write(env, xpsr, xpsr_mask); - - if (env->v7m.secure) { - bool sfpa = xpsr & XPSR_SFPA; - - env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S], - V7M_CONTROL, SFPA, sfpa); - } - - /* - * The restored xPSR exception field will be zero if we're - * resuming in Thread mode. If that doesn't match what the - * exception return excret specified then this is a UsageFault. - * v7M requires we make this check here; v8M did it earlier. - */ - if (return_to_handler != arm_v7m_is_handler_mode(env)) { - /* - * Take an INVPC UsageFault by pushing the stack again; - * we know we're v7M so this is never a Secure UsageFault. - */ - bool ignore_stackfaults; - - assert(!arm_feature(env, ARM_FEATURE_V8)); - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - ignore_stackfaults = v7m_push_stack(cpu); - qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: " - "failed exception return integrity check\n"); - v7m_exception_taken(cpu, excret, false, ignore_stackfaults); - return; - } - - /* Otherwise, we have a successful exception exit. */ - arm_clear_exclusive(env); - arm_rebuild_hflags(env); - qemu_log_mask(CPU_LOG_INT, "...successful exception return\n"); -} - -static bool do_v7m_function_return(ARMCPU *cpu) -{ - /* - * v8M security extensions magic function return. - * We may either: - * (1) throw an exception (longjump) - * (2) return true if we successfully handled the function return - * (3) return false if we failed a consistency check and have - * pended a UsageFault that needs to be taken now - * - * At this point the magic return value is split between env->regs[15] - * and env->thumb. We don't bother to reconstitute it because we don't - * need it (all values are handled the same way). - */ - CPUARMState *env = &cpu->env; - uint32_t newpc, newpsr, newpsr_exc; - - qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n"); - - { - bool threadmode, spsel; - MemOpIdx oi; - ARMMMUIdx mmu_idx; - uint32_t *frame_sp_p; - uint32_t frameptr; - - /* Pull the return address and IPSR from the Secure stack */ - threadmode = !arm_v7m_is_handler_mode(env); - spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK; - - frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel); - frameptr = *frame_sp_p; - - /* - * These loads may throw an exception (for MPU faults). We want to - * do them as secure, so work out what MMU index that is. - */ - mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true); - oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx)); - newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0); - newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0); - - /* Consistency checks on new IPSR */ - newpsr_exc = newpsr & XPSR_EXCP; - if (!((env->v7m.exception == 0 && newpsr_exc == 0) || - (env->v7m.exception == 1 && newpsr_exc != 0))) { - /* Pend the fault and tell our caller to take it */ - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - qemu_log_mask(CPU_LOG_INT, - "...taking INVPC UsageFault: " - "IPSR consistency check failed\n"); - return false; - } - - *frame_sp_p = frameptr + 8; - } - - /* This invalidates frame_sp_p */ - switch_v7m_security_state(env, true); - env->v7m.exception = newpsr_exc; - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; - if (newpsr & XPSR_SFPA) { - env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK; - } - xpsr_write(env, 0, XPSR_IT); - env->thumb = newpc & 1; - env->regs[15] = newpc & ~1; - arm_rebuild_hflags(env); - - qemu_log_mask(CPU_LOG_INT, "...function return successful\n"); - return true; -} - -static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure, - uint32_t addr, uint16_t *insn) -{ - /* - * Load a 16-bit portion of a v7M instruction, returning true on success, - * or false on failure (in which case we will have pended the appropriate - * exception). - * We need to do the instruction fetch's MPU and SAU checks - * like this because there is no MMU index that would allow - * doing the load with a single function call. Instead we must - * first check that the security attributes permit the load - * and that they don't mismatch on the two halves of the instruction, - * and then we do the load as a secure load (ie using the security - * attributes of the address, not the CPU, as architecturally required). - */ - CPUState *cs = CPU(cpu); - CPUARMState *env = &cpu->env; - V8M_SAttributes sattrs = {}; - GetPhysAddrResult res = {}; - ARMMMUFaultInfo fi = {}; - MemTxResult txres; - - v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs); - if (!sattrs.nsc || sattrs.ns) { - /* - * This must be the second half of the insn, and it straddles a - * region boundary with the second half not being S&NSC. - */ - env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - qemu_log_mask(CPU_LOG_INT, - "...really SecureFault with SFSR.INVEP\n"); - return false; - } - if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) { - /* the MPU lookup failed */ - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure); - qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n"); - return false; - } - *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs), - res.f.phys_addr, res.f.attrs, &txres); - if (txres != MEMTX_OK) { - env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); - qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n"); - return false; - } - return true; -} - -static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx, - uint32_t addr, uint32_t *spdata) -{ - /* - * Read a word of data from the stack for the SG instruction, - * writing the value into *spdata. If the load succeeds, return - * true; otherwise pend an appropriate exception and return false. - * (We can't use data load helpers here that throw an exception - * because of the context we're called in, which is halfway through - * arm_v7m_cpu_do_interrupt().) - */ - CPUState *cs = CPU(cpu); - CPUARMState *env = &cpu->env; - MemTxResult txres; - GetPhysAddrResult res = {}; - ARMMMUFaultInfo fi = {}; - uint32_t value; - - if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) { - /* MPU/SAU lookup failed */ - if (fi.type == ARMFault_QEMU_SFault) { - qemu_log_mask(CPU_LOG_INT, - "...SecureFault during stack word read\n"); - env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; - env->v7m.sfar = addr; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - } else { - qemu_log_mask(CPU_LOG_INT, - "...MemManageFault during stack word read\n"); - env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK | - R_V7M_CFSR_MMARVALID_MASK; - env->v7m.mmfar[M_REG_S] = addr; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false); - } - return false; - } - value = address_space_ldl(arm_addressspace(cs, res.f.attrs), - res.f.phys_addr, res.f.attrs, &txres); - if (txres != MEMTX_OK) { - /* BusFault trying to read the data */ - qemu_log_mask(CPU_LOG_INT, - "...BusFault during stack word read\n"); - env->v7m.cfsr[M_REG_NS] |= - (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK); - env->v7m.bfar = addr; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); - return false; - } - - *spdata = value; - return true; -} - -static bool v7m_handle_execute_nsc(ARMCPU *cpu) -{ - /* - * Check whether this attempt to execute code in a Secure & NS-Callable - * memory region is for an SG instruction; if so, then emulate the - * effect of the SG instruction and return true. Otherwise pend - * the correct kind of exception and return false. - */ - CPUARMState *env = &cpu->env; - ARMMMUIdx mmu_idx; - uint16_t insn; - - /* - * We should never get here unless get_phys_addr_pmsav8() caused - * an exception for NS executing in S&NSC memory. - */ - assert(!env->v7m.secure); - assert(arm_feature(env, ARM_FEATURE_M_SECURITY)); - - /* We want to do the MPU lookup as secure; work out what mmu_idx that is */ - mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true); - - if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) { - return false; - } - - if (!env->thumb) { - goto gen_invep; - } - - if (insn != 0xe97f) { - /* - * Not an SG instruction first half (we choose the IMPDEF - * early-SG-check option). - */ - goto gen_invep; - } - - if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) { - return false; - } - - if (insn != 0xe97f) { - /* - * Not an SG instruction second half (yes, both halves of the SG - * insn have the same hex value) - */ - goto gen_invep; - } - - /* - * OK, we have confirmed that we really have an SG instruction. - * We know we're NS in S memory so don't need to repeat those checks. - */ - qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32 - ", executing it\n", env->regs[15]); - - if (cpu_isar_feature(aa32_m_sec_state, cpu) && - !arm_v7m_is_handler_mode(env)) { - /* - * v8.1M exception stack frame integrity check. Note that we - * must perform the memory access even if CCR_S.TRD is zero - * and we aren't going to check what the data loaded is. - */ - uint32_t spdata, sp; - - /* - * We know we are currently NS, so the S stack pointers must be - * in other_ss_{psp,msp}, not in regs[13]/other_sp. - */ - sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp; - if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) { - /* Stack access failed and an exception has been pended */ - return false; - } - - if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) { - if (((spdata & ~1) == 0xfefa125a) || - !(env->v7m.control[M_REG_S] & 1)) { - goto gen_invep; - } - } - } - - env->regs[14] &= ~1; - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; - switch_v7m_security_state(env, true); - xpsr_write(env, 0, XPSR_IT); - env->regs[15] += 4; - arm_rebuild_hflags(env); - return true; - -gen_invep: - env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - qemu_log_mask(CPU_LOG_INT, - "...really SecureFault with SFSR.INVEP\n"); - return false; -} - -void arm_v7m_cpu_do_interrupt(CPUState *cs) -{ - ARMCPU *cpu = ARM_CPU(cs); - CPUARMState *env = &cpu->env; - uint32_t lr; - bool ignore_stackfaults; - - arm_log_exception(cs); - - /* - * For exceptions we just mark as pending on the NVIC, and let that - * handle it. - */ - switch (cs->exception_index) { - case EXCP_UDEF: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK; - break; - case EXCP_NOCP: - { - /* - * NOCP might be directed to something other than the current - * security state if this fault is because of NSACR; we indicate - * the target security state using exception.target_el. - */ - int target_secstate; - - if (env->exception.target_el == 3) { - target_secstate = M_REG_S; - } else { - target_secstate = env->v7m.secure; - } - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate); - env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK; - break; - } - case EXCP_INVSTATE: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK; - break; - case EXCP_STKOF: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; - break; - case EXCP_LSERR: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; - break; - case EXCP_UNALIGNED: - /* Unaligned faults reported by M-profile aware code */ - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK; - break; - case EXCP_DIVBYZERO: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK; - break; - case EXCP_SWI: - /* The PC already points to the next instruction. */ - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure); - break; - case EXCP_PREFETCH_ABORT: - case EXCP_DATA_ABORT: - /* - * Note that for M profile we don't have a guest facing FSR, but - * the env->exception.fsr will be populated by the code that - * raises the fault, in the A profile short-descriptor format. - * - * Log the exception.vaddress now regardless of subtype, because - * logging below only logs it when it goes into a guest visible - * register. - */ - qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n", - (uint32_t)env->exception.vaddress); - switch (env->exception.fsr & 0xf) { - case M_FAKE_FSR_NSC_EXEC: - /* - * Exception generated when we try to execute code at an address - * which is marked as Secure & Non-Secure Callable and the CPU - * is in the Non-Secure state. The only instruction which can - * be executed like this is SG (and that only if both halves of - * the SG instruction have the same security attributes.) - * Everything else must generate an INVEP SecureFault, so we - * emulate the SG instruction here. - */ - if (v7m_handle_execute_nsc(cpu)) { - return; - } - break; - case M_FAKE_FSR_SFAULT: - /* - * Various flavours of SecureFault for attempts to execute or - * access data in the wrong security state. - */ - switch (cs->exception_index) { - case EXCP_PREFETCH_ABORT: - if (env->v7m.secure) { - env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK; - qemu_log_mask(CPU_LOG_INT, - "...really SecureFault with SFSR.INVTRAN\n"); - } else { - env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; - qemu_log_mask(CPU_LOG_INT, - "...really SecureFault with SFSR.INVEP\n"); - } - break; - case EXCP_DATA_ABORT: - /* This must be an NS access to S memory */ - env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK; - qemu_log_mask(CPU_LOG_INT, - "...really SecureFault with SFSR.AUVIOL\n"); - break; - } - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - break; - case 0x8: /* External Abort */ - switch (cs->exception_index) { - case EXCP_PREFETCH_ABORT: - env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK; - qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n"); - break; - case EXCP_DATA_ABORT: - env->v7m.cfsr[M_REG_NS] |= - (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK); - env->v7m.bfar = env->exception.vaddress; - qemu_log_mask(CPU_LOG_INT, - "...with CFSR.PRECISERR and BFAR 0x%x\n", - env->v7m.bfar); - break; - } - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); - break; - case 0x1: /* Alignment fault reported by generic code */ - qemu_log_mask(CPU_LOG_INT, - "...really UsageFault with UFSR.UNALIGNED\n"); - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK; - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, - env->v7m.secure); - break; - default: - /* - * All other FSR values are either MPU faults or "can't happen - * for M profile" cases. - */ - switch (cs->exception_index) { - case EXCP_PREFETCH_ABORT: - env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK; - qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n"); - break; - case EXCP_DATA_ABORT: - env->v7m.cfsr[env->v7m.secure] |= - (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK); - env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress; - qemu_log_mask(CPU_LOG_INT, - "...with CFSR.DACCVIOL and MMFAR 0x%x\n", - env->v7m.mmfar[env->v7m.secure]); - break; - } - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, - env->v7m.secure); - break; - } - break; - case EXCP_SEMIHOST: - qemu_log_mask(CPU_LOG_INT, - "...handling as semihosting call 0x%x\n", - env->regs[0]); -#ifdef CONFIG_TCG - do_common_semihosting(cs); -#else - g_assert_not_reached(); -#endif - env->regs[15] += env->thumb ? 2 : 4; - return; - case EXCP_BKPT: - armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false); - break; - case EXCP_IRQ: - break; - case EXCP_EXCEPTION_EXIT: - if (env->regs[15] < EXC_RETURN_MIN_MAGIC) { - /* Must be v8M security extension function return */ - assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC); - assert(arm_feature(env, ARM_FEATURE_M_SECURITY)); - if (do_v7m_function_return(cpu)) { - return; - } - } else { - do_v7m_exception_exit(cpu); - return; - } - break; - case EXCP_LAZYFP: - /* - * We already pended the specific exception in the NVIC in the - * v7m_preserve_fp_state() helper function. - */ - break; - default: - cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index); - return; /* Never happens. Keep compiler happy. */ - } - - if (arm_feature(env, ARM_FEATURE_V8)) { - lr = R_V7M_EXCRET_RES1_MASK | - R_V7M_EXCRET_DCRS_MASK; - /* - * The S bit indicates whether we should return to Secure - * or NonSecure (ie our current state). - * The ES bit indicates whether we're taking this exception - * to Secure or NonSecure (ie our target state). We set it - * later, in v7m_exception_taken(). - * The SPSEL bit is also set in v7m_exception_taken() for v8M. - * This corresponds to the ARM ARM pseudocode for v8M setting - * some LR bits in PushStack() and some in ExceptionTaken(); - * the distinction matters for the tailchain cases where we - * can take an exception without pushing the stack. - */ - if (env->v7m.secure) { - lr |= R_V7M_EXCRET_S_MASK; - } - } else { - lr = R_V7M_EXCRET_RES1_MASK | - R_V7M_EXCRET_S_MASK | - R_V7M_EXCRET_DCRS_MASK | - R_V7M_EXCRET_ES_MASK; - if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) { - lr |= R_V7M_EXCRET_SPSEL_MASK; - } - } - if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) { - lr |= R_V7M_EXCRET_FTYPE_MASK; - } - if (!arm_v7m_is_handler_mode(env)) { - lr |= R_V7M_EXCRET_MODE_MASK; - } - - ignore_stackfaults = v7m_push_stack(cpu); - v7m_exception_taken(cpu, lr, false, ignore_stackfaults); -} - -uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg) -{ - unsigned el = arm_current_el(env); - - /* First handle registers which unprivileged can read */ - switch (reg) { - case 0 ... 7: /* xPSR sub-fields */ - return v7m_mrs_xpsr(env, reg, el); - case 20: /* CONTROL */ - return v7m_mrs_control(env, env->v7m.secure); - case 0x94: /* CONTROL_NS */ - /* - * We have to handle this here because unprivileged Secure code - * can read the NS CONTROL register. - */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.control[M_REG_NS] | - (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK); - } - - if (el == 0) { - return 0; /* unprivileged reads others as zero */ - } - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - switch (reg) { - case 0x88: /* MSP_NS */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.other_ss_msp; - case 0x89: /* PSP_NS */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.other_ss_psp; - case 0x8a: /* MSPLIM_NS */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.msplim[M_REG_NS]; - case 0x8b: /* PSPLIM_NS */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.psplim[M_REG_NS]; - case 0x90: /* PRIMASK_NS */ - if (!env->v7m.secure) { - return 0; - } - return env->v7m.primask[M_REG_NS]; - case 0x91: /* BASEPRI_NS */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - if (!env->v7m.secure) { - return 0; - } - return env->v7m.basepri[M_REG_NS]; - case 0x93: /* FAULTMASK_NS */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - if (!env->v7m.secure) { - return 0; - } - return env->v7m.faultmask[M_REG_NS]; - case 0x98: /* SP_NS */ - { - /* - * This gives the non-secure SP selected based on whether we're - * currently in handler mode or not, using the NS CONTROL.SPSEL. - */ - bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK; - - if (!env->v7m.secure) { - return 0; - } - if (!arm_v7m_is_handler_mode(env) && spsel) { - return env->v7m.other_ss_psp; - } else { - return env->v7m.other_ss_msp; - } - } - default: - break; - } - } - - switch (reg) { - case 8: /* MSP */ - return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13]; - case 9: /* PSP */ - return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp; - case 10: /* MSPLIM */ - if (!arm_feature(env, ARM_FEATURE_V8)) { - goto bad_reg; - } - return env->v7m.msplim[env->v7m.secure]; - case 11: /* PSPLIM */ - if (!arm_feature(env, ARM_FEATURE_V8)) { - goto bad_reg; - } - return env->v7m.psplim[env->v7m.secure]; - case 16: /* PRIMASK */ - return env->v7m.primask[env->v7m.secure]; - case 17: /* BASEPRI */ - case 18: /* BASEPRI_MAX */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - return env->v7m.basepri[env->v7m.secure]; - case 19: /* FAULTMASK */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - return env->v7m.faultmask[env->v7m.secure]; - default: - bad_reg: - qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special" - " register %d\n", reg); - return 0; - } -} - -void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val) -{ - /* - * We're passed bits [11..0] of the instruction; extract - * SYSm and the mask bits. - * Invalid combinations of SYSm and mask are UNPREDICTABLE; - * we choose to treat them as if the mask bits were valid. - * NB that the pseudocode 'mask' variable is bits [11..10], - * whereas ours is [11..8]. - */ - uint32_t mask = extract32(maskreg, 8, 4); - uint32_t reg = extract32(maskreg, 0, 8); - int cur_el = arm_current_el(env); - - if (cur_el == 0 && reg > 7 && reg != 20) { - /* - * only xPSR sub-fields and CONTROL.SFPA may be written by - * unprivileged code - */ - return; - } - - if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { - switch (reg) { - case 0x88: /* MSP_NS */ - if (!env->v7m.secure) { - return; - } - env->v7m.other_ss_msp = val & ~3; - return; - case 0x89: /* PSP_NS */ - if (!env->v7m.secure) { - return; - } - env->v7m.other_ss_psp = val & ~3; - return; - case 0x8a: /* MSPLIM_NS */ - if (!env->v7m.secure) { - return; - } - env->v7m.msplim[M_REG_NS] = val & ~7; - return; - case 0x8b: /* PSPLIM_NS */ - if (!env->v7m.secure) { - return; - } - env->v7m.psplim[M_REG_NS] = val & ~7; - return; - case 0x90: /* PRIMASK_NS */ - if (!env->v7m.secure) { - return; - } - env->v7m.primask[M_REG_NS] = val & 1; - return; - case 0x91: /* BASEPRI_NS */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - if (!env->v7m.secure) { - return; - } - env->v7m.basepri[M_REG_NS] = val & 0xff; - return; - case 0x93: /* FAULTMASK_NS */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - if (!env->v7m.secure) { - return; - } - env->v7m.faultmask[M_REG_NS] = val & 1; - return; - case 0x94: /* CONTROL_NS */ - if (!env->v7m.secure) { - return; - } - write_v7m_control_spsel_for_secstate(env, - val & R_V7M_CONTROL_SPSEL_MASK, - M_REG_NS); - if (arm_feature(env, ARM_FEATURE_M_MAIN)) { - env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK; - env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK; - } - /* - * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0, - * RES0 if the FPU is not present, and is stored in the S bank - */ - if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) && - extract32(env->v7m.nsacr, 10, 1)) { - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; - env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK; - } - return; - case 0x98: /* SP_NS */ - { - /* - * This gives the non-secure SP selected based on whether we're - * currently in handler mode or not, using the NS CONTROL.SPSEL. - */ - bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK; - bool is_psp = !arm_v7m_is_handler_mode(env) && spsel; - uint32_t limit; - - if (!env->v7m.secure) { - return; - } - - limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false]; - - val &= ~0x3; - - if (val < limit) { - raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC()); - } - - if (is_psp) { - env->v7m.other_ss_psp = val; - } else { - env->v7m.other_ss_msp = val; - } - return; - } - default: - break; - } - } - - switch (reg) { - case 0 ... 7: /* xPSR sub-fields */ - v7m_msr_xpsr(env, mask, reg, val); - break; - case 8: /* MSP */ - if (v7m_using_psp(env)) { - env->v7m.other_sp = val & ~3; - } else { - env->regs[13] = val & ~3; - } - break; - case 9: /* PSP */ - if (v7m_using_psp(env)) { - env->regs[13] = val & ~3; - } else { - env->v7m.other_sp = val & ~3; - } - break; - case 10: /* MSPLIM */ - if (!arm_feature(env, ARM_FEATURE_V8)) { - goto bad_reg; - } - env->v7m.msplim[env->v7m.secure] = val & ~7; - break; - case 11: /* PSPLIM */ - if (!arm_feature(env, ARM_FEATURE_V8)) { - goto bad_reg; - } - env->v7m.psplim[env->v7m.secure] = val & ~7; - break; - case 16: /* PRIMASK */ - env->v7m.primask[env->v7m.secure] = val & 1; - break; - case 17: /* BASEPRI */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - env->v7m.basepri[env->v7m.secure] = val & 0xff; - break; - case 18: /* BASEPRI_MAX */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - val &= 0xff; - if (val != 0 && (val < env->v7m.basepri[env->v7m.secure] - || env->v7m.basepri[env->v7m.secure] == 0)) { - env->v7m.basepri[env->v7m.secure] = val; - } - break; - case 19: /* FAULTMASK */ - if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { - goto bad_reg; - } - env->v7m.faultmask[env->v7m.secure] = val & 1; - break; - case 20: /* CONTROL */ - /* - * Writing to the SPSEL bit only has an effect if we are in - * thread mode; other bits can be updated by any privileged code. - * write_v7m_control_spsel() deals with updating the SPSEL bit in - * env->v7m.control, so we only need update the others. - * For v7M, we must just ignore explicit writes to SPSEL in handler - * mode; for v8M the write is permitted but will have no effect. - * All these bits are writes-ignored from non-privileged code, - * except for SFPA. - */ - if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) || - !arm_v7m_is_handler_mode(env))) { - write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0); - } - if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) { - env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK; - env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK; - } - if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) { - /* - * SFPA is RAZ/WI from NS or if no FPU. - * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present. - * Both are stored in the S bank. - */ - if (env->v7m.secure) { - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; - env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK; - } - if (cur_el > 0 && - (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) || - extract32(env->v7m.nsacr, 10, 1))) { - env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; - env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK; - } - } - break; - default: - bad_reg: - qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special" - " register %d\n", reg); - return; - } -} - -uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op) -{ - /* Implement the TT instruction. op is bits [7:6] of the insn. */ - bool forceunpriv = op & 1; - bool alt = op & 2; - V8M_SAttributes sattrs = {}; - uint32_t tt_resp; - bool r, rw, nsr, nsrw, mrvalid; - ARMMMUIdx mmu_idx; - uint32_t mregion; - bool targetpriv; - bool targetsec = env->v7m.secure; - - /* - * Work out what the security state and privilege level we're - * interested in is... - */ - if (alt) { - targetsec = !targetsec; - } - - if (forceunpriv) { - targetpriv = false; - } else { - targetpriv = arm_v7m_is_handler_mode(env) || - !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK); - } - - /* ...and then figure out which MMU index this is */ - mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv); - - /* - * We know that the MPU and SAU don't care about the access type - * for our purposes beyond that we don't want to claim to be - * an insn fetch, so we arbitrarily call this a read. - */ - - /* - * MPU region info only available for privileged or if - * inspecting the other MPU state. - */ - if (arm_current_el(env) != 0 || alt) { - GetPhysAddrResult res = {}; - ARMMMUFaultInfo fi = {}; - - /* We can ignore the return value as prot is always set */ - pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec, - &res, &fi, &mregion); - if (mregion == -1) { - mrvalid = false; - mregion = 0; - } else { - mrvalid = true; - } - r = res.f.prot & PAGE_READ; - rw = res.f.prot & PAGE_WRITE; - } else { - r = false; - rw = false; - mrvalid = false; - mregion = 0; - } - - if (env->v7m.secure) { - v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, - targetsec, &sattrs); - nsr = sattrs.ns && r; - nsrw = sattrs.ns && rw; - } else { - sattrs.ns = true; - nsr = false; - nsrw = false; - } - - tt_resp = (sattrs.iregion << 24) | - (sattrs.irvalid << 23) | - ((!sattrs.ns) << 22) | - (nsrw << 21) | - (nsr << 20) | - (rw << 19) | - (r << 18) | - (sattrs.srvalid << 17) | - (mrvalid << 16) | - (sattrs.sregion << 8) | - mregion; - - return tt_resp; -} - -#endif /* !CONFIG_USER_ONLY */ diff --git a/target/arm/meson.build b/target/arm/meson.build index b2904b676b..3e2f403005 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -1,17 +1,9 @@ arm_ss = ss.source_set() arm_ss.add(files( 'cpu.c', - 'crypto_helper.c', 'debug_helper.c', 'gdbstub.c', 'helper.c', - 'iwmmxt_helper.c', - 'm_helper.c', - 'mve_helper.c', - 'neon_helper.c', - 'op_helper.c', - 'tlb_helper.c', - 'vec_helper.c', 'vfp_helper.c', 'cpu_tcg.c', )) @@ -22,11 +14,6 @@ arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c'), if_false: fil arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'cpu64.c', 'gdbstub64.c', - 'helper-a64.c', - 'mte_helper.c', - 'pauth_helper.c', - 'sve_helper.c', - 'sme_helper.c', )) arm_softmmu_ss = ss.source_set() @@ -43,6 +30,8 @@ subdir('hvf') if 'CONFIG_TCG' in config_all subdir('tcg') +else + arm_ss.add(files('tcg-stubs.c')) endif target_arch += {'arm': arm_ss} diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c deleted file mode 100644 index 98bcf59c22..0000000000 --- a/target/arm/mte_helper.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * ARM v8.5-MemTag Operations - * - * Copyright (c) 2020 Linaro, Ltd. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "qemu/log.h" -#include "cpu.h" -#include "internals.h" -#include "exec/exec-all.h" -#include "exec/ram_addr.h" -#include "exec/cpu_ldst.h" -#include "exec/helper-proto.h" -#include "qapi/error.h" -#include "qemu/guest-random.h" - - -static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude) -{ - if (exclude == 0xffff) { - return 0; - } - if (offset == 0) { - while (exclude & (1 << tag)) { - tag = (tag + 1) & 15; - } - } else { - do { - do { - tag = (tag + 1) & 15; - } while (exclude & (1 << tag)); - } while (--offset > 0); - } - return tag; -} - -/** - * allocation_tag_mem: - * @env: the cpu environment - * @ptr_mmu_idx: the addressing regime to use for the virtual address - * @ptr: the virtual address for which to look up tag memory - * @ptr_access: the access to use for the virtual address - * @ptr_size: the number of bytes in the normal memory access - * @tag_access: the access to use for the tag memory - * @tag_size: the number of bytes in the tag memory access - * @ra: the return address for exception handling - * - * Our tag memory is formatted as a sequence of little-endian nibbles. - * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two - * tags, with the tag at [3:0] for the lower addr and the tag at [7:4] - * for the higher addr. - * - * Here, resolve the physical address from the virtual address, and return - * a pointer to the corresponding tag byte. Exit with exception if the - * virtual address is not accessible for @ptr_access. - * - * The @ptr_size and @tag_size values may not have an obvious relation - * due to the alignment of @ptr, and the number of tag checks required. - * - * If there is no tag storage corresponding to @ptr, return NULL. - */ -static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx, - uint64_t ptr, MMUAccessType ptr_access, - int ptr_size, MMUAccessType tag_access, - int tag_size, uintptr_t ra) -{ -#ifdef CONFIG_USER_ONLY - uint64_t clean_ptr = useronly_clean_ptr(ptr); - int flags = page_get_flags(clean_ptr); - uint8_t *tags; - uintptr_t index; - - if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) { - cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access, - !(flags & PAGE_VALID), ra); - } - - /* Require both MAP_ANON and PROT_MTE for the page. */ - if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) { - return NULL; - } - - tags = page_get_target_data(clean_ptr); - - index = extract32(ptr, LOG2_TAG_GRANULE + 1, - TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1); - return tags + index; -#else - CPUTLBEntryFull *full; - MemTxAttrs attrs; - int in_page, flags; - hwaddr ptr_paddr, tag_paddr, xlat; - MemoryRegion *mr; - ARMASIdx tag_asi; - AddressSpace *tag_as; - void *host; - - /* - * Probe the first byte of the virtual address. This raises an - * exception for inaccessible pages, and resolves the virtual address - * into the softmmu tlb. - * - * When RA == 0, this is for mte_probe. The page is expected to be - * valid. Indicate to probe_access_flags no-fault, then assert that - * we received a valid page. - */ - flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx, - ra == 0, &host, &full, ra); - assert(!(flags & TLB_INVALID_MASK)); - - /* If the virtual page MemAttr != Tagged, access unchecked. */ - if (full->pte_attrs != 0xf0) { - return NULL; - } - - /* - * If not backed by host ram, there is no tag storage: access unchecked. - * This is probably a guest os bug though, so log it. - */ - if (unlikely(flags & TLB_MMIO)) { - qemu_log_mask(LOG_GUEST_ERROR, - "Page @ 0x%" PRIx64 " indicates Tagged Normal memory " - "but is not backed by host ram\n", ptr); - return NULL; - } - - /* - * Remember these values across the second lookup below, - * which may invalidate this pointer via tlb resize. - */ - ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK); - attrs = full->attrs; - full = NULL; - - /* - * The Normal memory access can extend to the next page. E.g. a single - * 8-byte access to the last byte of a page will check only the last - * tag on the first page. - * Any page access exception has priority over tag check exception. - */ - in_page = -(ptr | TARGET_PAGE_MASK); - if (unlikely(ptr_size > in_page)) { - flags |= probe_access_full(env, ptr + in_page, ptr_access, - ptr_mmu_idx, ra == 0, &host, &full, ra); - assert(!(flags & TLB_INVALID_MASK)); - } - - /* Any debug exception has priority over a tag check exception. */ - if (unlikely(flags & TLB_WATCHPOINT)) { - int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE; - assert(ra != 0); - cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra); - } - - /* Convert to the physical address in tag space. */ - tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1); - - /* Look up the address in tag space. */ - tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS; - tag_as = cpu_get_address_space(env_cpu(env), tag_asi); - mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL, - tag_access == MMU_DATA_STORE, attrs); - - /* - * Note that @mr will never be NULL. If there is nothing in the address - * space at @tag_paddr, the translation will return the unallocated memory - * region. For our purposes, the result must be ram. - */ - if (unlikely(!memory_region_is_ram(mr))) { - /* ??? Failure is a board configuration error. */ - qemu_log_mask(LOG_UNIMP, - "Tag Memory @ 0x%" HWADDR_PRIx " not found for " - "Normal Memory @ 0x%" HWADDR_PRIx "\n", - tag_paddr, ptr_paddr); - return NULL; - } - - /* - * Ensure the tag memory is dirty on write, for migration. - * Tag memory can never contain code or display memory (vga). - */ - if (tag_access == MMU_DATA_STORE) { - ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat; - cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION); - } - - return memory_region_get_ram_ptr(mr) + xlat; -#endif -} - -uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm) -{ - uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16); - int rrnd = extract32(env->cp15.gcr_el1, 16, 1); - int start = extract32(env->cp15.rgsr_el1, 0, 4); - int seed = extract32(env->cp15.rgsr_el1, 8, 16); - int offset, i, rtag; - - /* - * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the - * deterministic algorithm. Except that with RRND==1 the kernel is - * not required to have set RGSR_EL1.SEED != 0, which is required for - * the deterministic algorithm to function. So we force a non-zero - * SEED for that case. - */ - if (unlikely(seed == 0) && rrnd) { - do { - Error *err = NULL; - uint16_t two; - - if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) { - /* - * Failed, for unknown reasons in the crypto subsystem. - * Best we can do is log the reason and use a constant seed. - */ - qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n", - error_get_pretty(err)); - error_free(err); - two = 1; - } - seed = two; - } while (seed == 0); - } - - /* RandomTag */ - for (i = offset = 0; i < 4; ++i) { - /* NextRandomTagBit */ - int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^ - extract32(seed, 2, 1) ^ extract32(seed, 0, 1)); - seed = (top << 15) | (seed >> 1); - offset |= top << i; - } - rtag = choose_nonexcluded_tag(start, offset, exclude); - env->cp15.rgsr_el1 = rtag | (seed << 8); - - return address_with_allocation_tag(rn, rtag); -} - -uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr, - int32_t offset, uint32_t tag_offset) -{ - int start_tag = allocation_tag_from_addr(ptr); - uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16); - int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude); - - return address_with_allocation_tag(ptr + offset, rtag); -} - -static int load_tag1(uint64_t ptr, uint8_t *mem) -{ - int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; - return extract32(*mem, ofs, 4); -} - -uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt) -{ - int mmu_idx = cpu_mmu_index(env, false); - uint8_t *mem; - int rtag = 0; - - /* Trap if accessing an invalid page. */ - mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1, - MMU_DATA_LOAD, 1, GETPC()); - - /* Load if page supports tags. */ - if (mem) { - rtag = load_tag1(ptr, mem); - } - - return address_with_allocation_tag(xt, rtag); -} - -static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra) -{ - if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) { - arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE, - cpu_mmu_index(env, false), ra); - g_assert_not_reached(); - } -} - -/* For use in a non-parallel context, store to the given nibble. */ -static void store_tag1(uint64_t ptr, uint8_t *mem, int tag) -{ - int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; - *mem = deposit32(*mem, ofs, 4, tag); -} - -/* For use in a parallel context, atomically store to the given nibble. */ -static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag) -{ - int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; - uint8_t old = qatomic_read(mem); - - while (1) { - uint8_t new = deposit32(old, ofs, 4, tag); - uint8_t cmp = qatomic_cmpxchg(mem, old, new); - if (likely(cmp == old)) { - return; - } - old = cmp; - } -} - -typedef void stg_store1(uint64_t, uint8_t *, int); - -static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt, - uintptr_t ra, stg_store1 store1) -{ - int mmu_idx = cpu_mmu_index(env, false); - uint8_t *mem; - - check_tag_aligned(env, ptr, ra); - - /* Trap if accessing an invalid page. */ - mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE, - MMU_DATA_STORE, 1, ra); - - /* Store if page supports tags. */ - if (mem) { - store1(ptr, mem, allocation_tag_from_addr(xt)); - } -} - -void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt) -{ - do_stg(env, ptr, xt, GETPC(), store_tag1); -} - -void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) -{ - do_stg(env, ptr, xt, GETPC(), store_tag1_parallel); -} - -void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr) -{ - int mmu_idx = cpu_mmu_index(env, false); - uintptr_t ra = GETPC(); - - check_tag_aligned(env, ptr, ra); - probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); -} - -static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt, - uintptr_t ra, stg_store1 store1) -{ - int mmu_idx = cpu_mmu_index(env, false); - int tag = allocation_tag_from_addr(xt); - uint8_t *mem1, *mem2; - - check_tag_aligned(env, ptr, ra); - - /* - * Trap if accessing an invalid page(s). - * This takes priority over !allocation_tag_access_enabled. - */ - if (ptr & TAG_GRANULE) { - /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */ - mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, - TAG_GRANULE, MMU_DATA_STORE, 1, ra); - mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE, - MMU_DATA_STORE, TAG_GRANULE, - MMU_DATA_STORE, 1, ra); - - /* Store if page(s) support tags. */ - if (mem1) { - store1(TAG_GRANULE, mem1, tag); - } - if (mem2) { - store1(0, mem2, tag); - } - } else { - /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */ - mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, - 2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra); - if (mem1) { - tag |= tag << 4; - qatomic_set(mem1, tag); - } - } -} - -void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt) -{ - do_st2g(env, ptr, xt, GETPC(), store_tag1); -} - -void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) -{ - do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel); -} - -void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr) -{ - int mmu_idx = cpu_mmu_index(env, false); - uintptr_t ra = GETPC(); - int in_page = -(ptr | TARGET_PAGE_MASK); - - check_tag_aligned(env, ptr, ra); - - if (likely(in_page >= 2 * TAG_GRANULE)) { - probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra); - } else { - probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); - probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra); - } -} - -#define LDGM_STGM_SIZE (4 << GMID_EL1_BS) - -uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr) -{ - int mmu_idx = cpu_mmu_index(env, false); - uintptr_t ra = GETPC(); - void *tag_mem; - - ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); - - /* Trap if accessing an invalid page. */ - tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, - LDGM_STGM_SIZE, MMU_DATA_LOAD, - LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); - - /* The tag is squashed to zero if the page does not support tags. */ - if (!tag_mem) { - return 0; - } - - QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); - /* - * We are loading 64-bits worth of tags. The ordering of elements - * within the word corresponds to a 64-bit little-endian operation. - */ - return ldq_le_p(tag_mem); -} - -void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val) -{ - int mmu_idx = cpu_mmu_index(env, false); - uintptr_t ra = GETPC(); - void *tag_mem; - - ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); - - /* Trap if accessing an invalid page. */ - tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, - LDGM_STGM_SIZE, MMU_DATA_LOAD, - LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); - - /* - * Tag store only happens if the page support tags, - * and if the OS has enabled access to the tags. - */ - if (!tag_mem) { - return; - } - - QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); - /* - * We are storing 64-bits worth of tags. The ordering of elements - * within the word corresponds to a 64-bit little-endian operation. - */ - stq_le_p(tag_mem, val); -} - -void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val) -{ - uintptr_t ra = GETPC(); - int mmu_idx = cpu_mmu_index(env, false); - int log2_dcz_bytes, log2_tag_bytes; - intptr_t dcz_bytes, tag_bytes; - uint8_t *mem; - - /* - * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1, - * i.e. 32 bytes, which is an unreasonably small dcz anyway, - * to make sure that we can access one complete tag byte here. - */ - log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; - log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); - dcz_bytes = (intptr_t)1 << log2_dcz_bytes; - tag_bytes = (intptr_t)1 << log2_tag_bytes; - ptr &= -dcz_bytes; - - mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes, - MMU_DATA_STORE, tag_bytes, ra); - if (mem) { - int tag_pair = (val & 0xf) * 0x11; - memset(mem, tag_pair, tag_bytes); - } -} - -static void mte_sync_check_fail(CPUARMState *env, uint32_t desc, - uint64_t dirty_ptr, uintptr_t ra) -{ - int is_write, syn; - - env->exception.vaddress = dirty_ptr; - - is_write = FIELD_EX32(desc, MTEDESC, WRITE); - syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write, - 0x11); - raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra); - g_assert_not_reached(); -} - -static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr, - uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el) -{ - int select; - - if (regime_has_2_ranges(arm_mmu_idx)) { - select = extract64(dirty_ptr, 55, 1); - } else { - select = 0; - } - env->cp15.tfsr_el[el] |= 1 << select; -#ifdef CONFIG_USER_ONLY - /* - * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, - * which then sends a SIGSEGV when the thread is next scheduled. - * This cpu will return to the main loop at the end of the TB, - * which is rather sooner than "normal". But the alternative - * is waiting until the next syscall. - */ - qemu_cpu_kick(env_cpu(env)); -#endif -} - -/* Record a tag check failure. */ -static void mte_check_fail(CPUARMState *env, uint32_t desc, - uint64_t dirty_ptr, uintptr_t ra) -{ - int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); - ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx); - int el, reg_el, tcf; - uint64_t sctlr; - - reg_el = regime_el(env, arm_mmu_idx); - sctlr = env->cp15.sctlr_el[reg_el]; - - switch (arm_mmu_idx) { - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E20_0: - el = 0; - tcf = extract64(sctlr, 38, 2); - break; - default: - el = reg_el; - tcf = extract64(sctlr, 40, 2); - } - - switch (tcf) { - case 1: - /* Tag check fail causes a synchronous exception. */ - mte_sync_check_fail(env, desc, dirty_ptr, ra); - break; - - case 0: - /* - * Tag check fail does not affect the PE. - * We eliminate this case by not setting MTE_ACTIVE - * in tb_flags, so that we never make this runtime call. - */ - g_assert_not_reached(); - - case 2: - /* Tag check fail causes asynchronous flag set. */ - mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); - break; - - case 3: - /* - * Tag check fail causes asynchronous flag set for stores, or - * a synchronous exception for loads. - */ - if (FIELD_EX32(desc, MTEDESC, WRITE)) { - mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); - } else { - mte_sync_check_fail(env, desc, dirty_ptr, ra); - } - break; - } -} - -/** - * checkN: - * @tag: tag memory to test - * @odd: true to begin testing at tags at odd nibble - * @cmp: the tag to compare against - * @count: number of tags to test - * - * Return the number of successful tests. - * Thus a return value < @count indicates a failure. - * - * A note about sizes: count is expected to be small. - * - * The most common use will be LDP/STP of two integer registers, - * which means 16 bytes of memory touching at most 2 tags, but - * often the access is aligned and thus just 1 tag. - * - * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory, - * touching at most 5 tags. SVE LDR/STR (vector) with the default - * vector length is also 64 bytes; the maximum architectural length - * is 256 bytes touching at most 9 tags. - * - * The loop below uses 7 logical operations and 1 memory operation - * per tag pair. An implementation that loads an aligned word and - * uses masking to ignore adjacent tags requires 18 logical operations - * and thus does not begin to pay off until 6 tags. - * Which, according to the survey above, is unlikely to be common. - */ -static int checkN(uint8_t *mem, int odd, int cmp, int count) -{ - int n = 0, diff; - - /* Replicate the test tag and compare. */ - cmp *= 0x11; - diff = *mem++ ^ cmp; - - if (odd) { - goto start_odd; - } - - while (1) { - /* Test even tag. */ - if (unlikely((diff) & 0x0f)) { - break; - } - if (++n == count) { - break; - } - - start_odd: - /* Test odd tag. */ - if (unlikely((diff) & 0xf0)) { - break; - } - if (++n == count) { - break; - } - - diff = *mem++ ^ cmp; - } - return n; -} - -/** - * mte_probe_int() - helper for mte_probe and mte_check - * @env: CPU environment - * @desc: MTEDESC descriptor - * @ptr: virtual address of the base of the access - * @fault: return virtual address of the first check failure - * - * Internal routine for both mte_probe and mte_check. - * Return zero on failure, filling in *fault. - * Return negative on trivial success for tbi disabled. - * Return positive on success with tbi enabled. - */ -static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr, - uintptr_t ra, uint64_t *fault) -{ - int mmu_idx, ptr_tag, bit55; - uint64_t ptr_last, prev_page, next_page; - uint64_t tag_first, tag_last; - uint64_t tag_byte_first, tag_byte_last; - uint32_t sizem1, tag_count, tag_size, n, c; - uint8_t *mem1, *mem2; - MMUAccessType type; - - bit55 = extract64(ptr, 55, 1); - *fault = ptr; - - /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ - if (unlikely(!tbi_check(desc, bit55))) { - return -1; - } - - ptr_tag = allocation_tag_from_addr(ptr); - - if (tcma_check(desc, bit55, ptr_tag)) { - return 1; - } - - mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); - type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD; - sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1); - - /* Find the addr of the end of the access */ - ptr_last = ptr + sizem1; - - /* Round the bounds to the tag granule, and compute the number of tags. */ - tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE); - tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE); - tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1; - - /* Round the bounds to twice the tag granule, and compute the bytes. */ - tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE); - tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE); - - /* Locate the page boundaries. */ - prev_page = ptr & TARGET_PAGE_MASK; - next_page = prev_page + TARGET_PAGE_SIZE; - - if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) { - /* Memory access stays on one page. */ - tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1; - mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1, - MMU_DATA_LOAD, tag_size, ra); - if (!mem1) { - return 1; - } - /* Perform all of the comparisons. */ - n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count); - } else { - /* Memory access crosses to next page. */ - tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE); - mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr, - MMU_DATA_LOAD, tag_size, ra); - - tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1; - mem2 = allocation_tag_mem(env, mmu_idx, next_page, type, - ptr_last - next_page + 1, - MMU_DATA_LOAD, tag_size, ra); - - /* - * Perform all of the comparisons. - * Note the possible but unlikely case of the operation spanning - * two pages that do not both have tagging enabled. - */ - n = c = (next_page - tag_first) / TAG_GRANULE; - if (mem1) { - n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c); - } - if (n == c) { - if (!mem2) { - return 1; - } - n += checkN(mem2, 0, ptr_tag, tag_count - c); - } - } - - if (likely(n == tag_count)) { - return 1; - } - - /* - * If we failed, we know which granule. For the first granule, the - * failure address is @ptr, the first byte accessed. Otherwise the - * failure address is the first byte of the nth granule. - */ - if (n > 0) { - *fault = tag_first + n * TAG_GRANULE; - } - return 0; -} - -uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra) -{ - uint64_t fault; - int ret = mte_probe_int(env, desc, ptr, ra, &fault); - - if (unlikely(ret == 0)) { - mte_check_fail(env, desc, fault, ra); - } else if (ret < 0) { - return ptr; - } - return useronly_clean_ptr(ptr); -} - -uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr) -{ - return mte_check(env, desc, ptr, GETPC()); -} - -/* - * No-fault version of mte_check, to be used by SVE for MemSingleNF. - * Returns false if the access is Checked and the check failed. This - * is only intended to probe the tag -- the validity of the page must - * be checked beforehand. - */ -bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr) -{ - uint64_t fault; - int ret = mte_probe_int(env, desc, ptr, 0, &fault); - - return ret != 0; -} - -/* - * Perform an MTE checked access for DC_ZVA. - */ -uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr) -{ - uintptr_t ra = GETPC(); - int log2_dcz_bytes, log2_tag_bytes; - int mmu_idx, bit55; - intptr_t dcz_bytes, tag_bytes, i; - void *mem; - uint64_t ptr_tag, mem_tag, align_ptr; - - bit55 = extract64(ptr, 55, 1); - - /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ - if (unlikely(!tbi_check(desc, bit55))) { - return ptr; - } - - ptr_tag = allocation_tag_from_addr(ptr); - - if (tcma_check(desc, bit55, ptr_tag)) { - goto done; - } - - /* - * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1, - * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make - * sure that we can access one complete tag byte here. - */ - log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; - log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); - dcz_bytes = (intptr_t)1 << log2_dcz_bytes; - tag_bytes = (intptr_t)1 << log2_tag_bytes; - align_ptr = ptr & -dcz_bytes; - - /* - * Trap if accessing an invalid page. DC_ZVA requires that we supply - * the original pointer for an invalid page. But watchpoints require - * that we probe the actual space. So do both. - */ - mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); - (void) probe_write(env, ptr, 1, mmu_idx, ra); - mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE, - dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra); - if (!mem) { - goto done; - } - - /* - * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus - * it is quite easy to perform all of the comparisons at once without - * any extra masking. - * - * The most common zva block size is 64; some of the thunderx cpus use - * a block size of 128. For user-only, aarch64_max_initfn will set the - * block size to 512. Fill out the other cases for future-proofing. - * - * In order to be able to find the first miscompare later, we want the - * tag bytes to be in little-endian order. - */ - switch (log2_tag_bytes) { - case 0: /* zva_blocksize 32 */ - mem_tag = *(uint8_t *)mem; - ptr_tag *= 0x11u; - break; - case 1: /* zva_blocksize 64 */ - mem_tag = cpu_to_le16(*(uint16_t *)mem); - ptr_tag *= 0x1111u; - break; - case 2: /* zva_blocksize 128 */ - mem_tag = cpu_to_le32(*(uint32_t *)mem); - ptr_tag *= 0x11111111u; - break; - case 3: /* zva_blocksize 256 */ - mem_tag = cpu_to_le64(*(uint64_t *)mem); - ptr_tag *= 0x1111111111111111ull; - break; - - default: /* zva_blocksize 512, 1024, 2048 */ - ptr_tag *= 0x1111111111111111ull; - i = 0; - do { - mem_tag = cpu_to_le64(*(uint64_t *)(mem + i)); - if (unlikely(mem_tag != ptr_tag)) { - goto fail; - } - i += 8; - align_ptr += 16 * TAG_GRANULE; - } while (i < tag_bytes); - goto done; - } - - if (likely(mem_tag == ptr_tag)) { - goto done; - } - - fail: - /* Locate the first nibble that differs. */ - i = ctz64(mem_tag ^ ptr_tag) >> 4; - mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra); - - done: - return useronly_clean_ptr(ptr); -} diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c deleted file mode 100644 index 403b345ea3..0000000000 --- a/target/arm/mve_helper.c +++ /dev/null @@ -1,3450 +0,0 @@ -/* - * M-profile MVE Operations - * - * Copyright (c) 2021 Linaro, Ltd. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "vec_internal.h" -#include "exec/helper-proto.h" -#include "exec/cpu_ldst.h" -#include "exec/exec-all.h" -#include "tcg/tcg.h" -#include "fpu/softfloat.h" - -static uint16_t mve_eci_mask(CPUARMState *env) -{ - /* - * Return the mask of which elements in the MVE vector correspond - * to beats being executed. The mask has 1 bits for executed lanes - * and 0 bits where ECI says this beat was already executed. - */ - int eci; - - if ((env->condexec_bits & 0xf) != 0) { - return 0xffff; - } - - eci = env->condexec_bits >> 4; - switch (eci) { - case ECI_NONE: - return 0xffff; - case ECI_A0: - return 0xfff0; - case ECI_A0A1: - return 0xff00; - case ECI_A0A1A2: - case ECI_A0A1A2B0: - return 0xf000; - default: - g_assert_not_reached(); - } -} - -static uint16_t mve_element_mask(CPUARMState *env) -{ - /* - * Return the mask of which elements in the MVE vector should be - * updated. This is a combination of multiple things: - * (1) by default, we update every lane in the vector - * (2) VPT predication stores its state in the VPR register; - * (3) low-overhead-branch tail predication will mask out part - * the vector on the final iteration of the loop - * (4) if EPSR.ECI is set then we must execute only some beats - * of the insn - * We combine all these into a 16-bit result with the same semantics - * as VPR.P0: 0 to mask the lane, 1 if it is active. - * 8-bit vector ops will look at all bits of the result; - * 16-bit ops will look at bits 0, 2, 4, ...; - * 32-bit ops will look at bits 0, 4, 8 and 12. - * Compare pseudocode GetCurInstrBeat(), though that only returns - * the 4-bit slice of the mask corresponding to a single beat. - */ - uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); - - if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { - mask |= 0xff; - } - if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { - mask |= 0xff00; - } - - if (env->v7m.ltpsize < 4 && - env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { - /* - * Tail predication active, and this is the last loop iteration. - * The element size is (1 << ltpsize), and we only want to process - * loopcount elements, so we want to retain the least significant - * (loopcount * esize) predicate bits and zero out bits above that. - */ - int masklen = env->regs[14] << env->v7m.ltpsize; - assert(masklen <= 16); - uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; - mask &= ltpmask; - } - - /* - * ECI bits indicate which beats are already executed; - * we handle this by effectively predicating them out. - */ - mask &= mve_eci_mask(env); - return mask; -} - -static void mve_advance_vpt(CPUARMState *env) -{ - /* Advance the VPT and ECI state if necessary */ - uint32_t vpr = env->v7m.vpr; - unsigned mask01, mask23; - uint16_t inv_mask; - uint16_t eci_mask = mve_eci_mask(env); - - if ((env->condexec_bits & 0xf) == 0) { - env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? - (ECI_A0 << 4) : (ECI_NONE << 4); - } - - if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) { - /* VPT not enabled, nothing to do */ - return; - } - - /* Invert P0 bits if needed, but only for beats we actually executed */ - mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); - mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); - /* Start by assuming we invert all bits corresponding to executed beats */ - inv_mask = eci_mask; - if (mask01 <= 8) { - /* MASK01 says don't invert low half of P0 */ - inv_mask &= ~0xff; - } - if (mask23 <= 8) { - /* MASK23 says don't invert high half of P0 */ - inv_mask &= ~0xff00; - } - vpr ^= inv_mask; - /* Only update MASK01 if beat 1 executed */ - if (eci_mask & 0xf0) { - vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); - } - /* Beat 3 always executes, so update MASK23 */ - vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); - env->v7m.vpr = vpr; -} - -/* For loads, predicated lanes are zeroed instead of keeping their old values */ -#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ - { \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - unsigned b, e; \ - /* \ - * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ - * beats so we don't care if we update part of the dest and \ - * then take an exception. \ - */ \ - for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ - if (eci_mask & (1 << b)) { \ - d[H##ESIZE(e)] = (mask & (1 << b)) ? \ - cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ - } \ - addr += MSIZE; \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ - { \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - unsigned b, e; \ - for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ - if (mask & (1 << b)) { \ - cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ - } \ - addr += MSIZE; \ - } \ - mve_advance_vpt(env); \ - } - -DO_VLDR(vldrb, 1, ldub, 1, uint8_t) -DO_VLDR(vldrh, 2, lduw, 2, uint16_t) -DO_VLDR(vldrw, 4, ldl, 4, uint32_t) - -DO_VSTR(vstrb, 1, stb, 1, uint8_t) -DO_VSTR(vstrh, 2, stw, 2, uint16_t) -DO_VSTR(vstrw, 4, stl, 4, uint32_t) - -DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) -DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) -DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) -DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) -DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) -DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) - -DO_VSTR(vstrb_h, 1, stb, 2, int16_t) -DO_VSTR(vstrb_w, 1, stb, 4, int32_t) -DO_VSTR(vstrh_w, 2, stw, 4, int32_t) - -#undef DO_VLDR -#undef DO_VSTR - -/* - * Gather loads/scatter stores. Here each element of Qm specifies - * an offset to use from the base register Rm. In the _os_ versions - * that offset is scaled by the element size. - * For loads, predicated lanes are zeroed instead of retaining - * their previous values. - */ -#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ - uint32_t base) \ - { \ - TYPE *d = vd; \ - OFFTYPE *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - unsigned e; \ - uint32_t addr; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ - if (!(eci_mask & 1)) { \ - continue; \ - } \ - addr = ADDRFN(base, m[H##ESIZE(e)]); \ - d[H##ESIZE(e)] = (mask & 1) ? \ - cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ - if (WB) { \ - m[H##ESIZE(e)] = addr; \ - } \ - } \ - mve_advance_vpt(env); \ - } - -/* We know here TYPE is unsigned so always the same as the offset type */ -#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ - uint32_t base) \ - { \ - TYPE *d = vd; \ - TYPE *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - unsigned e; \ - uint32_t addr; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ - if (!(eci_mask & 1)) { \ - continue; \ - } \ - addr = ADDRFN(base, m[H##ESIZE(e)]); \ - if (mask & 1) { \ - cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ - } \ - if (WB) { \ - m[H##ESIZE(e)] = addr; \ - } \ - } \ - mve_advance_vpt(env); \ - } - -/* - * 64-bit accesses are slightly different: they are done as two 32-bit - * accesses, controlled by the predicate mask for the relevant beat, - * and with a single 32-bit offset in the first of the two Qm elements. - * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little). - * Address writeback happens on the odd beats and updates the address - * stored in the even-beat element. - */ -#define DO_VLDR64_SG(OP, ADDRFN, WB) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ - uint32_t base) \ - { \ - uint32_t *d = vd; \ - uint32_t *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - unsigned e; \ - uint32_t addr; \ - for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ - if (!(eci_mask & 1)) { \ - continue; \ - } \ - addr = ADDRFN(base, m[H4(e & ~1)]); \ - addr += 4 * (e & 1); \ - d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ - if (WB && (e & 1)) { \ - m[H4(e & ~1)] = addr - 4; \ - } \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VSTR64_SG(OP, ADDRFN, WB) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ - uint32_t base) \ - { \ - uint32_t *d = vd; \ - uint32_t *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - unsigned e; \ - uint32_t addr; \ - for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ - if (!(eci_mask & 1)) { \ - continue; \ - } \ - addr = ADDRFN(base, m[H4(e & ~1)]); \ - addr += 4 * (e & 1); \ - if (mask & 1) { \ - cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ - } \ - if (WB && (e & 1)) { \ - m[H4(e & ~1)] = addr - 4; \ - } \ - } \ - mve_advance_vpt(env); \ - } - -#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET)) -#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1)) -#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) -#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) - -DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false) - -DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false) -DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false) -DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) -DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false) - -DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false) -DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false) -DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false) - -DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false) -DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false) -DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false) -DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false) -DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false) -DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false) -DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false) - -DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false) -DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false) -DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false) -DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false) - -DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) -DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true) -DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true) -DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) - -/* - * Deinterleaving loads/interleaving stores. - * - * For these helpers we are passed the index of the first Qreg - * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3) - * and the value of the base address register Rn. - * The helpers are specialized for pattern and element size, so - * for instance vld42h is VLD4 with pattern 2, element size MO_16. - * - * These insns are beatwise but not predicated, so we must honour ECI, - * but need not look at mve_element_mask(). - * - * The pseudocode implements these insns with multiple memory accesses - * of the element size, but rules R_VVVG and R_FXDM permit us to make - * one 32-bit memory access per beat. - */ -#define DO_VLD4B(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat, e; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - for (e = 0; e < 4; e++, data >>= 8) { \ - uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ - qd[H1(off[beat])] = data; \ - } \ - } \ - } - -#define DO_VLD4H(OP, O1, O2) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O1, O2, O2 }; \ - uint32_t addr, data; \ - int y; /* y counts 0 2 0 2 */ \ - uint16_t *qd; \ - for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 8 + (beat & 1) * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ - qd[H2(off[beat])] = data; \ - data >>= 16; \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ - qd[H2(off[beat])] = data; \ - } \ - } - -#define DO_VLD4W(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint32_t *qd; \ - int y; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - y = (beat + (O1 & 2)) & 3; \ - qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ - qd[H4(off[beat] >> 2)] = data; \ - } \ - } - -DO_VLD4B(vld40b, 0, 1, 10, 11) -DO_VLD4B(vld41b, 2, 3, 12, 13) -DO_VLD4B(vld42b, 4, 5, 14, 15) -DO_VLD4B(vld43b, 6, 7, 8, 9) - -DO_VLD4H(vld40h, 0, 5) -DO_VLD4H(vld41h, 1, 6) -DO_VLD4H(vld42h, 2, 7) -DO_VLD4H(vld43h, 3, 4) - -DO_VLD4W(vld40w, 0, 1, 10, 11) -DO_VLD4W(vld41w, 2, 3, 12, 13) -DO_VLD4W(vld42w, 4, 5, 14, 15) -DO_VLD4W(vld43w, 6, 7, 8, 9) - -#define DO_VLD2B(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat, e; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint8_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 2; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - for (e = 0; e < 4; e++, data >>= 8) { \ - qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ - qd[H1(off[beat] + (e >> 1))] = data; \ - } \ - } \ - } - -#define DO_VLD2H(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - int e; \ - uint16_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - for (e = 0; e < 2; e++, data >>= 16) { \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ - qd[H2(off[beat])] = data; \ - } \ - } \ - } - -#define DO_VLD2W(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint32_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat]; \ - data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ - qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ - qd[H4(off[beat] >> 3)] = data; \ - } \ - } - -DO_VLD2B(vld20b, 0, 2, 12, 14) -DO_VLD2B(vld21b, 4, 6, 8, 10) - -DO_VLD2H(vld20h, 0, 1, 6, 7) -DO_VLD2H(vld21h, 2, 3, 4, 5) - -DO_VLD2W(vld20w, 0, 4, 24, 28) -DO_VLD2W(vld21w, 8, 12, 16, 20) - -#define DO_VST4B(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat, e; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - data = 0; \ - for (e = 3; e >= 0; e--) { \ - uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ - data = (data << 8) | qd[H1(off[beat])]; \ - } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -#define DO_VST4H(OP, O1, O2) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O1, O2, O2 }; \ - uint32_t addr, data; \ - int y; /* y counts 0 2 0 2 */ \ - uint16_t *qd; \ - for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 8 + (beat & 1) * 4; \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ - data = qd[H2(off[beat])]; \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ - data |= qd[H2(off[beat])] << 16; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -#define DO_VST4W(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint32_t *qd; \ - int y; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - y = (beat + (O1 & 2)) & 3; \ - qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ - data = qd[H4(off[beat] >> 2)]; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -DO_VST4B(vst40b, 0, 1, 10, 11) -DO_VST4B(vst41b, 2, 3, 12, 13) -DO_VST4B(vst42b, 4, 5, 14, 15) -DO_VST4B(vst43b, 6, 7, 8, 9) - -DO_VST4H(vst40h, 0, 5) -DO_VST4H(vst41h, 1, 6) -DO_VST4H(vst42h, 2, 7) -DO_VST4H(vst43h, 3, 4) - -DO_VST4W(vst40w, 0, 1, 10, 11) -DO_VST4W(vst41w, 2, 3, 12, 13) -DO_VST4W(vst42w, 4, 5, 14, 15) -DO_VST4W(vst43w, 6, 7, 8, 9) - -#define DO_VST2B(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat, e; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint8_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 2; \ - data = 0; \ - for (e = 3; e >= 0; e--) { \ - qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ - data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \ - } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -#define DO_VST2H(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - int e; \ - uint16_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat] * 4; \ - data = 0; \ - for (e = 1; e >= 0; e--) { \ - qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ - data = (data << 16) | qd[H2(off[beat])]; \ - } \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -#define DO_VST2W(OP, O1, O2, O3, O4) \ - void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ - uint32_t base) \ - { \ - int beat; \ - uint16_t mask = mve_eci_mask(env); \ - static const uint8_t off[4] = { O1, O2, O3, O4 }; \ - uint32_t addr, data; \ - uint32_t *qd; \ - for (beat = 0; beat < 4; beat++, mask >>= 4) { \ - if ((mask & 1) == 0) { \ - /* ECI says skip this beat */ \ - continue; \ - } \ - addr = base + off[beat]; \ - qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ - data = qd[H4(off[beat] >> 3)]; \ - cpu_stl_le_data_ra(env, addr, data, GETPC()); \ - } \ - } - -DO_VST2B(vst20b, 0, 2, 12, 14) -DO_VST2B(vst21b, 4, 6, 8, 10) - -DO_VST2H(vst20h, 0, 1, 6, 7) -DO_VST2H(vst21h, 2, 3, 4, 5) - -DO_VST2W(vst20w, 0, 4, 24, 28) -DO_VST2W(vst21w, 8, 12, 16, 20) - -/* - * The mergemask(D, R, M) macro performs the operation "*D = R" but - * storing only the bytes which correspond to 1 bits in M, - * leaving other bytes in *D unchanged. We use _Generic - * to select the correct implementation based on the type of D. - */ - -static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask) -{ - if (mask & 1) { - *d = r; - } -} - -static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask) -{ - mergemask_ub((uint8_t *)d, r, mask); -} - -static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask) -{ - uint16_t bmask = expand_pred_b(mask); - *d = (*d & ~bmask) | (r & bmask); -} - -static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask) -{ - mergemask_uh((uint16_t *)d, r, mask); -} - -static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask) -{ - uint32_t bmask = expand_pred_b(mask); - *d = (*d & ~bmask) | (r & bmask); -} - -static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask) -{ - mergemask_uw((uint32_t *)d, r, mask); -} - -static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask) -{ - uint64_t bmask = expand_pred_b(mask); - *d = (*d & ~bmask) | (r & bmask); -} - -static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) -{ - mergemask_uq((uint64_t *)d, r, mask); -} - -#define mergemask(D, R, M) \ - _Generic(D, \ - uint8_t *: mergemask_ub, \ - int8_t *: mergemask_sb, \ - uint16_t *: mergemask_uh, \ - int16_t *: mergemask_sh, \ - uint32_t *: mergemask_uw, \ - int32_t *: mergemask_sw, \ - uint64_t *: mergemask_uq, \ - int64_t *: mergemask_sq)(D, R, M) - -void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val) -{ - /* - * The generated code already replicated an 8 or 16 bit constant - * into the 32-bit value, so we only need to write the 32-bit - * value to all elements of the Qreg, allowing for predication. - */ - uint32_t *d = vd; - uint16_t mask = mve_element_mask(env); - unsigned e; - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - mergemask(&d[H4(e)], val, mask); - } - mve_advance_vpt(env); -} - -#define DO_1OP(OP, ESIZE, TYPE, FN) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ - { \ - TYPE *d = vd, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_CLS_B(N) (clrsb32(N) - 24) -#define DO_CLS_H(N) (clrsb32(N) - 16) - -DO_1OP(vclsb, 1, int8_t, DO_CLS_B) -DO_1OP(vclsh, 2, int16_t, DO_CLS_H) -DO_1OP(vclsw, 4, int32_t, clrsb32) - -#define DO_CLZ_B(N) (clz32(N) - 24) -#define DO_CLZ_H(N) (clz32(N) - 16) - -DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) -DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) -DO_1OP(vclzw, 4, uint32_t, clz32) - -DO_1OP(vrev16b, 2, uint16_t, bswap16) -DO_1OP(vrev32b, 4, uint32_t, bswap32) -DO_1OP(vrev32h, 4, uint32_t, hswap32) -DO_1OP(vrev64b, 8, uint64_t, bswap64) -DO_1OP(vrev64h, 8, uint64_t, hswap64) -DO_1OP(vrev64w, 8, uint64_t, wswap64) - -#define DO_NOT(N) (~(N)) - -DO_1OP(vmvn, 8, uint64_t, DO_NOT) - -#define DO_ABS(N) ((N) < 0 ? -(N) : (N)) -#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff)) -#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff)) - -DO_1OP(vabsb, 1, int8_t, DO_ABS) -DO_1OP(vabsh, 2, int16_t, DO_ABS) -DO_1OP(vabsw, 4, int32_t, DO_ABS) - -/* We can do these 64 bits at a time */ -DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) -DO_1OP(vfabss, 8, uint64_t, DO_FABSS) - -#define DO_NEG(N) (-(N)) -#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000)) -#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000)) - -DO_1OP(vnegb, 1, int8_t, DO_NEG) -DO_1OP(vnegh, 2, int16_t, DO_NEG) -DO_1OP(vnegw, 4, int32_t, DO_NEG) - -/* We can do these 64 bits at a time */ -DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) -DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) - -/* - * 1 operand immediates: Vda is destination and possibly also one source. - * All these insns work at 64-bit widths. - */ -#define DO_1OP_IMM(OP, FN) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \ - { \ - uint64_t *da = vda; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ - mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_MOVI(N, I) (I) -#define DO_ANDI(N, I) ((N) & (I)) -#define DO_ORRI(N, I) ((N) | (I)) - -DO_1OP_IMM(vmovi, DO_MOVI) -DO_1OP_IMM(vandi, DO_ANDI) -DO_1OP_IMM(vorri, DO_ORRI) - -#define DO_2OP(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], \ - FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ - } \ - mve_advance_vpt(env); \ - } - -/* provide unsigned 2-op helpers for all sizes */ -#define DO_2OP_U(OP, FN) \ - DO_2OP(OP##b, 1, uint8_t, FN) \ - DO_2OP(OP##h, 2, uint16_t, FN) \ - DO_2OP(OP##w, 4, uint32_t, FN) - -/* provide signed 2-op helpers for all sizes */ -#define DO_2OP_S(OP, FN) \ - DO_2OP(OP##b, 1, int8_t, FN) \ - DO_2OP(OP##h, 2, int16_t, FN) \ - DO_2OP(OP##w, 4, int32_t, FN) - -/* - * "Long" operations where two half-sized inputs (taken from either the - * top or the bottom of the input vector) produce a double-width result. - * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output. - */ -#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ - { \ - LTYPE *d = vd; \ - TYPE *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \ - m[H##ESIZE(le * 2 + TOP)]); \ - mergemask(&d[H##LESIZE(le)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -/* provide unsigned 2-op helpers for all sizes */ -#define DO_2OP_SAT_U(OP, FN) \ - DO_2OP_SAT(OP##b, 1, uint8_t, FN) \ - DO_2OP_SAT(OP##h, 2, uint16_t, FN) \ - DO_2OP_SAT(OP##w, 4, uint32_t, FN) - -/* provide signed 2-op helpers for all sizes */ -#define DO_2OP_SAT_S(OP, FN) \ - DO_2OP_SAT(OP##b, 1, int8_t, FN) \ - DO_2OP_SAT(OP##h, 2, int16_t, FN) \ - DO_2OP_SAT(OP##w, 4, int32_t, FN) - -#define DO_AND(N, M) ((N) & (M)) -#define DO_BIC(N, M) ((N) & ~(M)) -#define DO_ORR(N, M) ((N) | (M)) -#define DO_ORN(N, M) ((N) | ~(M)) -#define DO_EOR(N, M) ((N) ^ (M)) - -DO_2OP(vand, 8, uint64_t, DO_AND) -DO_2OP(vbic, 8, uint64_t, DO_BIC) -DO_2OP(vorr, 8, uint64_t, DO_ORR) -DO_2OP(vorn, 8, uint64_t, DO_ORN) -DO_2OP(veor, 8, uint64_t, DO_EOR) - -#define DO_ADD(N, M) ((N) + (M)) -#define DO_SUB(N, M) ((N) - (M)) -#define DO_MUL(N, M) ((N) * (M)) - -DO_2OP_U(vadd, DO_ADD) -DO_2OP_U(vsub, DO_SUB) -DO_2OP_U(vmul, DO_MUL) - -DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL) -DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL) -DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL) -DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL) -DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL) -DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL) - -DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL) -DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL) -DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL) -DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) -DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) -DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) - -/* - * Polynomial multiply. We can always do this generating 64 bits - * of the result at a time, so we don't need to use DO_2OP_L. - */ -#define VMULLPH_MASK 0x00ff00ff00ff00ffULL -#define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) -#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) -#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) -#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) - -DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) -DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) -DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) -DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) - -/* - * Because the computation type is at least twice as large as required, - * these work for both signed and unsigned source types. - */ -static inline uint8_t do_mulh_b(int32_t n, int32_t m) -{ - return (n * m) >> 8; -} - -static inline uint16_t do_mulh_h(int32_t n, int32_t m) -{ - return (n * m) >> 16; -} - -static inline uint32_t do_mulh_w(int64_t n, int64_t m) -{ - return (n * m) >> 32; -} - -static inline uint8_t do_rmulh_b(int32_t n, int32_t m) -{ - return (n * m + (1U << 7)) >> 8; -} - -static inline uint16_t do_rmulh_h(int32_t n, int32_t m) -{ - return (n * m + (1U << 15)) >> 16; -} - -static inline uint32_t do_rmulh_w(int64_t n, int64_t m) -{ - return (n * m + (1U << 31)) >> 32; -} - -DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) -DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) -DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) -DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) -DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) -DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) - -DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b) -DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h) -DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) -DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) -DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) -DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) - -#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) -#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) - -DO_2OP_S(vmaxs, DO_MAX) -DO_2OP_U(vmaxu, DO_MAX) -DO_2OP_S(vmins, DO_MIN) -DO_2OP_U(vminu, DO_MIN) - -#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) - -DO_2OP_S(vabds, DO_ABD) -DO_2OP_U(vabdu, DO_ABD) - -static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) -{ - return ((uint64_t)n + m) >> 1; -} - -static inline int32_t do_vhadd_s(int32_t n, int32_t m) -{ - return ((int64_t)n + m) >> 1; -} - -static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) -{ - return ((uint64_t)n - m) >> 1; -} - -static inline int32_t do_vhsub_s(int32_t n, int32_t m) -{ - return ((int64_t)n - m) >> 1; -} - -DO_2OP_S(vhadds, do_vhadd_s) -DO_2OP_U(vhaddu, do_vhadd_u) -DO_2OP_S(vhsubs, do_vhsub_s) -DO_2OP_U(vhsubu, do_vhsub_u) - -#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) -#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) -#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) -#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) - -DO_2OP_S(vshls, DO_VSHLS) -DO_2OP_U(vshlu, DO_VSHLU) -DO_2OP_S(vrshls, DO_VRSHLS) -DO_2OP_U(vrshlu, DO_VRSHLU) - -#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) -#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) - -DO_2OP_S(vrhadds, DO_RHADD_S) -DO_2OP_U(vrhaddu, DO_RHADD_U) - -static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, - uint32_t inv, uint32_t carry_in, bool update_flags) -{ - uint16_t mask = mve_element_mask(env); - unsigned e; - - /* If any additions trigger, we will update flags. */ - if (mask & 0x1111) { - update_flags = true; - } - - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - uint64_t r = carry_in; - r += n[H4(e)]; - r += m[H4(e)] ^ inv; - if (mask & 1) { - carry_in = r >> 32; - } - mergemask(&d[H4(e)], r, mask); - } - - if (update_flags) { - /* Store C, clear NZV. */ - env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; - env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; - } - mve_advance_vpt(env); -} - -void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) -{ - bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; - do_vadc(env, vd, vn, vm, 0, carry_in, false); -} - -void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) -{ - bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; - do_vadc(env, vd, vn, vm, -1, carry_in, false); -} - - -void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm) -{ - do_vadc(env, vd, vn, vm, 0, 0, true); -} - -void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) -{ - do_vadc(env, vd, vn, vm, -1, 1, true); -} - -#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE r[16 / ESIZE]; \ - /* Calculate all results first to avoid overwriting inputs */ \ - for (e = 0; e < 16 / ESIZE; e++) { \ - if (!(e & 1)) { \ - r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \ - } else { \ - r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \ - } \ - } \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], r[e], mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VCADD_ALL(OP, FN0, FN1) \ - DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \ - DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \ - DO_VCADD(OP##w, 4, int32_t, FN0, FN1) - -DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) -DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) -DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s) -DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s) - -static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) -{ - if (val > max) { - *s = true; - return max; - } else if (val < min) { - *s = true; - return min; - } - return val; -} - -#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) -#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) -#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) - -#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) -#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) -#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) - -#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) -#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) -#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) - -#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) -#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) -#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) - -/* - * For QDMULH and QRDMULH we simplify "double and shift by esize" into - * "shift by esize-1", adjusting the QRDMULH rounding constant to match. - */ -#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ - INT8_MIN, INT8_MAX, s) -#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ - INT16_MIN, INT16_MAX, s) -#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ - INT32_MIN, INT32_MAX, s) - -#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ - INT8_MIN, INT8_MAX, s) -#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ - INT16_MIN, INT16_MAX, s) -#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ - INT32_MIN, INT32_MAX, s) - -DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B) -DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H) -DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W) - -DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) -DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) -DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) - -DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B) -DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H) -DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W) -DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B) -DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H) -DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W) - -DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B) -DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H) -DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W) -DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) -DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) -DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) - -/* - * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs() - * and friends wanting a uint32_t* sat and our needing a bool*. - */ -#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ - ({ \ - uint32_t su32 = 0; \ - typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \ - if (su32) { \ - *satp = true; \ - } \ - r; \ - }) - -#define DO_SQSHL_OP(N, M, satp) \ - WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) -#define DO_UQSHL_OP(N, M, satp) \ - WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) -#define DO_SQRSHL_OP(N, M, satp) \ - WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp) -#define DO_UQRSHL_OP(N, M, satp) \ - WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp) -#define DO_SUQSHL_OP(N, M, satp) \ - WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp) - -DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) -DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) -DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) -DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) - -/* - * Multiply add dual returning high half - * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of - * whether to add the rounding constant, and the pointer to the - * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant", - * saturate to twice the input size and return the high half; or - * (A * B - C * D) etc for VQDMLSDH. - */ -#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - if ((e & 1) == XCHG) { \ - TYPE r = FN(n[H##ESIZE(e)], \ - m[H##ESIZE(e - XCHG)], \ - n[H##ESIZE(e + (1 - 2 * XCHG))], \ - m[H##ESIZE(e + (1 - XCHG))], \ - ROUND, &sat); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - qc |= sat & mask & 1; \ - } \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d, - int round, bool *sat) -{ - int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7); - return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; -} - -static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d, - int round, bool *sat) -{ - int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15); - return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; -} - -static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, - int round, bool *sat) -{ - int64_t m1 = (int64_t)a * b; - int64_t m2 = (int64_t)c * d; - int64_t r; - /* - * Architecturally we should do the entire add, double, round - * and then check for saturation. We do three saturating adds, - * but we need to be careful about the order. If the first - * m1 + m2 saturates then it's impossible for the *2+rc to - * bring it back into the non-saturated range. However, if - * m1 + m2 is negative then it's possible that doing the doubling - * would take the intermediate result below INT64_MAX and the - * addition of the rounding constant then brings it back in range. - * So we add half the rounding constant before doubling rather - * than adding the rounding constant after the doubling. - */ - if (sadd64_overflow(m1, m2, &r) || - sadd64_overflow(r, (round << 30), &r) || - sadd64_overflow(r, r, &r)) { - *sat = true; - return r < 0 ? INT32_MAX : INT32_MIN; - } - return r >> 32; -} - -static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d, - int round, bool *sat) -{ - int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); - return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; -} - -static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d, - int round, bool *sat) -{ - int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); - return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; -} - -static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d, - int round, bool *sat) -{ - int64_t m1 = (int64_t)a * b; - int64_t m2 = (int64_t)c * d; - int64_t r; - /* The same ordering issue as in do_vqdmladh_w applies here too */ - if (ssub64_overflow(m1, m2, &r) || - sadd64_overflow(r, (round << 30), &r) || - sadd64_overflow(r, r, &r)) { - *sat = true; - return r < 0 ? INT32_MAX : INT32_MIN; - } - return r >> 32; -} - -DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) -DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) -DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) -DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b) -DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h) -DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w) - -DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b) -DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h) -DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w) -DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) -DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) -DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) - -DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b) -DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h) -DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w) -DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b) -DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h) -DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w) - -DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b) -DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h) -DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w) -DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b) -DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h) -DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) - -#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \ - mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -/* "accumulating" version where FN takes d as well as n and m */ -#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], \ - FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - mergemask(&d[H##ESIZE(e)], \ - FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \ - mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -/* provide unsigned 2-op scalar helpers for all sizes */ -#define DO_2OP_SCALAR_U(OP, FN) \ - DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ - DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ - DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) -#define DO_2OP_SCALAR_S(OP, FN) \ - DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \ - DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ - DO_2OP_SCALAR(OP##w, 4, int32_t, FN) - -#define DO_2OP_ACC_SCALAR_U(OP, FN) \ - DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN) \ - DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN) \ - DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN) - -DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) -DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) -DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) -DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s) -DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) -DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) -DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) - -DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B) -DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H) -DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W) -DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B) -DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H) -DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W) - -DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B) -DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H) -DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W) -DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) -DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) -DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) - -DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B) -DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H) -DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W) -DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) -DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) -DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) - -static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat) -{ - int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7); - return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; -} - -static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c, - int round, bool *sat) -{ - int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15); - return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; -} - -static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c, - int round, bool *sat) -{ - /* - * Architecturally we should do the entire add, double, round - * and then check for saturation. We do three saturating adds, - * but we need to be careful about the order. If the first - * m1 + m2 saturates then it's impossible for the *2+rc to - * bring it back into the non-saturated range. However, if - * m1 + m2 is negative then it's possible that doing the doubling - * would take the intermediate result below INT64_MAX and the - * addition of the rounding constant then brings it back in range. - * So we add half the rounding constant and half the "c << esize" - * before doubling rather than adding the rounding constant after - * the doubling. - */ - int64_t m1 = (int64_t)a * b; - int64_t m2 = (int64_t)c << 31; - int64_t r; - if (sadd64_overflow(m1, m2, &r) || - sadd64_overflow(r, (round << 30), &r) || - sadd64_overflow(r, r, &r)) { - *sat = true; - return r < 0 ? INT32_MAX : INT32_MIN; - } - return r >> 32; -} - -/* - * The *MLAH insns are vector * scalar + vector; - * the *MLASH insns are vector * vector + scalar - */ -#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S) -#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S) -#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S) -#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S) -#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S) -#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S) - -#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S) -#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S) -#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S) -#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S) -#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S) -#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S) - -DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B) -DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H) -DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W) -DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B) -DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H) -DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W) - -DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B) -DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H) -DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W) -DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B) -DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H) -DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W) - -/* Vector by scalar plus vector */ -#define DO_VMLA(D, N, M) ((N) * (M) + (D)) - -DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA) - -/* Vector by vector plus scalar */ -#define DO_VMLAS(D, N, M) ((N) * (D) + (M)) - -DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS) - -/* - * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the - * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. - * SATMASK specifies which bits of the predicate mask matter for determining - * whether to propagate a saturation indication into FPSCR.QC -- for - * the 16x16->32 case we must check only the bit corresponding to the T or B - * half that we used, but for the 32x32->64 case we propagate if the mask - * bit is set for either half. - */ -#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - uint32_t rm) \ - { \ - LTYPE *d = vd; \ - TYPE *n = vn; \ - TYPE m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - bool qc = false; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - bool sat = false; \ - LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ - mergemask(&d[H##LESIZE(le)], r, mask); \ - qc |= sat && (mask & SATMASK); \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat) -{ - int64_t r = ((int64_t)n * m) * 2; - return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat); -} - -static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) -{ - /* The multiply can't overflow, but the doubling might */ - int64_t r = (int64_t)n * m; - if (r > INT64_MAX / 2) { - *sat = true; - return INT64_MAX; - } else if (r < INT64_MIN / 2) { - *sat = true; - return INT64_MIN; - } else { - return r * 2; - } -} - -#define SATMASK16B 1 -#define SATMASK16T (1 << 2) -#define SATMASK32 ((1 << 4) | 1) - -DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \ - do_qdmullh, SATMASK16B) -DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \ - do_qdmullw, SATMASK32) -DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ - do_qdmullh, SATMASK16T) -DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ - do_qdmullw, SATMASK32) - -/* - * Long saturating ops - */ -#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ - void *vm) \ - { \ - LTYPE *d = vd; \ - TYPE *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - bool qc = false; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - bool sat = false; \ - LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \ - LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \ - mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \ - qc |= sat && (mask & SATMASK); \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B) -DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) -DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T) -DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) - -static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) -{ - m &= 0xff; - if (m == 0) { - return 0; - } - n = revbit8(n); - if (m < 8) { - n >>= 8 - m; - } - return n; -} - -static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) -{ - m &= 0xff; - if (m == 0) { - return 0; - } - n = revbit16(n); - if (m < 16) { - n >>= 16 - m; - } - return n; -} - -static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) -{ - m &= 0xff; - if (m == 0) { - return 0; - } - n = revbit32(n); - if (m < 32) { - n >>= 32 - m; - } - return n; -} - -DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb) -DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh) -DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw) - -/* - * Multiply add long dual accumulate ops. - */ -#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ - uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - void *vm, uint64_t a) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *n = vn, *m = vm; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - if (e & 1) { \ - a ODDACC \ - (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ - } else { \ - a EVENACC \ - (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ - } \ - } \ - } \ - mve_advance_vpt(env); \ - return a; \ - } - -DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=) -DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=) -DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=) -DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) - -DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) -DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) - -DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) -DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) -DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) -DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) - -/* - * Multiply add dual accumulate ops - */ -#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ - uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - void *vm, uint32_t a) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *n = vn, *m = vm; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - if (e & 1) { \ - a ODDACC \ - n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ - } else { \ - a EVENACC \ - n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ - } \ - } \ - } \ - mve_advance_vpt(env); \ - return a; \ - } - -#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC) - -#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \ - DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC) - -DO_DAV_S(vmladavs, false, +=, +=) -DO_DAV_U(vmladavu, false, +=, +=) -DO_DAV_S(vmlsdav, false, +=, -=) -DO_DAV_S(vmladavsx, true, +=, +=) -DO_DAV_S(vmlsdavx, true, +=, -=) - -/* - * Rounding multiply add long dual accumulate high. In the pseudocode - * this is implemented with a 72-bit internal accumulator value of which - * the top 64 bits are returned. We optimize this to avoid having to - * use 128-bit arithmetic -- we can do this because the 74-bit accumulator - * is squashed back into 64-bits after each beat. - */ -#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \ - uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - void *vm, uint64_t a) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *n = vn, *m = vm; \ - for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ - if (mask & 1) { \ - LTYPE mul; \ - if (e & 1) { \ - mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \ - if (SUB) { \ - mul = -mul; \ - } \ - } else { \ - mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \ - } \ - mul = (mul >> 8) + ((mul >> 7) & 1); \ - a += mul; \ - } \ - } \ - mve_advance_vpt(env); \ - return a; \ - } - -DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false) -DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false) - -DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false) - -DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true) -DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true) - -/* Vector add across vector */ -#define DO_VADDV(OP, ESIZE, TYPE) \ - uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ - uint32_t ra) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *m = vm; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - ra += m[H##ESIZE(e)]; \ - } \ - } \ - mve_advance_vpt(env); \ - return ra; \ - } \ - -DO_VADDV(vaddvsb, 1, int8_t) -DO_VADDV(vaddvsh, 2, int16_t) -DO_VADDV(vaddvsw, 4, int32_t) -DO_VADDV(vaddvub, 1, uint8_t) -DO_VADDV(vaddvuh, 2, uint16_t) -DO_VADDV(vaddvuw, 4, uint32_t) - -/* - * Vector max/min across vector. Unlike VADDV, we must - * read ra as the element size, not its full width. - * We work with int64_t internally for simplicity. - */ -#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN) \ - uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ - uint32_t ra_in) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *m = vm; \ - int64_t ra = (RATYPE)ra_in; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - ra = FN(ra, m[H##ESIZE(e)]); \ - } \ - } \ - mve_advance_vpt(env); \ - return ra; \ - } \ - -#define DO_VMAXMINV_U(INSN, FN) \ - DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN) \ - DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN) \ - DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN) -#define DO_VMAXMINV_S(INSN, FN) \ - DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN) \ - DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN) \ - DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN) - -/* - * Helpers for max and min of absolute values across vector: - * note that we only take the absolute value of 'm', not 'n' - */ -static int64_t do_maxa(int64_t n, int64_t m) -{ - if (m < 0) { - m = -m; - } - return MAX(n, m); -} - -static int64_t do_mina(int64_t n, int64_t m) -{ - if (m < 0) { - m = -m; - } - return MIN(n, m); -} - -DO_VMAXMINV_S(vmaxvs, DO_MAX) -DO_VMAXMINV_U(vmaxvu, DO_MAX) -DO_VMAXMINV_S(vminvs, DO_MIN) -DO_VMAXMINV_U(vminvu, DO_MIN) -/* - * VMAXAV, VMINAV treat the general purpose input as unsigned - * and the vector elements as signed. - */ -DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa) -DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa) -DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa) -DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina) -DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina) -DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina) - -#define DO_VABAV(OP, ESIZE, TYPE) \ - uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - void *vm, uint32_t ra) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *m = vm, *n = vn; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - int64_t n0 = n[H##ESIZE(e)]; \ - int64_t m0 = m[H##ESIZE(e)]; \ - uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \ - ra += r; \ - } \ - } \ - mve_advance_vpt(env); \ - return ra; \ - } - -DO_VABAV(vabavsb, 1, int8_t) -DO_VABAV(vabavsh, 2, int16_t) -DO_VABAV(vabavsw, 4, int32_t) -DO_VABAV(vabavub, 1, uint8_t) -DO_VABAV(vabavuh, 2, uint16_t) -DO_VABAV(vabavuw, 4, uint32_t) - -#define DO_VADDLV(OP, TYPE, LTYPE) \ - uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ - uint64_t ra) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *m = vm; \ - for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ - if (mask & 1) { \ - ra += (LTYPE)m[H4(e)]; \ - } \ - } \ - mve_advance_vpt(env); \ - return ra; \ - } \ - -DO_VADDLV(vaddlv_s, int32_t, int64_t) -DO_VADDLV(vaddlv_u, uint32_t, uint64_t) - -/* Shifts by immediate */ -#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - TYPE *d = vd, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], \ - FN(m[H##ESIZE(e)], shift), mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - TYPE *d = vd, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - mergemask(&d[H##ESIZE(e)], \ - FN(m[H##ESIZE(e)], shift, &sat), mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -/* provide unsigned 2-op shift helpers for all sizes */ -#define DO_2SHIFT_U(OP, FN) \ - DO_2SHIFT(OP##b, 1, uint8_t, FN) \ - DO_2SHIFT(OP##h, 2, uint16_t, FN) \ - DO_2SHIFT(OP##w, 4, uint32_t, FN) -#define DO_2SHIFT_S(OP, FN) \ - DO_2SHIFT(OP##b, 1, int8_t, FN) \ - DO_2SHIFT(OP##h, 2, int16_t, FN) \ - DO_2SHIFT(OP##w, 4, int32_t, FN) - -#define DO_2SHIFT_SAT_U(OP, FN) \ - DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \ - DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \ - DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN) -#define DO_2SHIFT_SAT_S(OP, FN) \ - DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \ - DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \ - DO_2SHIFT_SAT(OP##w, 4, int32_t, FN) - -DO_2SHIFT_U(vshli_u, DO_VSHLU) -DO_2SHIFT_S(vshli_s, DO_VSHLS) -DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP) -DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP) -DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP) -DO_2SHIFT_U(vrshli_u, DO_VRSHLU) -DO_2SHIFT_S(vrshli_s, DO_VRSHLS) -DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP) -DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP) - -/* Shift-and-insert; we always work with 64 bits at a time */ -#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - uint64_t *d = vd, *m = vm; \ - uint16_t mask; \ - uint64_t shiftmask; \ - unsigned e; \ - if (shift == ESIZE * 8) { \ - /* \ - * Only VSRI can shift by
; it should mean "don't \ - * update the destination". The generic logic can't handle \ - * this because it would try to shift by an out-of-range \ - * amount, so special case it here. \ - */ \ - goto done; \ - } \ - assert(shift < ESIZE * 8); \ - mask = mve_element_mask(env); \ - /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \ - shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \ - for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ - uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \ - (d[H8(e)] & ~shiftmask); \ - mergemask(&d[H8(e)], r, mask); \ - } \ -done: \ - mve_advance_vpt(env); \ - } - -#define DO_SHL(N, SHIFT) ((N) << (SHIFT)) -#define DO_SHR(N, SHIFT) ((N) >> (SHIFT)) -#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT)) -#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT)) - -DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK) -DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK) -DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK) -DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK) -DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK) -DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK) - -/* - * Long shifts taking half-sized inputs from top or bottom of the input - * vector and producing a double-width result. ESIZE, TYPE are for - * the input, and LESIZE, LTYPE for the output. - * Unlike the normal shift helpers, we do not handle negative shift counts, - * because the long shift is strictly left-only. - */ -#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - LTYPE *d = vd; \ - TYPE *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - assert(shift <= 16); \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \ - mergemask(&d[H##LESIZE(le)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VSHLL_ALL(OP, TOP) \ - DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \ - DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \ - DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \ - DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \ - -DO_VSHLL_ALL(vshllb, false) -DO_VSHLL_ALL(vshllt, true) - -/* - * Narrowing right shifts, taking a double sized input, shifting it - * and putting the result in either the top or bottom half of the output. - * ESIZE, TYPE are the output, and LESIZE, LTYPE the input. - */ -#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - LTYPE *m = vm; \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - mask >>= ESIZE * TOP; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - TYPE r = FN(m[H##LESIZE(le)], shift); \ - mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VSHRN_ALL(OP, FN) \ - DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \ - DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \ - DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \ - DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN) - -static inline uint64_t do_urshr(uint64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else if (sh == 64) { - return x >> 63; - } else { - return 0; - } -} - -static inline int64_t do_srshr(int64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else { - /* Rounding the sign bit always produces 0. */ - return 0; - } -} - -DO_VSHRN_ALL(vshrn, DO_SHR) -DO_VSHRN_ALL(vrshrn, do_urshr) - -static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max, - bool *satp) -{ - if (val > max) { - *satp = true; - return max; - } else if (val < min) { - *satp = true; - return min; - } else { - return val; - } -} - -/* Saturating narrowing right shifts */ -#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ - void *vm, uint32_t shift) \ - { \ - LTYPE *m = vm; \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - bool qc = false; \ - unsigned le; \ - mask >>= ESIZE * TOP; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - bool sat = false; \ - TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \ - mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VSHRN_SAT_UB(BOP, TOP, FN) \ - DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \ - DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN) - -#define DO_VSHRN_SAT_UH(BOP, TOP, FN) \ - DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \ - DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN) - -#define DO_VSHRN_SAT_SB(BOP, TOP, FN) \ - DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \ - DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN) - -#define DO_VSHRN_SAT_SH(BOP, TOP, FN) \ - DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \ - DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN) - -#define DO_SHRN_SB(N, M, SATP) \ - do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP) -#define DO_SHRN_UB(N, M, SATP) \ - do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP) -#define DO_SHRUN_B(N, M, SATP) \ - do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP) - -#define DO_SHRN_SH(N, M, SATP) \ - do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP) -#define DO_SHRN_UH(N, M, SATP) \ - do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP) -#define DO_SHRUN_H(N, M, SATP) \ - do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP) - -#define DO_RSHRN_SB(N, M, SATP) \ - do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP) -#define DO_RSHRN_UB(N, M, SATP) \ - do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP) -#define DO_RSHRUN_B(N, M, SATP) \ - do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP) - -#define DO_RSHRN_SH(N, M, SATP) \ - do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP) -#define DO_RSHRN_UH(N, M, SATP) \ - do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP) -#define DO_RSHRUN_H(N, M, SATP) \ - do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP) - -DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB) -DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH) -DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB) -DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH) -DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B) -DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H) - -DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB) -DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH) -DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB) -DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH) -DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B) -DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H) - -#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ - { \ - LTYPE *m = vm; \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - unsigned le; \ - mask >>= ESIZE * TOP; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - mergemask(&d[H##ESIZE(le * 2 + TOP)], \ - m[H##LESIZE(le)], mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t) -DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t) -DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t) -DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t) - -#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ - { \ - LTYPE *m = vm; \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - bool qc = false; \ - unsigned le; \ - mask >>= ESIZE * TOP; \ - for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ - bool sat = false; \ - TYPE r = FN(m[H##LESIZE(le)], &sat); \ - mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VMOVN_SAT_UB(BOP, TOP, FN) \ - DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \ - DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN) - -#define DO_VMOVN_SAT_UH(BOP, TOP, FN) \ - DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \ - DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN) - -#define DO_VMOVN_SAT_SB(BOP, TOP, FN) \ - DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \ - DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN) - -#define DO_VMOVN_SAT_SH(BOP, TOP, FN) \ - DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \ - DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN) - -#define DO_VQMOVN_SB(N, SATP) \ - do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP) -#define DO_VQMOVN_UB(N, SATP) \ - do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP) -#define DO_VQMOVUN_B(N, SATP) \ - do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP) - -#define DO_VQMOVN_SH(N, SATP) \ - do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP) -#define DO_VQMOVN_UH(N, SATP) \ - do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP) -#define DO_VQMOVUN_H(N, SATP) \ - do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP) - -DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB) -DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH) -DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB) -DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH) -DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B) -DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H) - -uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm, - uint32_t shift) -{ - uint32_t *d = vd; - uint16_t mask = mve_element_mask(env); - unsigned e; - uint32_t r; - - /* - * For each 32-bit element, we shift it left, bringing in the - * low 'shift' bits of rdm at the bottom. Bits shifted out at - * the top become the new rdm, if the predicate mask permits. - * The final rdm value is returned to update the register. - * shift == 0 here means "shift by 32 bits". - */ - if (shift == 0) { - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - r = rdm; - if (mask & 1) { - rdm = d[H4(e)]; - } - mergemask(&d[H4(e)], r, mask); - } - } else { - uint32_t shiftmask = MAKE_64BIT_MASK(0, shift); - - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - r = (d[H4(e)] << shift) | (rdm & shiftmask); - if (mask & 1) { - rdm = d[H4(e)] >> (32 - shift); - } - mergemask(&d[H4(e)], r, mask); - } - } - mve_advance_vpt(env); - return rdm; -} - -uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_sqrshl_d(n, -(int8_t)shift, false, NULL); -} - -uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_uqrshl_d(n, (int8_t)shift, false, NULL); -} - -uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_sqrshl_d(n, (int8_t)shift, false, &env->QF); -} - -uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_uqrshl_d(n, (int8_t)shift, false, &env->QF); -} - -uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF); -} - -uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_uqrshl_d(n, (int8_t)shift, true, &env->QF); -} - -/* Operate on 64-bit values, but saturate at 48 bits */ -static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift, - bool round, uint32_t *sat) -{ - int64_t val, extval; - - if (shift <= -48) { - /* Rounding the sign bit always produces 0. */ - if (round) { - return 0; - } - return src >> 63; - } else if (shift < 0) { - if (round) { - src >>= -shift - 1; - val = (src >> 1) + (src & 1); - } else { - val = src >> -shift; - } - extval = sextract64(val, 0, 48); - if (!sat || val == extval) { - return extval; - } - } else if (shift < 48) { - int64_t extval = sextract64(src << shift, 0, 48); - if (!sat || src == (extval >> shift)) { - return extval; - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17); -} - -/* Operate on 64-bit values, but saturate at 48 bits */ -static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift, - bool round, uint32_t *sat) -{ - uint64_t val, extval; - - if (shift <= -(48 + round)) { - return 0; - } else if (shift < 0) { - if (round) { - val = src >> (-shift - 1); - val = (val >> 1) + (val & 1); - } else { - val = src >> -shift; - } - extval = extract64(val, 0, 48); - if (!sat || val == extval) { - return extval; - } - } else if (shift < 48) { - uint64_t extval = extract64(src << shift, 0, 48); - if (!sat || src == (extval >> shift)) { - return extval; - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return MAKE_64BIT_MASK(0, 48); -} - -uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF); -} - -uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift) -{ - return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF); -} - -uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift) -{ - return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); -} - -uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift) -{ - return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); -} - -uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift) -{ - return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF); -} - -uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift) -{ - return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF); -} - -#define DO_VIDUP(OP, ESIZE, TYPE, FN) \ - uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \ - uint32_t offset, uint32_t imm) \ - { \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], offset, mask); \ - offset = FN(offset, imm); \ - } \ - mve_advance_vpt(env); \ - return offset; \ - } - -#define DO_VIWDUP(OP, ESIZE, TYPE, FN) \ - uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \ - uint32_t offset, uint32_t wrap, \ - uint32_t imm) \ - { \ - TYPE *d = vd; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], offset, mask); \ - offset = FN(offset, wrap, imm); \ - } \ - mve_advance_vpt(env); \ - return offset; \ - } - -#define DO_VIDUP_ALL(OP, FN) \ - DO_VIDUP(OP##b, 1, int8_t, FN) \ - DO_VIDUP(OP##h, 2, int16_t, FN) \ - DO_VIDUP(OP##w, 4, int32_t, FN) - -#define DO_VIWDUP_ALL(OP, FN) \ - DO_VIWDUP(OP##b, 1, int8_t, FN) \ - DO_VIWDUP(OP##h, 2, int16_t, FN) \ - DO_VIWDUP(OP##w, 4, int32_t, FN) - -static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm) -{ - offset += imm; - if (offset == wrap) { - offset = 0; - } - return offset; -} - -static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm) -{ - if (offset == 0) { - offset = wrap; - } - offset -= imm; - return offset; -} - -DO_VIDUP_ALL(vidup, DO_ADD) -DO_VIWDUP_ALL(viwdup, do_add_wrap) -DO_VIWDUP_ALL(vdwdup, do_sub_wrap) - -/* - * Vector comparison. - * P0 bits for non-executed beats (where eci_mask is 0) are unchanged. - * P0 bits for predicated lanes in executed beats (where mask is 0) are 0. - * P0 bits otherwise are updated with the results of the comparisons. - * We must also keep unchanged the MASK fields at the top of v7m.vpr. - */ -#define DO_VCMP(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \ - { \ - TYPE *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - uint16_t beatpred = 0; \ - uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++) { \ - bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]); \ - /* Comparison sets 0/1 bits for each byte in the element */ \ - beatpred |= r * emask; \ - emask <<= ESIZE; \ - } \ - beatpred &= mask; \ - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ - (beatpred & eci_mask); \ - mve_advance_vpt(env); \ - } - -#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - uint32_t rm) \ - { \ - TYPE *n = vn; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - uint16_t beatpred = 0; \ - uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++) { \ - bool r = FN(n[H##ESIZE(e)], (TYPE)rm); \ - /* Comparison sets 0/1 bits for each byte in the element */ \ - beatpred |= r * emask; \ - emask <<= ESIZE; \ - } \ - beatpred &= mask; \ - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ - (beatpred & eci_mask); \ - mve_advance_vpt(env); \ - } - -#define DO_VCMP_S(OP, FN) \ - DO_VCMP(OP##b, 1, int8_t, FN) \ - DO_VCMP(OP##h, 2, int16_t, FN) \ - DO_VCMP(OP##w, 4, int32_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN) - -#define DO_VCMP_U(OP, FN) \ - DO_VCMP(OP##b, 1, uint8_t, FN) \ - DO_VCMP(OP##h, 2, uint16_t, FN) \ - DO_VCMP(OP##w, 4, uint32_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN) \ - DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN) - -#define DO_EQ(N, M) ((N) == (M)) -#define DO_NE(N, M) ((N) != (M)) -#define DO_EQ(N, M) ((N) == (M)) -#define DO_EQ(N, M) ((N) == (M)) -#define DO_GE(N, M) ((N) >= (M)) -#define DO_LT(N, M) ((N) < (M)) -#define DO_GT(N, M) ((N) > (M)) -#define DO_LE(N, M) ((N) <= (M)) - -DO_VCMP_U(vcmpeq, DO_EQ) -DO_VCMP_U(vcmpne, DO_NE) -DO_VCMP_U(vcmpcs, DO_GE) -DO_VCMP_U(vcmphi, DO_GT) -DO_VCMP_S(vcmpge, DO_GE) -DO_VCMP_S(vcmplt, DO_LT) -DO_VCMP_S(vcmpgt, DO_GT) -DO_VCMP_S(vcmple, DO_LE) - -void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm) -{ - /* - * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n] - * but note that whether bytes are written to Qd is still subject - * to (all forms of) predication in the usual way. - */ - uint64_t *d = vd, *n = vn, *m = vm; - uint16_t mask = mve_element_mask(env); - uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); - unsigned e; - for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) { - uint64_t r = m[H8(e)]; - mergemask(&r, n[H8(e)], p0); - mergemask(&d[H8(e)], r, mask); - } - mve_advance_vpt(env); -} - -void HELPER(mve_vpnot)(CPUARMState *env) -{ - /* - * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged. - * P0 bits for predicated lanes in executed bits (where mask is 0) are 0. - * P0 bits otherwise are inverted. - * (This is the same logic as VCMP.) - * This insn is itself subject to predication and to beat-wise execution, - * and after it executes VPT state advances in the usual way. - */ - uint16_t mask = mve_element_mask(env); - uint16_t eci_mask = mve_eci_mask(env); - uint16_t beatpred = ~env->v7m.vpr & mask; - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask); - mve_advance_vpt(env); -} - -/* - * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed, - * otherwise set according to value of Rn. The calculation of - * newmask here works in the same way as the calculation of the - * ltpmask in mve_element_mask(), but we have pre-calculated - * the masklen in the generated code. - */ -void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen) -{ - uint16_t mask = mve_element_mask(env); - uint16_t eci_mask = mve_eci_mask(env); - uint16_t newmask; - - assert(masklen <= 16); - newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; - newmask &= mask; - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask); - mve_advance_vpt(env); -} - -#define DO_1OP_SAT(OP, ESIZE, TYPE, FN) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ - { \ - TYPE *d = vd, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - bool qc = false; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - bool sat = false; \ - mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \ - qc |= sat & mask & 1; \ - } \ - if (qc) { \ - env->vfp.qc[0] = qc; \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VQABS_B(N, SATP) \ - do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP) -#define DO_VQABS_H(N, SATP) \ - do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP) -#define DO_VQABS_W(N, SATP) \ - do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP) - -#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP) -#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP) -#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP) - -DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B) -DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H) -DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W) - -DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B) -DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H) -DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W) - -/* - * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its - * absolute value; we then do an unsigned comparison. - */ -#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN) \ - void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ - { \ - UTYPE *d = vd; \ - STYPE *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - UTYPE r = DO_ABS(m[H##ESIZE(e)]); \ - r = FN(d[H##ESIZE(e)], r); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX) -DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX) -DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX) -DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN) -DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN) -DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN) - -/* - * 2-operand floating point. Note that if an element is partially - * predicated we must do the FP operation to update the non-predicated - * bytes, but we must be careful to avoid updating the FP exception - * state unless byte 0 of the element was unpredicated. - */ -#define DO_2OP_FP(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - TYPE r; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2OP_FP_ALL(OP, FN) \ - DO_2OP_FP(OP##h, 2, float16, float16_##FN) \ - DO_2OP_FP(OP##s, 4, float32, float32_##FN) - -DO_2OP_FP_ALL(vfadd, add) -DO_2OP_FP_ALL(vfsub, sub) -DO_2OP_FP_ALL(vfmul, mul) - -static inline float16 float16_abd(float16 a, float16 b, float_status *s) -{ - return float16_abs(float16_sub(a, b, s)); -} - -static inline float32 float32_abd(float32 a, float32 b, float_status *s) -{ - return float32_abs(float32_sub(a, b, s)); -} - -DO_2OP_FP_ALL(vfabd, abd) -DO_2OP_FP_ALL(vmaxnm, maxnum) -DO_2OP_FP_ALL(vminnm, minnum) - -static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s) -{ - return float16_maxnum(float16_abs(a), float16_abs(b), s); -} - -static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s) -{ - return float32_maxnum(float32_abs(a), float32_abs(b), s); -} - -static inline float16 float16_minnuma(float16 a, float16 b, float_status *s) -{ - return float16_minnum(float16_abs(a), float16_abs(b), s); -} - -static inline float32 float32_minnuma(float32 a, float32 b, float_status *s) -{ - return float32_minnum(float32_abs(a), float32_abs(b), s); -} - -DO_2OP_FP_ALL(vmaxnma, maxnuma) -DO_2OP_FP_ALL(vminnma, minnuma) - -#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - TYPE r[16 / ESIZE]; \ - uint16_t tm, mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - /* Calculate all results first to avoid overwriting inputs */ \ - for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) { \ - if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - r[e] = 0; \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(tm & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - if (!(e & 1)) { \ - r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst); \ - } else { \ - r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst); \ - } \ - } \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - mergemask(&d[H##ESIZE(e)], r[e], mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add) -DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add) -DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub) -DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub) - -#define DO_VFMA(OP, ESIZE, TYPE, CHS) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - TYPE r; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = n[H##ESIZE(e)]; \ - if (CHS) { \ - r = TYPE##_chs(r); \ - } \ - r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)], \ - 0, fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_VFMA(vfmah, 2, float16, false) -DO_VFMA(vfmas, 4, float32, false) -DO_VFMA(vfmsh, 2, float16, true) -DO_VFMA(vfmss, 4, float32, true) - -#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, void *vm) \ - { \ - TYPE *d = vd, *n = vn, *m = vm; \ - TYPE r0, r1, e1, e2, e3, e4; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst0, *fpst1; \ - float_status scratch_fpst; \ - /* We loop through pairs of elements at a time */ \ - for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) { \ - continue; \ - } \ - fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - fpst1 = fpst0; \ - if (!(mask & 1)) { \ - scratch_fpst = *fpst0; \ - fpst0 = &scratch_fpst; \ - } \ - if (!(mask & (1 << ESIZE))) { \ - scratch_fpst = *fpst1; \ - fpst1 = &scratch_fpst; \ - } \ - switch (ROT) { \ - case 0: \ - e1 = m[H##ESIZE(e)]; \ - e2 = n[H##ESIZE(e)]; \ - e3 = m[H##ESIZE(e + 1)]; \ - e4 = n[H##ESIZE(e)]; \ - break; \ - case 1: \ - e1 = TYPE##_chs(m[H##ESIZE(e + 1)]); \ - e2 = n[H##ESIZE(e + 1)]; \ - e3 = m[H##ESIZE(e)]; \ - e4 = n[H##ESIZE(e + 1)]; \ - break; \ - case 2: \ - e1 = TYPE##_chs(m[H##ESIZE(e)]); \ - e2 = n[H##ESIZE(e)]; \ - e3 = TYPE##_chs(m[H##ESIZE(e + 1)]); \ - e4 = n[H##ESIZE(e)]; \ - break; \ - case 3: \ - e1 = m[H##ESIZE(e + 1)]; \ - e2 = n[H##ESIZE(e + 1)]; \ - e3 = TYPE##_chs(m[H##ESIZE(e)]); \ - e4 = n[H##ESIZE(e + 1)]; \ - break; \ - default: \ - g_assert_not_reached(); \ - } \ - r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0); \ - r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1); \ - mergemask(&d[H##ESIZE(e)], r0, mask); \ - mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S) -#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S) - -#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S) -#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S) - -DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH) -DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS) -DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH) -DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS) -DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH) -DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS) -DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH) -DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS) - -DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH) -DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS) -DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH) -DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS) -DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH) -DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS) -DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH) -DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS) - -#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE r, m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(n[H##ESIZE(e)], m, fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -#define DO_2OP_FP_SCALAR_ALL(OP, FN) \ - DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN) \ - DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN) - -DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add) -DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub) -DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul) - -#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vn, uint32_t rm) \ - { \ - TYPE *d = vd, *n = vn; \ - TYPE r, m = rm; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -/* VFMAS is vector * vector + scalar, so swap op2 and op3 */ -#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S) -#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S) - -/* VFMA is vector * scalar + vector */ -DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd) -DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd) -DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH) -DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS) - -/* Floating point max/min across vector. */ -#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN) \ - uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ - uint32_t ra_in) \ - { \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - TYPE *m = vm; \ - TYPE ra = (TYPE)ra_in; \ - float_status *fpst = (ESIZE == 2) ? \ - &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if (mask & 1) { \ - TYPE v = m[H##ESIZE(e)]; \ - if (TYPE##_is_signaling_nan(ra, fpst)) { \ - ra = TYPE##_silence_nan(ra, fpst); \ - float_raise(float_flag_invalid, fpst); \ - } \ - if (TYPE##_is_signaling_nan(v, fpst)) { \ - v = TYPE##_silence_nan(v, fpst); \ - float_raise(float_flag_invalid, fpst); \ - } \ - if (ABS) { \ - v = TYPE##_abs(v); \ - } \ - ra = FN(ra, v, fpst); \ - } \ - } \ - mve_advance_vpt(env); \ - return ra; \ - } \ - -#define NOP(X) (X) - -DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum) -DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum) -DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum) -DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum) -DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum) -DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum) -DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum) -DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum) - -/* FP compares; note that all comparisons signal InvalidOp for QNaNs */ -#define DO_VCMP_FP(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \ - { \ - TYPE *n = vn, *m = vm; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - uint16_t beatpred = 0; \ - uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - bool r; \ - for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \ - if ((mask & emask) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & (1 << (e * ESIZE)))) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \ - /* Comparison sets 0/1 bits for each byte in the element */ \ - beatpred |= r * emask; \ - } \ - beatpred &= mask; \ - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ - (beatpred & eci_mask); \ - mve_advance_vpt(env); \ - } - -#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ - uint32_t rm) \ - { \ - TYPE *n = vn; \ - uint16_t mask = mve_element_mask(env); \ - uint16_t eci_mask = mve_eci_mask(env); \ - uint16_t beatpred = 0; \ - uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - bool r; \ - for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \ - if ((mask & emask) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & (1 << (e * ESIZE)))) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst); \ - /* Comparison sets 0/1 bits for each byte in the element */ \ - beatpred |= r * emask; \ - } \ - beatpred &= mask; \ - env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ - (beatpred & eci_mask); \ - mve_advance_vpt(env); \ - } - -#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN) \ - DO_VCMP_FP(VOP, ESIZE, TYPE, FN) \ - DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN) - -/* - * Some care is needed here to get the correct result for the unordered case. - * Architecturally EQ, GE and GT are defined to be false for unordered, but - * the NE, LT and LE comparisons are defined as simple logical inverses of - * EQ, GE and GT and so they must return true for unordered. The softfloat - * comparison functions float*_{eq,le,lt} all return false for unordered. - */ -#define DO_GE16(X, Y, S) float16_le(Y, X, S) -#define DO_GE32(X, Y, S) float32_le(Y, X, S) -#define DO_GT16(X, Y, S) float16_lt(Y, X, S) -#define DO_GT32(X, Y, S) float32_lt(Y, X, S) - -DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq) -DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq) - -DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq) -DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq) - -DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16) -DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32) - -DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16) -DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32) - -DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16) -DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32) - -DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16) -DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32) - -#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm, \ - uint32_t shift) \ - { \ - TYPE *d = vd, *m = vm; \ - TYPE r; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(m[H##ESIZE(e)], shift, fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh) -DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh) -DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero) -DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero) -DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos) -DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos) -DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero) -DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero) - -/* VCVT with specified rmode */ -#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, \ - void *vd, void *vm, uint32_t rmode) \ - { \ - TYPE *d = vd, *m = vm; \ - TYPE r; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - float_status *base_fpst = (ESIZE == 2) ? \ - &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - uint32_t prev_rmode = get_float_rounding_mode(base_fpst); \ - set_float_rounding_mode(rmode, base_fpst); \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = base_fpst; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(m[H##ESIZE(e)], 0, fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - set_float_rounding_mode(prev_rmode, base_fpst); \ - mve_advance_vpt(env); \ - } - -DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh) -DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh) -DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls) -DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls) - -#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S) -#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S) - -DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H) -DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S) - -/* - * VCVT between halfprec and singleprec. As usual for halfprec - * conversions, FZ16 is ignored and AHP is observed. - */ -static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top) -{ - uint16_t *d = vd; - uint32_t *m = vm; - uint16_t r; - uint16_t mask = mve_element_mask(env); - bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); - unsigned e; - float_status *fpst; - float_status scratch_fpst; - float_status *base_fpst = &env->vfp.standard_fp_status; - bool old_fz = get_flush_to_zero(base_fpst); - set_flush_to_zero(false, base_fpst); - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) { - continue; - } - fpst = base_fpst; - if (!(mask & 1)) { - /* We need the result but without updating flags */ - scratch_fpst = *fpst; - fpst = &scratch_fpst; - } - r = float32_to_float16(m[H4(e)], ieee, fpst); - mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2)); - } - set_flush_to_zero(old_fz, base_fpst); - mve_advance_vpt(env); -} - -static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top) -{ - uint32_t *d = vd; - uint16_t *m = vm; - uint32_t r; - uint16_t mask = mve_element_mask(env); - bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); - unsigned e; - float_status *fpst; - float_status scratch_fpst; - float_status *base_fpst = &env->vfp.standard_fp_status; - bool old_fiz = get_flush_inputs_to_zero(base_fpst); - set_flush_inputs_to_zero(false, base_fpst); - for (e = 0; e < 16 / 4; e++, mask >>= 4) { - if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) { - continue; - } - fpst = base_fpst; - if (!(mask & (1 << (top * 2)))) { - /* We need the result but without updating flags */ - scratch_fpst = *fpst; - fpst = &scratch_fpst; - } - r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst); - mergemask(&d[H4(e)], r, mask); - } - set_flush_inputs_to_zero(old_fiz, base_fpst); - mve_advance_vpt(env); -} - -void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm) -{ - do_vcvt_sh(env, vd, vm, 0); -} -void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm) -{ - do_vcvt_sh(env, vd, vm, 1); -} -void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm) -{ - do_vcvt_hs(env, vd, vm, 0); -} -void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm) -{ - do_vcvt_hs(env, vd, vm, 1); -} - -#define DO_1OP_FP(OP, ESIZE, TYPE, FN) \ - void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm) \ - { \ - TYPE *d = vd, *m = vm; \ - TYPE r; \ - uint16_t mask = mve_element_mask(env); \ - unsigned e; \ - float_status *fpst; \ - float_status scratch_fpst; \ - for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ - if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ - continue; \ - } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ - if (!(mask & 1)) { \ - /* We need the result but without updating flags */ \ - scratch_fpst = *fpst; \ - fpst = &scratch_fpst; \ - } \ - r = FN(m[H##ESIZE(e)], fpst); \ - mergemask(&d[H##ESIZE(e)], r, mask); \ - } \ - mve_advance_vpt(env); \ - } - -DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int) -DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int) diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c deleted file mode 100644 index bc6c4a54e9..0000000000 --- a/target/arm/neon_helper.c +++ /dev/null @@ -1,1740 +0,0 @@ -/* - * ARM NEON vector operations. - * - * Copyright (c) 2007, 2008 CodeSourcery. - * Written by Paul Brook - * - * This code is licensed under the GNU GPL v2. - */ -#include "qemu/osdep.h" - -#include "cpu.h" -#include "exec/helper-proto.h" -#include "fpu/softfloat.h" -#include "vec_internal.h" - -#define SIGNBIT (uint32_t)0x80000000 -#define SIGNBIT64 ((uint64_t)1 << 63) - -#define SET_QC() env->vfp.qc[0] = 1 - -#define NEON_TYPE1(name, type) \ -typedef struct \ -{ \ - type v1; \ -} neon_##name; -#if HOST_BIG_ENDIAN -#define NEON_TYPE2(name, type) \ -typedef struct \ -{ \ - type v2; \ - type v1; \ -} neon_##name; -#define NEON_TYPE4(name, type) \ -typedef struct \ -{ \ - type v4; \ - type v3; \ - type v2; \ - type v1; \ -} neon_##name; -#else -#define NEON_TYPE2(name, type) \ -typedef struct \ -{ \ - type v1; \ - type v2; \ -} neon_##name; -#define NEON_TYPE4(name, type) \ -typedef struct \ -{ \ - type v1; \ - type v2; \ - type v3; \ - type v4; \ -} neon_##name; -#endif - -NEON_TYPE4(s8, int8_t) -NEON_TYPE4(u8, uint8_t) -NEON_TYPE2(s16, int16_t) -NEON_TYPE2(u16, uint16_t) -NEON_TYPE1(s32, int32_t) -NEON_TYPE1(u32, uint32_t) -#undef NEON_TYPE4 -#undef NEON_TYPE2 -#undef NEON_TYPE1 - -/* Copy from a uint32_t to a vector structure type. */ -#define NEON_UNPACK(vtype, dest, val) do { \ - union { \ - vtype v; \ - uint32_t i; \ - } conv_u; \ - conv_u.i = (val); \ - dest = conv_u.v; \ - } while(0) - -/* Copy from a vector structure type to a uint32_t. */ -#define NEON_PACK(vtype, dest, val) do { \ - union { \ - vtype v; \ - uint32_t i; \ - } conv_u; \ - conv_u.v = (val); \ - dest = conv_u.i; \ - } while(0) - -#define NEON_DO1 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); -#define NEON_DO2 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); -#define NEON_DO4 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); - -#define NEON_VOP_BODY(vtype, n) \ -{ \ - uint32_t res; \ - vtype vsrc1; \ - vtype vsrc2; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, arg1); \ - NEON_UNPACK(vtype, vsrc2, arg2); \ - NEON_DO##n; \ - NEON_PACK(vtype, res, vdest); \ - return res; \ -} - -#define NEON_VOP(name, vtype, n) \ -uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ -NEON_VOP_BODY(vtype, n) - -#define NEON_VOP_ENV(name, vtype, n) \ -uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ -NEON_VOP_BODY(vtype, n) - -/* Pairwise operations. */ -/* For 32-bit elements each segment only contains a single element, so - the elementwise and pairwise operations are the same. */ -#define NEON_PDO2 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); -#define NEON_PDO4 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ - -#define NEON_POP(name, vtype, n) \ -uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ -{ \ - uint32_t res; \ - vtype vsrc1; \ - vtype vsrc2; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, arg1); \ - NEON_UNPACK(vtype, vsrc2, arg2); \ - NEON_PDO##n; \ - NEON_PACK(vtype, res, vdest); \ - return res; \ -} - -/* Unary operators. */ -#define NEON_VOP1(name, vtype, n) \ -uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ -{ \ - vtype vsrc1; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, arg); \ - NEON_DO##n; \ - NEON_PACK(vtype, arg, vdest); \ - return arg; \ -} - - -#define NEON_USAT(dest, src1, src2, type) do { \ - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - SET_QC(); \ - dest = ~0; \ - } else { \ - dest = tmp; \ - }} while(0) -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) -NEON_VOP_ENV(qadd_u8, neon_u8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) -NEON_VOP_ENV(qadd_u16, neon_u16, 2) -#undef NEON_FN -#undef NEON_USAT - -uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a + b; - if (res < a) { - SET_QC(); - res = ~0; - } - return res; -} - -uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) -{ - uint64_t res; - - res = src1 + src2; - if (res < src1) { - SET_QC(); - res = ~(uint64_t)0; - } - return res; -} - -#define NEON_SSAT(dest, src1, src2, type) do { \ - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - SET_QC(); \ - if (src2 > 0) { \ - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ - } else { \ - tmp = 1 << (sizeof(type) * 8 - 1); \ - } \ - } \ - dest = tmp; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) -NEON_VOP_ENV(qadd_s8, neon_s8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) -NEON_VOP_ENV(qadd_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_SSAT - -uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a + b; - if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { - SET_QC(); - res = ~(((int32_t)a >> 31) ^ SIGNBIT); - } - return res; -} - -uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) -{ - uint64_t res; - - res = src1 + src2; - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { - SET_QC(); - res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; - } - return res; -} - -/* Unsigned saturating accumulate of signed value - * - * Op1/Rn is treated as signed - * Op2/Rd is treated as unsigned - * - * Explicit casting is used to ensure the correct sign extension of - * inputs. The result is treated as a unsigned value and saturated as such. - * - * We use a macro for the 8/16 bit cases which expects signed integers of va, - * vb, and vr for interim calculation and an unsigned 32 bit result value r. - */ - -#define USATACC(bits, shift) \ - do { \ - va = sextract32(a, shift, bits); \ - vb = extract32(b, shift, bits); \ - vr = va + vb; \ - if (vr > UINT##bits##_MAX) { \ - SET_QC(); \ - vr = UINT##bits##_MAX; \ - } else if (vr < 0) { \ - SET_QC(); \ - vr = 0; \ - } \ - r = deposit32(r, shift, bits, vr); \ - } while (0) - -uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int16_t va, vb, vr; - uint32_t r = 0; - - USATACC(8, 0); - USATACC(8, 8); - USATACC(8, 16); - USATACC(8, 24); - return r; -} - -uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int32_t va, vb, vr; - uint64_t r = 0; - - USATACC(16, 0); - USATACC(16, 16); - return r; -} - -#undef USATACC - -uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int64_t va = (int32_t)a; - int64_t vb = (uint32_t)b; - int64_t vr = va + vb; - if (vr > UINT32_MAX) { - SET_QC(); - vr = UINT32_MAX; - } else if (vr < 0) { - SET_QC(); - vr = 0; - } - return vr; -} - -uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b) -{ - uint64_t res; - res = a + b; - /* We only need to look at the pattern of SIGN bits to detect - * +ve/-ve saturation - */ - if (~a & b & ~res & SIGNBIT64) { - SET_QC(); - res = UINT64_MAX; - } else if (a & ~b & res & SIGNBIT64) { - SET_QC(); - res = 0; - } - return res; -} - -/* Signed saturating accumulate of unsigned value - * - * Op1/Rn is treated as unsigned - * Op2/Rd is treated as signed - * - * The result is treated as a signed value and saturated as such - * - * We use a macro for the 8/16 bit cases which expects signed integers of va, - * vb, and vr for interim calculation and an unsigned 32 bit result value r. - */ - -#define SSATACC(bits, shift) \ - do { \ - va = extract32(a, shift, bits); \ - vb = sextract32(b, shift, bits); \ - vr = va + vb; \ - if (vr > INT##bits##_MAX) { \ - SET_QC(); \ - vr = INT##bits##_MAX; \ - } else if (vr < INT##bits##_MIN) { \ - SET_QC(); \ - vr = INT##bits##_MIN; \ - } \ - r = deposit32(r, shift, bits, vr); \ - } while (0) - -uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int16_t va, vb, vr; - uint32_t r = 0; - - SSATACC(8, 0); - SSATACC(8, 8); - SSATACC(8, 16); - SSATACC(8, 24); - return r; -} - -uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int32_t va, vb, vr; - uint32_t r = 0; - - SSATACC(16, 0); - SSATACC(16, 16); - - return r; -} - -#undef SSATACC - -uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - int64_t res; - int64_t op1 = (uint32_t)a; - int64_t op2 = (int32_t)b; - res = op1 + op2; - if (res > INT32_MAX) { - SET_QC(); - res = INT32_MAX; - } else if (res < INT32_MIN) { - SET_QC(); - res = INT32_MIN; - } - return res; -} - -uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b) -{ - uint64_t res; - res = a + b; - /* We only need to look at the pattern of SIGN bits to detect an overflow */ - if (((a & res) - | (~b & res) - | (a & ~b)) & SIGNBIT64) { - SET_QC(); - res = INT64_MAX; - } - return res; -} - - -#define NEON_USAT(dest, src1, src2, type) do { \ - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - SET_QC(); \ - dest = 0; \ - } else { \ - dest = tmp; \ - }} while(0) -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) -NEON_VOP_ENV(qsub_u8, neon_u8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) -NEON_VOP_ENV(qsub_u16, neon_u16, 2) -#undef NEON_FN -#undef NEON_USAT - -uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a - b; - if (res > a) { - SET_QC(); - res = 0; - } - return res; -} - -uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) -{ - uint64_t res; - - if (src1 < src2) { - SET_QC(); - res = 0; - } else { - res = src1 - src2; - } - return res; -} - -#define NEON_SSAT(dest, src1, src2, type) do { \ - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - SET_QC(); \ - if (src2 < 0) { \ - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ - } else { \ - tmp = 1 << (sizeof(type) * 8 - 1); \ - } \ - } \ - dest = tmp; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) -NEON_VOP_ENV(qsub_s8, neon_s8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) -NEON_VOP_ENV(qsub_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_SSAT - -uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a - b; - if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { - SET_QC(); - res = ~(((int32_t)a >> 31) ^ SIGNBIT); - } - return res; -} - -uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) -{ - uint64_t res; - - res = src1 - src2; - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { - SET_QC(); - res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; - } - return res; -} - -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 -NEON_VOP(hadd_s8, neon_s8, 4) -NEON_VOP(hadd_u8, neon_u8, 4) -NEON_VOP(hadd_s16, neon_s16, 2) -NEON_VOP(hadd_u16, neon_u16, 2) -#undef NEON_FN - -int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) -{ - int32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if (src1 & src2 & 1) - dest++; - return dest; -} - -uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) -{ - uint32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if (src1 & src2 & 1) - dest++; - return dest; -} - -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 -NEON_VOP(rhadd_s8, neon_s8, 4) -NEON_VOP(rhadd_u8, neon_u8, 4) -NEON_VOP(rhadd_s16, neon_s16, 2) -NEON_VOP(rhadd_u16, neon_u16, 2) -#undef NEON_FN - -int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) -{ - int32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if ((src1 | src2) & 1) - dest++; - return dest; -} - -uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) -{ - uint32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if ((src1 | src2) & 1) - dest++; - return dest; -} - -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 -NEON_VOP(hsub_s8, neon_s8, 4) -NEON_VOP(hsub_u8, neon_u8, 4) -NEON_VOP(hsub_s16, neon_s16, 2) -NEON_VOP(hsub_u16, neon_u16, 2) -#undef NEON_FN - -int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) -{ - int32_t dest; - - dest = (src1 >> 1) - (src2 >> 1); - if ((~src1) & src2 & 1) - dest--; - return dest; -} - -uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) -{ - uint32_t dest; - - dest = (src1 >> 1) - (src2 >> 1); - if ((~src1) & src2 & 1) - dest--; - return dest; -} - -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 -NEON_POP(pmin_s8, neon_s8, 4) -NEON_POP(pmin_u8, neon_u8, 4) -NEON_POP(pmin_s16, neon_s16, 2) -NEON_POP(pmin_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 -NEON_POP(pmax_s8, neon_s8, 4) -NEON_POP(pmax_u8, neon_u8, 4) -NEON_POP(pmax_s16, neon_s16, 2) -NEON_POP(pmax_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) -NEON_VOP(shl_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) -NEON_VOP(shl_s16, neon_s16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) -NEON_VOP(rshl_s8, neon_s8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) -NEON_VOP(rshl_s16, neon_s16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) -{ - return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); -} - -uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) -{ - return do_sqrshl_d(val, (int8_t)shift, true, NULL); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) -NEON_VOP(rshl_u8, neon_u8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) -NEON_VOP(rshl_u16, neon_u16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) -{ - return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); -} - -uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) -{ - return do_uqrshl_d(val, (int8_t)shift, true, NULL); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) -NEON_VOP_ENV(qshl_u8, neon_u8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) -NEON_VOP_ENV(qshl_u16, neon_u16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) -{ - return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); -} - -uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) -{ - return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) -NEON_VOP_ENV(qshl_s8, neon_s8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) -NEON_VOP_ENV(qshl_s16, neon_s16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) -{ - return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); -} - -uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) -{ - return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) -NEON_VOP_ENV(qshlu_s8, neon_s8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) -NEON_VOP_ENV(qshlu_s16, neon_s16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) -{ - return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); -} - -uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) -{ - return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) -NEON_VOP_ENV(qrshl_u8, neon_u8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) -NEON_VOP_ENV(qrshl_u16, neon_u16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) -{ - return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); -} - -uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) -{ - return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); -} - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) -NEON_VOP_ENV(qrshl_s8, neon_s8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) \ - (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) -NEON_VOP_ENV(qrshl_s16, neon_s16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) -{ - return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); -} - -uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) -{ - return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); -} - -uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) -{ - uint32_t mask; - mask = (a ^ b) & 0x80808080u; - a &= ~0x80808080u; - b &= ~0x80808080u; - return (a + b) ^ mask; -} - -uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) -{ - uint32_t mask; - mask = (a ^ b) & 0x80008000u; - a &= ~0x80008000u; - b &= ~0x80008000u; - return (a + b) ^ mask; -} - -#define NEON_FN(dest, src1, src2) dest = src1 + src2 -NEON_POP(padd_u8, neon_u8, 4) -NEON_POP(padd_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = src1 - src2 -NEON_VOP(sub_u8, neon_u8, 4) -NEON_VOP(sub_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = src1 * src2 -NEON_VOP(mul_u8, neon_u8, 4) -NEON_VOP(mul_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 -NEON_VOP(tst_u8, neon_u8, 4) -NEON_VOP(tst_u16, neon_u16, 2) -NEON_VOP(tst_u32, neon_u32, 1) -#undef NEON_FN - -/* Count Leading Sign/Zero Bits. */ -static inline int do_clz8(uint8_t x) -{ - int n; - for (n = 8; x; n--) - x >>= 1; - return n; -} - -static inline int do_clz16(uint16_t x) -{ - int n; - for (n = 16; x; n--) - x >>= 1; - return n; -} - -#define NEON_FN(dest, src, dummy) dest = do_clz8(src) -NEON_VOP1(clz_u8, neon_u8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src, dummy) dest = do_clz16(src) -NEON_VOP1(clz_u16, neon_u16, 2) -#undef NEON_FN - -#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 -NEON_VOP1(cls_s8, neon_s8, 4) -#undef NEON_FN - -#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 -NEON_VOP1(cls_s16, neon_s16, 2) -#undef NEON_FN - -uint32_t HELPER(neon_cls_s32)(uint32_t x) -{ - int count; - if ((int32_t)x < 0) - x = ~x; - for (count = 32; x; count--) - x = x >> 1; - return count - 1; -} - -/* Bit count. */ -uint32_t HELPER(neon_cnt_u8)(uint32_t x) -{ - x = (x & 0x55555555) + ((x >> 1) & 0x55555555); - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); - return x; -} - -/* Reverse bits in each 8 bit word */ -uint32_t HELPER(neon_rbit_u8)(uint32_t x) -{ - x = ((x & 0xf0f0f0f0) >> 4) - | ((x & 0x0f0f0f0f) << 4); - x = ((x & 0x88888888) >> 3) - | ((x & 0x44444444) >> 1) - | ((x & 0x22222222) << 1) - | ((x & 0x11111111) << 3); - return x; -} - -#define NEON_QDMULH16(dest, src1, src2, round) do { \ - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ - SET_QC(); \ - tmp = (tmp >> 31) ^ ~SIGNBIT; \ - } else { \ - tmp <<= 1; \ - } \ - if (round) { \ - int32_t old = tmp; \ - tmp += 1 << 15; \ - if ((int32_t)tmp < old) { \ - SET_QC(); \ - tmp = SIGNBIT - 1; \ - } \ - } \ - dest = tmp >> 16; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) -NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) -NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_QDMULH16 - -#define NEON_QDMULH32(dest, src1, src2, round) do { \ - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ - SET_QC(); \ - tmp = (tmp >> 63) ^ ~SIGNBIT64; \ - } else { \ - tmp <<= 1; \ - } \ - if (round) { \ - int64_t old = tmp; \ - tmp += (int64_t)1 << 31; \ - if ((int64_t)tmp < old) { \ - SET_QC(); \ - tmp = SIGNBIT64 - 1; \ - } \ - } \ - dest = tmp >> 32; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) -NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) -NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) -#undef NEON_FN -#undef NEON_QDMULH32 - -uint32_t HELPER(neon_narrow_u8)(uint64_t x) -{ - return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) - | ((x >> 24) & 0xff000000u); -} - -uint32_t HELPER(neon_narrow_u16)(uint64_t x) -{ - return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); -} - -uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) -{ - return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) - | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); -} - -uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) -{ - return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); -} - -uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) -{ - x &= 0xff80ff80ff80ff80ull; - x += 0x0080008000800080ull; - return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) - | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); -} - -uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) -{ - x &= 0xffff8000ffff8000ull; - x += 0x0000800000008000ull; - return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); -} - -uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) -{ - uint16_t s; - uint8_t d; - uint32_t res = 0; -#define SAT8(n) \ - s = x >> n; \ - if (s & 0x8000) { \ - SET_QC(); \ - } else { \ - if (s > 0xff) { \ - d = 0xff; \ - SET_QC(); \ - } else { \ - d = s; \ - } \ - res |= (uint32_t)d << (n / 2); \ - } - - SAT8(0); - SAT8(16); - SAT8(32); - SAT8(48); -#undef SAT8 - return res; -} - -uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) -{ - uint16_t s; - uint8_t d; - uint32_t res = 0; -#define SAT8(n) \ - s = x >> n; \ - if (s > 0xff) { \ - d = 0xff; \ - SET_QC(); \ - } else { \ - d = s; \ - } \ - res |= (uint32_t)d << (n / 2); - - SAT8(0); - SAT8(16); - SAT8(32); - SAT8(48); -#undef SAT8 - return res; -} - -uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) -{ - int16_t s; - uint8_t d; - uint32_t res = 0; -#define SAT8(n) \ - s = x >> n; \ - if (s != (int8_t)s) { \ - d = (s >> 15) ^ 0x7f; \ - SET_QC(); \ - } else { \ - d = s; \ - } \ - res |= (uint32_t)d << (n / 2); - - SAT8(0); - SAT8(16); - SAT8(32); - SAT8(48); -#undef SAT8 - return res; -} - -uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) -{ - uint32_t high; - uint32_t low; - low = x; - if (low & 0x80000000) { - low = 0; - SET_QC(); - } else if (low > 0xffff) { - low = 0xffff; - SET_QC(); - } - high = x >> 32; - if (high & 0x80000000) { - high = 0; - SET_QC(); - } else if (high > 0xffff) { - high = 0xffff; - SET_QC(); - } - return low | (high << 16); -} - -uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) -{ - uint32_t high; - uint32_t low; - low = x; - if (low > 0xffff) { - low = 0xffff; - SET_QC(); - } - high = x >> 32; - if (high > 0xffff) { - high = 0xffff; - SET_QC(); - } - return low | (high << 16); -} - -uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) -{ - int32_t low; - int32_t high; - low = x; - if (low != (int16_t)low) { - low = (low >> 31) ^ 0x7fff; - SET_QC(); - } - high = x >> 32; - if (high != (int16_t)high) { - high = (high >> 31) ^ 0x7fff; - SET_QC(); - } - return (uint16_t)low | (high << 16); -} - -uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) -{ - if (x & 0x8000000000000000ull) { - SET_QC(); - return 0; - } - if (x > 0xffffffffu) { - SET_QC(); - return 0xffffffffu; - } - return x; -} - -uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) -{ - if (x > 0xffffffffu) { - SET_QC(); - return 0xffffffffu; - } - return x; -} - -uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) -{ - if ((int64_t)x != (int32_t)x) { - SET_QC(); - return ((int64_t)x >> 63) ^ 0x7fffffff; - } - return x; -} - -uint64_t HELPER(neon_widen_u8)(uint32_t x) -{ - uint64_t tmp; - uint64_t ret; - ret = (uint8_t)x; - tmp = (uint8_t)(x >> 8); - ret |= tmp << 16; - tmp = (uint8_t)(x >> 16); - ret |= tmp << 32; - tmp = (uint8_t)(x >> 24); - ret |= tmp << 48; - return ret; -} - -uint64_t HELPER(neon_widen_s8)(uint32_t x) -{ - uint64_t tmp; - uint64_t ret; - ret = (uint16_t)(int8_t)x; - tmp = (uint16_t)(int8_t)(x >> 8); - ret |= tmp << 16; - tmp = (uint16_t)(int8_t)(x >> 16); - ret |= tmp << 32; - tmp = (uint16_t)(int8_t)(x >> 24); - ret |= tmp << 48; - return ret; -} - -uint64_t HELPER(neon_widen_u16)(uint32_t x) -{ - uint64_t high = (uint16_t)(x >> 16); - return ((uint16_t)x) | (high << 32); -} - -uint64_t HELPER(neon_widen_s16)(uint32_t x) -{ - uint64_t high = (int16_t)(x >> 16); - return ((uint32_t)(int16_t)x) | (high << 32); -} - -uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) -{ - uint64_t mask; - mask = (a ^ b) & 0x8000800080008000ull; - a &= ~0x8000800080008000ull; - b &= ~0x8000800080008000ull; - return (a + b) ^ mask; -} - -uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) -{ - uint64_t mask; - mask = (a ^ b) & 0x8000000080000000ull; - a &= ~0x8000000080000000ull; - b &= ~0x8000000080000000ull; - return (a + b) ^ mask; -} - -uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) -{ - uint64_t tmp; - uint64_t tmp2; - - tmp = a & 0x0000ffff0000ffffull; - tmp += (a >> 16) & 0x0000ffff0000ffffull; - tmp2 = b & 0xffff0000ffff0000ull; - tmp2 += (b << 16) & 0xffff0000ffff0000ull; - return ( tmp & 0xffff) - | ((tmp >> 16) & 0xffff0000ull) - | ((tmp2 << 16) & 0xffff00000000ull) - | ( tmp2 & 0xffff000000000000ull); -} - -uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) -{ - uint32_t low = a + (a >> 32); - uint32_t high = b + (b >> 32); - return low + ((uint64_t)high << 32); -} - -uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) -{ - uint64_t mask; - mask = (a ^ ~b) & 0x8000800080008000ull; - a |= 0x8000800080008000ull; - b &= ~0x8000800080008000ull; - return (a - b) ^ mask; -} - -uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) -{ - uint64_t mask; - mask = (a ^ ~b) & 0x8000000080000000ull; - a |= 0x8000000080000000ull; - b &= ~0x8000000080000000ull; - return (a - b) ^ mask; -} - -uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) -{ - uint32_t x, y; - uint32_t low, high; - - x = a; - y = b; - low = x + y; - if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { - SET_QC(); - low = ((int32_t)x >> 31) ^ ~SIGNBIT; - } - x = a >> 32; - y = b >> 32; - high = x + y; - if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { - SET_QC(); - high = ((int32_t)x >> 31) ^ ~SIGNBIT; - } - return low | ((uint64_t)high << 32); -} - -uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) -{ - uint64_t result; - - result = a + b; - if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { - SET_QC(); - result = ((int64_t)a >> 63) ^ ~SIGNBIT64; - } - return result; -} - -/* We have to do the arithmetic in a larger type than - * the input type, because for example with a signed 32 bit - * op the absolute difference can overflow a signed 32 bit value. - */ -#define DO_ABD(dest, x, y, intype, arithtype) do { \ - arithtype tmp_x = (intype)(x); \ - arithtype tmp_y = (intype)(y); \ - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ - } while(0) - -uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - DO_ABD(result, a, b, uint8_t, uint32_t); - DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); - result |= tmp << 16; - DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); - result |= tmp << 32; - DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); - result |= tmp << 48; - return result; -} - -uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - DO_ABD(result, a, b, int8_t, int32_t); - DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); - result |= tmp << 16; - DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); - result |= tmp << 32; - DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); - result |= tmp << 48; - return result; -} - -uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - DO_ABD(result, a, b, uint16_t, uint32_t); - DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); - return result | (tmp << 32); -} - -uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - DO_ABD(result, a, b, int16_t, int32_t); - DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); - return result | (tmp << 32); -} - -uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) -{ - uint64_t result; - DO_ABD(result, a, b, uint32_t, uint64_t); - return result; -} - -uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) -{ - uint64_t result; - DO_ABD(result, a, b, int32_t, int64_t); - return result; -} -#undef DO_ABD - -/* Widening multiply. Named type is the source type. */ -#define DO_MULL(dest, x, y, type1, type2) do { \ - type1 tmp_x = x; \ - type1 tmp_y = y; \ - dest = (type2)((type2)tmp_x * (type2)tmp_y); \ - } while(0) - -uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - - DO_MULL(result, a, b, uint8_t, uint16_t); - DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); - result |= tmp << 16; - DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); - result |= tmp << 32; - DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); - result |= tmp << 48; - return result; -} - -uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - - DO_MULL(result, a, b, int8_t, uint16_t); - DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); - result |= tmp << 16; - DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); - result |= tmp << 32; - DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); - result |= tmp << 48; - return result; -} - -uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - - DO_MULL(result, a, b, uint16_t, uint32_t); - DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); - return result | (tmp << 32); -} - -uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) -{ - uint64_t tmp; - uint64_t result; - - DO_MULL(result, a, b, int16_t, uint32_t); - DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); - return result | (tmp << 32); -} - -uint64_t HELPER(neon_negl_u16)(uint64_t x) -{ - uint16_t tmp; - uint64_t result; - result = (uint16_t)-x; - tmp = -(x >> 16); - result |= (uint64_t)tmp << 16; - tmp = -(x >> 32); - result |= (uint64_t)tmp << 32; - tmp = -(x >> 48); - result |= (uint64_t)tmp << 48; - return result; -} - -uint64_t HELPER(neon_negl_u32)(uint64_t x) -{ - uint32_t low = -x; - uint32_t high = -(x >> 32); - return low | ((uint64_t)high << 32); -} - -/* Saturating sign manipulation. */ -/* ??? Make these use NEON_VOP1 */ -#define DO_QABS8(x) do { \ - if (x == (int8_t)0x80) { \ - x = 0x7f; \ - SET_QC(); \ - } else if (x < 0) { \ - x = -x; \ - }} while (0) -uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) -{ - neon_s8 vec; - NEON_UNPACK(neon_s8, vec, x); - DO_QABS8(vec.v1); - DO_QABS8(vec.v2); - DO_QABS8(vec.v3); - DO_QABS8(vec.v4); - NEON_PACK(neon_s8, x, vec); - return x; -} -#undef DO_QABS8 - -#define DO_QNEG8(x) do { \ - if (x == (int8_t)0x80) { \ - x = 0x7f; \ - SET_QC(); \ - } else { \ - x = -x; \ - }} while (0) -uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) -{ - neon_s8 vec; - NEON_UNPACK(neon_s8, vec, x); - DO_QNEG8(vec.v1); - DO_QNEG8(vec.v2); - DO_QNEG8(vec.v3); - DO_QNEG8(vec.v4); - NEON_PACK(neon_s8, x, vec); - return x; -} -#undef DO_QNEG8 - -#define DO_QABS16(x) do { \ - if (x == (int16_t)0x8000) { \ - x = 0x7fff; \ - SET_QC(); \ - } else if (x < 0) { \ - x = -x; \ - }} while (0) -uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) -{ - neon_s16 vec; - NEON_UNPACK(neon_s16, vec, x); - DO_QABS16(vec.v1); - DO_QABS16(vec.v2); - NEON_PACK(neon_s16, x, vec); - return x; -} -#undef DO_QABS16 - -#define DO_QNEG16(x) do { \ - if (x == (int16_t)0x8000) { \ - x = 0x7fff; \ - SET_QC(); \ - } else { \ - x = -x; \ - }} while (0) -uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) -{ - neon_s16 vec; - NEON_UNPACK(neon_s16, vec, x); - DO_QNEG16(vec.v1); - DO_QNEG16(vec.v2); - NEON_PACK(neon_s16, x, vec); - return x; -} -#undef DO_QNEG16 - -uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) -{ - if (x == SIGNBIT) { - SET_QC(); - x = ~SIGNBIT; - } else if ((int32_t)x < 0) { - x = -x; - } - return x; -} - -uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) -{ - if (x == SIGNBIT) { - SET_QC(); - x = ~SIGNBIT; - } else { - x = -x; - } - return x; -} - -uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) -{ - if (x == SIGNBIT64) { - SET_QC(); - x = ~SIGNBIT64; - } else if ((int64_t)x < 0) { - x = -x; - } - return x; -} - -uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) -{ - if (x == SIGNBIT64) { - SET_QC(); - x = ~SIGNBIT64; - } else { - x = -x; - } - return x; -} - -/* NEON Float helpers. */ - -/* Floating point comparisons produce an integer result. - * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. - * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. - */ -uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); -} - -uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float32_le(make_float32(b), make_float32(a), fpst); -} - -uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - return -float32_lt(make_float32(b), make_float32(a), fpst); -} - -uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float32 f0 = float32_abs(make_float32(a)); - float32 f1 = float32_abs(make_float32(b)); - return -float32_le(f1, f0, fpst); -} - -uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float32 f0 = float32_abs(make_float32(a)); - float32 f1 = float32_abs(make_float32(b)); - return -float32_lt(f1, f0, fpst); -} - -uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float64 f0 = float64_abs(make_float64(a)); - float64 f1 = float64_abs(make_float64(b)); - return -float64_le(f1, f0, fpst); -} - -uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) -{ - float_status *fpst = fpstp; - float64 f0 = float64_abs(make_float64(a)); - float64 f1 = float64_abs(make_float64(b)); - return -float64_lt(f1, f0, fpst); -} - -#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) - -void HELPER(neon_qunzip8)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) - | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) - | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) - | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); - uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) - | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) - | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) - | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); - uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) - | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) - | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) - | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); - uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) - | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) - | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) - | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_qunzip16)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) - | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); - uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) - | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); - uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) - | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); - uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) - | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_qunzip32)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); - uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); - uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); - uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_unzip8)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd = rd[0], zm = rm[0]; - - uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) - | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) - | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) - | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); - uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) - | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) - | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) - | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); - - rm[0] = m0; - rd[0] = d0; -} - -void HELPER(neon_unzip16)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd = rd[0], zm = rm[0]; - - uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) - | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); - uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) - | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); - - rm[0] = m0; - rd[0] = d0; -} - -void HELPER(neon_qzip8)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) - | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) - | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) - | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); - uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) - | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) - | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) - | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); - uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) - | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) - | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) - | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); - uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) - | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) - | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) - | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_qzip16)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) - | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); - uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) - | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); - uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) - | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); - uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) - | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_qzip32)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd0 = rd[0], zd1 = rd[1]; - uint64_t zm0 = rm[0], zm1 = rm[1]; - - uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); - uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); - uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); - uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); - - rm[0] = m0; - rm[1] = m1; - rd[0] = d0; - rd[1] = d1; -} - -void HELPER(neon_zip8)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd = rd[0], zm = rm[0]; - - uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) - | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) - | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) - | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); - uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) - | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) - | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) - | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); - - rm[0] = m0; - rd[0] = d0; -} - -void HELPER(neon_zip16)(void *vd, void *vm) -{ - uint64_t *rd = vd, *rm = vm; - uint64_t zd = rd[0], zm = rm[0]; - - uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) - | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); - uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) - | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); - - rm[0] = m0; - rd[0] = d0; -} diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c deleted file mode 100644 index 3baf8004f6..0000000000 --- a/target/arm/op_helper.c +++ /dev/null @@ -1,1069 +0,0 @@ -/* - * ARM helper routines - * - * Copyright (c) 2005-2007 CodeSourcery, LLC - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ -#include "qemu/osdep.h" -#include "qemu/main-loop.h" -#include "cpu.h" -#include "exec/helper-proto.h" -#include "internals.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" -#include "cpregs.h" - -#define SIGNBIT (uint32_t)0x80000000 -#define SIGNBIT64 ((uint64_t)1 << 63) - -int exception_target_el(CPUARMState *env) -{ - int target_el = MAX(1, arm_current_el(env)); - - /* - * No such thing as secure EL1 if EL3 is aarch32, - * so update the target EL to EL3 in this case. - */ - if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) { - target_el = 3; - } - - return target_el; -} - -void raise_exception(CPUARMState *env, uint32_t excp, - uint32_t syndrome, uint32_t target_el) -{ - CPUState *cs = env_cpu(env); - - if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) { - /* - * Redirect NS EL1 exceptions to NS EL2. These are reported with - * their original syndrome register value, with the exception of - * SIMD/FP access traps, which are reported as uncategorized - * (see DDI0478C.a D1.10.4) - */ - target_el = 2; - if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) { - syndrome = syn_uncategorized(); - } - } - - assert(!excp_is_internal(excp)); - cs->exception_index = excp; - env->exception.syndrome = syndrome; - env->exception.target_el = target_el; - cpu_loop_exit(cs); -} - -void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome, - uint32_t target_el, uintptr_t ra) -{ - CPUState *cs = env_cpu(env); - - /* - * restore_state_to_opc() will set env->exception.syndrome, so - * we must restore CPU state here before setting the syndrome - * the caller passed us, and cannot use cpu_loop_exit_restore(). - */ - cpu_restore_state(cs, ra); - raise_exception(env, excp, syndrome, target_el); -} - -uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc, - uint64_t ireg, uint64_t def) -{ - uint64_t tmp, val = 0; - uint32_t maxindex = ((desc & 3) + 1) * 8; - uint32_t base_reg = desc >> 2; - uint32_t shift, index, reg; - - for (shift = 0; shift < 64; shift += 8) { - index = (ireg >> shift) & 0xff; - if (index < maxindex) { - reg = base_reg + (index >> 3); - tmp = *aa32_vfp_dreg(env, reg); - tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift; - } else { - tmp = def & (0xffull << shift); - } - val |= tmp; - } - return val; -} - -void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue) -{ - /* - * Perform the v8M stack limit check for SP updates from translated code, - * raising an exception if the limit is breached. - */ - if (newvalue < v7m_sp_limit(env)) { - /* - * Stack limit exceptions are a rare case, so rather than syncing - * PC/condbits before the call, we use raise_exception_ra() so - * that cpu_restore_state() will sort them out. - */ - raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC()); - } -} - -uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a + b; - if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) - env->QF = 1; - return res; -} - -uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a + b; - if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { - env->QF = 1; - res = ~(((int32_t)a >> 31) ^ SIGNBIT); - } - return res; -} - -uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a - b; - if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { - env->QF = 1; - res = ~(((int32_t)a >> 31) ^ SIGNBIT); - } - return res; -} - -uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a + b; - if (res < a) { - env->QF = 1; - res = ~0; - } - return res; -} - -uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b) -{ - uint32_t res = a - b; - if (res > a) { - env->QF = 1; - res = 0; - } - return res; -} - -/* Signed saturation. */ -static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift) -{ - int32_t top; - uint32_t mask; - - top = val >> shift; - mask = (1u << shift) - 1; - if (top > 0) { - env->QF = 1; - return mask; - } else if (top < -1) { - env->QF = 1; - return ~mask; - } - return val; -} - -/* Unsigned saturation. */ -static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift) -{ - uint32_t max; - - max = (1u << shift) - 1; - if (val < 0) { - env->QF = 1; - return 0; - } else if (val > max) { - env->QF = 1; - return max; - } - return val; -} - -/* Signed saturate. */ -uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift) -{ - return do_ssat(env, x, shift); -} - -/* Dual halfword signed saturate. */ -uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift) -{ - uint32_t res; - - res = (uint16_t)do_ssat(env, (int16_t)x, shift); - res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16; - return res; -} - -/* Unsigned saturate. */ -uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift) -{ - return do_usat(env, x, shift); -} - -/* Dual halfword unsigned saturate. */ -uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift) -{ - uint32_t res; - - res = (uint16_t)do_usat(env, (int16_t)x, shift); - res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16; - return res; -} - -void HELPER(setend)(CPUARMState *env) -{ - env->uncached_cpsr ^= CPSR_E; - arm_rebuild_hflags(env); -} - -void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm) -{ - /* - * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU; - * check if HSTR.TJDBX means we need to trap to EL2. - */ - if (env->cp15.hstr_el2 & HSTR_TJDBX) { - /* - * We know the condition code check passed, so take the IMPDEF - * choice to always report CV=1 COND 0xe - */ - uint32_t syn = syn_bxjtrap(1, 0xe, rm); - raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC()); - } -} - -#ifndef CONFIG_USER_ONLY -/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped. - * The function returns the target EL (1-3) if the instruction is to be trapped; - * otherwise it returns 0 indicating it is not trapped. - */ -static inline int check_wfx_trap(CPUARMState *env, bool is_wfe) -{ - int cur_el = arm_current_el(env); - uint64_t mask; - - if (arm_feature(env, ARM_FEATURE_M)) { - /* M profile cores can never trap WFI/WFE. */ - return 0; - } - - /* If we are currently in EL0 then we need to check if SCTLR is set up for - * WFx instructions being trapped to EL1. These trap bits don't exist in v7. - */ - if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) { - int target_el; - - mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI; - if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) { - /* Secure EL0 and Secure PL1 is at EL3 */ - target_el = 3; - } else { - target_el = 1; - } - - if (!(env->cp15.sctlr_el[target_el] & mask)) { - return target_el; - } - } - - /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it - * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the - * bits will be zero indicating no trap. - */ - if (cur_el < 2) { - mask = is_wfe ? HCR_TWE : HCR_TWI; - if (arm_hcr_el2_eff(env) & mask) { - return 2; - } - } - - /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */ - if (cur_el < 3) { - mask = (is_wfe) ? SCR_TWE : SCR_TWI; - if (env->cp15.scr_el3 & mask) { - return 3; - } - } - - return 0; -} -#endif - -void HELPER(wfi)(CPUARMState *env, uint32_t insn_len) -{ -#ifdef CONFIG_USER_ONLY - /* - * WFI in the user-mode emulator is technically permitted but not - * something any real-world code would do. AArch64 Linux kernels - * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP; - * AArch32 kernels don't trap it so it will delay a bit. - * For QEMU, make it NOP here, because trying to raise EXCP_HLT - * would trigger an abort. - */ - return; -#else - CPUState *cs = env_cpu(env); - int target_el = check_wfx_trap(env, false); - - if (cpu_has_work(cs)) { - /* Don't bother to go into our "low power state" if - * we would just wake up immediately. - */ - return; - } - - if (target_el) { - if (env->aarch64) { - env->pc -= insn_len; - } else { - env->regs[15] -= insn_len; - } - - raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2), - target_el); - } - - cs->exception_index = EXCP_HLT; - cs->halted = 1; - cpu_loop_exit(cs); -#endif -} - -void HELPER(wfe)(CPUARMState *env) -{ - /* This is a hint instruction that is semantically different - * from YIELD even though we currently implement it identically. - * Don't actually halt the CPU, just yield back to top - * level loop. This is not going into a "low power state" - * (ie halting until some event occurs), so we never take - * a configurable trap to a different exception level. - */ - HELPER(yield)(env); -} - -void HELPER(yield)(CPUARMState *env) -{ - CPUState *cs = env_cpu(env); - - /* This is a non-trappable hint instruction that generally indicates - * that the guest is currently busy-looping. Yield control back to the - * top level loop so that a more deserving VCPU has a chance to run. - */ - cs->exception_index = EXCP_YIELD; - cpu_loop_exit(cs); -} - -/* Raise an internal-to-QEMU exception. This is limited to only - * those EXCP values which are special cases for QEMU to interrupt - * execution and not to be used for exceptions which are passed to - * the guest (those must all have syndrome information and thus should - * use exception_with_syndrome*). - */ -void HELPER(exception_internal)(CPUARMState *env, uint32_t excp) -{ - CPUState *cs = env_cpu(env); - - assert(excp_is_internal(excp)); - cs->exception_index = excp; - cpu_loop_exit(cs); -} - -/* Raise an exception with the specified syndrome register value */ -void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp, - uint32_t syndrome, uint32_t target_el) -{ - raise_exception(env, excp, syndrome, target_el); -} - -/* - * Raise an exception with the specified syndrome register value - * to the default target el. - */ -void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp, - uint32_t syndrome) -{ - raise_exception(env, excp, syndrome, exception_target_el(env)); -} - -uint32_t HELPER(cpsr_read)(CPUARMState *env) -{ - return cpsr_read(env) & ~CPSR_EXEC; -} - -void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask) -{ - cpsr_write(env, val, mask, CPSRWriteByInstr); - /* TODO: Not all cpsr bits are relevant to hflags. */ - arm_rebuild_hflags(env); -} - -/* Write the CPSR for a 32-bit exception return */ -void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val) -{ - uint32_t mask; - - qemu_mutex_lock_iothread(); - arm_call_pre_el_change_hook(env_archcpu(env)); - qemu_mutex_unlock_iothread(); - - mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar); - cpsr_write(env, val, mask, CPSRWriteExceptionReturn); - - /* Generated code has already stored the new PC value, but - * without masking out its low bits, because which bits need - * masking depends on whether we're returning to Thumb or ARM - * state. Do the masking now. - */ - env->regs[15] &= (env->thumb ? ~1 : ~3); - arm_rebuild_hflags(env); - - qemu_mutex_lock_iothread(); - arm_call_el_change_hook(env_archcpu(env)); - qemu_mutex_unlock_iothread(); -} - -/* Access to user mode registers from privileged modes. */ -uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno) -{ - uint32_t val; - - if (regno == 13) { - val = env->banked_r13[BANK_USRSYS]; - } else if (regno == 14) { - val = env->banked_r14[BANK_USRSYS]; - } else if (regno >= 8 - && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) { - val = env->usr_regs[regno - 8]; - } else { - val = env->regs[regno]; - } - return val; -} - -void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val) -{ - if (regno == 13) { - env->banked_r13[BANK_USRSYS] = val; - } else if (regno == 14) { - env->banked_r14[BANK_USRSYS] = val; - } else if (regno >= 8 - && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) { - env->usr_regs[regno - 8] = val; - } else { - env->regs[regno] = val; - } -} - -void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val) -{ - if ((env->uncached_cpsr & CPSR_M) == mode) { - env->regs[13] = val; - } else { - env->banked_r13[bank_number(mode)] = val; - } -} - -uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode) -{ - if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) { - /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF. - * Other UNPREDICTABLE and UNDEF cases were caught at translate time. - */ - raise_exception(env, EXCP_UDEF, syn_uncategorized(), - exception_target_el(env)); - } - - if ((env->uncached_cpsr & CPSR_M) == mode) { - return env->regs[13]; - } else { - return env->banked_r13[bank_number(mode)]; - } -} - -static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode, - uint32_t regno) -{ - /* Raise an exception if the requested access is one of the UNPREDICTABLE - * cases; otherwise return. This broadly corresponds to the pseudocode - * BankedRegisterAccessValid() and SPSRAccessValid(), - * except that we have already handled some cases at translate time. - */ - int curmode = env->uncached_cpsr & CPSR_M; - - if (regno == 17) { - /* ELR_Hyp: a special case because access from tgtmode is OK */ - if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) { - goto undef; - } - return; - } - - if (curmode == tgtmode) { - goto undef; - } - - if (tgtmode == ARM_CPU_MODE_USR) { - switch (regno) { - case 8 ... 12: - if (curmode != ARM_CPU_MODE_FIQ) { - goto undef; - } - break; - case 13: - if (curmode == ARM_CPU_MODE_SYS) { - goto undef; - } - break; - case 14: - if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) { - goto undef; - } - break; - default: - break; - } - } - - if (tgtmode == ARM_CPU_MODE_HYP) { - /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */ - if (curmode != ARM_CPU_MODE_MON) { - goto undef; - } - } - - return; - -undef: - raise_exception(env, EXCP_UDEF, syn_uncategorized(), - exception_target_el(env)); -} - -void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode, - uint32_t regno) -{ - msr_mrs_banked_exc_checks(env, tgtmode, regno); - - switch (regno) { - case 16: /* SPSRs */ - env->banked_spsr[bank_number(tgtmode)] = value; - break; - case 17: /* ELR_Hyp */ - env->elr_el[2] = value; - break; - case 13: - env->banked_r13[bank_number(tgtmode)] = value; - break; - case 14: - env->banked_r14[r14_bank_number(tgtmode)] = value; - break; - case 8 ... 12: - switch (tgtmode) { - case ARM_CPU_MODE_USR: - env->usr_regs[regno - 8] = value; - break; - case ARM_CPU_MODE_FIQ: - env->fiq_regs[regno - 8] = value; - break; - default: - g_assert_not_reached(); - } - break; - default: - g_assert_not_reached(); - } -} - -uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno) -{ - msr_mrs_banked_exc_checks(env, tgtmode, regno); - - switch (regno) { - case 16: /* SPSRs */ - return env->banked_spsr[bank_number(tgtmode)]; - case 17: /* ELR_Hyp */ - return env->elr_el[2]; - case 13: - return env->banked_r13[bank_number(tgtmode)]; - case 14: - return env->banked_r14[r14_bank_number(tgtmode)]; - case 8 ... 12: - switch (tgtmode) { - case ARM_CPU_MODE_USR: - return env->usr_regs[regno - 8]; - case ARM_CPU_MODE_FIQ: - return env->fiq_regs[regno - 8]; - default: - g_assert_not_reached(); - } - default: - g_assert_not_reached(); - } -} - -const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key, - uint32_t syndrome, uint32_t isread) -{ - ARMCPU *cpu = env_archcpu(env); - const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key); - CPAccessResult res = CP_ACCESS_OK; - int target_el; - - assert(ri != NULL); - - if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14 - && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) { - res = CP_ACCESS_TRAP; - goto fail; - } - - if (ri->accessfn) { - res = ri->accessfn(env, ri, isread); - } - - /* - * If the access function indicates a trap from EL0 to EL1 then - * that always takes priority over the HSTR_EL2 trap. (If it indicates - * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates - * a trap to EL2, then the syndrome is the same either way so we don't - * care whether technically the architecture says that HSTR_EL2 trap or - * the other trap takes priority. So we take the "check HSTR_EL2" path - * for all of those cases.) - */ - if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) && - arm_current_el(env) == 0) { - goto fail; - } - - /* - * HSTR_EL2 traps from EL1 are checked earlier, in generated code; - * we only need to check here for traps from EL0. - */ - if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 && - arm_is_el2_enabled(env) && - (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) { - uint32_t mask = 1 << ri->crn; - - if (ri->type & ARM_CP_64BIT) { - mask = 1 << ri->crm; - } - - /* T4 and T14 are RES0 */ - mask &= ~((1 << 4) | (1 << 14)); - - if (env->cp15.hstr_el2 & mask) { - res = CP_ACCESS_TRAP_EL2; - goto fail; - } - } - - /* - * Fine-grained traps also are lower priority than undef-to-EL1, - * higher priority than trap-to-EL3, and we don't care about priority - * order with other EL2 traps because the syndrome value is the same. - */ - if (arm_fgt_active(env, arm_current_el(env))) { - uint64_t trapword = 0; - unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX); - unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS); - bool rev = FIELD_EX32(ri->fgt, FGT, REV); - bool trapbit; - - if (ri->fgt & FGT_EXEC) { - assert(idx < ARRAY_SIZE(env->cp15.fgt_exec)); - trapword = env->cp15.fgt_exec[idx]; - } else if (isread && (ri->fgt & FGT_R)) { - assert(idx < ARRAY_SIZE(env->cp15.fgt_read)); - trapword = env->cp15.fgt_read[idx]; - } else if (!isread && (ri->fgt & FGT_W)) { - assert(idx < ARRAY_SIZE(env->cp15.fgt_write)); - trapword = env->cp15.fgt_write[idx]; - } - - trapbit = extract64(trapword, bitpos, 1); - if (trapbit != rev) { - res = CP_ACCESS_TRAP_EL2; - goto fail; - } - } - - if (likely(res == CP_ACCESS_OK)) { - return ri; - } - - fail: - switch (res & ~CP_ACCESS_EL_MASK) { - case CP_ACCESS_TRAP: - break; - case CP_ACCESS_TRAP_UNCATEGORIZED: - /* Only CP_ACCESS_TRAP traps are direct to a specified EL */ - assert((res & CP_ACCESS_EL_MASK) == 0); - if (cpu_isar_feature(aa64_ids, cpu) && isread && - arm_cpreg_in_idspace(ri)) { - /* - * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP, - * not EC_UNCATEGORIZED - */ - break; - } - syndrome = syn_uncategorized(); - break; - default: - g_assert_not_reached(); - } - - target_el = res & CP_ACCESS_EL_MASK; - switch (target_el) { - case 0: - target_el = exception_target_el(env); - break; - case 2: - assert(arm_current_el(env) != 3); - assert(arm_is_el2_enabled(env)); - break; - case 3: - assert(arm_feature(env, ARM_FEATURE_EL3)); - break; - default: - /* No "direct" traps to EL1 */ - g_assert_not_reached(); - } - - raise_exception(env, EXCP_UDEF, syndrome, target_el); -} - -const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key) -{ - ARMCPU *cpu = env_archcpu(env); - const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key); - - assert(ri != NULL); - return ri; -} - -void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value) -{ - const ARMCPRegInfo *ri = rip; - - if (ri->type & ARM_CP_IO) { - qemu_mutex_lock_iothread(); - ri->writefn(env, ri, value); - qemu_mutex_unlock_iothread(); - } else { - ri->writefn(env, ri, value); - } -} - -uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip) -{ - const ARMCPRegInfo *ri = rip; - uint32_t res; - - if (ri->type & ARM_CP_IO) { - qemu_mutex_lock_iothread(); - res = ri->readfn(env, ri); - qemu_mutex_unlock_iothread(); - } else { - res = ri->readfn(env, ri); - } - - return res; -} - -void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value) -{ - const ARMCPRegInfo *ri = rip; - - if (ri->type & ARM_CP_IO) { - qemu_mutex_lock_iothread(); - ri->writefn(env, ri, value); - qemu_mutex_unlock_iothread(); - } else { - ri->writefn(env, ri, value); - } -} - -uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip) -{ - const ARMCPRegInfo *ri = rip; - uint64_t res; - - if (ri->type & ARM_CP_IO) { - qemu_mutex_lock_iothread(); - res = ri->readfn(env, ri); - qemu_mutex_unlock_iothread(); - } else { - res = ri->readfn(env, ri); - } - - return res; -} - -void HELPER(pre_hvc)(CPUARMState *env) -{ - ARMCPU *cpu = env_archcpu(env); - int cur_el = arm_current_el(env); - /* FIXME: Use actual secure state. */ - bool secure = false; - bool undef; - - if (arm_is_psci_call(cpu, EXCP_HVC)) { - /* If PSCI is enabled and this looks like a valid PSCI call then - * that overrides the architecturally mandated HVC behaviour. - */ - return; - } - - if (!arm_feature(env, ARM_FEATURE_EL2)) { - /* If EL2 doesn't exist, HVC always UNDEFs */ - undef = true; - } else if (arm_feature(env, ARM_FEATURE_EL3)) { - /* EL3.HCE has priority over EL2.HCD. */ - undef = !(env->cp15.scr_el3 & SCR_HCE); - } else { - undef = env->cp15.hcr_el2 & HCR_HCD; - } - - /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state. - * For ARMv8/AArch64, HVC is allowed in EL3. - * Note that we've already trapped HVC from EL0 at translation - * time. - */ - if (secure && (!is_a64(env) || cur_el == 1)) { - undef = true; - } - - if (undef) { - raise_exception(env, EXCP_UDEF, syn_uncategorized(), - exception_target_el(env)); - } -} - -void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome) -{ - ARMCPU *cpu = env_archcpu(env); - int cur_el = arm_current_el(env); - bool secure = arm_is_secure(env); - bool smd_flag = env->cp15.scr_el3 & SCR_SMD; - - /* - * SMC behaviour is summarized in the following table. - * This helper handles the "Trap to EL2" and "Undef insn" cases. - * The "Trap to EL3" and "PSCI call" cases are handled in the exception - * helper. - * - * -> ARM_FEATURE_EL3 and !SMD - * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 - * - * Conduit SMC, valid call Trap to EL2 PSCI Call - * Conduit SMC, inval call Trap to EL2 Trap to EL3 - * Conduit not SMC Trap to EL2 Trap to EL3 - * - * - * -> ARM_FEATURE_EL3 and SMD - * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 - * - * Conduit SMC, valid call Trap to EL2 PSCI Call - * Conduit SMC, inval call Trap to EL2 Undef insn - * Conduit not SMC Trap to EL2 Undef insn - * - * - * -> !ARM_FEATURE_EL3 - * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 - * - * Conduit SMC, valid call Trap to EL2 PSCI Call - * Conduit SMC, inval call Trap to EL2 Undef insn - * Conduit not SMC Undef insn Undef insn - */ - - /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state. - * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization - * extensions, SMD only applies to NS state. - * On ARMv7 without the Virtualization extensions, the SMD bit - * doesn't exist, but we forbid the guest to set it to 1 in scr_write(), - * so we need not special case this here. - */ - bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag - : smd_flag && !secure; - - if (!arm_feature(env, ARM_FEATURE_EL3) && - cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) { - /* If we have no EL3 then SMC always UNDEFs and can't be - * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3 - * firmware within QEMU, and we want an EL2 guest to be able - * to forbid its EL1 from making PSCI calls into QEMU's - * "firmware" via HCR.TSC, so for these purposes treat - * PSCI-via-SMC as implying an EL3. - * This handles the very last line of the previous table. - */ - raise_exception(env, EXCP_UDEF, syn_uncategorized(), - exception_target_el(env)); - } - - if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) { - /* In NS EL1, HCR controlled routing to EL2 has priority over SMD. - * We also want an EL2 guest to be able to forbid its EL1 from - * making PSCI calls into QEMU's "firmware" via HCR.TSC. - * This handles all the "Trap to EL2" cases of the previous table. - */ - raise_exception(env, EXCP_HYP_TRAP, syndrome, 2); - } - - /* Catch the two remaining "Undef insn" cases of the previous table: - * - PSCI conduit is SMC but we don't have a valid PCSI call, - * - We don't have EL3 or SMD is set. - */ - if (!arm_is_psci_call(cpu, EXCP_SMC) && - (smd || !arm_feature(env, ARM_FEATURE_EL3))) { - raise_exception(env, EXCP_UDEF, syn_uncategorized(), - exception_target_el(env)); - } -} - -/* ??? Flag setting arithmetic is awkward because we need to do comparisons. - The only way to do that in TCG is a conditional branch, which clobbers - all our temporaries. For now implement these as helper functions. */ - -/* Similarly for variable shift instructions. */ - -uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift = i & 0xff; - if (shift >= 32) { - if (shift == 32) - env->CF = x & 1; - else - env->CF = 0; - return 0; - } else if (shift != 0) { - env->CF = (x >> (32 - shift)) & 1; - return x << shift; - } - return x; -} - -uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift = i & 0xff; - if (shift >= 32) { - if (shift == 32) - env->CF = (x >> 31) & 1; - else - env->CF = 0; - return 0; - } else if (shift != 0) { - env->CF = (x >> (shift - 1)) & 1; - return x >> shift; - } - return x; -} - -uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift = i & 0xff; - if (shift >= 32) { - env->CF = (x >> 31) & 1; - return (int32_t)x >> 31; - } else if (shift != 0) { - env->CF = (x >> (shift - 1)) & 1; - return (int32_t)x >> shift; - } - return x; -} - -uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i) -{ - int shift1, shift; - shift1 = i & 0xff; - shift = shift1 & 0x1f; - if (shift == 0) { - if (shift1 != 0) - env->CF = (x >> 31) & 1; - return x; - } else { - env->CF = (x >> (shift - 1)) & 1; - return ((uint32_t)x >> shift) | (x << (32 - shift)); - } -} - -void HELPER(probe_access)(CPUARMState *env, target_ulong ptr, - uint32_t access_type, uint32_t mmu_idx, - uint32_t size) -{ - uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE); - uintptr_t ra = GETPC(); - - if (likely(size <= in_page)) { - probe_access(env, ptr, size, access_type, mmu_idx, ra); - } else { - probe_access(env, ptr, in_page, access_type, mmu_idx, ra); - probe_access(env, ptr + in_page, size - in_page, - access_type, mmu_idx, ra); - } -} - -/* - * This function corresponds to AArch64.vESBOperation(). - * Note that the AArch32 version is not functionally different. - */ -void HELPER(vesb)(CPUARMState *env) -{ - /* - * The EL2Enabled() check is done inside arm_hcr_el2_eff, - * and will return HCR_EL2.VSE == 0, so nothing happens. - */ - uint64_t hcr = arm_hcr_el2_eff(env); - bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO); - bool pending = enabled && (hcr & HCR_VSE); - bool masked = (env->daif & PSTATE_A); - - /* If VSE pending and masked, defer the exception. */ - if (pending && masked) { - uint32_t syndrome; - - if (arm_el_is_aa64(env, 1)) { - /* Copy across IDS and ISS from VSESR. */ - syndrome = env->cp15.vsesr_el2 & 0x1ffffff; - } else { - ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal }; - - if (extended_addresses_enabled(env)) { - syndrome = arm_fi_to_lfsc(&fi); - } else { - syndrome = arm_fi_to_sfsc(&fi); - } - /* Copy across AET and ExT from VSESR. */ - syndrome |= env->cp15.vsesr_el2 & 0xd000; - } - - /* Set VDISR_EL2.A along with the syndrome. */ - env->cp15.vdisr_el2 = syndrome | (1u << 31); - - /* Clear pending virtual SError */ - env->cp15.hcr_el2 &= ~HCR_VSE; - cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR); - } -} diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c deleted file mode 100644 index d0483bf051..0000000000 --- a/target/arm/pauth_helper.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * ARM v8.3-PAuth Operations - * - * Copyright (c) 2019 Linaro, Ltd. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "exec/exec-all.h" -#include "exec/cpu_ldst.h" -#include "exec/helper-proto.h" -#include "tcg/tcg-gvec-desc.h" -#include "qemu/xxhash.h" - - -static uint64_t pac_cell_shuffle(uint64_t i) -{ - uint64_t o = 0; - - o |= extract64(i, 52, 4); - o |= extract64(i, 24, 4) << 4; - o |= extract64(i, 44, 4) << 8; - o |= extract64(i, 0, 4) << 12; - - o |= extract64(i, 28, 4) << 16; - o |= extract64(i, 48, 4) << 20; - o |= extract64(i, 4, 4) << 24; - o |= extract64(i, 40, 4) << 28; - - o |= extract64(i, 32, 4) << 32; - o |= extract64(i, 12, 4) << 36; - o |= extract64(i, 56, 4) << 40; - o |= extract64(i, 20, 4) << 44; - - o |= extract64(i, 8, 4) << 48; - o |= extract64(i, 36, 4) << 52; - o |= extract64(i, 16, 4) << 56; - o |= extract64(i, 60, 4) << 60; - - return o; -} - -static uint64_t pac_cell_inv_shuffle(uint64_t i) -{ - uint64_t o = 0; - - o |= extract64(i, 12, 4); - o |= extract64(i, 24, 4) << 4; - o |= extract64(i, 48, 4) << 8; - o |= extract64(i, 36, 4) << 12; - - o |= extract64(i, 56, 4) << 16; - o |= extract64(i, 44, 4) << 20; - o |= extract64(i, 4, 4) << 24; - o |= extract64(i, 16, 4) << 28; - - o |= i & MAKE_64BIT_MASK(32, 4); - o |= extract64(i, 52, 4) << 36; - o |= extract64(i, 28, 4) << 40; - o |= extract64(i, 8, 4) << 44; - - o |= extract64(i, 20, 4) << 48; - o |= extract64(i, 0, 4) << 52; - o |= extract64(i, 40, 4) << 56; - o |= i & MAKE_64BIT_MASK(60, 4); - - return o; -} - -static uint64_t pac_sub(uint64_t i) -{ - static const uint8_t sub[16] = { - 0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe, - 0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa, - }; - uint64_t o = 0; - int b; - - for (b = 0; b < 64; b += 4) { - o |= (uint64_t)sub[(i >> b) & 0xf] << b; - } - return o; -} - -static uint64_t pac_inv_sub(uint64_t i) -{ - static const uint8_t inv_sub[16] = { - 0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9, - 0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3, - }; - uint64_t o = 0; - int b; - - for (b = 0; b < 64; b += 4) { - o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b; - } - return o; -} - -static int rot_cell(int cell, int n) -{ - /* 4-bit rotate left by n. */ - cell |= cell << 4; - return extract32(cell, 4 - n, 4); -} - -static uint64_t pac_mult(uint64_t i) -{ - uint64_t o = 0; - int b; - - for (b = 0; b < 4 * 4; b += 4) { - int i0, i4, i8, ic, t0, t1, t2, t3; - - i0 = extract64(i, b, 4); - i4 = extract64(i, b + 4 * 4, 4); - i8 = extract64(i, b + 8 * 4, 4); - ic = extract64(i, b + 12 * 4, 4); - - t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1); - t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2); - t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1); - t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1); - - o |= (uint64_t)t3 << b; - o |= (uint64_t)t2 << (b + 4 * 4); - o |= (uint64_t)t1 << (b + 8 * 4); - o |= (uint64_t)t0 << (b + 12 * 4); - } - return o; -} - -static uint64_t tweak_cell_rot(uint64_t cell) -{ - return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3); -} - -static uint64_t tweak_shuffle(uint64_t i) -{ - uint64_t o = 0; - - o |= extract64(i, 16, 4) << 0; - o |= extract64(i, 20, 4) << 4; - o |= tweak_cell_rot(extract64(i, 24, 4)) << 8; - o |= extract64(i, 28, 4) << 12; - - o |= tweak_cell_rot(extract64(i, 44, 4)) << 16; - o |= extract64(i, 8, 4) << 20; - o |= extract64(i, 12, 4) << 24; - o |= tweak_cell_rot(extract64(i, 32, 4)) << 28; - - o |= extract64(i, 48, 4) << 32; - o |= extract64(i, 52, 4) << 36; - o |= extract64(i, 56, 4) << 40; - o |= tweak_cell_rot(extract64(i, 60, 4)) << 44; - - o |= tweak_cell_rot(extract64(i, 0, 4)) << 48; - o |= extract64(i, 4, 4) << 52; - o |= tweak_cell_rot(extract64(i, 40, 4)) << 56; - o |= tweak_cell_rot(extract64(i, 36, 4)) << 60; - - return o; -} - -static uint64_t tweak_cell_inv_rot(uint64_t cell) -{ - return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3)); -} - -static uint64_t tweak_inv_shuffle(uint64_t i) -{ - uint64_t o = 0; - - o |= tweak_cell_inv_rot(extract64(i, 48, 4)); - o |= extract64(i, 52, 4) << 4; - o |= extract64(i, 20, 4) << 8; - o |= extract64(i, 24, 4) << 12; - - o |= extract64(i, 0, 4) << 16; - o |= extract64(i, 4, 4) << 20; - o |= tweak_cell_inv_rot(extract64(i, 8, 4)) << 24; - o |= extract64(i, 12, 4) << 28; - - o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32; - o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36; - o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40; - o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44; - - o |= extract64(i, 32, 4) << 48; - o |= extract64(i, 36, 4) << 52; - o |= extract64(i, 40, 4) << 56; - o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60; - - return o; -} - -static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier, - ARMPACKey key) -{ - static const uint64_t RC[5] = { - 0x0000000000000000ull, - 0x13198A2E03707344ull, - 0xA4093822299F31D0ull, - 0x082EFA98EC4E6C89ull, - 0x452821E638D01377ull, - }; - const uint64_t alpha = 0xC0AC29B7C97C50DDull; - /* - * Note that in the ARM pseudocode, key0 contains bits <127:64> - * and key1 contains bits <63:0> of the 128-bit key. - */ - uint64_t key0 = key.hi, key1 = key.lo; - uint64_t workingval, runningmod, roundkey, modk0; - int i; - - modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63)); - runningmod = modifier; - workingval = data ^ key0; - - for (i = 0; i <= 4; ++i) { - roundkey = key1 ^ runningmod; - workingval ^= roundkey; - workingval ^= RC[i]; - if (i > 0) { - workingval = pac_cell_shuffle(workingval); - workingval = pac_mult(workingval); - } - workingval = pac_sub(workingval); - runningmod = tweak_shuffle(runningmod); - } - roundkey = modk0 ^ runningmod; - workingval ^= roundkey; - workingval = pac_cell_shuffle(workingval); - workingval = pac_mult(workingval); - workingval = pac_sub(workingval); - workingval = pac_cell_shuffle(workingval); - workingval = pac_mult(workingval); - workingval ^= key1; - workingval = pac_cell_inv_shuffle(workingval); - workingval = pac_inv_sub(workingval); - workingval = pac_mult(workingval); - workingval = pac_cell_inv_shuffle(workingval); - workingval ^= key0; - workingval ^= runningmod; - for (i = 0; i <= 4; ++i) { - workingval = pac_inv_sub(workingval); - if (i < 4) { - workingval = pac_mult(workingval); - workingval = pac_cell_inv_shuffle(workingval); - } - runningmod = tweak_inv_shuffle(runningmod); - roundkey = key1 ^ runningmod; - workingval ^= RC[4 - i]; - workingval ^= roundkey; - workingval ^= alpha; - } - workingval ^= modk0; - - return workingval; -} - -static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier, - ARMPACKey key) -{ - return qemu_xxhash64_4(data, modifier, key.lo, key.hi); -} - -static uint64_t pauth_computepac(CPUARMState *env, uint64_t data, - uint64_t modifier, ARMPACKey key) -{ - if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) { - return pauth_computepac_architected(data, modifier, key); - } else { - return pauth_computepac_impdef(data, modifier, key); - } -} - -static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, - ARMPACKey *key, bool data) -{ - ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); - uint64_t pac, ext_ptr, ext, test; - int bot_bit, top_bit; - - /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>. */ - if (param.tbi) { - ext = sextract64(ptr, 55, 1); - } else { - ext = sextract64(ptr, 63, 1); - } - - /* Build a pointer with known good extension bits. */ - top_bit = 64 - 8 * param.tbi; - bot_bit = 64 - param.tsz; - ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext); - - pac = pauth_computepac(env, ext_ptr, modifier, *key); - - /* - * Check if the ptr has good extension bits and corrupt the - * pointer authentication code if not. - */ - test = sextract64(ptr, bot_bit, top_bit - bot_bit); - if (test != 0 && test != -1) { - /* - * Note that our top_bit is one greater than the pseudocode's - * version, hence "- 2" here. - */ - pac ^= MAKE_64BIT_MASK(top_bit - 2, 1); - } - - /* - * Preserve the determination between upper and lower at bit 55, - * and insert pointer authentication code. - */ - if (param.tbi) { - ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1); - pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1); - } else { - ptr &= MAKE_64BIT_MASK(0, bot_bit); - pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit)); - } - ext &= MAKE_64BIT_MASK(55, 1); - return pac | ext | ptr; -} - -static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param) -{ - /* Note that bit 55 is used whether or not the regime has 2 ranges. */ - uint64_t extfield = sextract64(ptr, 55, 1); - int bot_pac_bit = 64 - param.tsz; - int top_pac_bit = 64 - 8 * param.tbi; - - return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield); -} - -static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, - ARMPACKey *key, bool data, int keynumber) -{ - ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); - int bot_bit, top_bit; - uint64_t pac, orig_ptr, test; - - orig_ptr = pauth_original_ptr(ptr, param); - pac = pauth_computepac(env, orig_ptr, modifier, *key); - bot_bit = 64 - param.tsz; - top_bit = 64 - 8 * param.tbi; - - test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1); - if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) { - int error_code = (keynumber << 1) | (keynumber ^ 1); - if (param.tbi) { - return deposit64(orig_ptr, 53, 2, error_code); - } else { - return deposit64(orig_ptr, 61, 2, error_code); - } - } - return orig_ptr; -} - -static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data) -{ - ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); - ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); - - return pauth_original_ptr(ptr, param); -} - -static G_NORETURN -void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra) -{ - raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra); -} - -static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra) -{ - if (el < 2 && arm_is_el2_enabled(env)) { - uint64_t hcr = arm_hcr_el2_eff(env); - bool trap = !(hcr & HCR_API); - if (el == 0) { - /* Trap only applies to EL1&0 regime. */ - trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE); - } - /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB]. */ - if (trap) { - pauth_trap(env, 2, ra); - } - } - if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) { - if (!(env->cp15.scr_el3 & SCR_API)) { - pauth_trap(env, 3, ra); - } - } -} - -static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit) -{ - return (arm_sctlr(env, el) & bit) != 0; -} - -uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnIA)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_addpac(env, x, y, &env->keys.apia, false); -} - -uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnIB)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_addpac(env, x, y, &env->keys.apib, false); -} - -uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnDA)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_addpac(env, x, y, &env->keys.apda, true); -} - -uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnDB)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_addpac(env, x, y, &env->keys.apdb, true); -} - -uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y) -{ - uint64_t pac; - - pauth_check_trap(env, arm_current_el(env), GETPC()); - pac = pauth_computepac(env, x, y, env->keys.apga); - - return pac & 0xffffffff00000000ull; -} - -uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnIA)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_auth(env, x, y, &env->keys.apia, false, 0); -} - -uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnIB)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_auth(env, x, y, &env->keys.apib, false, 1); -} - -uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnDA)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_auth(env, x, y, &env->keys.apda, true, 0); -} - -uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y) -{ - int el = arm_current_el(env); - if (!pauth_key_enabled(env, el, SCTLR_EnDB)) { - return x; - } - pauth_check_trap(env, el, GETPC()); - return pauth_auth(env, x, y, &env->keys.apdb, true, 1); -} - -uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a) -{ - return pauth_strip(env, a, false); -} - -uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a) -{ - return pauth_strip(env, a, true); -} diff --git a/target/arm/sme_helper.c b/target/arm/sme_helper.c deleted file mode 100644 index 1e67fcac30..0000000000 --- a/target/arm/sme_helper.c +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * ARM SME Operations - * - * Copyright (c) 2022 Linaro, Ltd. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "tcg/tcg-gvec-desc.h" -#include "exec/helper-proto.h" -#include "exec/cpu_ldst.h" -#include "exec/exec-all.h" -#include "qemu/int128.h" -#include "fpu/softfloat.h" -#include "vec_internal.h" -#include "sve_ldst_internal.h" - -void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask) -{ - aarch64_set_svcr(env, val, mask); -} - -void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) -{ - uint32_t i; - - /* - * Special case clearing the entire ZA space. - * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any - * parts of the ZA storage outside of SVL. - */ - if (imm == 0xff) { - memset(env->zarray, 0, sizeof(env->zarray)); - return; - } - - /* - * Recall that ZAnH.D[m] is spread across ZA[n+8*m], - * so each row is discontiguous within ZA[]. - */ - for (i = 0; i < svl; i++) { - if (imm & (1 << (i % 8))) { - memset(&env->zarray[i], 0, svl); - } - } -} - - -/* - * When considering the ZA storage as an array of elements of - * type T, the index within that array of the Nth element of - * a vertical slice of a tile can be calculated like this, - * regardless of the size of type T. This is because the tiles - * are interleaved, so if type T is size N bytes then row 1 of - * the tile is N rows away from row 0. The division by N to - * convert a byte offset into an array index and the multiplication - * by N to convert from vslice-index-within-the-tile to - * the index within the ZA storage cancel out. - */ -#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg)) - -/* - * When doing byte arithmetic on the ZA storage, the element - * byteoff bytes away in a tile vertical slice is always this - * many bytes away in the ZA storage, regardless of the - * size of the tile element, assuming that byteoff is a multiple - * of the element size. Again this is because of the interleaving - * of the tiles. For instance if we have 1 byte per element then - * each row of the ZA storage has one byte of the vslice data, - * and (counting from 0) byte 8 goes in row 8 of the storage - * at offset (8 * row-size-in-bytes). - * If we have 8 bytes per element then each row of the ZA storage - * has 8 bytes of the data, but there are 8 interleaved tiles and - * so byte 8 of the data goes into row 1 of the tile, - * which is again row 8 of the storage, so the offset is still - * (8 * row-size-in-bytes). Similarly for other element sizes. - */ -#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg)) - - -/* - * Move Zreg vector to ZArray column. - */ -#define DO_MOVA_C(NAME, TYPE, H) \ -void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \ -{ \ - int i, oprsz = simd_oprsz(desc); \ - for (i = 0; i < oprsz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \ - } \ - i += sizeof(TYPE); \ - pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -DO_MOVA_C(sme_mova_cz_b, uint8_t, H1) -DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2) -DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4) - -void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc) -{ - int i, oprsz = simd_oprsz(desc) / 8; - uint8_t *pg = vg; - uint64_t *n = vn; - uint64_t *a = za; - - for (i = 0; i < oprsz; i++) { - if (pg[H1(i)] & 1) { - a[tile_vslice_index(i)] = n[i]; - } - } -} - -void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc) -{ - int i, oprsz = simd_oprsz(desc) / 16; - uint16_t *pg = vg; - Int128 *n = vn; - Int128 *a = za; - - /* - * Int128 is used here simply to copy 16 bytes, and to simplify - * the address arithmetic. - */ - for (i = 0; i < oprsz; i++) { - if (pg[H2(i)] & 1) { - a[tile_vslice_index(i)] = n[i]; - } - } -} - -#undef DO_MOVA_C - -/* - * Move ZArray column to Zreg vector. - */ -#define DO_MOVA_Z(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \ -{ \ - int i, oprsz = simd_oprsz(desc); \ - for (i = 0; i < oprsz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \ - } \ - i += sizeof(TYPE); \ - pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1) -DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2) -DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4) - -void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc) -{ - int i, oprsz = simd_oprsz(desc) / 8; - uint8_t *pg = vg; - uint64_t *d = vd; - uint64_t *a = za; - - for (i = 0; i < oprsz; i++) { - if (pg[H1(i)] & 1) { - d[i] = a[tile_vslice_index(i)]; - } - } -} - -void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) -{ - int i, oprsz = simd_oprsz(desc) / 16; - uint16_t *pg = vg; - Int128 *d = vd; - Int128 *a = za; - - /* - * Int128 is used here simply to copy 16 bytes, and to simplify - * the address arithmetic. - */ - for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) { - if (pg[H2(i)] & 1) { - d[i] = a[tile_vslice_index(i)]; - } - } -} - -#undef DO_MOVA_Z - -/* - * Clear elements in a tile slice comprising len bytes. - */ - -typedef void ClearFn(void *ptr, size_t off, size_t len); - -static void clear_horizontal(void *ptr, size_t off, size_t len) -{ - memset(ptr + off, 0, len); -} - -static void clear_vertical_b(void *vptr, size_t off, size_t len) -{ - for (size_t i = 0; i < len; ++i) { - *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0; - } -} - -static void clear_vertical_h(void *vptr, size_t off, size_t len) -{ - for (size_t i = 0; i < len; i += 2) { - *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0; - } -} - -static void clear_vertical_s(void *vptr, size_t off, size_t len) -{ - for (size_t i = 0; i < len; i += 4) { - *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0; - } -} - -static void clear_vertical_d(void *vptr, size_t off, size_t len) -{ - for (size_t i = 0; i < len; i += 8) { - *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0; - } -} - -static void clear_vertical_q(void *vptr, size_t off, size_t len) -{ - for (size_t i = 0; i < len; i += 16) { - memset(vptr + tile_vslice_offset(i + off), 0, 16); - } -} - -/* - * Copy elements from an array into a tile slice comprising len bytes. - */ - -typedef void CopyFn(void *dst, const void *src, size_t len); - -static void copy_horizontal(void *dst, const void *src, size_t len) -{ - memcpy(dst, src, len); -} - -static void copy_vertical_b(void *vdst, const void *vsrc, size_t len) -{ - const uint8_t *src = vsrc; - uint8_t *dst = vdst; - size_t i; - - for (i = 0; i < len; ++i) { - dst[tile_vslice_index(i)] = src[i]; - } -} - -static void copy_vertical_h(void *vdst, const void *vsrc, size_t len) -{ - const uint16_t *src = vsrc; - uint16_t *dst = vdst; - size_t i; - - for (i = 0; i < len / 2; ++i) { - dst[tile_vslice_index(i)] = src[i]; - } -} - -static void copy_vertical_s(void *vdst, const void *vsrc, size_t len) -{ - const uint32_t *src = vsrc; - uint32_t *dst = vdst; - size_t i; - - for (i = 0; i < len / 4; ++i) { - dst[tile_vslice_index(i)] = src[i]; - } -} - -static void copy_vertical_d(void *vdst, const void *vsrc, size_t len) -{ - const uint64_t *src = vsrc; - uint64_t *dst = vdst; - size_t i; - - for (i = 0; i < len / 8; ++i) { - dst[tile_vslice_index(i)] = src[i]; - } -} - -static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) -{ - for (size_t i = 0; i < len; i += 16) { - memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16); - } -} - -/* - * Host and TLB primitives for vertical tile slice addressing. - */ - -#define DO_LD(NAME, TYPE, HOST, TLB) \ -static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ -{ \ - TYPE val = HOST(host); \ - *(TYPE *)(za + tile_vslice_offset(off)) = val; \ -} \ -static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ - intptr_t off, target_ulong addr, uintptr_t ra) \ -{ \ - TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \ - *(TYPE *)(za + tile_vslice_offset(off)) = val; \ -} - -#define DO_ST(NAME, TYPE, HOST, TLB) \ -static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ -{ \ - TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ - HOST(host, val); \ -} \ -static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ - intptr_t off, target_ulong addr, uintptr_t ra) \ -{ \ - TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ - TLB(env, useronly_clean_ptr(addr), val, ra); \ -} - -/* - * The ARMVectorReg elements are stored in host-endian 64-bit units. - * For 128-bit quantities, the sequence defined by the Elem[] pseudocode - * corresponds to storing the two 64-bit pieces in little-endian order. - */ -#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ -static inline void HNAME##_host(void *za, intptr_t off, void *host) \ -{ \ - uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ - uint64_t *ptr = za + off; \ - ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ -} \ -static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ -{ \ - HNAME##_host(za, tile_vslice_offset(off), host); \ -} \ -static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ - uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ - uint64_t *ptr = za + off; \ - ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ -} \ -static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ -} - -#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ -static inline void HNAME##_host(void *za, intptr_t off, void *host) \ -{ \ - uint64_t *ptr = za + off; \ - HOST(host, ptr[BE]); \ - HOST(host + 1, ptr[!BE]); \ -} \ -static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ -{ \ - HNAME##_host(za, tile_vslice_offset(off), host); \ -} \ -static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - uint64_t *ptr = za + off; \ - TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ - TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ -} \ -static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ - target_ulong addr, uintptr_t ra) \ -{ \ - HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ -} - -DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra) -DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra) -DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra) -DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra) -DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) -DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) -DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) - -DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) -DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) - -DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) -DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) -DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra) -DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra) -DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) -DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) -DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) - -DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) -DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) - -#undef DO_LD -#undef DO_ST -#undef DO_LDQ -#undef DO_STQ - -/* - * Common helper for all contiguous predicated loads. - */ - -static inline QEMU_ALWAYS_INLINE -void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, - const target_ulong addr, uint32_t desc, const uintptr_t ra, - const int esz, uint32_t mtedesc, bool vertical, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn, - ClearFn *clr_fn, - CopyFn *cpy_fn) -{ - const intptr_t reg_max = simd_oprsz(desc); - const intptr_t esize = 1 << esz; - intptr_t reg_off, reg_last; - SVEContLdSt info; - void *host; - int flags; - - /* Find the active elements. */ - if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { - /* The entire predicate was false; no load occurs. */ - clr_fn(za, 0, reg_max); - return; - } - - /* Probe the page(s). Exit with exception for any invalid page. */ - sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); - - /* Handle watchpoints for all active elements. */ - sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, - BP_MEM_READ, ra); - - /* - * Handle mte checks for all active elements. - * Since TBI must be set for MTE, !mtedesc => !mte_active. - */ - if (mtedesc) { - sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, - mtedesc, ra); - } - - flags = info.page[0].flags | info.page[1].flags; - if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else - /* - * At least one page includes MMIO. - * Any bus operation can fail with cpu_transaction_failed, - * which for ARM will raise SyncExternal. Perform the load - * into scratch memory to preserve register state until the end. - */ - ARMVectorReg scratch = { }; - - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[1]; - if (reg_last < 0) { - reg_last = info.reg_off_split; - if (reg_last < 0) { - reg_last = info.reg_off_last[0]; - } - } - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - tlb_fn(env, &scratch, reg_off, addr + reg_off, ra); - } - reg_off += esize; - } while (reg_off & 63); - } while (reg_off <= reg_last); - - cpy_fn(za, &scratch, reg_max); - return; -#endif - } - - /* The entire operation is in RAM, on valid pages. */ - - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[0]; - host = info.page[0].host; - - if (!vertical) { - memset(za, 0, reg_max); - } else if (reg_off) { - clr_fn(za, 0, reg_off); - } - - while (reg_off <= reg_last) { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - host_fn(za, reg_off, host + reg_off); - } else if (vertical) { - clr_fn(za, reg_off, esize); - } - reg_off += esize; - } while (reg_off <= reg_last && (reg_off & 63)); - } - - /* - * Use the slow path to manage the cross-page misalignment. - * But we know this is RAM and cannot trap. - */ - reg_off = info.reg_off_split; - if (unlikely(reg_off >= 0)) { - tlb_fn(env, za, reg_off, addr + reg_off, ra); - } - - reg_off = info.reg_off_first[1]; - if (unlikely(reg_off >= 0)) { - reg_last = info.reg_off_last[1]; - host = info.page[1].host; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - host_fn(za, reg_off, host + reg_off); - } else if (vertical) { - clr_fn(za, reg_off, esize); - } - reg_off += esize; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} - -static inline QEMU_ALWAYS_INLINE -void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, - target_ulong addr, uint32_t desc, uintptr_t ra, - const int esz, bool vertical, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn, - ClearFn *clr_fn, - CopyFn *cpy_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - int bit55 = extract64(addr, 55, 1); - - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { - mtedesc = 0; - } - - sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical, - host_fn, tlb_fn, clr_fn, cpy_fn); -} - -#define DO_LD(L, END, ESZ) \ -void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ - sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ - clear_horizontal, copy_horizontal); \ -} \ -void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ - sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ - clear_vertical_##L, copy_vertical_##L); \ -} \ -void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ - sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ - clear_horizontal, copy_horizontal); \ -} \ -void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ - sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ - clear_vertical_##L, copy_vertical_##L); \ -} - -DO_LD(b, , MO_8) -DO_LD(h, _be, MO_16) -DO_LD(h, _le, MO_16) -DO_LD(s, _be, MO_32) -DO_LD(s, _le, MO_32) -DO_LD(d, _be, MO_64) -DO_LD(d, _le, MO_64) -DO_LD(q, _be, MO_128) -DO_LD(q, _le, MO_128) - -#undef DO_LD - -/* - * Common helper for all contiguous predicated stores. - */ - -static inline QEMU_ALWAYS_INLINE -void sme_st1(CPUARMState *env, void *za, uint64_t *vg, - const target_ulong addr, uint32_t desc, const uintptr_t ra, - const int esz, uint32_t mtedesc, bool vertical, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const intptr_t reg_max = simd_oprsz(desc); - const intptr_t esize = 1 << esz; - intptr_t reg_off, reg_last; - SVEContLdSt info; - void *host; - int flags; - - /* Find the active elements. */ - if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { - /* The entire predicate was false; no store occurs. */ - return; - } - - /* Probe the page(s). Exit with exception for any invalid page. */ - sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); - - /* Handle watchpoints for all active elements. */ - sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, - BP_MEM_WRITE, ra); - - /* - * Handle mte checks for all active elements. - * Since TBI must be set for MTE, !mtedesc => !mte_active. - */ - if (mtedesc) { - sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, - mtedesc, ra); - } - - flags = info.page[0].flags | info.page[1].flags; - if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else - /* - * At least one page includes MMIO. - * Any bus operation can fail with cpu_transaction_failed, - * which for ARM will raise SyncExternal. We cannot avoid - * this fault and will leave with the store incomplete. - */ - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[1]; - if (reg_last < 0) { - reg_last = info.reg_off_split; - if (reg_last < 0) { - reg_last = info.reg_off_last[0]; - } - } - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - tlb_fn(env, za, reg_off, addr + reg_off, ra); - } - reg_off += esize; - } while (reg_off & 63); - } while (reg_off <= reg_last); - return; -#endif - } - - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[0]; - host = info.page[0].host; - - while (reg_off <= reg_last) { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - host_fn(za, reg_off, host + reg_off); - } - reg_off += 1 << esz; - } while (reg_off <= reg_last && (reg_off & 63)); - } - - /* - * Use the slow path to manage the cross-page misalignment. - * But we know this is RAM and cannot trap. - */ - reg_off = info.reg_off_split; - if (unlikely(reg_off >= 0)) { - tlb_fn(env, za, reg_off, addr + reg_off, ra); - } - - reg_off = info.reg_off_first[1]; - if (unlikely(reg_off >= 0)) { - reg_last = info.reg_off_last[1]; - host = info.page[1].host; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - host_fn(za, reg_off, host + reg_off); - } - reg_off += 1 << esz; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} - -static inline QEMU_ALWAYS_INLINE -void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, - uint32_t desc, uintptr_t ra, int esz, bool vertical, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - int bit55 = extract64(addr, 55, 1); - - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { - mtedesc = 0; - } - - sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc, - vertical, host_fn, tlb_fn); -} - -#define DO_ST(L, END, ESZ) \ -void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ - sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ -} \ -void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ - sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ -} \ -void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ - sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ -} \ -void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ - sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ -} - -DO_ST(b, , MO_8) -DO_ST(h, _be, MO_16) -DO_ST(h, _le, MO_16) -DO_ST(s, _be, MO_32) -DO_ST(s, _le, MO_32) -DO_ST(d, _be, MO_64) -DO_ST(d, _le, MO_64) -DO_ST(q, _be, MO_128) -DO_ST(q, _le, MO_128) - -#undef DO_ST - -void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, - void *vpm, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 4; - uint64_t *pn = vpn, *pm = vpm; - uint32_t *zda = vzda, *zn = vzn; - - for (row = 0; row < oprsz; ) { - uint64_t pa = pn[row >> 4]; - do { - if (pa & 1) { - for (col = 0; col < oprsz; ) { - uint64_t pb = pm[col >> 4]; - do { - if (pb & 1) { - zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)]; - } - pb >>= 4; - } while (++col & 15); - } - } - pa >>= 4; - } while (++row & 15); - } -} - -void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, - void *vpm, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 8; - uint8_t *pn = vpn, *pm = vpm; - uint64_t *zda = vzda, *zn = vzn; - - for (row = 0; row < oprsz; ++row) { - if (pn[H1(row)] & 1) { - for (col = 0; col < oprsz; ++col) { - if (pm[H1(col)] & 1) { - zda[tile_vslice_index(row) + col] += zn[col]; - } - } - } - } -} - -void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, - void *vpm, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 4; - uint64_t *pn = vpn, *pm = vpm; - uint32_t *zda = vzda, *zn = vzn; - - for (row = 0; row < oprsz; ) { - uint64_t pa = pn[row >> 4]; - do { - if (pa & 1) { - uint32_t zn_row = zn[H4(row)]; - for (col = 0; col < oprsz; ) { - uint64_t pb = pm[col >> 4]; - do { - if (pb & 1) { - zda[tile_vslice_index(row) + H4(col)] += zn_row; - } - pb >>= 4; - } while (++col & 15); - } - } - pa >>= 4; - } while (++row & 15); - } -} - -void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, - void *vpm, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 8; - uint8_t *pn = vpn, *pm = vpm; - uint64_t *zda = vzda, *zn = vzn; - - for (row = 0; row < oprsz; ++row) { - if (pn[H1(row)] & 1) { - uint64_t zn_row = zn[row]; - for (col = 0; col < oprsz; ++col) { - if (pm[H1(col)] & 1) { - zda[tile_vslice_index(row) + col] += zn_row; - } - } - } - } -} - -void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, void *vst, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) << 31; - uint16_t *pn = vpn, *pm = vpm; - float_status fpst; - - /* - * Make a copy of float_status because this operation does not - * update the cumulative fp exception status. It also produces - * default nans. - */ - fpst = *(float_status *)vst; - set_default_nan_mode(true, &fpst); - - for (row = 0; row < oprsz; ) { - uint16_t pa = pn[H2(row >> 4)]; - do { - if (pa & 1) { - void *vza_row = vza + tile_vslice_offset(row); - uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; - - for (col = 0; col < oprsz; ) { - uint16_t pb = pm[H2(col >> 4)]; - do { - if (pb & 1) { - uint32_t *a = vza_row + H1_4(col); - uint32_t *m = vzm + H1_4(col); - *a = float32_muladd(n, *m, *a, 0, vst); - } - col += 4; - pb >>= 4; - } while (col & 15); - } - } - row += 4; - pa >>= 4; - } while (row & 15); - } -} - -void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, void *vst, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 8; - uint64_t neg = (uint64_t)simd_data(desc) << 63; - uint64_t *za = vza, *zn = vzn, *zm = vzm; - uint8_t *pn = vpn, *pm = vpm; - float_status fpst = *(float_status *)vst; - - set_default_nan_mode(true, &fpst); - - for (row = 0; row < oprsz; ++row) { - if (pn[H1(row)] & 1) { - uint64_t *za_row = &za[tile_vslice_index(row)]; - uint64_t n = zn[row] ^ neg; - - for (col = 0; col < oprsz; ++col) { - if (pm[H1(col)] & 1) { - uint64_t *a = &za_row[col]; - *a = float64_muladd(n, zm[col], *a, 0, &fpst); - } - } - } - } -} - -/* - * Alter PAIR as needed for controlling predicates being false, - * and for NEG on an enabled row element. - */ -static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) -{ - /* - * The pseudocode uses a conditional negate after the conditional zero. - * It is simpler here to unconditionally negate before conditional zero. - */ - pair ^= neg; - if (!(pg & 1)) { - pair &= 0xffff0000u; - } - if (!(pg & 4)) { - pair &= 0x0000ffffu; - } - return pair; -} - -static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, - float_status *s_std, float_status *s_odd) -{ - float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std); - float64 e1c = float16_to_float64(e1 >> 16, true, s_std); - float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std); - float64 e2c = float16_to_float64(e2 >> 16, true, s_std); - float64 t64; - float32 t32; - - /* - * The ARM pseudocode function FPDot performs both multiplies - * and the add with a single rounding operation. Emulate this - * by performing the first multiply in round-to-odd, then doing - * the second multiply as fused multiply-add, and rounding to - * float32 all in one step. - */ - t64 = float64_mul(e1r, e2r, s_odd); - t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); - - /* This conversion is exact, because we've already rounded. */ - t32 = float64_to_float32(t64, s_std); - - /* The final accumulation step is not fused. */ - return float32_add(sum, t32, s_std); -} - -void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, void *vst, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) * 0x80008000u; - uint16_t *pn = vpn, *pm = vpm; - float_status fpst_odd, fpst_std; - - /* - * Make a copy of float_status because this operation does not - * update the cumulative fp exception status. It also produces - * default nans. Make a second copy with round-to-odd -- see above. - */ - fpst_std = *(float_status *)vst; - set_default_nan_mode(true, &fpst_std); - fpst_odd = fpst_std; - set_float_rounding_mode(float_round_to_odd, &fpst_odd); - - for (row = 0; row < oprsz; ) { - uint16_t prow = pn[H2(row >> 4)]; - do { - void *vza_row = vza + tile_vslice_offset(row); - uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - - n = f16mop_adj_pair(n, prow, neg); - - for (col = 0; col < oprsz; ) { - uint16_t pcol = pm[H2(col >> 4)]; - do { - if (prow & pcol & 0b0101) { - uint32_t *a = vza_row + H1_4(col); - uint32_t m = *(uint32_t *)(vzm + H1_4(col)); - - m = f16mop_adj_pair(m, pcol, 0); - *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd); - - col += 4; - pcol >>= 4; - } - } while (col & 15); - } - row += 4; - prow >>= 4; - } while (row & 15); - } -} - -void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, - void *vpm, uint32_t desc) -{ - intptr_t row, col, oprsz = simd_maxsz(desc); - uint32_t neg = simd_data(desc) * 0x80008000u; - uint16_t *pn = vpn, *pm = vpm; - - for (row = 0; row < oprsz; ) { - uint16_t prow = pn[H2(row >> 4)]; - do { - void *vza_row = vza + tile_vslice_offset(row); - uint32_t n = *(uint32_t *)(vzn + H1_4(row)); - - n = f16mop_adj_pair(n, prow, neg); - - for (col = 0; col < oprsz; ) { - uint16_t pcol = pm[H2(col >> 4)]; - do { - if (prow & pcol & 0b0101) { - uint32_t *a = vza_row + H1_4(col); - uint32_t m = *(uint32_t *)(vzm + H1_4(col)); - - m = f16mop_adj_pair(m, pcol, 0); - *a = bfdotadd(*a, n, m); - - col += 4; - pcol >>= 4; - } - } while (col & 15); - } - row += 4; - prow >>= 4; - } while (row & 15); - } -} - -typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool); - -static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm, - uint8_t *pn, uint8_t *pm, - uint32_t desc, IMOPFn *fn) -{ - intptr_t row, col, oprsz = simd_oprsz(desc) / 8; - bool neg = simd_data(desc); - - for (row = 0; row < oprsz; ++row) { - uint8_t pa = pn[H1(row)]; - uint64_t *za_row = &za[tile_vslice_index(row)]; - uint64_t n = zn[row]; - - for (col = 0; col < oprsz; ++col) { - uint8_t pb = pm[H1(col)]; - uint64_t *a = &za_row[col]; - - *a = fn(n, zm[col], *a, pa & pb, neg); - } - } -} - -#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ -static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ -{ \ - uint32_t sum0 = 0, sum1 = 0; \ - /* Apply P to N as a mask, making the inactive elements 0. */ \ - n &= expand_pred_b(p); \ - sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ - sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \ - sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ - sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \ - sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ - sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \ - sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ - sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \ - if (neg) { \ - sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \ - } else { \ - sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \ - } \ - return ((uint64_t)sum1 << 32) | sum0; \ -} - -#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ -static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ -{ \ - uint64_t sum = 0; \ - /* Apply P to N as a mask, making the inactive elements 0. */ \ - n &= expand_pred_h(p); \ - sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ - sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ - sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ - sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ - return neg ? a - sum : a + sum; \ -} - -DEF_IMOP_32(smopa_s, int8_t, int8_t) -DEF_IMOP_32(umopa_s, uint8_t, uint8_t) -DEF_IMOP_32(sumopa_s, int8_t, uint8_t) -DEF_IMOP_32(usmopa_s, uint8_t, int8_t) - -DEF_IMOP_64(smopa_d, int16_t, int16_t) -DEF_IMOP_64(umopa_d, uint16_t, uint16_t) -DEF_IMOP_64(sumopa_d, int16_t, uint16_t) -DEF_IMOP_64(usmopa_d, uint16_t, int16_t) - -#define DEF_IMOPH(NAME) \ - void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \ - void *vpm, uint32_t desc) \ - { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); } - -DEF_IMOPH(smopa_s) -DEF_IMOPH(umopa_s) -DEF_IMOPH(sumopa_s) -DEF_IMOPH(usmopa_s) -DEF_IMOPH(smopa_d) -DEF_IMOPH(umopa_d) -DEF_IMOPH(sumopa_d) -DEF_IMOPH(usmopa_d) diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c deleted file mode 100644 index 521fc9b969..0000000000 --- a/target/arm/sve_helper.c +++ /dev/null @@ -1,7483 +0,0 @@ -/* - * ARM SVE Operations - * - * Copyright (c) 2018 Linaro, Ltd. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "exec/exec-all.h" -#include "exec/helper-proto.h" -#include "tcg/tcg-gvec-desc.h" -#include "fpu/softfloat.h" -#include "tcg/tcg.h" -#include "vec_internal.h" -#include "sve_ldst_internal.h" - - -/* Return a value for NZCV as per the ARM PredTest pseudofunction. - * - * The return value has bit 31 set if N is set, bit 1 set if Z is clear, - * and bit 0 set if C is set. Compare the definitions of these variables - * within CPUARMState. - */ - -/* For no G bits set, NZCV = C. */ -#define PREDTEST_INIT 1 - -/* This is an iterative function, called for each Pd and Pg word - * moving forward. - */ -static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) -{ - if (likely(g)) { - /* Compute N from first D & G. - Use bit 2 to signal first G bit seen. */ - if (!(flags & 4)) { - flags |= ((d & (g & -g)) != 0) << 31; - flags |= 4; - } - - /* Accumulate Z from each D & G. */ - flags |= ((d & g) != 0) << 1; - - /* Compute C from last !(D & G). Replace previous. */ - flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); - } - return flags; -} - -/* This is an iterative function, called for each Pd and Pg word - * moving backward. - */ -static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) -{ - if (likely(g)) { - /* Compute C from first (i.e last) !(D & G). - Use bit 2 to signal first G bit seen. */ - if (!(flags & 4)) { - flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ - flags |= (d & pow2floor(g)) == 0; - } - - /* Accumulate Z from each D & G. */ - flags |= ((d & g) != 0) << 1; - - /* Compute N from last (i.e first) D & G. Replace previous. */ - flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); - } - return flags; -} - -/* The same for a single word predicate. */ -uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) -{ - return iter_predtest_fwd(d, g, PREDTEST_INIT); -} - -/* The same for a multi-word predicate. */ -uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) -{ - uint32_t flags = PREDTEST_INIT; - uint64_t *d = vd, *g = vg; - uintptr_t i = 0; - - do { - flags = iter_predtest_fwd(d[i], g[i], flags); - } while (++i < words); - - return flags; -} - -/* Similarly for single word elements. */ -static inline uint64_t expand_pred_s(uint8_t byte) -{ - static const uint64_t word[] = { - [0x01] = 0x00000000ffffffffull, - [0x10] = 0xffffffff00000000ull, - [0x11] = 0xffffffffffffffffull, - }; - return word[byte & 0x11]; -} - -#define LOGICAL_PPPP(NAME, FUNC) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - uintptr_t opr_sz = simd_oprsz(desc); \ - uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ - uintptr_t i; \ - for (i = 0; i < opr_sz / 8; ++i) { \ - d[i] = FUNC(n[i], m[i], g[i]); \ - } \ -} - -#define DO_AND(N, M, G) (((N) & (M)) & (G)) -#define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) -#define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) -#define DO_ORR(N, M, G) (((N) | (M)) & (G)) -#define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) -#define DO_NOR(N, M, G) (~((N) | (M)) & (G)) -#define DO_NAND(N, M, G) (~((N) & (M)) & (G)) -#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) - -LOGICAL_PPPP(sve_and_pppp, DO_AND) -LOGICAL_PPPP(sve_bic_pppp, DO_BIC) -LOGICAL_PPPP(sve_eor_pppp, DO_EOR) -LOGICAL_PPPP(sve_sel_pppp, DO_SEL) -LOGICAL_PPPP(sve_orr_pppp, DO_ORR) -LOGICAL_PPPP(sve_orn_pppp, DO_ORN) -LOGICAL_PPPP(sve_nor_pppp, DO_NOR) -LOGICAL_PPPP(sve_nand_pppp, DO_NAND) - -#undef DO_AND -#undef DO_BIC -#undef DO_EOR -#undef DO_ORR -#undef DO_ORN -#undef DO_NOR -#undef DO_NAND -#undef DO_SEL -#undef LOGICAL_PPPP - -/* Fully general three-operand expander, controlled by a predicate. - * This is complicated by the host-endian storage of the register file. - */ -/* ??? I don't expect the compiler could ever vectorize this itself. - * With some tables we can convert bit masks to byte masks, and with - * extra care wrt byte/word ordering we could use gcc generic vectors - * and do 16 bytes at a time. - */ -#define DO_ZPZZ(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - TYPE mm = *(TYPE *)(vm + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, mm); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -/* Similarly, specialized for 64-bit operands. */ -#define DO_ZPZZ_D(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint8_t *pg = vg; \ - for (i = 0; i < opr_sz; i += 1) { \ - if (pg[H1(i)] & 1) { \ - TYPE nn = n[i], mm = m[i]; \ - d[i] = OP(nn, mm); \ - } \ - } \ -} - -#define DO_AND(N, M) (N & M) -#define DO_EOR(N, M) (N ^ M) -#define DO_ORR(N, M) (N | M) -#define DO_BIC(N, M) (N & ~M) -#define DO_ADD(N, M) (N + M) -#define DO_SUB(N, M) (N - M) -#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) -#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) -#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) -#define DO_MUL(N, M) (N * M) - - -/* - * We must avoid the C undefined behaviour cases: division by - * zero and signed division of INT_MIN by -1. Both of these - * have architecturally defined required results for Arm. - * We special case all signed divisions by -1 to avoid having - * to deduce the minimum integer for the type involved. - */ -#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) -#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) - -DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) -DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) -DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) -DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) - -DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) -DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) -DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) -DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) - -DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) -DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) -DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) -DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) - -DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) -DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) -DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) -DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) - -DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) -DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) -DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) -DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) - -DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) -DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) -DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) -DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) - -DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) -DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) -DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) -DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) - -DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) -DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) -DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) -DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) - -DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) -DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) -DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) -DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) - -DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) -DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) -DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) -DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) - -DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) -DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) -DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) -DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) - -DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) -DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) -DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) -DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) - -/* Because the computation type is at least twice as large as required, - these work for both signed and unsigned source types. */ -static inline uint8_t do_mulh_b(int32_t n, int32_t m) -{ - return (n * m) >> 8; -} - -static inline uint16_t do_mulh_h(int32_t n, int32_t m) -{ - return (n * m) >> 16; -} - -static inline uint32_t do_mulh_s(int64_t n, int64_t m) -{ - return (n * m) >> 32; -} - -static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) -{ - uint64_t lo, hi; - muls64(&lo, &hi, n, m); - return hi; -} - -static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) -{ - uint64_t lo, hi; - mulu64(&lo, &hi, n, m); - return hi; -} - -DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) -DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) -DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) -DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) - -DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) -DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) -DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) -DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) - -DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) -DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) -DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) -DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) - -DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) -DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) - -DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) -DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) - -/* Note that all bits of the shift are significant - and not modulo the element size. */ -#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) -#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) -#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) - -DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) -DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) -DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) - -DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) -DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) -DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) - -DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) -DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) -DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) - -DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) -DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) -DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) - -static inline uint16_t do_sadalp_h(int16_t n, int16_t m) -{ - int8_t n1 = n, n2 = n >> 8; - return m + n1 + n2; -} - -static inline uint32_t do_sadalp_s(int32_t n, int32_t m) -{ - int16_t n1 = n, n2 = n >> 16; - return m + n1 + n2; -} - -static inline uint64_t do_sadalp_d(int64_t n, int64_t m) -{ - int32_t n1 = n, n2 = n >> 32; - return m + n1 + n2; -} - -DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) -DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) -DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) - -static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) -{ - uint8_t n1 = n, n2 = n >> 8; - return m + n1 + n2; -} - -static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) -{ - uint16_t n1 = n, n2 = n >> 16; - return m + n1 + n2; -} - -static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) -{ - uint32_t n1 = n, n2 = n >> 32; - return m + n1 + n2; -} - -DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) -DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) -DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) - -#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) -#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) -#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) -#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) - -DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) -DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) -DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) -DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) - -#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) -#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) -#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) -#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) - -DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) -DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) -DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) -DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) - -/* - * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. - * We pass in a pointer to a dummy saturation field to trigger - * the saturating arithmetic but discard the information about - * whether it has occurred. - */ -#define do_sqshl_b(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) -#define do_sqshl_h(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) -#define do_sqshl_s(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) -#define do_sqshl_d(n, m) \ - ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) - -DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) -DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) -DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) -DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) - -#define do_uqshl_b(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) -#define do_uqshl_h(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) -#define do_uqshl_s(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) -#define do_uqshl_d(n, m) \ - ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) - -DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) -DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) -DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) -DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) - -#define do_sqrshl_b(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) -#define do_sqrshl_h(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) -#define do_sqrshl_s(n, m) \ - ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) -#define do_sqrshl_d(n, m) \ - ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) - -DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) -DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) -DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) -DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) - -#undef do_sqrshl_d - -#define do_uqrshl_b(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) -#define do_uqrshl_h(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) -#define do_uqrshl_s(n, m) \ - ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) -#define do_uqrshl_d(n, m) \ - ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) - -DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) -DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) -DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) -DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) - -#undef do_uqrshl_d - -#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) -#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) - -DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) -DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) -DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) -DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) - -DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) -DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) -DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) -DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) - -#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) -#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) - -DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) -DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) -DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) -DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) - -DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) -DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) -DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) -DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) - -#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) -#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) - -DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) -DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) -DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) -DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) - -DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) -DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) -DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) -DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) - -static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) -{ - return val >= max ? max : val <= min ? min : val; -} - -#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) - -static inline int64_t do_sqadd_d(int64_t n, int64_t m) -{ - int64_t r = n + m; - if (((r ^ n) & ~(n ^ m)) < 0) { - /* Signed overflow. */ - return r < 0 ? INT64_MAX : INT64_MIN; - } - return r; -} - -DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) -DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) -DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) -DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) - -#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) -#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) -#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) - -static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) -{ - uint64_t r = n + m; - return r < n ? UINT64_MAX : r; -} - -DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) -DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) -DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) -DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) - -#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) -#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) -#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) - -static inline int64_t do_sqsub_d(int64_t n, int64_t m) -{ - int64_t r = n - m; - if (((r ^ n) & (n ^ m)) < 0) { - /* Signed overflow. */ - return r < 0 ? INT64_MAX : INT64_MIN; - } - return r; -} - -DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) -DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) -DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) -DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) - -#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) -#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) -#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) - -static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) -{ - return n > m ? n - m : 0; -} - -DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) -DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) -DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) -DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) - -#define DO_SUQADD_B(n, m) \ - do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) -#define DO_SUQADD_H(n, m) \ - do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) -#define DO_SUQADD_S(n, m) \ - do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) - -static inline int64_t do_suqadd_d(int64_t n, uint64_t m) -{ - uint64_t r = n + m; - - if (n < 0) { - /* Note that m - abs(n) cannot underflow. */ - if (r > INT64_MAX) { - /* Result is either very large positive or negative. */ - if (m > -n) { - /* m > abs(n), so r is a very large positive. */ - return INT64_MAX; - } - /* Result is negative. */ - } - } else { - /* Both inputs are positive: check for overflow. */ - if (r < m || r > INT64_MAX) { - return INT64_MAX; - } - } - return r; -} - -DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) -DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) -DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) -DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) - -#define DO_USQADD_B(n, m) \ - do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) -#define DO_USQADD_H(n, m) \ - do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) -#define DO_USQADD_S(n, m) \ - do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) - -static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) -{ - uint64_t r = n + m; - - if (m < 0) { - return n < -m ? 0 : r; - } - return r < n ? UINT64_MAX : r; -} - -DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) -DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) -DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) -DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) - -#undef DO_ZPZZ -#undef DO_ZPZZ_D - -/* - * Three operand expander, operating on element pairs. - * If the slot I is even, the elements from from VN {I, I+1}. - * If the slot I is odd, the elements from from VM {I-1, I}. - * Load all of the input elements in each pair before overwriting output. - */ -#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPE n0 = *(TYPE *)(vn + H(i)); \ - TYPE m0 = *(TYPE *)(vm + H(i)); \ - TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ - TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ - if (pg & 1) { \ - *(TYPE *)(vd + H(i)) = OP(n0, n1); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - if (pg & 1) { \ - *(TYPE *)(vd + H(i)) = OP(m0, m1); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -/* Similarly, specialized for 64-bit operands. */ -#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPE *d = vd, *n = vn, *m = vm; \ - uint8_t *pg = vg; \ - for (i = 0; i < opr_sz; i += 2) { \ - TYPE n0 = n[i], n1 = n[i + 1]; \ - TYPE m0 = m[i], m1 = m[i + 1]; \ - if (pg[H1(i)] & 1) { \ - d[i] = OP(n0, n1); \ - } \ - if (pg[H1(i + 1)] & 1) { \ - d[i + 1] = OP(m0, m1); \ - } \ - } \ -} - -DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) -DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) -DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) -DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) - -DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) -DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) -DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) -DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) - -DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) -DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) -DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) -DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) - -DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) -DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) -DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) -DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) - -DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) -DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) -DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) -DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) - -#undef DO_ZPZZ_PAIR -#undef DO_ZPZZ_PAIR_D - -#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ - void *status, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPE n0 = *(TYPE *)(vn + H(i)); \ - TYPE m0 = *(TYPE *)(vm + H(i)); \ - TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ - TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ - if (pg & 1) { \ - *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - if (pg & 1) { \ - *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) -DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) -DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) - -DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) -DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) -DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) - -DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) -DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) -DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) - -DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) -DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) -DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) - -DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) -DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) -DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) - -#undef DO_ZPZZ_PAIR_FP - -/* Three-operand expander, controlled by a predicate, in which the - * third operand is "wide". That is, for D = N op M, the same 64-bit - * value of M is used with all of the narrower values of N. - */ -#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ - TYPEW mm = *(TYPEW *)(vm + i); \ - do { \ - if (pg & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, mm); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 7); \ - } \ -} - -DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) -DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) -DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) - -DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) -DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) -DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) - -DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) -DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) -DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) - -#undef DO_ZPZW - -/* Fully general two-operand expander, controlled by a predicate. - */ -#define DO_ZPZ(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -/* Similarly, specialized for 64-bit operands. */ -#define DO_ZPZ_D(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPE *d = vd, *n = vn; \ - uint8_t *pg = vg; \ - for (i = 0; i < opr_sz; i += 1) { \ - if (pg[H1(i)] & 1) { \ - TYPE nn = n[i]; \ - d[i] = OP(nn); \ - } \ - } \ -} - -#define DO_CLS_B(N) (clrsb32(N) - 24) -#define DO_CLS_H(N) (clrsb32(N) - 16) - -DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) -DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) -DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) -DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) - -#define DO_CLZ_B(N) (clz32(N) - 24) -#define DO_CLZ_H(N) (clz32(N) - 16) - -DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) -DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) -DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) -DO_ZPZ_D(sve_clz_d, uint64_t, clz64) - -DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) -DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) -DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) -DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) - -#define DO_CNOT(N) (N == 0) - -DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) -DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) -DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) -DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) - -#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) - -DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) -DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) -DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) - -#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) - -DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) -DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) -DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) - -#define DO_NOT(N) (~N) - -DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) -DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) -DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) -DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) - -#define DO_SXTB(N) ((int8_t)N) -#define DO_SXTH(N) ((int16_t)N) -#define DO_SXTS(N) ((int32_t)N) -#define DO_UXTB(N) ((uint8_t)N) -#define DO_UXTH(N) ((uint16_t)N) -#define DO_UXTS(N) ((uint32_t)N) - -DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) -DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) -DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) -DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) -DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) -DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) - -DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) -DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) -DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) -DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) -DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) -DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) - -#define DO_ABS(N) (N < 0 ? -N : N) - -DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) -DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) -DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) -DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) - -#define DO_NEG(N) (-N) - -DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) -DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) -DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) -DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) - -DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) -DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) -DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) - -DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) -DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) - -DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) - -void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 2) { - if (pg[H1(i)] & 1) { - uint64_t n0 = n[i + 0]; - uint64_t n1 = n[i + 1]; - d[i + 0] = n1; - d[i + 1] = n0; - } - } -} - -DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) -DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) -DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) -DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) - -#define DO_SQABS(X) \ - ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ - x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) - -DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) -DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) -DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) -DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) - -#define DO_SQNEG(X) \ - ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ - x_ == min_ ? -min_ - 1 : -x_; }) - -DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) -DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) -DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) -DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) - -DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) -DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) - -/* Three-operand expander, unpredicated, in which the third operand is "wide". - */ -#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - TYPEW mm = *(TYPEW *)(vm + i); \ - do { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, mm); \ - i += sizeof(TYPE); \ - } while (i & 7); \ - } \ -} - -DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) -DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) -DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) - -DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) -DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) -DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) - -DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) -DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) -DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) - -#undef DO_ZZW - -#undef DO_CLS_B -#undef DO_CLS_H -#undef DO_CLZ_B -#undef DO_CLZ_H -#undef DO_CNOT -#undef DO_FABS -#undef DO_FNEG -#undef DO_ABS -#undef DO_NEG -#undef DO_ZPZ -#undef DO_ZPZ_D - -/* - * Three-operand expander, unpredicated, in which the two inputs are - * selected from the top or bottom half of the wide column. - */ -#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ - int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ - TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ - *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ - } \ -} - -DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) -DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) - -DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) -DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) - -DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) -DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) -DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) - -DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) -DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) - -DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) -DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) - -DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) -DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) -DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) - -DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) -DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) - -DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) -DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) - -/* Note that the multiply cannot overflow, but the doubling can. */ -static inline int16_t do_sqdmull_h(int16_t n, int16_t m) -{ - int16_t val = n * m; - return DO_SQADD_H(val, val); -} - -static inline int32_t do_sqdmull_s(int32_t n, int32_t m) -{ - int32_t val = n * m; - return DO_SQADD_S(val, val); -} - -static inline int64_t do_sqdmull_d(int64_t n, int64_t m) -{ - int64_t val = n * m; - return do_sqadd_d(val, val); -} - -DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) -DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) -DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) - -#undef DO_ZZZ_TB - -#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEW *)(vn + HW(i)); \ - TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ - *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ - } \ -} - -DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) -DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) - -DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) -DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) - -DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) -DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) -DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) - -DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) -DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) -DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) - -#undef DO_ZZZ_WTB - -#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ - intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ - for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ - TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ - TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ - *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ - } \ -} - -DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) -DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) -DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) -DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) - -#undef DO_ZZZ_NTB - -#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ - TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ - TYPEW aa = *(TYPEW *)(va + HW(i)); \ - *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ - } \ -} - -DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) -DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) -DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) - -DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) -DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) -DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) - -DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) -DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) - -DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) -DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) - -#define DO_NMUL(N, M) -(N * M) - -DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) -DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) -DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) - -DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) -DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) -DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) - -#undef DO_ZZZW_ACC - -#define DO_XTNB(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ - TYPE nn = *(TYPE *)(vn + i); \ - nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ - *(TYPE *)(vd + i) = nn; \ - } \ -} - -#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ - for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ - TYPE nn = *(TYPE *)(vn + i); \ - *(TYPEN *)(vd + i + odd) = OP(nn); \ - } \ -} - -#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) -#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) -#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) - -DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) -DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) -DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) - -DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) -DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) -DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) - -#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) -#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) -#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) - -DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) -DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) -DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) - -DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) - -DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) -DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) -DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) - -DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) -DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) -DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) - -#undef DO_XTNB -#undef DO_XTNT - -void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); - uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint32_t *a = va, *n = vn; - uint64_t *d = vd, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - uint32_t e1 = a[2 * i + H4(0)]; - uint32_t e2 = n[2 * i + sel] ^ inv; - uint64_t c = extract64(m[i], 32, 1); - /* Compute and store the entire 33-bit result at once. */ - d[i] = c + e1 + e2; - } -} - -void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int sel = extract32(desc, SIMD_DATA_SHIFT, 1); - uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint64_t *d = vd, *a = va, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; i += 2) { - Int128 e1 = int128_make64(a[i]); - Int128 e2 = int128_make64(n[i + sel] ^ inv); - Int128 c = int128_make64(m[i + 1] & 1); - Int128 r = int128_add(int128_add(e1, e2), c); - d[i + 0] = int128_getlo(r); - d[i + 1] = int128_gethi(r); - } -} - -#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ - int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ - TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ - TYPEW aa = *(TYPEW *)(va + HW(i)); \ - *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ - } \ -} - -DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, - do_sqdmull_h, DO_SQADD_H) -DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, - do_sqdmull_s, DO_SQADD_S) -DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, - do_sqdmull_d, do_sqadd_d) - -DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, - do_sqdmull_h, DO_SQSUB_H) -DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, - do_sqdmull_s, DO_SQSUB_S) -DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, - do_sqdmull_d, do_sqsub_d) - -#undef DO_SQDMLAL - -#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ - int rot = simd_data(desc); \ - int sel_a = rot & 1, sel_b = sel_a ^ 1; \ - bool sub_r = rot == 1 || rot == 2; \ - bool sub_i = rot >= 2; \ - TYPE *d = vd, *n = vn, *m = vm, *a = va; \ - for (i = 0; i < opr_sz; i += 2) { \ - TYPE elt1_a = n[H(i + sel_a)]; \ - TYPE elt2_a = m[H(i + sel_a)]; \ - TYPE elt2_b = m[H(i + sel_b)]; \ - d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ - d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ - } \ -} - -#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) - -DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) -DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) -DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) -DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) - -#define DO_SQRDMLAH_B(N, M, A, S) \ - do_sqrdmlah_b(N, M, A, S, true) -#define DO_SQRDMLAH_H(N, M, A, S) \ - ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) -#define DO_SQRDMLAH_S(N, M, A, S) \ - ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) -#define DO_SQRDMLAH_D(N, M, A, S) \ - do_sqrdmlah_d(N, M, A, S, true) - -DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) -DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) -DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) -DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) - -#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ - int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ - int sel_a = rot & 1, sel_b = sel_a ^ 1; \ - bool sub_r = rot == 1 || rot == 2; \ - bool sub_i = rot >= 2; \ - TYPE *d = vd, *n = vn, *m = vm, *a = va; \ - for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ - TYPE elt2_a = m[H(i + idx + sel_a)]; \ - TYPE elt2_b = m[H(i + idx + sel_b)]; \ - for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ - TYPE elt1_a = n[H(i + j + sel_a)]; \ - d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ - d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ - } \ - } \ -} - -DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) -DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) - -DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) -DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) - -#undef DO_CMLA -#undef DO_CMLA_FUNC -#undef DO_CMLA_IDX_FUNC -#undef DO_SQRDMLAH_B -#undef DO_SQRDMLAH_H -#undef DO_SQRDMLAH_S -#undef DO_SQRDMLAH_D - -/* Note N and M are 4 elements bundled into one unit. */ -static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, - int sel_a, int sel_b, int sub_i) -{ - for (int i = 0; i <= 1; i++) { - int32_t elt1_r = (int8_t)(n >> (16 * i)); - int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); - int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); - int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); - - a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; - } - return a; -} - -static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, - int sel_a, int sel_b, int sub_i) -{ - for (int i = 0; i <= 1; i++) { - int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); - int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); - int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); - int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); - - a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; - } - return a; -} - -void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - int opr_sz = simd_oprsz(desc); - int rot = simd_data(desc); - int sel_a = rot & 1; - int sel_b = sel_a ^ 1; - int sub_i = (rot == 0 || rot == 3 ? -1 : 1); - uint32_t *d = vd, *n = vn, *m = vm, *a = va; - - for (int e = 0; e < opr_sz / 4; e++) { - d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); - } -} - -void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - int opr_sz = simd_oprsz(desc); - int rot = simd_data(desc); - int sel_a = rot & 1; - int sel_b = sel_a ^ 1; - int sub_i = (rot == 0 || rot == 3 ? -1 : 1); - uint64_t *d = vd, *n = vn, *m = vm, *a = va; - - for (int e = 0; e < opr_sz / 8; e++) { - d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); - } -} - -void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - int opr_sz = simd_oprsz(desc); - int rot = extract32(desc, SIMD_DATA_SHIFT, 2); - int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); - int sel_a = rot & 1; - int sel_b = sel_a ^ 1; - int sub_i = (rot == 0 || rot == 3 ? -1 : 1); - uint32_t *d = vd, *n = vn, *m = vm, *a = va; - - for (int seg = 0; seg < opr_sz / 4; seg += 4) { - uint32_t seg_m = m[seg + idx]; - for (int e = 0; e < 4; e++) { - d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], - sel_a, sel_b, sub_i); - } - } -} - -void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - int seg, opr_sz = simd_oprsz(desc); - int rot = extract32(desc, SIMD_DATA_SHIFT, 2); - int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - int sel_a = rot & 1; - int sel_b = sel_a ^ 1; - int sub_i = (rot == 0 || rot == 3 ? -1 : 1); - uint64_t *d = vd, *n = vn, *m = vm, *a = va; - - for (seg = 0; seg < opr_sz / 8; seg += 2) { - uint64_t seg_m = m[seg + idx]; - for (int e = 0; e < 2; e++) { - d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], - sel_a, sel_b, sub_i); - } - } -} - -#define DO_ZZXZ(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ - intptr_t i, j, idx = simd_data(desc); \ - TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ - for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ - TYPE mm = m[i]; \ - for (j = 0; j < segment; j++) { \ - d[i + j] = OP(n[i + j], mm, a[i + j]); \ - } \ - } \ -} - -#define DO_SQRDMLAH_H(N, M, A) \ - ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) -#define DO_SQRDMLAH_S(N, M, A) \ - ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) -#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) - -DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) -DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) -DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) - -#define DO_SQRDMLSH_H(N, M, A) \ - ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) -#define DO_SQRDMLSH_S(N, M, A) \ - ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) -#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) - -DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) -DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) -DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) - -#undef DO_ZZXZ - -#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ - intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ - for (i = 0; i < oprsz; i += 16) { \ - TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ - for (j = 0; j < 16; j += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ - TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ - *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ - } \ - } \ -} - -#define DO_MLA(N, M, A) (A + N * M) - -DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) -DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) -DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) -DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) - -#define DO_MLS(N, M, A) (A - N * M) - -DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) -DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) -DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) -DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) - -#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) -#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) - -DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) -DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) - -#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) -#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) - -DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) -DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) - -#undef DO_MLA -#undef DO_MLS -#undef DO_ZZXW - -#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ - intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ - for (i = 0; i < oprsz; i += 16) { \ - TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ - for (j = 0; j < 16; j += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ - *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ - } \ - } \ -} - -DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) -DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) - -DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) -DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) - -DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) -DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) - -#undef DO_ZZX - -#define DO_BITPERM(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ - TYPE nn = *(TYPE *)(vn + i); \ - TYPE mm = *(TYPE *)(vm + i); \ - *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ - } \ -} - -static uint64_t bitextract(uint64_t data, uint64_t mask, int n) -{ - uint64_t res = 0; - int db, rb = 0; - - for (db = 0; db < n; ++db) { - if ((mask >> db) & 1) { - res |= ((data >> db) & 1) << rb; - ++rb; - } - } - return res; -} - -DO_BITPERM(sve2_bext_b, uint8_t, bitextract) -DO_BITPERM(sve2_bext_h, uint16_t, bitextract) -DO_BITPERM(sve2_bext_s, uint32_t, bitextract) -DO_BITPERM(sve2_bext_d, uint64_t, bitextract) - -static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) -{ - uint64_t res = 0; - int rb, db = 0; - - for (rb = 0; rb < n; ++rb) { - if ((mask >> rb) & 1) { - res |= ((data >> db) & 1) << rb; - ++db; - } - } - return res; -} - -DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) -DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) -DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) -DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) - -static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) -{ - uint64_t resm = 0, resu = 0; - int db, rbm = 0, rbu = 0; - - for (db = 0; db < n; ++db) { - uint64_t val = (data >> db) & 1; - if ((mask >> db) & 1) { - resm |= val << rbm++; - } else { - resu |= val << rbu++; - } - } - - return resm | (resu << rbm); -} - -DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) -DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) -DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) -DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) - -#undef DO_BITPERM - -#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int sub_r = simd_data(desc); \ - if (sub_r) { \ - for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ - TYPE acc_r = *(TYPE *)(vn + H(i)); \ - TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ - TYPE el2_r = *(TYPE *)(vm + H(i)); \ - TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ - acc_r = ADD_OP(acc_r, el2_i); \ - acc_i = SUB_OP(acc_i, el2_r); \ - *(TYPE *)(vd + H(i)) = acc_r; \ - *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ - } \ - } else { \ - for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ - TYPE acc_r = *(TYPE *)(vn + H(i)); \ - TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ - TYPE el2_r = *(TYPE *)(vm + H(i)); \ - TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ - acc_r = SUB_OP(acc_r, el2_i); \ - acc_i = ADD_OP(acc_i, el2_r); \ - *(TYPE *)(vd + H(i)) = acc_r; \ - *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ - } \ - } \ -} - -DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) -DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) -DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) -DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) - -DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) -DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) -DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) -DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) - -#undef DO_CADD - -#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ - int shift = simd_data(desc) >> 1; \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ - *(TYPEW *)(vd + HW(i)) = nn << shift; \ - } \ -} - -DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) -DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) -DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) - -DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) -DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) -DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) - -#undef DO_ZZI_SHLL - -/* Two-operand reduction expander, controlled by a predicate. - * The difference between TYPERED and TYPERET has to do with - * sign-extension. E.g. for SMAX, TYPERED must be signed, - * but TYPERET must be unsigned so that e.g. a 32-bit value - * is not sign-extended to the ABI uint64_t return type. - */ -/* ??? If we were to vectorize this by hand the reduction ordering - * would change. For integer operands, this is perfectly fine. - */ -#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ -uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPERED ret = INIT; \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ - ret = OP(ret, nn); \ - } \ - i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ - } while (i & 15); \ - } \ - return (TYPERET)ret; \ -} - -#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ -uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPEE *n = vn; \ - uint8_t *pg = vg; \ - TYPER ret = INIT; \ - for (i = 0; i < opr_sz; i += 1) { \ - if (pg[H1(i)] & 1) { \ - TYPEE nn = n[i]; \ - ret = OP(ret, nn); \ - } \ - } \ - return ret; \ -} - -DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) -DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) -DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) -DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) - -DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) -DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) -DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) -DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) - -DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) -DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) -DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) -DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) - -DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) -DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) -DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) - -DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) -DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) -DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) -DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) - -DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) -DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) -DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) -DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) - -DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) -DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) -DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) -DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) - -DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) -DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) -DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) -DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) - -DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) -DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) -DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) -DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) - -#undef DO_VPZ -#undef DO_VPZ_D - -/* Two vector operand, one scalar operand, unpredicated. */ -#define DO_ZZI(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ - TYPE s = s64, *d = vd, *n = vn; \ - for (i = 0; i < opr_sz; ++i) { \ - d[i] = OP(n[i], s); \ - } \ -} - -#define DO_SUBR(X, Y) (Y - X) - -DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) -DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) -DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) -DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) - -DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) -DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) -DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) -DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) - -DO_ZZI(sve_smini_b, int8_t, DO_MIN) -DO_ZZI(sve_smini_h, int16_t, DO_MIN) -DO_ZZI(sve_smini_s, int32_t, DO_MIN) -DO_ZZI(sve_smini_d, int64_t, DO_MIN) - -DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) -DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) -DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) -DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) - -DO_ZZI(sve_umini_b, uint8_t, DO_MIN) -DO_ZZI(sve_umini_h, uint16_t, DO_MIN) -DO_ZZI(sve_umini_s, uint32_t, DO_MIN) -DO_ZZI(sve_umini_d, uint64_t, DO_MIN) - -#undef DO_ZZI - -#undef DO_AND -#undef DO_ORR -#undef DO_EOR -#undef DO_BIC -#undef DO_ADD -#undef DO_SUB -#undef DO_MAX -#undef DO_MIN -#undef DO_ABD -#undef DO_MUL -#undef DO_DIV -#undef DO_ASR -#undef DO_LSR -#undef DO_LSL -#undef DO_SUBR - -/* Similar to the ARM LastActiveElement pseudocode function, except the - result is multiplied by the element size. This includes the not found - indication; e.g. not found for esz=3 is -8. */ -static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) -{ - uint64_t mask = pred_esz_masks[esz]; - intptr_t i = words; - - do { - uint64_t this_g = g[--i] & mask; - if (this_g) { - return i * 64 + (63 - clz64(this_g)); - } - } while (i > 0); - return (intptr_t)-1 << esz; -} - -uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) -{ - intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); - uint32_t flags = PREDTEST_INIT; - uint64_t *d = vd, *g = vg; - intptr_t i = 0; - - do { - uint64_t this_d = d[i]; - uint64_t this_g = g[i]; - - if (this_g) { - if (!(flags & 4)) { - /* Set in D the first bit of G. */ - this_d |= this_g & -this_g; - d[i] = this_d; - } - flags = iter_predtest_fwd(this_d, this_g, flags); - } - } while (++i < words); - - return flags; -} - -uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) -{ - intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint32_t flags = PREDTEST_INIT; - uint64_t *d = vd, *g = vg, esz_mask; - intptr_t i, next; - - next = last_active_element(vd, words, esz) + (1 << esz); - esz_mask = pred_esz_masks[esz]; - - /* Similar to the pseudocode for pnext, but scaled by ESZ - so that we find the correct bit. */ - if (next < words * 64) { - uint64_t mask = -1; - - if (next & 63) { - mask = ~((1ull << (next & 63)) - 1); - next &= -64; - } - do { - uint64_t this_g = g[next / 64] & esz_mask & mask; - if (this_g != 0) { - next = (next & -64) + ctz64(this_g); - break; - } - next += 64; - mask = -1; - } while (next < words * 64); - } - - i = 0; - do { - uint64_t this_d = 0; - if (i == next / 64) { - this_d = 1ull << (next & 63); - } - d[i] = this_d; - flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); - } while (++i < words); - - return flags; -} - -/* - * Copy Zn into Zd, and store zero into inactive elements. - * If inv, store zeros into the active elements. - */ -void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t inv = -(uint64_t)(simd_data(desc) & 1); - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); - } -} - -void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t inv = -(uint64_t)(simd_data(desc) & 1); - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); - } -} - -void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t inv = -(uint64_t)(simd_data(desc) & 1); - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); - } -} - -void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - uint8_t inv = simd_data(desc); - - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); - } -} - -/* Three-operand expander, immediate operand, controlled by a predicate. - */ -#define DO_ZPZI(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPE imm = simd_data(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, imm); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -/* Similarly, specialized for 64-bit operands. */ -#define DO_ZPZI_D(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPE *d = vd, *n = vn; \ - TYPE imm = simd_data(desc); \ - uint8_t *pg = vg; \ - for (i = 0; i < opr_sz; i += 1) { \ - if (pg[H1(i)] & 1) { \ - TYPE nn = n[i]; \ - d[i] = OP(nn, imm); \ - } \ - } \ -} - -#define DO_SHR(N, M) (N >> M) -#define DO_SHL(N, M) (N << M) - -/* Arithmetic shift right for division. This rounds negative numbers - toward zero as per signed division. Therefore before shifting, - when N is negative, add 2**M-1. */ -#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) - -static inline uint64_t do_urshr(uint64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else if (sh == 64) { - return x >> 63; - } else { - return 0; - } -} - -static inline int64_t do_srshr(int64_t x, unsigned sh) -{ - if (likely(sh < 64)) { - return (x >> sh) + ((x >> (sh - 1)) & 1); - } else { - /* Rounding the sign bit always produces 0. */ - return 0; - } -} - -DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) -DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) -DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) -DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) - -DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) -DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) -DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) -DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) - -DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) -DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) -DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) -DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) - -DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) -DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) -DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) -DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) - -/* SVE2 bitwise shift by immediate */ -DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) -DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) -DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) -DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) - -DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) -DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) -DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) -DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) - -DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) -DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) -DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) -DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) - -DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) -DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) -DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) -DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) - -#define do_suqrshl_b(n, m) \ - ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) -#define do_suqrshl_h(n, m) \ - ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) -#define do_suqrshl_s(n, m) \ - ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) -#define do_suqrshl_d(n, m) \ - ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) - -DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) -DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) -DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) -DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) - -#undef DO_ASRD -#undef DO_ZPZI -#undef DO_ZPZI_D - -#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEW *)(vn + i); \ - *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ - } \ -} - -#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEW *)(vn + HW(i)); \ - *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ - } \ -} - -DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) -DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) -DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) - -DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) -DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) -DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) - -DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) -DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) -DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) - -DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) -DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) -DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) - -#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) -#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) -#define DO_SQSHRUN_D(x, sh) \ - do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) - -DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) -DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) -DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) - -DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) -DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) -DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) - -#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) -#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) -#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) - -DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) -DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) -DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) - -DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) -DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) -DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) - -#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) -#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) -#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) - -DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) -DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) -DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) - -DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) -DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) -DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) - -#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) -#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) -#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) - -DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) -DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) -DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) - -DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) -DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) -DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) - -#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) -#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) -#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) - -DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) -DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) -DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) - -DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) -DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) -DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) - -#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) -#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) -#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) - -DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) -DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) -DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) - -DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) -DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) -DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) - -#undef DO_SHRNB -#undef DO_SHRNT - -#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEW *)(vn + i); \ - TYPEW mm = *(TYPEW *)(vm + i); \ - *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ - } \ -} - -#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ - TYPEW nn = *(TYPEW *)(vn + HW(i)); \ - TYPEW mm = *(TYPEW *)(vm + HW(i)); \ - *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ - } \ -} - -#define DO_ADDHN(N, M, SH) ((N + M) >> SH) -#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) -#define DO_SUBHN(N, M, SH) ((N - M) >> SH) -#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) - -DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) -DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) -DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) - -DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) -DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) -DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) - -DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) -DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) -DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) - -DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) -DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) -DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) - -DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) -DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) -DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) - -DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) -DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) -DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) - -DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) -DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) -DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) - -DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) -DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) -DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) - -#undef DO_RSUBHN -#undef DO_SUBHN -#undef DO_RADDHN -#undef DO_ADDHN - -#undef DO_BINOPNB - -/* Fully general four-operand expander, controlled by a predicate. - */ -#define DO_ZPZZZ(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ - void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - if (pg & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - TYPE mm = *(TYPE *)(vm + H(i)); \ - TYPE aa = *(TYPE *)(va + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ - } \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ -} - -/* Similarly, specialized for 64-bit operands. */ -#define DO_ZPZZZ_D(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ - void *vg, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ - TYPE *d = vd, *a = va, *n = vn, *m = vm; \ - uint8_t *pg = vg; \ - for (i = 0; i < opr_sz; i += 1) { \ - if (pg[H1(i)] & 1) { \ - TYPE aa = a[i], nn = n[i], mm = m[i]; \ - d[i] = OP(aa, nn, mm); \ - } \ - } \ -} - -#define DO_MLA(A, N, M) (A + N * M) -#define DO_MLS(A, N, M) (A - N * M) - -DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) -DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) - -DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) -DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) - -DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) -DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) - -DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) -DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) - -#undef DO_MLA -#undef DO_MLS -#undef DO_ZPZZZ -#undef DO_ZPZZZ_D - -void HELPER(sve_index_b)(void *vd, uint32_t start, - uint32_t incr, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint8_t *d = vd; - for (i = 0; i < opr_sz; i += 1) { - d[H1(i)] = start + i * incr; - } -} - -void HELPER(sve_index_h)(void *vd, uint32_t start, - uint32_t incr, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 2; - uint16_t *d = vd; - for (i = 0; i < opr_sz; i += 1) { - d[H2(i)] = start + i * incr; - } -} - -void HELPER(sve_index_s)(void *vd, uint32_t start, - uint32_t incr, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 4; - uint32_t *d = vd; - for (i = 0; i < opr_sz; i += 1) { - d[H4(i)] = start + i * incr; - } -} - -void HELPER(sve_index_d)(void *vd, uint64_t start, - uint64_t incr, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd; - for (i = 0; i < opr_sz; i += 1) { - d[i] = start + i * incr; - } -} - -void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 4; - uint32_t sh = simd_data(desc); - uint32_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] + (m[i] << sh); - } -} - -void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t sh = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] + (m[i] << sh); - } -} - -void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t sh = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); - } -} - -void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t sh = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); - } -} - -void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) -{ - /* These constants are cut-and-paste directly from the ARM pseudocode. */ - static const uint16_t coeff[] = { - 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, - 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, - 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, - 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / 2; - uint16_t *d = vd, *n = vn; - - for (i = 0; i < opr_sz; i++) { - uint16_t nn = n[i]; - intptr_t idx = extract32(nn, 0, 5); - uint16_t exp = extract32(nn, 5, 5); - d[i] = coeff[idx] | (exp << 10); - } -} - -void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) -{ - /* These constants are cut-and-paste directly from the ARM pseudocode. */ - static const uint32_t coeff[] = { - 0x000000, 0x0164d2, 0x02cd87, 0x043a29, - 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, - 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, - 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, - 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, - 0x1ef532, 0x20b051, 0x227043, 0x243516, - 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, - 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, - 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, - 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, - 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, - 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, - 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, - 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, - 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, - 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / 4; - uint32_t *d = vd, *n = vn; - - for (i = 0; i < opr_sz; i++) { - uint32_t nn = n[i]; - intptr_t idx = extract32(nn, 0, 6); - uint32_t exp = extract32(nn, 6, 8); - d[i] = coeff[idx] | (exp << 23); - } -} - -void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) -{ - /* These constants are cut-and-paste directly from the ARM pseudocode. */ - static const uint64_t coeff[] = { - 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, - 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, - 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, - 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, - 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, - 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, - 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, - 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, - 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, - 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, - 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, - 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, - 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, - 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, - 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, - 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, - 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, - 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, - 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, - 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, - 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, - 0xFA7C1819E90D8ull, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - - for (i = 0; i < opr_sz; i++) { - uint64_t nn = n[i]; - intptr_t idx = extract32(nn, 0, 6); - uint64_t exp = extract32(nn, 6, 11); - d[i] = coeff[idx] | (exp << 52); - } -} - -void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 2; - uint16_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - uint16_t nn = n[i]; - uint16_t mm = m[i]; - if (mm & 1) { - nn = float16_one; - } - d[i] = nn ^ (mm & 2) << 14; - } -} - -void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 4; - uint32_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - uint32_t nn = n[i]; - uint32_t mm = m[i]; - if (mm & 1) { - nn = float32_one; - } - d[i] = nn ^ (mm & 2) << 30; - } -} - -void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i]; - uint64_t mm = m[i]; - if (mm & 1) { - nn = float64_one; - } - d[i] = nn ^ (mm & 2) << 62; - } -} - -/* - * Signed saturating addition with scalar operand. - */ - -void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(int8_t)) { - *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); - } -} - -void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(int16_t)) { - *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); - } -} - -void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(int32_t)) { - *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); - } -} - -void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(int64_t)) { - *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); - } -} - -/* - * Unsigned saturating addition with scalar operand. - */ - -void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(uint8_t)) { - *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); - } -} - -void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(uint16_t)) { - *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); - } -} - -void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(uint32_t)) { - *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); - } -} - -void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(uint64_t)) { - *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); - } -} - -void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - - for (i = 0; i < oprsz; i += sizeof(uint64_t)) { - *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); - } -} - -/* Two operand predicated copy immediate with merge. All valid immediates - * can fit within 17 signed bits in the simd_data field. - */ -void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, - uint64_t mm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - mm = dup_const(MO_8, mm); - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i]; - uint64_t pp = expand_pred_b(pg[H1(i)]); - d[i] = (mm & pp) | (nn & ~pp); - } -} - -void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, - uint64_t mm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - mm = dup_const(MO_16, mm); - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i]; - uint64_t pp = expand_pred_h(pg[H1(i)]); - d[i] = (mm & pp) | (nn & ~pp); - } -} - -void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, - uint64_t mm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - mm = dup_const(MO_32, mm); - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i]; - uint64_t pp = expand_pred_s(pg[H1(i)]); - d[i] = (mm & pp) | (nn & ~pp); - } -} - -void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, - uint64_t mm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i]; - d[i] = (pg[H1(i)] & 1 ? mm : nn); - } -} - -void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd; - uint8_t *pg = vg; - - val = dup_const(MO_8, val); - for (i = 0; i < opr_sz; i += 1) { - d[i] = val & expand_pred_b(pg[H1(i)]); - } -} - -void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd; - uint8_t *pg = vg; - - val = dup_const(MO_16, val); - for (i = 0; i < opr_sz; i += 1) { - d[i] = val & expand_pred_h(pg[H1(i)]); - } -} - -void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd; - uint8_t *pg = vg; - - val = dup_const(MO_32, val); - for (i = 0; i < opr_sz; i += 1) { - d[i] = val & expand_pred_s(pg[H1(i)]); - } -} - -void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - d[i] = (pg[H1(i)] & 1 ? val : 0); - } -} - -/* Big-endian hosts need to frob the byte indices. If the copy - * happens to be 8-byte aligned, then no frobbing necessary. - */ -static void swap_memmove(void *vd, void *vs, size_t n) -{ - uintptr_t d = (uintptr_t)vd; - uintptr_t s = (uintptr_t)vs; - uintptr_t o = (d | s | n) & 7; - size_t i; - -#if !HOST_BIG_ENDIAN - o = 0; -#endif - switch (o) { - case 0: - memmove(vd, vs, n); - break; - - case 4: - if (d < s || d >= s + n) { - for (i = 0; i < n; i += 4) { - *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); - } - } else { - for (i = n; i > 0; ) { - i -= 4; - *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); - } - } - break; - - case 2: - case 6: - if (d < s || d >= s + n) { - for (i = 0; i < n; i += 2) { - *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); - } - } else { - for (i = n; i > 0; ) { - i -= 2; - *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); - } - } - break; - - default: - if (d < s || d >= s + n) { - for (i = 0; i < n; i++) { - *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); - } - } else { - for (i = n; i > 0; ) { - i -= 1; - *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); - } - } - break; - } -} - -/* Similarly for memset of 0. */ -static void swap_memzero(void *vd, size_t n) -{ - uintptr_t d = (uintptr_t)vd; - uintptr_t o = (d | n) & 7; - size_t i; - - /* Usually, the first bit of a predicate is set, so N is 0. */ - if (likely(n == 0)) { - return; - } - -#if !HOST_BIG_ENDIAN - o = 0; -#endif - switch (o) { - case 0: - memset(vd, 0, n); - break; - - case 4: - for (i = 0; i < n; i += 4) { - *(uint32_t *)H1_4(d + i) = 0; - } - break; - - case 2: - case 6: - for (i = 0; i < n; i += 2) { - *(uint16_t *)H1_2(d + i) = 0; - } - break; - - default: - for (i = 0; i < n; i++) { - *(uint8_t *)H1(d + i) = 0; - } - break; - } -} - -void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t opr_sz = simd_oprsz(desc); - size_t n_ofs = simd_data(desc); - size_t n_siz = opr_sz - n_ofs; - - if (vd != vm) { - swap_memmove(vd, vn + n_ofs, n_siz); - swap_memmove(vd + n_siz, vm, n_ofs); - } else if (vd != vn) { - swap_memmove(vd + n_siz, vd, n_ofs); - swap_memmove(vd, vn + n_ofs, n_siz); - } else { - /* vd == vn == vm. Need temp space. */ - ARMVectorReg tmp; - swap_memmove(&tmp, vm, n_ofs); - swap_memmove(vd, vd + n_ofs, n_siz); - memcpy(vd + n_siz, &tmp, n_ofs); - } -} - -#define DO_INSR(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ -{ \ - intptr_t opr_sz = simd_oprsz(desc); \ - swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ - *(TYPE *)(vd + H(0)) = val; \ -} - -DO_INSR(sve_insr_b, uint8_t, H1) -DO_INSR(sve_insr_h, uint16_t, H1_2) -DO_INSR(sve_insr_s, uint32_t, H1_4) -DO_INSR(sve_insr_d, uint64_t, H1_8) - -#undef DO_INSR - -void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { - uint64_t f = *(uint64_t *)(vn + i); - uint64_t b = *(uint64_t *)(vn + j); - *(uint64_t *)(vd + i) = bswap64(b); - *(uint64_t *)(vd + j) = bswap64(f); - } -} - -void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { - uint64_t f = *(uint64_t *)(vn + i); - uint64_t b = *(uint64_t *)(vn + j); - *(uint64_t *)(vd + i) = hswap64(b); - *(uint64_t *)(vd + j) = hswap64(f); - } -} - -void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { - uint64_t f = *(uint64_t *)(vn + i); - uint64_t b = *(uint64_t *)(vn + j); - *(uint64_t *)(vd + i) = rol64(b, 32); - *(uint64_t *)(vd + j) = rol64(f, 32); - } -} - -void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { - uint64_t f = *(uint64_t *)(vn + i); - uint64_t b = *(uint64_t *)(vn + j); - *(uint64_t *)(vd + i) = b; - *(uint64_t *)(vd + j) = f; - } -} - -typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); - -static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, - bool is_tbx, tb_impl_fn *fn) -{ - ARMVectorReg scratch; - uintptr_t oprsz = simd_oprsz(desc); - - if (unlikely(vd == vn)) { - vn = memcpy(&scratch, vn, oprsz); - } - - fn(vd, vn, NULL, vm, oprsz, is_tbx); -} - -static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, - uint32_t desc, bool is_tbx, tb_impl_fn *fn) -{ - ARMVectorReg scratch; - uintptr_t oprsz = simd_oprsz(desc); - - if (unlikely(vd == vn0)) { - vn0 = memcpy(&scratch, vn0, oprsz); - if (vd == vn1) { - vn1 = vn0; - } - } else if (unlikely(vd == vn1)) { - vn1 = memcpy(&scratch, vn1, oprsz); - } - - fn(vd, vn0, vn1, vm, oprsz, is_tbx); -} - -#define DO_TB(SUFF, TYPE, H) \ -static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ - void *vm, uintptr_t oprsz, bool is_tbx) \ -{ \ - TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ - uintptr_t i, nelem = oprsz / sizeof(TYPE); \ - for (i = 0; i < nelem; ++i) { \ - TYPE index = indexes[H1(i)], val = 0; \ - if (index < nelem) { \ - val = tbl0[H(index)]; \ - } else { \ - index -= nelem; \ - if (tbl1 && index < nelem) { \ - val = tbl1[H(index)]; \ - } else if (is_tbx) { \ - continue; \ - } \ - } \ - d[H(i)] = val; \ - } \ -} \ -void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ -} \ -void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ - void *vm, uint32_t desc) \ -{ \ - do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ -} \ -void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ -} - -DO_TB(b, uint8_t, H1) -DO_TB(h, uint16_t, H2) -DO_TB(s, uint32_t, H4) -DO_TB(d, uint64_t, H8) - -#undef DO_TB - -#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPED *d = vd; \ - TYPES *n = vn; \ - ARMVectorReg tmp; \ - if (unlikely(vn - vd < opr_sz)) { \ - n = memcpy(&tmp, n, opr_sz / 2); \ - } \ - for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ - d[HD(i)] = n[HS(i)]; \ - } \ -} - -DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) -DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) -DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) - -DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) -DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) -DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) - -#undef DO_UNPK - -/* Mask of bits included in the even numbered predicates of width esz. - * We also use this for expand_bits/compress_bits, and so extend the - * same pattern out to 16-bit units. - */ -static const uint64_t even_bit_esz_masks[5] = { - 0x5555555555555555ull, - 0x3333333333333333ull, - 0x0f0f0f0f0f0f0f0full, - 0x00ff00ff00ff00ffull, - 0x0000ffff0000ffffull, -}; - -/* Zero-extend units of 2**N bits to units of 2**(N+1) bits. - * For N==0, this corresponds to the operation that in qemu/bitops.h - * we call half_shuffle64; this algorithm is from Hacker's Delight, - * section 7-2 Shuffling Bits. - */ -static uint64_t expand_bits(uint64_t x, int n) -{ - int i; - - x &= 0xffffffffu; - for (i = 4; i >= n; i--) { - int sh = 1 << i; - x = ((x << sh) | x) & even_bit_esz_masks[i]; - } - return x; -} - -/* Compress units of 2**(N+1) bits to units of 2**N bits. - * For N==0, this corresponds to the operation that in qemu/bitops.h - * we call half_unshuffle64; this algorithm is from Hacker's Delight, - * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. - */ -static uint64_t compress_bits(uint64_t x, int n) -{ - int i; - - for (i = n; i <= 4; i++) { - int sh = 1 << i; - x &= even_bit_esz_masks[i]; - x = (x >> sh) | x; - } - return x & 0xffffffffu; -} - -void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); - int esize = 1 << esz; - uint64_t *d = vd; - intptr_t i; - - if (oprsz <= 8) { - uint64_t nn = *(uint64_t *)vn; - uint64_t mm = *(uint64_t *)vm; - int half = 4 * oprsz; - - nn = extract64(nn, high * half, half); - mm = extract64(mm, high * half, half); - nn = expand_bits(nn, esz); - mm = expand_bits(mm, esz); - d[0] = nn | (mm << esize); - } else { - ARMPredicateReg tmp; - - /* We produce output faster than we consume input. - Therefore we must be mindful of possible overlap. */ - if (vd == vn) { - vn = memcpy(&tmp, vn, oprsz); - if (vd == vm) { - vm = vn; - } - } else if (vd == vm) { - vm = memcpy(&tmp, vm, oprsz); - } - if (high) { - high = oprsz >> 1; - } - - if ((oprsz & 7) == 0) { - uint32_t *n = vn, *m = vm; - high >>= 2; - - for (i = 0; i < oprsz / 8; i++) { - uint64_t nn = n[H4(high + i)]; - uint64_t mm = m[H4(high + i)]; - - nn = expand_bits(nn, esz); - mm = expand_bits(mm, esz); - d[i] = nn | (mm << esize); - } - } else { - uint8_t *n = vn, *m = vm; - uint16_t *d16 = vd; - - for (i = 0; i < oprsz / 2; i++) { - uint16_t nn = n[H1(high + i)]; - uint16_t mm = m[H1(high + i)]; - - nn = expand_bits(nn, esz); - mm = expand_bits(mm, esz); - d16[H2(i)] = nn | (mm << esize); - } - } - } -} - -void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t l, h; - intptr_t i; - - if (oprsz <= 8) { - l = compress_bits(n[0] >> odd, esz); - h = compress_bits(m[0] >> odd, esz); - d[0] = l | (h << (4 * oprsz)); - } else { - ARMPredicateReg tmp_m; - intptr_t oprsz_16 = oprsz / 16; - - if ((vm - vd) < (uintptr_t)oprsz) { - m = memcpy(&tmp_m, vm, oprsz); - } - - for (i = 0; i < oprsz_16; i++) { - l = n[2 * i + 0]; - h = n[2 * i + 1]; - l = compress_bits(l >> odd, esz); - h = compress_bits(h >> odd, esz); - d[i] = l | (h << 32); - } - - /* - * For VL which is not a multiple of 512, the results from M do not - * align nicely with the uint64_t for D. Put the aligned results - * from M into TMP_M and then copy it into place afterward. - */ - if (oprsz & 15) { - int final_shift = (oprsz & 15) * 2; - - l = n[2 * i + 0]; - h = n[2 * i + 1]; - l = compress_bits(l >> odd, esz); - h = compress_bits(h >> odd, esz); - d[i] = l | (h << final_shift); - - for (i = 0; i < oprsz_16; i++) { - l = m[2 * i + 0]; - h = m[2 * i + 1]; - l = compress_bits(l >> odd, esz); - h = compress_bits(h >> odd, esz); - tmp_m.p[i] = l | (h << 32); - } - l = m[2 * i + 0]; - h = m[2 * i + 1]; - l = compress_bits(l >> odd, esz); - h = compress_bits(h >> odd, esz); - tmp_m.p[i] = l | (h << final_shift); - - swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); - } else { - for (i = 0; i < oprsz_16; i++) { - l = m[2 * i + 0]; - h = m[2 * i + 1]; - l = compress_bits(l >> odd, esz); - h = compress_bits(h >> odd, esz); - d[oprsz_16 + i] = l | (h << 32); - } - } - } -} - -void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t mask; - int shr, shl; - intptr_t i; - - shl = 1 << esz; - shr = 0; - mask = even_bit_esz_masks[esz]; - if (odd) { - mask <<= shl; - shr = shl; - shl = 0; - } - - for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { - uint64_t nn = (n[i] & mask) >> shr; - uint64_t mm = (m[i] & mask) << shl; - d[i] = nn + mm; - } -} - -/* Reverse units of 2**N bits. */ -static uint64_t reverse_bits_64(uint64_t x, int n) -{ - int i, sh; - - x = bswap64(x); - for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { - uint64_t mask = even_bit_esz_masks[i]; - x = ((x & mask) << sh) | ((x >> sh) & mask); - } - return x; -} - -static uint8_t reverse_bits_8(uint8_t x, int n) -{ - static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; - int i, sh; - - for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { - x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); - } - return x; -} - -void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - intptr_t i, oprsz_2 = oprsz / 2; - - if (oprsz <= 8) { - uint64_t l = *(uint64_t *)vn; - l = reverse_bits_64(l << (64 - 8 * oprsz), esz); - *(uint64_t *)vd = l; - } else if ((oprsz & 15) == 0) { - for (i = 0; i < oprsz_2; i += 8) { - intptr_t ih = oprsz - 8 - i; - uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); - uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); - *(uint64_t *)(vd + i) = h; - *(uint64_t *)(vd + ih) = l; - } - } else { - for (i = 0; i < oprsz_2; i += 1) { - intptr_t il = H1(i); - intptr_t ih = H1(oprsz - 1 - i); - uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); - uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); - *(uint8_t *)(vd + il) = h; - *(uint8_t *)(vd + ih) = l; - } - } -} - -void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); - uint64_t *d = vd; - intptr_t i; - - if (oprsz <= 8) { - uint64_t nn = *(uint64_t *)vn; - int half = 4 * oprsz; - - nn = extract64(nn, high * half, half); - nn = expand_bits(nn, 0); - d[0] = nn; - } else { - ARMPredicateReg tmp_n; - - /* We produce output faster than we consume input. - Therefore we must be mindful of possible overlap. */ - if ((vn - vd) < (uintptr_t)oprsz) { - vn = memcpy(&tmp_n, vn, oprsz); - } - if (high) { - high = oprsz >> 1; - } - - if ((oprsz & 7) == 0) { - uint32_t *n = vn; - high >>= 2; - - for (i = 0; i < oprsz / 8; i++) { - uint64_t nn = n[H4(high + i)]; - d[i] = expand_bits(nn, 0); - } - } else { - uint16_t *d16 = vd; - uint8_t *n = vn; - - for (i = 0; i < oprsz / 2; i++) { - uint16_t nn = n[H1(high + i)]; - d16[H2(i)] = expand_bits(nn, 0); - } - } - } -} - -#define DO_ZIP(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t oprsz = simd_oprsz(desc); \ - intptr_t odd_ofs = simd_data(desc); \ - intptr_t i, oprsz_2 = oprsz / 2; \ - ARMVectorReg tmp_n, tmp_m; \ - /* We produce output faster than we consume input. \ - Therefore we must be mindful of possible overlap. */ \ - if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ - vn = memcpy(&tmp_n, vn, oprsz); \ - } \ - if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ - vm = memcpy(&tmp_m, vm, oprsz); \ - } \ - for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ - *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ - *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ - *(TYPE *)(vm + odd_ofs + H(i)); \ - } \ - if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ - memset(vd + oprsz - 16, 0, 16); \ - } \ -} - -DO_ZIP(sve_zip_b, uint8_t, H1) -DO_ZIP(sve_zip_h, uint16_t, H1_2) -DO_ZIP(sve_zip_s, uint32_t, H1_4) -DO_ZIP(sve_zip_d, uint64_t, H1_8) -DO_ZIP(sve2_zip_q, Int128, ) - -#define DO_UZP(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t oprsz = simd_oprsz(desc); \ - intptr_t odd_ofs = simd_data(desc); \ - intptr_t i, p; \ - ARMVectorReg tmp_m; \ - if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ - vm = memcpy(&tmp_m, vm, oprsz); \ - } \ - i = 0, p = odd_ofs; \ - do { \ - *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ - i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ - } while (p < oprsz); \ - p -= oprsz; \ - do { \ - *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ - i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ - } while (p < oprsz); \ - tcg_debug_assert(i == oprsz); \ -} - -DO_UZP(sve_uzp_b, uint8_t, H1) -DO_UZP(sve_uzp_h, uint16_t, H1_2) -DO_UZP(sve_uzp_s, uint32_t, H1_4) -DO_UZP(sve_uzp_d, uint64_t, H1_8) -DO_UZP(sve2_uzp_q, Int128, ) - -#define DO_TRN(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t oprsz = simd_oprsz(desc); \ - intptr_t odd_ofs = simd_data(desc); \ - intptr_t i; \ - for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ - TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ - TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ - *(TYPE *)(vd + H(i + 0)) = ae; \ - *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ - } \ - if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ - memset(vd + oprsz - 16, 0, 16); \ - } \ -} - -DO_TRN(sve_trn_b, uint8_t, H1) -DO_TRN(sve_trn_h, uint16_t, H1_2) -DO_TRN(sve_trn_s, uint32_t, H1_4) -DO_TRN(sve_trn_d, uint64_t, H1_8) -DO_TRN(sve2_trn_q, Int128, ) - -#undef DO_ZIP -#undef DO_UZP -#undef DO_TRN - -void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; - uint32_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = j = 0; i < opr_sz; i++) { - if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { - d[H4(j)] = n[H4(i)]; - j++; - } - } - for (; j < opr_sz; j++) { - d[H4(j)] = 0; - } -} - -void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn; - uint8_t *pg = vg; - - for (i = j = 0; i < opr_sz; i++) { - if (pg[H1(i)] & 1) { - d[j] = n[i]; - j++; - } - } - for (; j < opr_sz; j++) { - d[j] = 0; - } -} - -/* Similar to the ARM LastActiveElement pseudocode function, except the - * result is multiplied by the element size. This includes the not found - * indication; e.g. not found for esz=3 is -8. - */ -int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) -{ - intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - - return last_active_element(vg, words, esz); -} - -void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) -{ - intptr_t opr_sz = simd_oprsz(desc) / 8; - int esz = simd_data(desc); - uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; - intptr_t i, first_i, last_i; - ARMVectorReg tmp; - - first_i = last_i = 0; - first_g = last_g = 0; - - /* Find the extent of the active elements within VG. */ - for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { - pg = *(uint64_t *)(vg + i) & mask; - if (pg) { - if (last_g == 0) { - last_g = pg; - last_i = i; - } - first_g = pg; - first_i = i; - } - } - - len = 0; - if (first_g != 0) { - first_i = first_i * 8 + ctz64(first_g); - last_i = last_i * 8 + 63 - clz64(last_g); - len = last_i - first_i + (1 << esz); - if (vd == vm) { - vm = memcpy(&tmp, vm, opr_sz * 8); - } - swap_memmove(vd, vn + first_i, len); - } - swap_memmove(vd + len, vm, opr_sz * 8 - len); -} - -void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, - void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i], mm = m[i]; - uint64_t pp = expand_pred_b(pg[H1(i)]); - d[i] = (nn & pp) | (mm & ~pp); - } -} - -void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, - void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i], mm = m[i]; - uint64_t pp = expand_pred_h(pg[H1(i)]); - d[i] = (nn & pp) | (mm & ~pp); - } -} - -void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, - void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i], mm = m[i]; - uint64_t pp = expand_pred_s(pg[H1(i)]); - d[i] = (nn & pp) | (mm & ~pp); - } -} - -void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, - void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - uint64_t nn = n[i], mm = m[i]; - d[i] = (pg[H1(i)] & 1 ? nn : mm); - } -} - -void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, - void *vg, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 16; - Int128 *d = vd, *n = vn, *m = vm; - uint16_t *pg = vg; - - for (i = 0; i < opr_sz; i += 1) { - d[i] = (pg[H2(i)] & 1 ? n : m)[i]; - } -} - -/* Two operand comparison controlled by a predicate. - * ??? It is very tempting to want to be able to expand this inline - * with x86 instructions, e.g. - * - * vcmpeqw zm, zn, %ymm0 - * vpmovmskb %ymm0, %eax - * and $0x5555, %eax - * and pg, %eax - * - * or even aarch64, e.g. - * - * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 - * cmeq v0.8h, zn, zm - * and v0.8h, v0.8h, mask - * addv h0, v0.8h - * and v0.8b, pg - * - * However, coming up with an abstraction that allows vector inputs and - * a scalar output, and also handles the byte-ordering of sub-uint64_t - * scalar outputs, is tricky. - */ -#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ -uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t opr_sz = simd_oprsz(desc); \ - uint32_t flags = PREDTEST_INIT; \ - intptr_t i = opr_sz; \ - do { \ - uint64_t out = 0, pg; \ - do { \ - i -= sizeof(TYPE), out <<= sizeof(TYPE); \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - TYPE mm = *(TYPE *)(vm + H(i)); \ - out |= nn OP mm; \ - } while (i & 63); \ - pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ - out &= pg; \ - *(uint64_t *)(vd + (i >> 3)) = out; \ - flags = iter_predtest_bwd(out, pg, flags); \ - } while (i > 0); \ - return flags; \ -} - -#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ - DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) -#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ - DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) -#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ - DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) -#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ - DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) - -DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) -DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) -DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) -DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) - -DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) -DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) -DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) -DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) - -DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) -DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) -DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) -DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) - -DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) -DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) -DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) -DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) - -DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) -DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) -DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) -DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) - -DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) -DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) -DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) -DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) - -#undef DO_CMP_PPZZ_B -#undef DO_CMP_PPZZ_H -#undef DO_CMP_PPZZ_S -#undef DO_CMP_PPZZ_D -#undef DO_CMP_PPZZ - -/* Similar, but the second source is "wide". */ -#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ -uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - intptr_t opr_sz = simd_oprsz(desc); \ - uint32_t flags = PREDTEST_INIT; \ - intptr_t i = opr_sz; \ - do { \ - uint64_t out = 0, pg; \ - do { \ - TYPEW mm = *(TYPEW *)(vm + i - 8); \ - do { \ - i -= sizeof(TYPE), out <<= sizeof(TYPE); \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - out |= nn OP mm; \ - } while (i & 7); \ - } while (i & 63); \ - pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ - out &= pg; \ - *(uint64_t *)(vd + (i >> 3)) = out; \ - flags = iter_predtest_bwd(out, pg, flags); \ - } while (i > 0); \ - return flags; \ -} - -#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ - DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) -#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ - DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) -#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ - DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) - -DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) -DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) -DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) - -DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) -DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) -DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) - -DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) -DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) -DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) - -DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) -DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) -DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) - -DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) -DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) -DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) - -DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) -DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) -DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) - -DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) -DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) -DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) - -DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) -DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) -DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) - -DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) -DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) -DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) - -DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) -DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) -DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) - -#undef DO_CMP_PPZW_B -#undef DO_CMP_PPZW_H -#undef DO_CMP_PPZW_S -#undef DO_CMP_PPZW - -/* Similar, but the second source is immediate. */ -#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ -uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ -{ \ - intptr_t opr_sz = simd_oprsz(desc); \ - uint32_t flags = PREDTEST_INIT; \ - TYPE mm = simd_data(desc); \ - intptr_t i = opr_sz; \ - do { \ - uint64_t out = 0, pg; \ - do { \ - i -= sizeof(TYPE), out <<= sizeof(TYPE); \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - out |= nn OP mm; \ - } while (i & 63); \ - pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ - out &= pg; \ - *(uint64_t *)(vd + (i >> 3)) = out; \ - flags = iter_predtest_bwd(out, pg, flags); \ - } while (i > 0); \ - return flags; \ -} - -#define DO_CMP_PPZI_B(NAME, TYPE, OP) \ - DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) -#define DO_CMP_PPZI_H(NAME, TYPE, OP) \ - DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) -#define DO_CMP_PPZI_S(NAME, TYPE, OP) \ - DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) -#define DO_CMP_PPZI_D(NAME, TYPE, OP) \ - DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) - -DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) -DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) -DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) -DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) - -DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) -DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) -DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) -DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) - -DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) -DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) -DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) -DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) - -DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) -DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) -DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) -DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) - -DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) -DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) -DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) -DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) - -DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) -DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) -DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) -DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) - -DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) -DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) -DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) -DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) - -DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) -DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) -DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) -DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) - -DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) -DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) -DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) -DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) - -DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) -DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) -DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) -DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) - -#undef DO_CMP_PPZI_B -#undef DO_CMP_PPZI_H -#undef DO_CMP_PPZI_S -#undef DO_CMP_PPZI_D -#undef DO_CMP_PPZI - -/* Similar to the ARM LastActive pseudocode function. */ -static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) -{ - intptr_t i; - - for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { - uint64_t pg = *(uint64_t *)(vg + i); - if (pg) { - return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; - } - } - return 0; -} - -/* Compute a mask into RETB that is true for all G, up to and including - * (if after) or excluding (if !after) the first G & N. - * Return true if BRK found. - */ -static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, - bool brk, bool after) -{ - uint64_t b; - - if (brk) { - b = 0; - } else if ((g & n) == 0) { - /* For all G, no N are set; break not found. */ - b = g; - } else { - /* Break somewhere in N. Locate it. */ - b = g & n; /* guard true, pred true */ - b = b & -b; /* first such */ - if (after) { - b = b | (b - 1); /* break after same */ - } else { - b = b - 1; /* break before same */ - } - brk = true; - } - - *retb = b; - return brk; -} - -/* Compute a zeroing BRK. */ -static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, - intptr_t oprsz, bool after) -{ - bool brk = false; - intptr_t i; - - for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { - uint64_t this_b, this_g = g[i]; - - brk = compute_brk(&this_b, n[i], this_g, brk, after); - d[i] = this_b & this_g; - } -} - -/* Likewise, but also compute flags. */ -static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, - intptr_t oprsz, bool after) -{ - uint32_t flags = PREDTEST_INIT; - bool brk = false; - intptr_t i; - - for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { - uint64_t this_b, this_d, this_g = g[i]; - - brk = compute_brk(&this_b, n[i], this_g, brk, after); - d[i] = this_d = this_b & this_g; - flags = iter_predtest_fwd(this_d, this_g, flags); - } - return flags; -} - -/* Compute a merging BRK. */ -static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, - intptr_t oprsz, bool after) -{ - bool brk = false; - intptr_t i; - - for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { - uint64_t this_b, this_g = g[i]; - - brk = compute_brk(&this_b, n[i], this_g, brk, after); - d[i] = (this_b & this_g) | (d[i] & ~this_g); - } -} - -/* Likewise, but also compute flags. */ -static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, - intptr_t oprsz, bool after) -{ - uint32_t flags = PREDTEST_INIT; - bool brk = false; - intptr_t i; - - for (i = 0; i < oprsz / 8; ++i) { - uint64_t this_b, this_d = d[i], this_g = g[i]; - - brk = compute_brk(&this_b, n[i], this_g, brk, after); - d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); - flags = iter_predtest_fwd(this_d, this_g, flags); - } - return flags; -} - -static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) -{ - /* It is quicker to zero the whole predicate than loop on OPRSZ. - * The compiler should turn this into 4 64-bit integer stores. - */ - memset(d, 0, sizeof(ARMPredicateReg)); - return PREDTEST_INIT; -} - -void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, - uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (last_active_pred(vn, vg, oprsz)) { - compute_brk_z(vd, vm, vg, oprsz, true); - } else { - do_zero(vd, oprsz); - } -} - -uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, - uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (last_active_pred(vn, vg, oprsz)) { - return compute_brks_z(vd, vm, vg, oprsz, true); - } else { - return do_zero(vd, oprsz); - } -} - -void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, - uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (last_active_pred(vn, vg, oprsz)) { - compute_brk_z(vd, vm, vg, oprsz, false); - } else { - do_zero(vd, oprsz); - } -} - -uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, - uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (last_active_pred(vn, vg, oprsz)) { - return compute_brks_z(vd, vm, vg, oprsz, false); - } else { - return do_zero(vd, oprsz); - } -} - -void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - compute_brk_z(vd, vn, vg, oprsz, true); -} - -uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - return compute_brks_z(vd, vn, vg, oprsz, true); -} - -void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - compute_brk_z(vd, vn, vg, oprsz, false); -} - -uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - return compute_brks_z(vd, vn, vg, oprsz, false); -} - -void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - compute_brk_m(vd, vn, vg, oprsz, true); -} - -uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - return compute_brks_m(vd, vn, vg, oprsz, true); -} - -void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - compute_brk_m(vd, vn, vg, oprsz, false); -} - -uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - return compute_brks_m(vd, vn, vg, oprsz, false); -} - -void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (!last_active_pred(vn, vg, oprsz)) { - do_zero(vd, oprsz); - } -} - -/* As if PredTest(Ones(PL), D, esz). */ -static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, - uint64_t esz_mask) -{ - uint32_t flags = PREDTEST_INIT; - intptr_t i; - - for (i = 0; i < oprsz / 8; i++) { - flags = iter_predtest_fwd(d->p[i], esz_mask, flags); - } - if (oprsz & 7) { - uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); - flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); - } - return flags; -} - -uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - if (last_active_pred(vn, vg, oprsz)) { - return predtest_ones(vd, oprsz, -1); - } else { - return do_zero(vd, oprsz); - } -} - -uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) -{ - intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; - intptr_t i; - - for (i = 0; i < words; ++i) { - uint64_t t = n[i] & g[i] & mask; - sum += ctpop64(t); - } - return sum; -} - -uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint64_t esz_mask = pred_esz_masks[esz]; - ARMPredicateReg *d = vd; - uint32_t flags; - intptr_t i; - - /* Begin with a zero predicate register. */ - flags = do_zero(d, oprsz); - if (count == 0) { - return flags; - } - - /* Set all of the requested bits. */ - for (i = 0; i < count / 64; ++i) { - d->p[i] = esz_mask; - } - if (count & 63) { - d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; - } - - return predtest_ones(d, oprsz, esz_mask); -} - -uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) -{ - intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); - intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); - uint64_t esz_mask = pred_esz_masks[esz]; - ARMPredicateReg *d = vd; - intptr_t i, invcount, oprbits; - uint64_t bits; - - if (count == 0) { - return do_zero(d, oprsz); - } - - oprbits = oprsz * 8; - tcg_debug_assert(count <= oprbits); - - bits = esz_mask; - if (oprbits & 63) { - bits &= MAKE_64BIT_MASK(0, oprbits & 63); - } - - invcount = oprbits - count; - for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { - d->p[i] = bits; - bits = esz_mask; - } - - d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); - - while (--i >= 0) { - d->p[i] = 0; - } - - return predtest_ones(d, oprsz, esz_mask); -} - -/* Recursive reduction on a function; - * C.f. the ARM ARM function ReducePredicated. - * - * While it would be possible to write this without the DATA temporary, - * it is much simpler to process the predicate register this way. - * The recursion is bounded to depth 7 (128 fp16 elements), so there's - * little to gain with a more complex non-recursive form. - */ -#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ -static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ -{ \ - if (n == 1) { \ - return *data; \ - } else { \ - uintptr_t half = n / 2; \ - TYPE lo = NAME##_reduce(data, status, half); \ - TYPE hi = NAME##_reduce(data + half, status, half); \ - return TYPE##_##FUNC(lo, hi, status); \ - } \ -} \ -uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ -{ \ - uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ - TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ - for (i = 0; i < oprsz; ) { \ - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ - do { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ - i += sizeof(TYPE), pg >>= sizeof(TYPE); \ - } while (i & 15); \ - } \ - for (; i < maxsz; i += sizeof(TYPE)) { \ - *(TYPE *)((void *)data + i) = IDENT; \ - } \ - return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ -} - -DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) -DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) -DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) - -/* Identity is floatN_default_nan, without the function call. */ -DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) -DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) -DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) - -DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) -DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) -DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) - -DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) -DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) -DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) - -DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) -DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) -DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) - -#undef DO_REDUCE - -uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, - void *status, uint32_t desc) -{ - intptr_t i = 0, opr_sz = simd_oprsz(desc); - float16 result = nn; - - do { - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); - do { - if (pg & 1) { - float16 mm = *(float16 *)(vm + H1_2(i)); - result = float16_add(result, mm, status); - } - i += sizeof(float16), pg >>= sizeof(float16); - } while (i & 15); - } while (i < opr_sz); - - return result; -} - -uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, - void *status, uint32_t desc) -{ - intptr_t i = 0, opr_sz = simd_oprsz(desc); - float32 result = nn; - - do { - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); - do { - if (pg & 1) { - float32 mm = *(float32 *)(vm + H1_2(i)); - result = float32_add(result, mm, status); - } - i += sizeof(float32), pg >>= sizeof(float32); - } while (i & 15); - } while (i < opr_sz); - - return result; -} - -uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, - void *status, uint32_t desc) -{ - intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; - uint64_t *m = vm; - uint8_t *pg = vg; - - for (i = 0; i < opr_sz; i++) { - if (pg[H1(i)] & 1) { - nn = float64_add(nn, m[i], status); - } - } - - return nn; -} - -/* Fully general three-operand expander, controlled by a predicate, - * With the extra float_status parameter. - */ -#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ - void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc); \ - uint64_t *g = vg; \ - do { \ - uint64_t pg = g[(i - 1) >> 6]; \ - do { \ - i -= sizeof(TYPE); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - TYPE mm = *(TYPE *)(vm + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ - } \ - } while (i & 63); \ - } while (i != 0); \ -} - -DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) -DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) -DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) - -DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) -DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) -DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) - -DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) -DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) -DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) - -DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) -DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) -DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) - -DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) -DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) -DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) - -DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) -DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) -DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) - -DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) -DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) -DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) - -DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) -DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) -DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) - -static inline float16 abd_h(float16 a, float16 b, float_status *s) -{ - return float16_abs(float16_sub(a, b, s)); -} - -static inline float32 abd_s(float32 a, float32 b, float_status *s) -{ - return float32_abs(float32_sub(a, b, s)); -} - -static inline float64 abd_d(float64 a, float64 b, float_status *s) -{ - return float64_abs(float64_sub(a, b, s)); -} - -DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) -DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) -DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) - -static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) -{ - int b_int = MIN(MAX(b, INT_MIN), INT_MAX); - return float64_scalbn(a, b_int, s); -} - -DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) -DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) -DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) - -DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) -DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) -DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) - -#undef DO_ZPZZ_FP - -/* Three-operand expander, with one scalar operand, controlled by - * a predicate, with the extra float_status parameter. - */ -#define DO_ZPZS_FP(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ - void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc); \ - uint64_t *g = vg; \ - TYPE mm = scalar; \ - do { \ - uint64_t pg = g[(i - 1) >> 6]; \ - do { \ - i -= sizeof(TYPE); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ - } \ - } while (i & 63); \ - } while (i != 0); \ -} - -DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) -DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) -DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) - -DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) -DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) -DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) - -DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) -DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) -DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) - -static inline float16 subr_h(float16 a, float16 b, float_status *s) -{ - return float16_sub(b, a, s); -} - -static inline float32 subr_s(float32 a, float32 b, float_status *s) -{ - return float32_sub(b, a, s); -} - -static inline float64 subr_d(float64 a, float64 b, float_status *s) -{ - return float64_sub(b, a, s); -} - -DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) -DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) -DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) - -DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) -DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) -DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) - -DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) -DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) -DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) - -DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) -DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) -DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) - -DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) -DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) -DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) - -/* Fully general two-operand expander, controlled by a predicate, - * With the extra float_status parameter. - */ -#define DO_ZPZ_FP(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc); \ - uint64_t *g = vg; \ - do { \ - uint64_t pg = g[(i - 1) >> 6]; \ - do { \ - i -= sizeof(TYPE); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(i)) = OP(nn, status); \ - } \ - } while (i & 63); \ - } while (i != 0); \ -} - -/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore - * FZ16. When converting from fp16, this affects flushing input denormals; - * when converting to fp16, this affects flushing output denormals. - */ -static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) -{ - bool save = get_flush_inputs_to_zero(fpst); - float32 ret; - - set_flush_inputs_to_zero(false, fpst); - ret = float16_to_float32(f, true, fpst); - set_flush_inputs_to_zero(save, fpst); - return ret; -} - -static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) -{ - bool save = get_flush_inputs_to_zero(fpst); - float64 ret; - - set_flush_inputs_to_zero(false, fpst); - ret = float16_to_float64(f, true, fpst); - set_flush_inputs_to_zero(save, fpst); - return ret; -} - -static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) -{ - bool save = get_flush_to_zero(fpst); - float16 ret; - - set_flush_to_zero(false, fpst); - ret = float32_to_float16(f, true, fpst); - set_flush_to_zero(save, fpst); - return ret; -} - -static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) -{ - bool save = get_flush_to_zero(fpst); - float16 ret; - - set_flush_to_zero(false, fpst); - ret = float64_to_float16(f, true, fpst); - set_flush_to_zero(save, fpst); - return ret; -} - -static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) -{ - if (float16_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float16_to_int16_round_to_zero(f, s); -} - -static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) -{ - if (float16_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float16_to_int64_round_to_zero(f, s); -} - -static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) -{ - if (float32_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float32_to_int64_round_to_zero(f, s); -} - -static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) -{ - if (float64_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float64_to_int64_round_to_zero(f, s); -} - -static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) -{ - if (float16_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float16_to_uint16_round_to_zero(f, s); -} - -static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) -{ - if (float16_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float16_to_uint64_round_to_zero(f, s); -} - -static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) -{ - if (float32_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float32_to_uint64_round_to_zero(f, s); -} - -static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) -{ - if (float64_is_any_nan(f)) { - float_raise(float_flag_invalid, s); - return 0; - } - return float64_to_uint64_round_to_zero(f, s); -} - -DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) -DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) -DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) -DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) -DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) -DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) -DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) - -DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) -DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) -DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) -DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) -DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) -DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) -DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) - -DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) -DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) -DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) -DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) -DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) -DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) -DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) - -DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) -DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) -DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) - -DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) -DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) -DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) - -DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) -DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) -DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) - -DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) -DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) -DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) - -DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) -DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) -DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) -DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) -DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) -DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) -DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) - -DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) -DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) -DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) -DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) -DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) -DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) -DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) - -static int16_t do_float16_logb_as_int(float16 a, float_status *s) -{ - /* Extract frac to the top of the uint32_t. */ - uint32_t frac = (uint32_t)a << (16 + 6); - int16_t exp = extract32(a, 10, 5); - - if (unlikely(exp == 0)) { - if (frac != 0) { - if (!get_flush_inputs_to_zero(s)) { - /* denormal: bias - fractional_zeros */ - return -15 - clz32(frac); - } - /* flush to zero */ - float_raise(float_flag_input_denormal, s); - } - } else if (unlikely(exp == 0x1f)) { - if (frac == 0) { - return INT16_MAX; /* infinity */ - } - } else { - /* normal: exp - bias */ - return exp - 15; - } - /* nan or zero */ - float_raise(float_flag_invalid, s); - return INT16_MIN; -} - -static int32_t do_float32_logb_as_int(float32 a, float_status *s) -{ - /* Extract frac to the top of the uint32_t. */ - uint32_t frac = a << 9; - int32_t exp = extract32(a, 23, 8); - - if (unlikely(exp == 0)) { - if (frac != 0) { - if (!get_flush_inputs_to_zero(s)) { - /* denormal: bias - fractional_zeros */ - return -127 - clz32(frac); - } - /* flush to zero */ - float_raise(float_flag_input_denormal, s); - } - } else if (unlikely(exp == 0xff)) { - if (frac == 0) { - return INT32_MAX; /* infinity */ - } - } else { - /* normal: exp - bias */ - return exp - 127; - } - /* nan or zero */ - float_raise(float_flag_invalid, s); - return INT32_MIN; -} - -static int64_t do_float64_logb_as_int(float64 a, float_status *s) -{ - /* Extract frac to the top of the uint64_t. */ - uint64_t frac = a << 12; - int64_t exp = extract64(a, 52, 11); - - if (unlikely(exp == 0)) { - if (frac != 0) { - if (!get_flush_inputs_to_zero(s)) { - /* denormal: bias - fractional_zeros */ - return -1023 - clz64(frac); - } - /* flush to zero */ - float_raise(float_flag_input_denormal, s); - } - } else if (unlikely(exp == 0x7ff)) { - if (frac == 0) { - return INT64_MAX; /* infinity */ - } - } else { - /* normal: exp - bias */ - return exp - 1023; - } - /* nan or zero */ - float_raise(float_flag_invalid, s); - return INT64_MIN; -} - -DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) -DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) -DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) - -#undef DO_ZPZ_FP - -static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, - float_status *status, uint32_t desc, - uint16_t neg1, uint16_t neg3) -{ - intptr_t i = simd_oprsz(desc); - uint64_t *g = vg; - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - i -= 2; - if (likely((pg >> (i & 63)) & 1)) { - float16 e1, e2, e3, r; - - e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; - e2 = *(uint16_t *)(vm + H1_2(i)); - e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; - r = float16_muladd(e1, e2, e3, 0, status); - *(uint16_t *)(vd + H1_2(i)) = r; - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); -} - -void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); -} - -void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); -} - -void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); -} - -static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, - float_status *status, uint32_t desc, - uint32_t neg1, uint32_t neg3) -{ - intptr_t i = simd_oprsz(desc); - uint64_t *g = vg; - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - i -= 4; - if (likely((pg >> (i & 63)) & 1)) { - float32 e1, e2, e3, r; - - e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; - e2 = *(uint32_t *)(vm + H1_4(i)); - e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; - r = float32_muladd(e1, e2, e3, 0, status); - *(uint32_t *)(vd + H1_4(i)) = r; - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); -} - -void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); -} - -void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); -} - -void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); -} - -static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, - float_status *status, uint32_t desc, - uint64_t neg1, uint64_t neg3) -{ - intptr_t i = simd_oprsz(desc); - uint64_t *g = vg; - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - i -= 8; - if (likely((pg >> (i & 63)) & 1)) { - float64 e1, e2, e3, r; - - e1 = *(uint64_t *)(vn + i) ^ neg1; - e2 = *(uint64_t *)(vm + i); - e3 = *(uint64_t *)(va + i) ^ neg3; - r = float64_muladd(e1, e2, e3, 0, status); - *(uint64_t *)(vd + i) = r; - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); -} - -void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); -} - -void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); -} - -void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); -} - -/* Two operand floating-point comparison controlled by a predicate. - * Unlike the integer version, we are not allowed to optimistically - * compare operands, since the comparison may have side effects wrt - * the FPSR. - */ -#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ - void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ - uint64_t *d = vd, *g = vg; \ - do { \ - uint64_t out = 0, pg = g[j]; \ - do { \ - i -= sizeof(TYPE), out <<= sizeof(TYPE); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - TYPE mm = *(TYPE *)(vm + H(i)); \ - out |= OP(TYPE, nn, mm, status); \ - } \ - } while (i & 63); \ - d[j--] = out; \ - } while (i > 0); \ -} - -#define DO_FPCMP_PPZZ_H(NAME, OP) \ - DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) -#define DO_FPCMP_PPZZ_S(NAME, OP) \ - DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) -#define DO_FPCMP_PPZZ_D(NAME, OP) \ - DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) - -#define DO_FPCMP_PPZZ_ALL(NAME, OP) \ - DO_FPCMP_PPZZ_H(NAME, OP) \ - DO_FPCMP_PPZZ_S(NAME, OP) \ - DO_FPCMP_PPZZ_D(NAME, OP) - -#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 -#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 -#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 -#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 -#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 -#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 -#define DO_FCMUO(TYPE, X, Y, ST) \ - TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered -#define DO_FACGE(TYPE, X, Y, ST) \ - TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 -#define DO_FACGT(TYPE, X, Y, ST) \ - TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 - -DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) -DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) -DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) -DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) -DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) -DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) -DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) - -#undef DO_FPCMP_PPZZ_ALL -#undef DO_FPCMP_PPZZ_D -#undef DO_FPCMP_PPZZ_S -#undef DO_FPCMP_PPZZ_H -#undef DO_FPCMP_PPZZ - -/* One operand floating-point comparison against zero, controlled - * by a predicate. - */ -#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, \ - void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ - uint64_t *d = vd, *g = vg; \ - do { \ - uint64_t out = 0, pg = g[j]; \ - do { \ - i -= sizeof(TYPE), out <<= sizeof(TYPE); \ - if ((pg >> (i & 63)) & 1) { \ - TYPE nn = *(TYPE *)(vn + H(i)); \ - out |= OP(TYPE, nn, 0, status); \ - } \ - } while (i & 63); \ - d[j--] = out; \ - } while (i > 0); \ -} - -#define DO_FPCMP_PPZ0_H(NAME, OP) \ - DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) -#define DO_FPCMP_PPZ0_S(NAME, OP) \ - DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) -#define DO_FPCMP_PPZ0_D(NAME, OP) \ - DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) - -#define DO_FPCMP_PPZ0_ALL(NAME, OP) \ - DO_FPCMP_PPZ0_H(NAME, OP) \ - DO_FPCMP_PPZ0_S(NAME, OP) \ - DO_FPCMP_PPZ0_D(NAME, OP) - -DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) -DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) -DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) -DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) -DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) -DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) - -/* FP Trig Multiply-Add. */ - -void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) -{ - static const float16 coeff[16] = { - 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); - intptr_t x = simd_data(desc); - float16 *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i++) { - float16 mm = m[i]; - intptr_t xx = x; - if (float16_is_neg(mm)) { - mm = float16_abs(mm); - xx += 8; - } - d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); - } -} - -void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) -{ - static const float32 coeff[16] = { - 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, - 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, - 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, - 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); - intptr_t x = simd_data(desc); - float32 *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i++) { - float32 mm = m[i]; - intptr_t xx = x; - if (float32_is_neg(mm)) { - mm = float32_abs(mm); - xx += 8; - } - d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); - } -} - -void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) -{ - static const float64 coeff[16] = { - 0x3ff0000000000000ull, 0xbfc5555555555543ull, - 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, - 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, - 0x3de5d8408868552full, 0x0000000000000000ull, - 0x3ff0000000000000ull, 0xbfe0000000000000ull, - 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, - 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, - 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, - }; - intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); - intptr_t x = simd_data(desc); - float64 *d = vd, *n = vn, *m = vm; - for (i = 0; i < opr_sz; i++) { - float64 mm = m[i]; - intptr_t xx = x; - if (float64_is_neg(mm)) { - mm = float64_abs(mm); - xx += 8; - } - d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); - } -} - -/* - * FP Complex Add - */ - -void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, - void *vs, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - uint64_t *g = vg; - float16 neg_imag = float16_set_sign(0, simd_data(desc)); - float16 neg_real = float16_chs(neg_imag); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float16 e0, e1, e2, e3; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float16); - i -= 2 * sizeof(float16); - - e0 = *(float16 *)(vn + H1_2(i)); - e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; - e2 = *(float16 *)(vn + H1_2(j)); - e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); - } - if (likely((pg >> (j & 63)) & 1)) { - *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, - void *vs, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - uint64_t *g = vg; - float32 neg_imag = float32_set_sign(0, simd_data(desc)); - float32 neg_real = float32_chs(neg_imag); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float32 e0, e1, e2, e3; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float32); - i -= 2 * sizeof(float32); - - e0 = *(float32 *)(vn + H1_2(i)); - e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; - e2 = *(float32 *)(vn + H1_2(j)); - e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); - } - if (likely((pg >> (j & 63)) & 1)) { - *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, - void *vs, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - uint64_t *g = vg; - float64 neg_imag = float64_set_sign(0, simd_data(desc)); - float64 neg_real = float64_chs(neg_imag); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float64 e0, e1, e2, e3; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float64); - i -= 2 * sizeof(float64); - - e0 = *(float64 *)(vn + H1_2(i)); - e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; - e2 = *(float64 *)(vn + H1_2(j)); - e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); - } - if (likely((pg >> (j & 63)) & 1)) { - *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); - } - } while (i & 63); - } while (i != 0); -} - -/* - * FP Complex Multiply - */ - -void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float16 neg_imag, neg_real; - uint64_t *g = vg; - - neg_imag = float16_set_sign(0, (rot & 2) != 0); - neg_real = float16_set_sign(0, rot == 1 || rot == 2); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float16 e1, e2, e3, e4, nr, ni, mr, mi, d; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float16); - i -= 2 * sizeof(float16); - - nr = *(float16 *)(vn + H1_2(i)); - ni = *(float16 *)(vn + H1_2(j)); - mr = *(float16 *)(vm + H1_2(i)); - mi = *(float16 *)(vm + H1_2(j)); - - e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; - e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - d = *(float16 *)(va + H1_2(i)); - d = float16_muladd(e2, e1, d, 0, status); - *(float16 *)(vd + H1_2(i)) = d; - } - if (likely((pg >> (j & 63)) & 1)) { - d = *(float16 *)(va + H1_2(j)); - d = float16_muladd(e4, e3, d, 0, status); - *(float16 *)(vd + H1_2(j)) = d; - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float32 neg_imag, neg_real; - uint64_t *g = vg; - - neg_imag = float32_set_sign(0, (rot & 2) != 0); - neg_real = float32_set_sign(0, rot == 1 || rot == 2); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float32 e1, e2, e3, e4, nr, ni, mr, mi, d; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float32); - i -= 2 * sizeof(float32); - - nr = *(float32 *)(vn + H1_2(i)); - ni = *(float32 *)(vn + H1_2(j)); - mr = *(float32 *)(vm + H1_2(i)); - mi = *(float32 *)(vm + H1_2(j)); - - e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; - e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - d = *(float32 *)(va + H1_2(i)); - d = float32_muladd(e2, e1, d, 0, status); - *(float32 *)(vd + H1_2(i)) = d; - } - if (likely((pg >> (j & 63)) & 1)) { - d = *(float32 *)(va + H1_2(j)); - d = float32_muladd(e4, e3, d, 0, status); - *(float32 *)(vd + H1_2(j)) = d; - } - } while (i & 63); - } while (i != 0); -} - -void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, - void *vg, void *status, uint32_t desc) -{ - intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float64 neg_imag, neg_real; - uint64_t *g = vg; - - neg_imag = float64_set_sign(0, (rot & 2) != 0); - neg_real = float64_set_sign(0, rot == 1 || rot == 2); - - do { - uint64_t pg = g[(i - 1) >> 6]; - do { - float64 e1, e2, e3, e4, nr, ni, mr, mi, d; - - /* I holds the real index; J holds the imag index. */ - j = i - sizeof(float64); - i -= 2 * sizeof(float64); - - nr = *(float64 *)(vn + H1_2(i)); - ni = *(float64 *)(vn + H1_2(j)); - mr = *(float64 *)(vm + H1_2(i)); - mi = *(float64 *)(vm + H1_2(j)); - - e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; - e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; - - if (likely((pg >> (i & 63)) & 1)) { - d = *(float64 *)(va + H1_2(i)); - d = float64_muladd(e2, e1, d, 0, status); - *(float64 *)(vd + H1_2(i)) = d; - } - if (likely((pg >> (j & 63)) & 1)) { - d = *(float64 *)(va + H1_2(j)); - d = float64_muladd(e4, e3, d, 0, status); - *(float64 *)(vd + H1_2(j)) = d; - } - } while (i & 63); - } while (i != 0); -} - -/* - * Load contiguous data, protected by a governing predicate. - */ - -/* - * Skip through a sequence of inactive elements in the guarding predicate @vg, - * beginning at @reg_off bounded by @reg_max. Return the offset of the active - * element >= @reg_off, or @reg_max if there were no active elements at all. - */ -static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, - intptr_t reg_max, int esz) -{ - uint64_t pg_mask = pred_esz_masks[esz]; - uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); - - /* In normal usage, the first element is active. */ - if (likely(pg & 1)) { - return reg_off; - } - - if (pg == 0) { - reg_off &= -64; - do { - reg_off += 64; - if (unlikely(reg_off >= reg_max)) { - /* The entire predicate was false. */ - return reg_max; - } - pg = vg[reg_off >> 6] & pg_mask; - } while (pg == 0); - } - reg_off += ctz64(pg); - - /* We should never see an out of range predicate bit set. */ - tcg_debug_assert(reg_off < reg_max); - return reg_off; -} - -/* - * Resolve the guest virtual address to info->host and info->flags. - * If @nofault, return false if the page is invalid, otherwise - * exit via page fault exception. - */ - -bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, - target_ulong addr, int mem_off, MMUAccessType access_type, - int mmu_idx, uintptr_t retaddr) -{ - int flags; - - addr += mem_off; - - /* - * User-only currently always issues with TBI. See the comment - * above useronly_clean_ptr. Usually we clean this top byte away - * during translation, but we can't do that for e.g. vector + imm - * addressing modes. - * - * We currently always enable TBI for user-only, and do not provide - * a way to turn it off. So clean the pointer unconditionally here, - * rather than look it up here, or pass it down from above. - */ - addr = useronly_clean_ptr(addr); - -#ifdef CONFIG_USER_ONLY - flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, - &info->host, retaddr); -#else - CPUTLBEntryFull *full; - flags = probe_access_full(env, addr, access_type, mmu_idx, nofault, - &info->host, &full, retaddr); -#endif - info->flags = flags; - - if (flags & TLB_INVALID_MASK) { - g_assert(nofault); - return false; - } - -#ifdef CONFIG_USER_ONLY - memset(&info->attrs, 0, sizeof(info->attrs)); - /* Require both ANON and MTE; see allocation_tag_mem(). */ - info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); -#else - info->attrs = full->attrs; - info->tagged = full->pte_attrs == 0xf0; -#endif - - /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ - info->host -= mem_off; - return true; -} - -/* - * Find first active element on each page, and a loose bound for the - * final element on each page. Identify any single element that spans - * the page boundary. Return true if there are any active elements. - */ -bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, - intptr_t reg_max, int esz, int msize) -{ - const int esize = 1 << esz; - const uint64_t pg_mask = pred_esz_masks[esz]; - intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; - intptr_t mem_off_last, mem_off_split; - intptr_t page_split, elt_split; - intptr_t i; - - /* Set all of the element indices to -1, and the TLB data to 0. */ - memset(info, -1, offsetof(SVEContLdSt, page)); - memset(info->page, 0, sizeof(info->page)); - - /* Gross scan over the entire predicate to find bounds. */ - i = 0; - do { - uint64_t pg = vg[i] & pg_mask; - if (pg) { - reg_off_last = i * 64 + 63 - clz64(pg); - if (reg_off_first < 0) { - reg_off_first = i * 64 + ctz64(pg); - } - } - } while (++i * 64 < reg_max); - - if (unlikely(reg_off_first < 0)) { - /* No active elements, no pages touched. */ - return false; - } - tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); - - info->reg_off_first[0] = reg_off_first; - info->mem_off_first[0] = (reg_off_first >> esz) * msize; - mem_off_last = (reg_off_last >> esz) * msize; - - page_split = -(addr | TARGET_PAGE_MASK); - if (likely(mem_off_last + msize <= page_split)) { - /* The entire operation fits within a single page. */ - info->reg_off_last[0] = reg_off_last; - return true; - } - - info->page_split = page_split; - elt_split = page_split / msize; - reg_off_split = elt_split << esz; - mem_off_split = elt_split * msize; - - /* - * This is the last full element on the first page, but it is not - * necessarily active. If there is no full element, i.e. the first - * active element is the one that's split, this value remains -1. - * It is useful as iteration bounds. - */ - if (elt_split != 0) { - info->reg_off_last[0] = reg_off_split - esize; - } - - /* Determine if an unaligned element spans the pages. */ - if (page_split % msize != 0) { - /* It is helpful to know if the split element is active. */ - if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { - info->reg_off_split = reg_off_split; - info->mem_off_split = mem_off_split; - - if (reg_off_split == reg_off_last) { - /* The page crossing element is last. */ - return true; - } - } - reg_off_split += esize; - mem_off_split += msize; - } - - /* - * We do want the first active element on the second page, because - * this may affect the address reported in an exception. - */ - reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); - tcg_debug_assert(reg_off_split <= reg_off_last); - info->reg_off_first[1] = reg_off_split; - info->mem_off_first[1] = (reg_off_split >> esz) * msize; - info->reg_off_last[1] = reg_off_last; - return true; -} - -/* - * Resolve the guest virtual addresses to info->page[]. - * Control the generation of page faults with @fault. Return false if - * there is no work to do, which can only happen with @fault == FAULT_NO. - */ -bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, - CPUARMState *env, target_ulong addr, - MMUAccessType access_type, uintptr_t retaddr) -{ - int mmu_idx = cpu_mmu_index(env, false); - int mem_off = info->mem_off_first[0]; - bool nofault = fault == FAULT_NO; - bool have_work = true; - - if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, - access_type, mmu_idx, retaddr)) { - /* No work to be done. */ - return false; - } - - if (likely(info->page_split < 0)) { - /* The entire operation was on the one page. */ - return true; - } - - /* - * If the second page is invalid, then we want the fault address to be - * the first byte on that page which is accessed. - */ - if (info->mem_off_split >= 0) { - /* - * There is an element split across the pages. The fault address - * should be the first byte of the second page. - */ - mem_off = info->page_split; - /* - * If the split element is also the first active element - * of the vector, then: For first-fault we should continue - * to generate faults for the second page. For no-fault, - * we have work only if the second page is valid. - */ - if (info->mem_off_first[0] < info->mem_off_split) { - nofault = FAULT_FIRST; - have_work = false; - } - } else { - /* - * There is no element split across the pages. The fault address - * should be the first active element on the second page. - */ - mem_off = info->mem_off_first[1]; - /* - * There must have been one active element on the first page, - * so we're out of first-fault territory. - */ - nofault = fault != FAULT_ALL; - } - - have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, - access_type, mmu_idx, retaddr); - return have_work; -} - -#ifndef CONFIG_USER_ONLY -void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, - uint64_t *vg, target_ulong addr, - int esize, int msize, int wp_access, - uintptr_t retaddr) -{ - intptr_t mem_off, reg_off, reg_last; - int flags0 = info->page[0].flags; - int flags1 = info->page[1].flags; - - if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { - return; - } - - /* Indicate that watchpoints are handled. */ - info->page[0].flags = flags0 & ~TLB_WATCHPOINT; - info->page[1].flags = flags1 & ~TLB_WATCHPOINT; - - if (flags0 & TLB_WATCHPOINT) { - mem_off = info->mem_off_first[0]; - reg_off = info->reg_off_first[0]; - reg_last = info->reg_off_last[0]; - - while (reg_off <= reg_last) { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - cpu_check_watchpoint(env_cpu(env), addr + mem_off, - msize, info->page[0].attrs, - wp_access, retaddr); - } - reg_off += esize; - mem_off += msize; - } while (reg_off <= reg_last && (reg_off & 63)); - } - } - - mem_off = info->mem_off_split; - if (mem_off >= 0) { - cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, - info->page[0].attrs, wp_access, retaddr); - } - - mem_off = info->mem_off_first[1]; - if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { - reg_off = info->reg_off_first[1]; - reg_last = info->reg_off_last[1]; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - cpu_check_watchpoint(env_cpu(env), addr + mem_off, - msize, info->page[1].attrs, - wp_access, retaddr); - } - reg_off += esize; - mem_off += msize; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} -#endif - -void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, - uint64_t *vg, target_ulong addr, int esize, - int msize, uint32_t mtedesc, uintptr_t ra) -{ - intptr_t mem_off, reg_off, reg_last; - - /* Process the page only if MemAttr == Tagged. */ - if (info->page[0].tagged) { - mem_off = info->mem_off_first[0]; - reg_off = info->reg_off_first[0]; - reg_last = info->reg_off_split; - if (reg_last < 0) { - reg_last = info->reg_off_last[0]; - } - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - mte_check(env, mtedesc, addr, ra); - } - reg_off += esize; - mem_off += msize; - } while (reg_off <= reg_last && (reg_off & 63)); - } while (reg_off <= reg_last); - } - - mem_off = info->mem_off_first[1]; - if (mem_off >= 0 && info->page[1].tagged) { - reg_off = info->reg_off_first[1]; - reg_last = info->reg_off_last[1]; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - mte_check(env, mtedesc, addr, ra); - } - reg_off += esize; - mem_off += msize; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} - -/* - * Common helper for all contiguous 1,2,3,4-register predicated stores. - */ -static inline QEMU_ALWAYS_INLINE -void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, - uint32_t desc, const uintptr_t retaddr, - const int esz, const int msz, const int N, uint32_t mtedesc, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const unsigned rd = simd_data(desc); - const intptr_t reg_max = simd_oprsz(desc); - intptr_t reg_off, reg_last, mem_off; - SVEContLdSt info; - void *host; - int flags, i; - - /* Find the active elements. */ - if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { - /* The entire predicate was false; no load occurs. */ - for (i = 0; i < N; ++i) { - memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); - } - return; - } - - /* Probe the page(s). Exit with exception for any invalid page. */ - sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); - - /* Handle watchpoints for all active elements. */ - sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, - BP_MEM_READ, retaddr); - - /* - * Handle mte checks for all active elements. - * Since TBI must be set for MTE, !mtedesc => !mte_active. - */ - if (mtedesc) { - sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, - mtedesc, retaddr); - } - - flags = info.page[0].flags | info.page[1].flags; - if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else - /* - * At least one page includes MMIO. - * Any bus operation can fail with cpu_transaction_failed, - * which for ARM will raise SyncExternal. Perform the load - * into scratch memory to preserve register state until the end. - */ - ARMVectorReg scratch[4] = { }; - - mem_off = info.mem_off_first[0]; - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[1]; - if (reg_last < 0) { - reg_last = info.reg_off_split; - if (reg_last < 0) { - reg_last = info.reg_off_last[0]; - } - } - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - tlb_fn(env, &scratch[i], reg_off, - addr + mem_off + (i << msz), retaddr); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off & 63); - } while (reg_off <= reg_last); - - for (i = 0; i < N; ++i) { - memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); - } - return; -#endif - } - - /* The entire operation is in RAM, on valid pages. */ - - for (i = 0; i < N; ++i) { - memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); - } - - mem_off = info.mem_off_first[0]; - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[0]; - host = info.page[0].host; - - while (reg_off <= reg_last) { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, - host + mem_off + (i << msz)); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off <= reg_last && (reg_off & 63)); - } - - /* - * Use the slow path to manage the cross-page misalignment. - * But we know this is RAM and cannot trap. - */ - mem_off = info.mem_off_split; - if (unlikely(mem_off >= 0)) { - reg_off = info.reg_off_split; - for (i = 0; i < N; ++i) { - tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, - addr + mem_off + (i << msz), retaddr); - } - } - - mem_off = info.mem_off_first[1]; - if (unlikely(mem_off >= 0)) { - reg_off = info.reg_off_first[1]; - reg_last = info.reg_off_last[1]; - host = info.page[1].host; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, - host + mem_off + (i << msz)); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} - -static inline QEMU_ALWAYS_INLINE -void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, - uint32_t desc, const uintptr_t ra, - const int esz, const int msz, const int N, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - int bit55 = extract64(addr, 55, 1); - - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { - mtedesc = 0; - } - - sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); -} - -#define DO_LD1_1(NAME, ESZ) \ -void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ - sve_##NAME##_host, sve_##NAME##_tlb); \ -} \ -void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ - sve_##NAME##_host, sve_##NAME##_tlb); \ -} - -#define DO_LD1_2(NAME, ESZ, MSZ) \ -void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ - sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ -} \ -void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ - sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ -} \ -void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ - sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ -} \ -void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ - sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ -} - -DO_LD1_1(ld1bb, MO_8) -DO_LD1_1(ld1bhu, MO_16) -DO_LD1_1(ld1bhs, MO_16) -DO_LD1_1(ld1bsu, MO_32) -DO_LD1_1(ld1bss, MO_32) -DO_LD1_1(ld1bdu, MO_64) -DO_LD1_1(ld1bds, MO_64) - -DO_LD1_2(ld1hh, MO_16, MO_16) -DO_LD1_2(ld1hsu, MO_32, MO_16) -DO_LD1_2(ld1hss, MO_32, MO_16) -DO_LD1_2(ld1hdu, MO_64, MO_16) -DO_LD1_2(ld1hds, MO_64, MO_16) - -DO_LD1_2(ld1ss, MO_32, MO_32) -DO_LD1_2(ld1sdu, MO_64, MO_32) -DO_LD1_2(ld1sds, MO_64, MO_32) - -DO_LD1_2(ld1dd, MO_64, MO_64) - -#undef DO_LD1_1 -#undef DO_LD1_2 - -#define DO_LDN_1(N) \ -void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ - sve_ld1bb_host, sve_ld1bb_tlb); \ -} \ -void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ - sve_ld1bb_host, sve_ld1bb_tlb); \ -} - -#define DO_LDN_2(N, SUFF, ESZ) \ -void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ - sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ -} \ -void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ - sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ -} \ -void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ - sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ -} \ -void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ - sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ -} - -DO_LDN_1(2) -DO_LDN_1(3) -DO_LDN_1(4) - -DO_LDN_2(2, hh, MO_16) -DO_LDN_2(3, hh, MO_16) -DO_LDN_2(4, hh, MO_16) - -DO_LDN_2(2, ss, MO_32) -DO_LDN_2(3, ss, MO_32) -DO_LDN_2(4, ss, MO_32) - -DO_LDN_2(2, dd, MO_64) -DO_LDN_2(3, dd, MO_64) -DO_LDN_2(4, dd, MO_64) - -#undef DO_LDN_1 -#undef DO_LDN_2 - -/* - * Load contiguous data, first-fault and no-fault. - * - * For user-only, one could argue that we should hold the mmap_lock during - * the operation so that there is no race between page_check_range and the - * load operation. However, unmapping pages out from under a running thread - * is extraordinarily unlikely. This theoretical race condition also affects - * linux-user/ in its get_user/put_user macros. - * - * TODO: Construct some helpers, written in assembly, that interact with - * host_signal_handler to produce memory ops which can properly report errors - * without racing. - */ - -/* Fault on byte I. All bits in FFR from I are cleared. The vector - * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE - * option, which leaves subsequent data unchanged. - */ -static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) -{ - uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; - - if (i & 63) { - ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); - i = ROUND_UP(i, 64); - } - for (; i < oprsz; i += 64) { - ffr[i / 64] = 0; - } -} - -/* - * Common helper for all contiguous no-fault and first-fault loads. - */ -static inline QEMU_ALWAYS_INLINE -void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, - uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, - const int esz, const int msz, const SVEContFault fault, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const unsigned rd = simd_data(desc); - void *vd = &env->vfp.zregs[rd]; - const intptr_t reg_max = simd_oprsz(desc); - intptr_t reg_off, mem_off, reg_last; - SVEContLdSt info; - int flags; - void *host; - - /* Find the active elements. */ - if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { - /* The entire predicate was false; no load occurs. */ - memset(vd, 0, reg_max); - return; - } - reg_off = info.reg_off_first[0]; - - /* Probe the page(s). */ - if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { - /* Fault on first element. */ - tcg_debug_assert(fault == FAULT_NO); - memset(vd, 0, reg_max); - goto do_fault; - } - - mem_off = info.mem_off_first[0]; - flags = info.page[0].flags; - - /* - * Disable MTE checking if the Tagged bit is not set. Since TBI must - * be set within MTEDESC for MTE, !mtedesc => !mte_active. - */ - if (!info.page[0].tagged) { - mtedesc = 0; - } - - if (fault == FAULT_FIRST) { - /* Trapping mte check for the first-fault element. */ - if (mtedesc) { - mte_check(env, mtedesc, addr + mem_off, retaddr); - } - - /* - * Special handling of the first active element, - * if it crosses a page boundary or is MMIO. - */ - bool is_split = mem_off == info.mem_off_split; - if (unlikely(flags != 0) || unlikely(is_split)) { - /* - * Use the slow path for cross-page handling. - * Might trap for MMIO or watchpoints. - */ - tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); - - /* After any fault, zero the other elements. */ - swap_memzero(vd, reg_off); - reg_off += 1 << esz; - mem_off += 1 << msz; - swap_memzero(vd + reg_off, reg_max - reg_off); - - if (is_split) { - goto second_page; - } - } else { - memset(vd, 0, reg_max); - } - } else { - memset(vd, 0, reg_max); - if (unlikely(mem_off == info.mem_off_split)) { - /* The first active element crosses a page boundary. */ - flags |= info.page[1].flags; - if (unlikely(flags & TLB_MMIO)) { - /* Some page is MMIO, see below. */ - goto do_fault; - } - if (unlikely(flags & TLB_WATCHPOINT) && - (cpu_watchpoint_address_matches - (env_cpu(env), addr + mem_off, 1 << msz) - & BP_MEM_READ)) { - /* Watchpoint hit, see below. */ - goto do_fault; - } - if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { - goto do_fault; - } - /* - * Use the slow path for cross-page handling. - * This is RAM, without a watchpoint, and will not trap. - */ - tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); - goto second_page; - } - } - - /* - * From this point on, all memory operations are MemSingleNF. - * - * Per the MemSingleNF pseudocode, a no-fault load from Device memory - * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. - * - * Unfortuately we do not have access to the memory attributes from the - * PTE to tell Device memory from Normal memory. So we make a mostly - * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. - * This gives the right answer for the common cases of "Normal memory, - * backed by host RAM" and "Device memory, backed by MMIO". - * The architecture allows us to suppress an NF load and return - * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner - * case of "Normal memory, backed by MMIO" is permitted. The case we - * get wrong is "Device memory, backed by host RAM", for which we - * should return (UNKNOWN, FAULT) for but do not. - * - * Similarly, CPU_BP breakpoints would raise exceptions, and so - * return (UNKNOWN, FAULT). For simplicity, we consider gdb and - * architectural breakpoints the same. - */ - if (unlikely(flags & TLB_MMIO)) { - goto do_fault; - } - - reg_last = info.reg_off_last[0]; - host = info.page[0].host; - - do { - uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); - do { - if ((pg >> (reg_off & 63)) & 1) { - if (unlikely(flags & TLB_WATCHPOINT) && - (cpu_watchpoint_address_matches - (env_cpu(env), addr + mem_off, 1 << msz) - & BP_MEM_READ)) { - goto do_fault; - } - if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { - goto do_fault; - } - host_fn(vd, reg_off, host + mem_off); - } - reg_off += 1 << esz; - mem_off += 1 << msz; - } while (reg_off <= reg_last && (reg_off & 63)); - } while (reg_off <= reg_last); - - /* - * MemSingleNF is allowed to fail for any reason. We have special - * code above to handle the first element crossing a page boundary. - * As an implementation choice, decline to handle a cross-page element - * in any other position. - */ - reg_off = info.reg_off_split; - if (reg_off >= 0) { - goto do_fault; - } - - second_page: - reg_off = info.reg_off_first[1]; - if (likely(reg_off < 0)) { - /* No active elements on the second page. All done. */ - return; - } - - /* - * MemSingleNF is allowed to fail for any reason. As an implementation - * choice, decline to handle elements on the second page. This should - * be low frequency as the guest walks through memory -- the next - * iteration of the guest's loop should be aligned on the page boundary, - * and then all following iterations will stay aligned. - */ - - do_fault: - record_fault(env, reg_off, reg_max); -} - -static inline QEMU_ALWAYS_INLINE -void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, - uint32_t desc, const uintptr_t retaddr, - const int esz, const int msz, const SVEContFault fault, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - int bit55 = extract64(addr, 55, 1); - - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { - mtedesc = 0; - } - - sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, - esz, msz, fault, host_fn, tlb_fn); -} - -#define DO_LDFF1_LDNF1_1(PART, ESZ) \ -void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ - sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ - sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ -} \ -void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ - sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ - sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ -} - -#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ -void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ - sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ - sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ -} \ -void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ - sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ - sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ -} \ -void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ - sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ - sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ -} \ -void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ - sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ -} \ -void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ - sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ -} - -DO_LDFF1_LDNF1_1(bb, MO_8) -DO_LDFF1_LDNF1_1(bhu, MO_16) -DO_LDFF1_LDNF1_1(bhs, MO_16) -DO_LDFF1_LDNF1_1(bsu, MO_32) -DO_LDFF1_LDNF1_1(bss, MO_32) -DO_LDFF1_LDNF1_1(bdu, MO_64) -DO_LDFF1_LDNF1_1(bds, MO_64) - -DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) -DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) -DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) -DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) -DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) - -DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) -DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) -DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) - -DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) - -#undef DO_LDFF1_LDNF1_1 -#undef DO_LDFF1_LDNF1_2 - -/* - * Common helper for all contiguous 1,2,3,4-register predicated stores. - */ - -static inline QEMU_ALWAYS_INLINE -void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, - uint32_t desc, const uintptr_t retaddr, - const int esz, const int msz, const int N, uint32_t mtedesc, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const unsigned rd = simd_data(desc); - const intptr_t reg_max = simd_oprsz(desc); - intptr_t reg_off, reg_last, mem_off; - SVEContLdSt info; - void *host; - int i, flags; - - /* Find the active elements. */ - if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { - /* The entire predicate was false; no store occurs. */ - return; - } - - /* Probe the page(s). Exit with exception for any invalid page. */ - sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); - - /* Handle watchpoints for all active elements. */ - sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, - BP_MEM_WRITE, retaddr); - - /* - * Handle mte checks for all active elements. - * Since TBI must be set for MTE, !mtedesc => !mte_active. - */ - if (mtedesc) { - sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, - mtedesc, retaddr); - } - - flags = info.page[0].flags | info.page[1].flags; - if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else - /* - * At least one page includes MMIO. - * Any bus operation can fail with cpu_transaction_failed, - * which for ARM will raise SyncExternal. We cannot avoid - * this fault and will leave with the store incomplete. - */ - mem_off = info.mem_off_first[0]; - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[1]; - if (reg_last < 0) { - reg_last = info.reg_off_split; - if (reg_last < 0) { - reg_last = info.reg_off_last[0]; - } - } - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, - addr + mem_off + (i << msz), retaddr); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off & 63); - } while (reg_off <= reg_last); - return; -#endif - } - - mem_off = info.mem_off_first[0]; - reg_off = info.reg_off_first[0]; - reg_last = info.reg_off_last[0]; - host = info.page[0].host; - - while (reg_off <= reg_last) { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, - host + mem_off + (i << msz)); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off <= reg_last && (reg_off & 63)); - } - - /* - * Use the slow path to manage the cross-page misalignment. - * But we know this is RAM and cannot trap. - */ - mem_off = info.mem_off_split; - if (unlikely(mem_off >= 0)) { - reg_off = info.reg_off_split; - for (i = 0; i < N; ++i) { - tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, - addr + mem_off + (i << msz), retaddr); - } - } - - mem_off = info.mem_off_first[1]; - if (unlikely(mem_off >= 0)) { - reg_off = info.reg_off_first[1]; - reg_last = info.reg_off_last[1]; - host = info.page[1].host; - - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if ((pg >> (reg_off & 63)) & 1) { - for (i = 0; i < N; ++i) { - host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, - host + mem_off + (i << msz)); - } - } - reg_off += 1 << esz; - mem_off += N << msz; - } while (reg_off & 63); - } while (reg_off <= reg_last); - } -} - -static inline QEMU_ALWAYS_INLINE -void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, - uint32_t desc, const uintptr_t ra, - const int esz, const int msz, const int N, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - int bit55 = extract64(addr, 55, 1); - - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { - mtedesc = 0; - } - - sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); -} - -#define DO_STN_1(N, NAME, ESZ) \ -void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ - sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ -} \ -void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ - sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ -} - -#define DO_STN_2(N, NAME, ESZ, MSZ) \ -void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ - sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ -} \ -void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ - sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ -} \ -void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ - sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ -} \ -void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ - sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ -} - -DO_STN_1(1, bb, MO_8) -DO_STN_1(1, bh, MO_16) -DO_STN_1(1, bs, MO_32) -DO_STN_1(1, bd, MO_64) -DO_STN_1(2, bb, MO_8) -DO_STN_1(3, bb, MO_8) -DO_STN_1(4, bb, MO_8) - -DO_STN_2(1, hh, MO_16, MO_16) -DO_STN_2(1, hs, MO_32, MO_16) -DO_STN_2(1, hd, MO_64, MO_16) -DO_STN_2(2, hh, MO_16, MO_16) -DO_STN_2(3, hh, MO_16, MO_16) -DO_STN_2(4, hh, MO_16, MO_16) - -DO_STN_2(1, ss, MO_32, MO_32) -DO_STN_2(1, sd, MO_64, MO_32) -DO_STN_2(2, ss, MO_32, MO_32) -DO_STN_2(3, ss, MO_32, MO_32) -DO_STN_2(4, ss, MO_32, MO_32) - -DO_STN_2(1, dd, MO_64, MO_64) -DO_STN_2(2, dd, MO_64, MO_64) -DO_STN_2(3, dd, MO_64, MO_64) -DO_STN_2(4, dd, MO_64, MO_64) - -#undef DO_STN_1 -#undef DO_STN_2 - -/* - * Loads with a vector index. - */ - -/* - * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. - */ -typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); - -static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) -{ - return *(uint32_t *)(reg + H1_4(reg_ofs)); -} - -static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) -{ - return *(int32_t *)(reg + H1_4(reg_ofs)); -} - -static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) -{ - return (uint32_t)*(uint64_t *)(reg + reg_ofs); -} - -static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) -{ - return (int32_t)*(uint64_t *)(reg + reg_ofs); -} - -static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) -{ - return *(uint64_t *)(reg + reg_ofs); -} - -static inline QEMU_ALWAYS_INLINE -void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - uint32_t mtedesc, int esize, int msize, - zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const int mmu_idx = cpu_mmu_index(env, false); - const intptr_t reg_max = simd_oprsz(desc); - const int scale = simd_data(desc); - ARMVectorReg scratch; - intptr_t reg_off; - SVEHostPage info, info2; - - memset(&scratch, 0, reg_max); - reg_off = 0; - do { - uint64_t pg = vg[reg_off >> 6]; - do { - if (likely(pg & 1)) { - target_ulong addr = base + (off_fn(vm, reg_off) << scale); - target_ulong in_page = -(addr | TARGET_PAGE_MASK); - - sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, - mmu_idx, retaddr); - - if (likely(in_page >= msize)) { - if (unlikely(info.flags & TLB_WATCHPOINT)) { - cpu_check_watchpoint(env_cpu(env), addr, msize, - info.attrs, BP_MEM_READ, retaddr); - } - if (mtedesc && info.tagged) { - mte_check(env, mtedesc, addr, retaddr); - } - if (unlikely(info.flags & TLB_MMIO)) { - tlb_fn(env, &scratch, reg_off, addr, retaddr); - } else { - host_fn(&scratch, reg_off, info.host); - } - } else { - /* Element crosses the page boundary. */ - sve_probe_page(&info2, false, env, addr + in_page, 0, - MMU_DATA_LOAD, mmu_idx, retaddr); - if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { - cpu_check_watchpoint(env_cpu(env), addr, - msize, info.attrs, - BP_MEM_READ, retaddr); - } - if (mtedesc && info.tagged) { - mte_check(env, mtedesc, addr, retaddr); - } - tlb_fn(env, &scratch, reg_off, addr, retaddr); - } - } - reg_off += esize; - pg >>= esize; - } while (reg_off & 63); - } while (reg_off < reg_max); - - /* Wait until all exceptions have been raised to write back. */ - memcpy(vd, &scratch, reg_max); -} - -static inline QEMU_ALWAYS_INLINE -void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - int esize, int msize, zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* - * ??? TODO: For the 32-bit offset extractions, base + ofs cannot - * offset base entirely over the address space hole to change the - * pointer tag, or change the bit55 selector. So we could here - * examine TBI + TCMA like we do for sve_ldN_r_mte(). - */ - sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, - esize, msize, off_fn, host_fn, tlb_fn); -} - -#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ -void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ - off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} \ -void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ - off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} - -#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ -void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ - off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} \ -void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ - off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} - -DO_LD1_ZPZ_S(bsu, zsu, MO_8) -DO_LD1_ZPZ_S(bsu, zss, MO_8) -DO_LD1_ZPZ_D(bdu, zsu, MO_8) -DO_LD1_ZPZ_D(bdu, zss, MO_8) -DO_LD1_ZPZ_D(bdu, zd, MO_8) - -DO_LD1_ZPZ_S(bss, zsu, MO_8) -DO_LD1_ZPZ_S(bss, zss, MO_8) -DO_LD1_ZPZ_D(bds, zsu, MO_8) -DO_LD1_ZPZ_D(bds, zss, MO_8) -DO_LD1_ZPZ_D(bds, zd, MO_8) - -DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) -DO_LD1_ZPZ_S(hsu_le, zss, MO_16) -DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) -DO_LD1_ZPZ_D(hdu_le, zss, MO_16) -DO_LD1_ZPZ_D(hdu_le, zd, MO_16) - -DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) -DO_LD1_ZPZ_S(hsu_be, zss, MO_16) -DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) -DO_LD1_ZPZ_D(hdu_be, zss, MO_16) -DO_LD1_ZPZ_D(hdu_be, zd, MO_16) - -DO_LD1_ZPZ_S(hss_le, zsu, MO_16) -DO_LD1_ZPZ_S(hss_le, zss, MO_16) -DO_LD1_ZPZ_D(hds_le, zsu, MO_16) -DO_LD1_ZPZ_D(hds_le, zss, MO_16) -DO_LD1_ZPZ_D(hds_le, zd, MO_16) - -DO_LD1_ZPZ_S(hss_be, zsu, MO_16) -DO_LD1_ZPZ_S(hss_be, zss, MO_16) -DO_LD1_ZPZ_D(hds_be, zsu, MO_16) -DO_LD1_ZPZ_D(hds_be, zss, MO_16) -DO_LD1_ZPZ_D(hds_be, zd, MO_16) - -DO_LD1_ZPZ_S(ss_le, zsu, MO_32) -DO_LD1_ZPZ_S(ss_le, zss, MO_32) -DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) -DO_LD1_ZPZ_D(sdu_le, zss, MO_32) -DO_LD1_ZPZ_D(sdu_le, zd, MO_32) - -DO_LD1_ZPZ_S(ss_be, zsu, MO_32) -DO_LD1_ZPZ_S(ss_be, zss, MO_32) -DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) -DO_LD1_ZPZ_D(sdu_be, zss, MO_32) -DO_LD1_ZPZ_D(sdu_be, zd, MO_32) - -DO_LD1_ZPZ_D(sds_le, zsu, MO_32) -DO_LD1_ZPZ_D(sds_le, zss, MO_32) -DO_LD1_ZPZ_D(sds_le, zd, MO_32) - -DO_LD1_ZPZ_D(sds_be, zsu, MO_32) -DO_LD1_ZPZ_D(sds_be, zss, MO_32) -DO_LD1_ZPZ_D(sds_be, zd, MO_32) - -DO_LD1_ZPZ_D(dd_le, zsu, MO_64) -DO_LD1_ZPZ_D(dd_le, zss, MO_64) -DO_LD1_ZPZ_D(dd_le, zd, MO_64) - -DO_LD1_ZPZ_D(dd_be, zsu, MO_64) -DO_LD1_ZPZ_D(dd_be, zss, MO_64) -DO_LD1_ZPZ_D(dd_be, zd, MO_64) - -#undef DO_LD1_ZPZ_S -#undef DO_LD1_ZPZ_D - -/* First fault loads with a vector index. */ - -/* - * Common helpers for all gather first-faulting loads. - */ - -static inline QEMU_ALWAYS_INLINE -void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - uint32_t mtedesc, const int esz, const int msz, - zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const int mmu_idx = cpu_mmu_index(env, false); - const intptr_t reg_max = simd_oprsz(desc); - const int scale = simd_data(desc); - const int esize = 1 << esz; - const int msize = 1 << msz; - intptr_t reg_off; - SVEHostPage info; - target_ulong addr, in_page; - - /* Skip to the first true predicate. */ - reg_off = find_next_active(vg, 0, reg_max, esz); - if (unlikely(reg_off >= reg_max)) { - /* The entire predicate was false; no load occurs. */ - memset(vd, 0, reg_max); - return; - } - - /* - * Probe the first element, allowing faults. - */ - addr = base + (off_fn(vm, reg_off) << scale); - if (mtedesc) { - mte_check(env, mtedesc, addr, retaddr); - } - tlb_fn(env, vd, reg_off, addr, retaddr); - - /* After any fault, zero the other elements. */ - swap_memzero(vd, reg_off); - reg_off += esize; - swap_memzero(vd + reg_off, reg_max - reg_off); - - /* - * Probe the remaining elements, not allowing faults. - */ - while (reg_off < reg_max) { - uint64_t pg = vg[reg_off >> 6]; - do { - if (likely((pg >> (reg_off & 63)) & 1)) { - addr = base + (off_fn(vm, reg_off) << scale); - in_page = -(addr | TARGET_PAGE_MASK); - - if (unlikely(in_page < msize)) { - /* Stop if the element crosses a page boundary. */ - goto fault; - } - - sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, - mmu_idx, retaddr); - if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { - goto fault; - } - if (unlikely(info.flags & TLB_WATCHPOINT) && - (cpu_watchpoint_address_matches - (env_cpu(env), addr, msize) & BP_MEM_READ)) { - goto fault; - } - if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { - goto fault; - } - - host_fn(vd, reg_off, info.host); - } - reg_off += esize; - } while (reg_off & 63); - } - return; - - fault: - record_fault(env, reg_off, reg_max); -} - -static inline QEMU_ALWAYS_INLINE -void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - const int esz, const int msz, - zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* - * ??? TODO: For the 32-bit offset extractions, base + ofs cannot - * offset base entirely over the address space hole to change the - * pointer tag, or change the bit55 selector. So we could here - * examine TBI + TCMA like we do for sve_ldN_r_mte(). - */ - sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, - esz, msz, off_fn, host_fn, tlb_fn); -} - -#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ -void HELPER(sve_ldff##MEM##_##OFS) \ - (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ - off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} \ -void HELPER(sve_ldff##MEM##_##OFS##_mte) \ - (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ - off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} - -#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ -void HELPER(sve_ldff##MEM##_##OFS) \ - (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ - off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} \ -void HELPER(sve_ldff##MEM##_##OFS##_mte) \ - (CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ - off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ -} - -DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) -DO_LDFF1_ZPZ_S(bsu, zss, MO_8) -DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) -DO_LDFF1_ZPZ_D(bdu, zss, MO_8) -DO_LDFF1_ZPZ_D(bdu, zd, MO_8) - -DO_LDFF1_ZPZ_S(bss, zsu, MO_8) -DO_LDFF1_ZPZ_S(bss, zss, MO_8) -DO_LDFF1_ZPZ_D(bds, zsu, MO_8) -DO_LDFF1_ZPZ_D(bds, zss, MO_8) -DO_LDFF1_ZPZ_D(bds, zd, MO_8) - -DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) -DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) -DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) -DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) -DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) - -DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) -DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) -DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) -DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) -DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) - -DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) -DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) -DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) -DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) -DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) - -DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) -DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) -DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) -DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) -DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) - -DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) -DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) -DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) -DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) -DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) - -DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) -DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) -DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) -DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) -DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) - -DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) -DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) -DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) - -DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) -DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) -DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) - -DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) -DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) -DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) - -DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) -DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) -DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) - -/* Stores with a vector index. */ - -static inline QEMU_ALWAYS_INLINE -void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - uint32_t mtedesc, int esize, int msize, - zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - const int mmu_idx = cpu_mmu_index(env, false); - const intptr_t reg_max = simd_oprsz(desc); - const int scale = simd_data(desc); - void *host[ARM_MAX_VQ * 4]; - intptr_t reg_off, i; - SVEHostPage info, info2; - - /* - * Probe all of the elements for host addresses and flags. - */ - i = reg_off = 0; - do { - uint64_t pg = vg[reg_off >> 6]; - do { - target_ulong addr = base + (off_fn(vm, reg_off) << scale); - target_ulong in_page = -(addr | TARGET_PAGE_MASK); - - host[i] = NULL; - if (likely((pg >> (reg_off & 63)) & 1)) { - if (likely(in_page >= msize)) { - sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, - mmu_idx, retaddr); - if (!(info.flags & TLB_MMIO)) { - host[i] = info.host; - } - } else { - /* - * Element crosses the page boundary. - * Probe both pages, but do not record the host address, - * so that we use the slow path. - */ - sve_probe_page(&info, false, env, addr, 0, - MMU_DATA_STORE, mmu_idx, retaddr); - sve_probe_page(&info2, false, env, addr + in_page, 0, - MMU_DATA_STORE, mmu_idx, retaddr); - info.flags |= info2.flags; - } - - if (unlikely(info.flags & TLB_WATCHPOINT)) { - cpu_check_watchpoint(env_cpu(env), addr, msize, - info.attrs, BP_MEM_WRITE, retaddr); - } - - if (mtedesc && info.tagged) { - mte_check(env, mtedesc, addr, retaddr); - } - } - i += 1; - reg_off += esize; - } while (reg_off & 63); - } while (reg_off < reg_max); - - /* - * Now that we have recognized all exceptions except SyncExternal - * (from TLB_MMIO), which we cannot avoid, perform all of the stores. - * - * Note for the common case of an element in RAM, not crossing a page - * boundary, we have stored the host address in host[]. This doubles - * as a first-level check against the predicate, since only enabled - * elements have non-null host addresses. - */ - i = reg_off = 0; - do { - void *h = host[i]; - if (likely(h != NULL)) { - host_fn(vd, reg_off, h); - } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { - target_ulong addr = base + (off_fn(vm, reg_off) << scale); - tlb_fn(env, vd, reg_off, addr, retaddr); - } - i += 1; - reg_off += esize; - } while (reg_off < reg_max); -} - -static inline QEMU_ALWAYS_INLINE -void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, - target_ulong base, uint32_t desc, uintptr_t retaddr, - int esize, int msize, zreg_off_fn *off_fn, - sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) -{ - uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - /* Remove mtedesc from the normal sve descriptor. */ - desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); - - /* - * ??? TODO: For the 32-bit offset extractions, base + ofs cannot - * offset base entirely over the address space hole to change the - * pointer tag, or change the bit55 selector. So we could here - * examine TBI + TCMA like we do for sve_ldN_r_mte(). - */ - sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, - esize, msize, off_fn, host_fn, tlb_fn); -} - -#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ -void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ - off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ -} \ -void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ - off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ -} - -#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ -void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ - off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ -} \ -void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ - off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ -} - -DO_ST1_ZPZ_S(bs, zsu, MO_8) -DO_ST1_ZPZ_S(hs_le, zsu, MO_16) -DO_ST1_ZPZ_S(hs_be, zsu, MO_16) -DO_ST1_ZPZ_S(ss_le, zsu, MO_32) -DO_ST1_ZPZ_S(ss_be, zsu, MO_32) - -DO_ST1_ZPZ_S(bs, zss, MO_8) -DO_ST1_ZPZ_S(hs_le, zss, MO_16) -DO_ST1_ZPZ_S(hs_be, zss, MO_16) -DO_ST1_ZPZ_S(ss_le, zss, MO_32) -DO_ST1_ZPZ_S(ss_be, zss, MO_32) - -DO_ST1_ZPZ_D(bd, zsu, MO_8) -DO_ST1_ZPZ_D(hd_le, zsu, MO_16) -DO_ST1_ZPZ_D(hd_be, zsu, MO_16) -DO_ST1_ZPZ_D(sd_le, zsu, MO_32) -DO_ST1_ZPZ_D(sd_be, zsu, MO_32) -DO_ST1_ZPZ_D(dd_le, zsu, MO_64) -DO_ST1_ZPZ_D(dd_be, zsu, MO_64) - -DO_ST1_ZPZ_D(bd, zss, MO_8) -DO_ST1_ZPZ_D(hd_le, zss, MO_16) -DO_ST1_ZPZ_D(hd_be, zss, MO_16) -DO_ST1_ZPZ_D(sd_le, zss, MO_32) -DO_ST1_ZPZ_D(sd_be, zss, MO_32) -DO_ST1_ZPZ_D(dd_le, zss, MO_64) -DO_ST1_ZPZ_D(dd_be, zss, MO_64) - -DO_ST1_ZPZ_D(bd, zd, MO_8) -DO_ST1_ZPZ_D(hd_le, zd, MO_16) -DO_ST1_ZPZ_D(hd_be, zd, MO_16) -DO_ST1_ZPZ_D(sd_le, zd, MO_32) -DO_ST1_ZPZ_D(sd_be, zd, MO_32) -DO_ST1_ZPZ_D(dd_le, zd, MO_64) -DO_ST1_ZPZ_D(dd_be, zd, MO_64) - -#undef DO_ST1_ZPZ_S -#undef DO_ST1_ZPZ_D - -void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm, *k = vk; - - for (i = 0; i < opr_sz; ++i) { - d[i] = n[i] ^ m[i] ^ k[i]; - } -} - -void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm, *k = vk; - - for (i = 0; i < opr_sz; ++i) { - d[i] = n[i] ^ (m[i] & ~k[i]); - } -} - -void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm, *k = vk; - - for (i = 0; i < opr_sz; ++i) { - d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); - } -} - -void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm, *k = vk; - - for (i = 0; i < opr_sz; ++i) { - d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); - } -} - -void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - uint64_t *d = vd, *n = vn, *m = vm, *k = vk; - - for (i = 0; i < opr_sz; ++i) { - d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); - } -} - -/* - * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. - * See hasless(v,1) from - * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord - */ -static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) -{ - int bits = 8 << esz; - uint64_t ones = dup_const(esz, 1); - uint64_t signs = ones << (bits - 1); - uint64_t cmp0, cmp1; - - cmp1 = dup_const(esz, n); - cmp0 = cmp1 ^ m0; - cmp1 = cmp1 ^ m1; - cmp0 = (cmp0 - ones) & ~cmp0; - cmp1 = (cmp1 - ones) & ~cmp1; - return (cmp0 | cmp1) & signs; -} - -static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, - uint32_t desc, int esz, bool nmatch) -{ - uint16_t esz_mask = pred_esz_masks[esz]; - intptr_t opr_sz = simd_oprsz(desc); - uint32_t flags = PREDTEST_INIT; - intptr_t i, j, k; - - for (i = 0; i < opr_sz; i += 16) { - uint64_t m0 = *(uint64_t *)(vm + i); - uint64_t m1 = *(uint64_t *)(vm + i + 8); - uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; - uint16_t out = 0; - - for (j = 0; j < 16; j += 8) { - uint64_t n = *(uint64_t *)(vn + i + j); - - for (k = 0; k < 8; k += 1 << esz) { - if (pg & (1 << (j + k))) { - bool o = do_match2(n >> (k * 8), m0, m1, esz); - out |= (o ^ nmatch) << (j + k); - } - } - } - *(uint16_t *)(vd + H1_2(i >> 3)) = out; - flags = iter_predtest_fwd(out, pg, flags); - } - return flags; -} - -#define DO_PPZZ_MATCH(NAME, ESZ, INV) \ -uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ -{ \ - return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ -} - -DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) -DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) - -DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) -DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) - -#undef DO_PPZZ_MATCH - -void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, - uint32_t desc) -{ - ARMVectorReg scratch; - intptr_t i, j; - intptr_t opr_sz = simd_oprsz(desc); - uint32_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - if (d == n) { - n = memcpy(&scratch, n, opr_sz); - if (d == m) { - m = n; - } - } else if (d == m) { - m = memcpy(&scratch, m, opr_sz); - } - - for (i = 0; i < opr_sz; i += 4) { - uint64_t count = 0; - uint8_t pred; - - pred = pg[H1(i >> 3)] >> (i & 7); - if (pred & 1) { - uint32_t nn = n[H4(i >> 2)]; - - for (j = 0; j <= i; j += 4) { - pred = pg[H1(j >> 3)] >> (j & 7); - if ((pred & 1) && nn == m[H4(j >> 2)]) { - ++count; - } - } - } - d[H4(i >> 2)] = count; - } -} - -void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, - uint32_t desc) -{ - ARMVectorReg scratch; - intptr_t i, j; - intptr_t opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - uint8_t *pg = vg; - - if (d == n) { - n = memcpy(&scratch, n, opr_sz); - if (d == m) { - m = n; - } - } else if (d == m) { - m = memcpy(&scratch, m, opr_sz); - } - - for (i = 0; i < opr_sz / 8; ++i) { - uint64_t count = 0; - if (pg[H1(i)] & 1) { - uint64_t nn = n[i]; - for (j = 0; j <= i; ++j) { - if ((pg[H1(j)] & 1) && nn == m[j]) { - ++count; - } - } - } - d[i] = count; - } -} - -/* - * Returns the number of bytes in m0 and m1 that match n. - * Unlike do_match2 we don't just need true/false, we need an exact count. - * This requires two extra logical operations. - */ -static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) -{ - const uint64_t mask = dup_const(MO_8, 0x7f); - uint64_t cmp0, cmp1; - - cmp1 = dup_const(MO_8, n); - cmp0 = cmp1 ^ m0; - cmp1 = cmp1 ^ m1; - - /* - * 1: clear msb of each byte to avoid carry to next byte (& mask) - * 2: carry in to msb if byte != 0 (+ mask) - * 3: set msb if cmp has msb set (| cmp) - * 4: set ~msb to ignore them (| mask) - * We now have 0xff for byte != 0 or 0x7f for byte == 0. - * 5: invert, resulting in 0x80 if and only if byte == 0. - */ - cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); - cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); - - /* - * Combine the two compares in a way that the bits do - * not overlap, and so preserves the count of set bits. - * If the host has an efficient instruction for ctpop, - * then ctpop(x) + ctpop(y) has the same number of - * operations as ctpop(x | (y >> 1)). If the host does - * not have an efficient ctpop, then we only want to - * use it once. - */ - return ctpop64(cmp0 | (cmp1 >> 1)); -} - -void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j; - intptr_t opr_sz = simd_oprsz(desc); - - for (i = 0; i < opr_sz; i += 16) { - uint64_t n0 = *(uint64_t *)(vn + i); - uint64_t m0 = *(uint64_t *)(vm + i); - uint64_t n1 = *(uint64_t *)(vn + i + 8); - uint64_t m1 = *(uint64_t *)(vm + i + 8); - uint64_t out0 = 0; - uint64_t out1 = 0; - - for (j = 0; j < 64; j += 8) { - uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); - uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); - out0 |= cnt0 << j; - out1 |= cnt1 << j; - } - - *(uint64_t *)(vd + i) = out0; - *(uint64_t *)(vd + i + 8) = out1; - } -} - -void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - int shr = simd_data(desc); - int shl = 8 - shr; - uint64_t mask = dup_const(MO_8, 0xff >> shr); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - uint64_t t = n[i] ^ m[i]; - d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); - } -} - -void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - int shr = simd_data(desc); - int shl = 16 - shr; - uint64_t mask = dup_const(MO_16, 0xffff >> shr); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - uint64_t t = n[i] ^ m[i]; - d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); - } -} - -void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 4; - int shr = simd_data(desc); - uint32_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = ror32(n[i] ^ m[i], shr); - } -} - -void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, - void *status, uint32_t desc) -{ - intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); - - for (s = 0; s < opr_sz; ++s) { - float32 *n = vn + s * sizeof(float32) * 4; - float32 *m = vm + s * sizeof(float32) * 4; - float32 *a = va + s * sizeof(float32) * 4; - float32 *d = vd + s * sizeof(float32) * 4; - float32 n00 = n[H4(0)], n01 = n[H4(1)]; - float32 n10 = n[H4(2)], n11 = n[H4(3)]; - float32 m00 = m[H4(0)], m01 = m[H4(1)]; - float32 m10 = m[H4(2)], m11 = m[H4(3)]; - float32 p0, p1; - - /* i = 0, j = 0 */ - p0 = float32_mul(n00, m00, status); - p1 = float32_mul(n01, m01, status); - d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); - - /* i = 0, j = 1 */ - p0 = float32_mul(n00, m10, status); - p1 = float32_mul(n01, m11, status); - d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); - - /* i = 1, j = 0 */ - p0 = float32_mul(n10, m00, status); - p1 = float32_mul(n11, m01, status); - d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); - - /* i = 1, j = 1 */ - p0 = float32_mul(n10, m10, status); - p1 = float32_mul(n11, m11, status); - d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); - } -} - -void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, - void *status, uint32_t desc) -{ - intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); - - for (s = 0; s < opr_sz; ++s) { - float64 *n = vn + s * sizeof(float64) * 4; - float64 *m = vm + s * sizeof(float64) * 4; - float64 *a = va + s * sizeof(float64) * 4; - float64 *d = vd + s * sizeof(float64) * 4; - float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; - float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; - float64 p0, p1; - - /* i = 0, j = 0 */ - p0 = float64_mul(n00, m00, status); - p1 = float64_mul(n01, m01, status); - d[0] = float64_add(a[0], float64_add(p0, p1, status), status); - - /* i = 0, j = 1 */ - p0 = float64_mul(n00, m10, status); - p1 = float64_mul(n01, m11, status); - d[1] = float64_add(a[1], float64_add(p0, p1, status), status); - - /* i = 1, j = 0 */ - p0 = float64_mul(n10, m00, status); - p1 = float64_mul(n11, m01, status); - d[2] = float64_add(a[2], float64_add(p0, p1, status), status); - - /* i = 1, j = 1 */ - p0 = float64_mul(n10, m10, status); - p1 = float64_mul(n11, m11, status); - d[3] = float64_add(a[3], float64_add(p0, p1, status), status); - } -} - -#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc); \ - uint64_t *g = vg; \ - do { \ - uint64_t pg = g[(i - 1) >> 6]; \ - do { \ - i -= sizeof(TYPEW); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPEW nn = *(TYPEW *)(vn + HW(i)); \ - *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ - } \ - } while (i & 63); \ - } while (i != 0); \ -} - -DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) -DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) -DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) - -#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ -void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ -{ \ - intptr_t i = simd_oprsz(desc); \ - uint64_t *g = vg; \ - do { \ - uint64_t pg = g[(i - 1) >> 6]; \ - do { \ - i -= sizeof(TYPEW); \ - if (likely((pg >> (i & 63)) & 1)) { \ - TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ - *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ - } \ - } while (i & 63); \ - } while (i != 0); \ -} - -DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) -DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) - -#undef DO_FCVTLT -#undef DO_FCVTNT diff --git a/target/arm/tcg-stubs.c b/target/arm/tcg-stubs.c new file mode 100644 index 0000000000..1a7ddb3664 --- /dev/null +++ b/target/arm/tcg-stubs.c @@ -0,0 +1,23 @@ +/* + * QEMU ARM stubs for some TCG helper functions + * + * Copyright 2021 SUSE LLC + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" + +void write_v7m_exception(CPUARMState *env, uint32_t new_exc) +{ + g_assert_not_reached(); +} + +void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome, + uint32_t target_el, uintptr_t ra) +{ + g_assert_not_reached(); +} diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c new file mode 100644 index 0000000000..d28690321f --- /dev/null +++ b/target/arm/tcg/crypto_helper.c @@ -0,0 +1,778 @@ +/* + * crypto_helper.c - emulate v8 Crypto Extensions instructions + * + * Copyright (C) 2013 - 2018 Linaro Ltd + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "exec/helper-proto.h" +#include "tcg/tcg-gvec-desc.h" +#include "crypto/aes.h" +#include "crypto/sm4.h" +#include "vec_internal.h" + +union CRYPTO_STATE { + uint8_t bytes[16]; + uint32_t words[4]; + uint64_t l[2]; +}; + +#if HOST_BIG_ENDIAN +#define CR_ST_BYTE(state, i) ((state).bytes[(15 - (i)) ^ 8]) +#define CR_ST_WORD(state, i) ((state).words[(3 - (i)) ^ 2]) +#else +#define CR_ST_BYTE(state, i) ((state).bytes[i]) +#define CR_ST_WORD(state, i) ((state).words[i]) +#endif + +/* + * The caller has not been converted to full gvec, and so only + * modifies the low 16 bytes of the vector register. + */ +static void clear_tail_16(void *vd, uint32_t desc) +{ + int opr_sz = simd_oprsz(desc); + int max_sz = simd_maxsz(desc); + + assert(opr_sz == 16); + clear_tail(vd, opr_sz, max_sz); +} + +static void do_crypto_aese(uint64_t *rd, uint64_t *rn, + uint64_t *rm, bool decrypt) +{ + static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox }; + static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts }; + union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } }; + union CRYPTO_STATE st = { .l = { rn[0], rn[1] } }; + int i; + + /* xor state vector with round key */ + rk.l[0] ^= st.l[0]; + rk.l[1] ^= st.l[1]; + + /* combine ShiftRows operation and sbox substitution */ + for (i = 0; i < 16; i++) { + CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])]; + } + + rd[0] = st.l[0]; + rd[1] = st.l[1]; +} + +void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + bool decrypt = simd_data(desc); + + for (i = 0; i < opr_sz; i += 16) { + do_crypto_aese(vd + i, vn + i, vm + i, decrypt); + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} + +static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt) +{ + static uint32_t const mc[][256] = { { + /* MixColumns lookup table */ + 0x00000000, 0x03010102, 0x06020204, 0x05030306, + 0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e, + 0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16, + 0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e, + 0x30101020, 0x33111122, 0x36121224, 0x35131326, + 0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e, + 0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36, + 0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e, + 0x60202040, 0x63212142, 0x66222244, 0x65232346, + 0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e, + 0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56, + 0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e, + 0x50303060, 0x53313162, 0x56323264, 0x55333366, + 0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e, + 0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76, + 0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e, + 0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386, + 0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e, + 0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96, + 0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e, + 0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6, + 0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae, + 0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6, + 0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe, + 0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6, + 0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce, + 0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6, + 0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde, + 0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6, + 0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee, + 0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6, + 0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe, + 0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d, + 0x97848413, 0x94858511, 0x91868617, 0x92878715, + 0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d, + 0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05, + 0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d, + 0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735, + 0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d, + 0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25, + 0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d, + 0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755, + 0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d, + 0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45, + 0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d, + 0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775, + 0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d, + 0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65, + 0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d, + 0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795, + 0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d, + 0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85, + 0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd, + 0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5, + 0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad, + 0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5, + 0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd, + 0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5, + 0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd, + 0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5, + 0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd, + 0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5, + 0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed, + 0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5, + }, { + /* Inverse MixColumns lookup table */ + 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12, + 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a, + 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362, + 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a, + 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2, + 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca, + 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382, + 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba, + 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9, + 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1, + 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9, + 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81, + 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029, + 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411, + 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859, + 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61, + 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf, + 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987, + 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf, + 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7, + 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f, + 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967, + 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f, + 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117, + 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664, + 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c, + 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14, + 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c, + 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684, + 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc, + 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4, + 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc, + 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753, + 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b, + 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23, + 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b, + 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3, + 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b, + 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3, + 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb, + 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88, + 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0, + 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8, + 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0, + 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68, + 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850, + 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418, + 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020, + 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe, + 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6, + 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e, + 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6, + 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e, + 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526, + 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e, + 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56, + 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25, + 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d, + 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255, + 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d, + 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5, + 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd, + 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5, + 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d, + } }; + + union CRYPTO_STATE st = { .l = { rm[0], rm[1] } }; + int i; + + for (i = 0; i < 16; i += 4) { + CR_ST_WORD(st, i >> 2) = + mc[decrypt][CR_ST_BYTE(st, i)] ^ + rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^ + rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^ + rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24); + } + + rd[0] = st.l[0]; + rd[1] = st.l[1]; +} + +void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + bool decrypt = simd_data(desc); + + for (i = 0; i < opr_sz; i += 16) { + do_crypto_aesmc(vd + i, vm + i, decrypt); + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} + +/* + * SHA-1 logical functions + */ + +static uint32_t cho(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & (y ^ z)) ^ z; +} + +static uint32_t par(uint32_t x, uint32_t y, uint32_t z) +{ + return x ^ y ^ z; +} + +static uint32_t maj(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & y) | ((x | y) & z); +} + +void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t d0, d1; + + d0 = d[1] ^ d[0] ^ m[0]; + d1 = n[0] ^ d[1] ^ m[1]; + d[0] = d0; + d[1] = d1; + + clear_tail_16(vd, desc); +} + +static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn, + uint64_t *rm, uint32_t desc, + uint32_t (*fn)(union CRYPTO_STATE *d)) +{ + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + int i; + + for (i = 0; i < 4; i++) { + uint32_t t = fn(&d); + + t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0) + + CR_ST_WORD(m, i); + + CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3); + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); + CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2); + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); + CR_ST_WORD(d, 0) = t; + } + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(rd, desc); +} + +static uint32_t do_sha1c(union CRYPTO_STATE *d) +{ + return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); +} + +void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc) +{ + crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c); +} + +static uint32_t do_sha1p(union CRYPTO_STATE *d) +{ + return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); +} + +void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc) +{ + crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p); +} + +static uint32_t do_sha1m(union CRYPTO_STATE *d) +{ + return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3)); +} + +void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc) +{ + crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m); +} + +void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rm = vm; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + + CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2); + CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0; + + rd[0] = m.l[0]; + rd[1] = m.l[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + + CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1); + CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1); + CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1); + CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +/* + * The SHA-256 logical functions, according to + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + */ + +static uint32_t S0(uint32_t x) +{ + return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22); +} + +static uint32_t S1(uint32_t x) +{ + return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25); +} + +static uint32_t s0(uint32_t x) +{ + return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3); +} + +static uint32_t s1(uint32_t x) +{ + return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10); +} + +void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + int i; + + for (i = 0; i < 4; i++) { + uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2)) + + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0)) + + CR_ST_WORD(m, i); + + CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2); + CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1); + CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0); + CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t; + + t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2)) + + S0(CR_ST_WORD(d, 0)); + + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); + CR_ST_WORD(d, 0) = t; + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + int i; + + for (i = 0; i < 4; i++) { + uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2)) + + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0)) + + CR_ST_WORD(m, i); + + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); + CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t; + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + + CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1)); + CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2)); + CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3)); + CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0)); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + + CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1); + CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2); + CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3); + CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +/* + * The SHA-512 logical functions (same as above but using 64-bit operands) + */ + +static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z) +{ + return (x & (y ^ z)) ^ z; +} + +static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z) +{ + return (x & y) | ((x | y) & z); +} + +static uint64_t S0_512(uint64_t x) +{ + return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); +} + +static uint64_t S1_512(uint64_t x) +{ + return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); +} + +static uint64_t s0_512(uint64_t x) +{ + return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); +} + +static uint64_t s1_512(uint64_t x) +{ + return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); +} + +void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]); + d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]); + + rd[0] = d0; + rd[1] = d1; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]); + d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]); + + rd[0] = d0; + rd[1] = d1; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d0 += s0_512(rd[1]); + d1 += s0_512(rn[0]); + + rd[0] = d0; + rd[1] = d1; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + + rd[0] += s1_512(rn[0]) + rm[0]; + rd[1] += s1_512(rn[1]) + rm[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t; + + t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17); + CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17); + CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17); + CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17); + CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25); + + CR_ST_WORD(d, 0) ^= t; + CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25); + CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25); + CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^ + ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(vd, desc); +} + +static inline void QEMU_ALWAYS_INLINE +crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm, + uint32_t desc, uint32_t opcode) +{ + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t imm2 = simd_data(desc); + uint32_t t; + + assert(imm2 < 4); + + if (opcode == 0 || opcode == 2) { + /* SM3TT1A, SM3TT2A */ + t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else if (opcode == 1) { + /* SM3TT1B */ + t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else if (opcode == 3) { + /* SM3TT2B */ + t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else { + qemu_build_not_reached(); + } + + t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2); + + CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1); + + if (opcode < 2) { + /* SM3TT1A, SM3TT1B */ + t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20); + + CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23); + } else { + /* SM3TT2A, SM3TT2B */ + t += CR_ST_WORD(n, 3); + t ^= rol32(t, 9) ^ rol32(t, 17); + + CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13); + } + + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3); + CR_ST_WORD(d, 3) = t; + + rd[0] = d.l[0]; + rd[1] = d.l[1]; + + clear_tail_16(rd, desc); +} + +#define DO_SM3TT(NAME, OPCODE) \ + void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ + { crypto_sm3tt(vd, vn, vm, desc, OPCODE); } + +DO_SM3TT(crypto_sm3tt1a, 0) +DO_SM3TT(crypto_sm3tt1b, 1) +DO_SM3TT(crypto_sm3tt2a, 2) +DO_SM3TT(crypto_sm3tt2b, 3) + +#undef DO_SM3TT + +static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm) +{ + union CRYPTO_STATE d = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE n = { .l = { rm[0], rm[1] } }; + uint32_t t, i; + + for (i = 0; i < 4; i++) { + t = CR_ST_WORD(d, (i + 1) % 4) ^ + CR_ST_WORD(d, (i + 2) % 4) ^ + CR_ST_WORD(d, (i + 3) % 4) ^ + CR_ST_WORD(n, i); + + t = sm4_sbox[t & 0xff] | + sm4_sbox[(t >> 8) & 0xff] << 8 | + sm4_sbox[(t >> 16) & 0xff] << 16 | + sm4_sbox[(t >> 24) & 0xff] << 24; + + CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^ + rol32(t, 24); + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + + for (i = 0; i < opr_sz; i += 16) { + do_crypto_sm4e(vd + i, vn + i, vm + i); + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} + +static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm) +{ + union CRYPTO_STATE d; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t, i; + + d = n; + for (i = 0; i < 4; i++) { + t = CR_ST_WORD(d, (i + 1) % 4) ^ + CR_ST_WORD(d, (i + 2) % 4) ^ + CR_ST_WORD(d, (i + 3) % 4) ^ + CR_ST_WORD(m, i); + + t = sm4_sbox[t & 0xff] | + sm4_sbox[(t >> 8) & 0xff] << 8 | + sm4_sbox[(t >> 16) & 0xff] << 16 | + sm4_sbox[(t >> 24) & 0xff] << 24; + + CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23); + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + + for (i = 0; i < opr_sz; i += 16) { + do_crypto_sm4ekey(vd + i, vn + i, vm + i); + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} + +void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = n[i] ^ rol64(m[i], 1); + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c new file mode 100644 index 0000000000..0972a4bdd0 --- /dev/null +++ b/target/arm/tcg/helper-a64.c @@ -0,0 +1,954 @@ +/* + * AArch64 specific helpers + * + * Copyright (c) 2013 Alexander Graf + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "cpu.h" +#include "exec/gdbstub.h" +#include "exec/helper-proto.h" +#include "qemu/host-utils.h" +#include "qemu/log.h" +#include "qemu/main-loop.h" +#include "qemu/bitops.h" +#include "internals.h" +#include "qemu/crc32c.h" +#include "exec/exec-all.h" +#include "exec/cpu_ldst.h" +#include "qemu/int128.h" +#include "qemu/atomic128.h" +#include "fpu/softfloat.h" +#include /* For crc32 */ + +/* C2.4.7 Multiply and divide */ +/* special cases for 0 and LLONG_MIN are mandated by the standard */ +uint64_t HELPER(udiv64)(uint64_t num, uint64_t den) +{ + if (den == 0) { + return 0; + } + return num / den; +} + +int64_t HELPER(sdiv64)(int64_t num, int64_t den) +{ + if (den == 0) { + return 0; + } + if (num == LLONG_MIN && den == -1) { + return LLONG_MIN; + } + return num / den; +} + +uint64_t HELPER(rbit64)(uint64_t x) +{ + return revbit64(x); +} + +void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm) +{ + update_spsel(env, imm); +} + +static void daif_check(CPUARMState *env, uint32_t op, + uint32_t imm, uintptr_t ra) +{ + /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set. */ + if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) { + raise_exception_ra(env, EXCP_UDEF, + syn_aa64_sysregtrap(0, extract32(op, 0, 3), + extract32(op, 3, 3), 4, + imm, 0x1f, 0), + exception_target_el(env), ra); + } +} + +void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm) +{ + daif_check(env, 0x1e, imm, GETPC()); + env->daif |= (imm << 6) & PSTATE_DAIF; + arm_rebuild_hflags(env); +} + +void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm) +{ + daif_check(env, 0x1f, imm, GETPC()); + env->daif &= ~((imm << 6) & PSTATE_DAIF); + arm_rebuild_hflags(env); +} + +/* Convert a softfloat float_relation_ (as returned by + * the float*_compare functions) to the correct ARM + * NZCV flag state. + */ +static inline uint32_t float_rel_to_flags(int res) +{ + uint64_t flags; + switch (res) { + case float_relation_equal: + flags = PSTATE_Z | PSTATE_C; + break; + case float_relation_less: + flags = PSTATE_N; + break; + case float_relation_greater: + flags = PSTATE_C; + break; + case float_relation_unordered: + default: + flags = PSTATE_C | PSTATE_V; + break; + } + return flags; +} + +uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status) +{ + return float_rel_to_flags(float16_compare_quiet(x, y, fp_status)); +} + +uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status) +{ + return float_rel_to_flags(float16_compare(x, y, fp_status)); +} + +uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status) +{ + return float_rel_to_flags(float32_compare_quiet(x, y, fp_status)); +} + +uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status) +{ + return float_rel_to_flags(float32_compare(x, y, fp_status)); +} + +uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status) +{ + return float_rel_to_flags(float64_compare_quiet(x, y, fp_status)); +} + +uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status) +{ + return float_rel_to_flags(float64_compare(x, y, fp_status)); +} + +float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float32_squash_input_denormal(a, fpst); + b = float32_squash_input_denormal(b, fpst); + + if ((float32_is_zero(a) && float32_is_infinity(b)) || + (float32_is_infinity(a) && float32_is_zero(b))) { + /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ + return make_float32((1U << 30) | + ((float32_val(a) ^ float32_val(b)) & (1U << 31))); + } + return float32_mul(a, b, fpst); +} + +float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float64_squash_input_denormal(a, fpst); + b = float64_squash_input_denormal(b, fpst); + + if ((float64_is_zero(a) && float64_is_infinity(b)) || + (float64_is_infinity(a) && float64_is_zero(b))) { + /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ + return make_float64((1ULL << 62) | + ((float64_val(a) ^ float64_val(b)) & (1ULL << 63))); + } + return float64_mul(a, b, fpst); +} + +/* 64bit/double versions of the neon float compare functions */ +uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float64_eq_quiet(a, b, fpst); +} + +uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float64_le(b, a, fpst); +} + +uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float64_lt(b, a, fpst); +} + +/* Reciprocal step and sqrt step. Note that unlike the A32/T32 + * versions, these do a fully fused multiply-add or + * multiply-add-and-halve. + */ + +uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float16_squash_input_denormal(a, fpst); + b = float16_squash_input_denormal(b, fpst); + + a = float16_chs(a); + if ((float16_is_infinity(a) && float16_is_zero(b)) || + (float16_is_infinity(b) && float16_is_zero(a))) { + return float16_two; + } + return float16_muladd(a, b, float16_two, 0, fpst); +} + +float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float32_squash_input_denormal(a, fpst); + b = float32_squash_input_denormal(b, fpst); + + a = float32_chs(a); + if ((float32_is_infinity(a) && float32_is_zero(b)) || + (float32_is_infinity(b) && float32_is_zero(a))) { + return float32_two; + } + return float32_muladd(a, b, float32_two, 0, fpst); +} + +float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float64_squash_input_denormal(a, fpst); + b = float64_squash_input_denormal(b, fpst); + + a = float64_chs(a); + if ((float64_is_infinity(a) && float64_is_zero(b)) || + (float64_is_infinity(b) && float64_is_zero(a))) { + return float64_two; + } + return float64_muladd(a, b, float64_two, 0, fpst); +} + +uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float16_squash_input_denormal(a, fpst); + b = float16_squash_input_denormal(b, fpst); + + a = float16_chs(a); + if ((float16_is_infinity(a) && float16_is_zero(b)) || + (float16_is_infinity(b) && float16_is_zero(a))) { + return float16_one_point_five; + } + return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst); +} + +float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float32_squash_input_denormal(a, fpst); + b = float32_squash_input_denormal(b, fpst); + + a = float32_chs(a); + if ((float32_is_infinity(a) && float32_is_zero(b)) || + (float32_is_infinity(b) && float32_is_zero(a))) { + return float32_one_point_five; + } + return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst); +} + +float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float64_squash_input_denormal(a, fpst); + b = float64_squash_input_denormal(b, fpst); + + a = float64_chs(a); + if ((float64_is_infinity(a) && float64_is_zero(b)) || + (float64_is_infinity(b) && float64_is_zero(a))) { + return float64_one_point_five; + } + return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst); +} + +/* Pairwise long add: add pairs of adjacent elements into + * double-width elements in the result (eg _s8 is an 8x8->16 op) + */ +uint64_t HELPER(neon_addlp_s8)(uint64_t a) +{ + uint64_t nsignmask = 0x0080008000800080ULL; + uint64_t wsignmask = 0x8000800080008000ULL; + uint64_t elementmask = 0x00ff00ff00ff00ffULL; + uint64_t tmp1, tmp2; + uint64_t res, signres; + + /* Extract odd elements, sign extend each to a 16 bit field */ + tmp1 = a & elementmask; + tmp1 ^= nsignmask; + tmp1 |= wsignmask; + tmp1 = (tmp1 - nsignmask) ^ wsignmask; + /* Ditto for the even elements */ + tmp2 = (a >> 8) & elementmask; + tmp2 ^= nsignmask; + tmp2 |= wsignmask; + tmp2 = (tmp2 - nsignmask) ^ wsignmask; + + /* calculate the result by summing bits 0..14, 16..22, etc, + * and then adjusting the sign bits 15, 23, etc manually. + * This ensures the addition can't overflow the 16 bit field. + */ + signres = (tmp1 ^ tmp2) & wsignmask; + res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); + res ^= signres; + + return res; +} + +uint64_t HELPER(neon_addlp_u8)(uint64_t a) +{ + uint64_t tmp; + + tmp = a & 0x00ff00ff00ff00ffULL; + tmp += (a >> 8) & 0x00ff00ff00ff00ffULL; + return tmp; +} + +uint64_t HELPER(neon_addlp_s16)(uint64_t a) +{ + int32_t reslo, reshi; + + reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); + reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); + + return (uint32_t)reslo | (((uint64_t)reshi) << 32); +} + +uint64_t HELPER(neon_addlp_u16)(uint64_t a) +{ + uint64_t tmp; + + tmp = a & 0x0000ffff0000ffffULL; + tmp += (a >> 16) & 0x0000ffff0000ffffULL; + return tmp; +} + +/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */ +uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp) +{ + float_status *fpst = fpstp; + uint16_t val16, sbit; + int16_t exp; + + if (float16_is_any_nan(a)) { + float16 nan = a; + if (float16_is_signaling_nan(a, fpst)) { + float_raise(float_flag_invalid, fpst); + if (!fpst->default_nan_mode) { + nan = float16_silence_nan(a, fpst); + } + } + if (fpst->default_nan_mode) { + nan = float16_default_nan(fpst); + } + return nan; + } + + a = float16_squash_input_denormal(a, fpst); + + val16 = float16_val(a); + sbit = 0x8000 & val16; + exp = extract32(val16, 10, 5); + + if (exp == 0) { + return make_float16(deposit32(sbit, 10, 5, 0x1e)); + } else { + return make_float16(deposit32(sbit, 10, 5, ~exp)); + } +} + +float32 HELPER(frecpx_f32)(float32 a, void *fpstp) +{ + float_status *fpst = fpstp; + uint32_t val32, sbit; + int32_t exp; + + if (float32_is_any_nan(a)) { + float32 nan = a; + if (float32_is_signaling_nan(a, fpst)) { + float_raise(float_flag_invalid, fpst); + if (!fpst->default_nan_mode) { + nan = float32_silence_nan(a, fpst); + } + } + if (fpst->default_nan_mode) { + nan = float32_default_nan(fpst); + } + return nan; + } + + a = float32_squash_input_denormal(a, fpst); + + val32 = float32_val(a); + sbit = 0x80000000ULL & val32; + exp = extract32(val32, 23, 8); + + if (exp == 0) { + return make_float32(sbit | (0xfe << 23)); + } else { + return make_float32(sbit | (~exp & 0xff) << 23); + } +} + +float64 HELPER(frecpx_f64)(float64 a, void *fpstp) +{ + float_status *fpst = fpstp; + uint64_t val64, sbit; + int64_t exp; + + if (float64_is_any_nan(a)) { + float64 nan = a; + if (float64_is_signaling_nan(a, fpst)) { + float_raise(float_flag_invalid, fpst); + if (!fpst->default_nan_mode) { + nan = float64_silence_nan(a, fpst); + } + } + if (fpst->default_nan_mode) { + nan = float64_default_nan(fpst); + } + return nan; + } + + a = float64_squash_input_denormal(a, fpst); + + val64 = float64_val(a); + sbit = 0x8000000000000000ULL & val64; + exp = extract64(float64_val(a), 52, 11); + + if (exp == 0) { + return make_float64(sbit | (0x7feULL << 52)); + } else { + return make_float64(sbit | (~exp & 0x7ffULL) << 52); + } +} + +float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env) +{ + /* Von Neumann rounding is implemented by using round-to-zero + * and then setting the LSB of the result if Inexact was raised. + */ + float32 r; + float_status *fpst = &env->vfp.fp_status; + float_status tstat = *fpst; + int exflags; + + set_float_rounding_mode(float_round_to_zero, &tstat); + set_float_exception_flags(0, &tstat); + r = float64_to_float32(a, &tstat); + exflags = get_float_exception_flags(&tstat); + if (exflags & float_flag_inexact) { + r = make_float32(float32_val(r) | 1); + } + exflags |= get_float_exception_flags(fpst); + set_float_exception_flags(exflags, fpst); + return r; +} + +/* 64-bit versions of the CRC helpers. Note that although the operation + * (and the prototypes of crc32c() and crc32() mean that only the bottom + * 32 bits of the accumulator and result are used, we pass and return + * uint64_t for convenience of the generated code. Unlike the 32-bit + * instruction set versions, val may genuinely have 64 bits of data in it. + * The upper bytes of val (above the number specified by 'bytes') must have + * been zeroed out by the caller. + */ +uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes) +{ + uint8_t buf[8]; + + stq_le_p(buf, val); + + /* zlib crc32 converts the accumulator and output to one's complement. */ + return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff; +} + +uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes) +{ + uint8_t buf[8]; + + stq_le_p(buf, val); + + /* Linux crc32c converts the output to one's complement. */ + return crc32c(acc, buf, bytes) ^ 0xffffffff; +} + +/* + * AdvSIMD half-precision + */ + +#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix)) + +#define ADVSIMD_HALFOP(name) \ +uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \ +{ \ + float_status *fpst = fpstp; \ + return float16_ ## name(a, b, fpst); \ +} + +ADVSIMD_HALFOP(add) +ADVSIMD_HALFOP(sub) +ADVSIMD_HALFOP(mul) +ADVSIMD_HALFOP(div) +ADVSIMD_HALFOP(min) +ADVSIMD_HALFOP(max) +ADVSIMD_HALFOP(minnum) +ADVSIMD_HALFOP(maxnum) + +#define ADVSIMD_TWOHALFOP(name) \ +uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \ +{ \ + float16 a1, a2, b1, b2; \ + uint32_t r1, r2; \ + float_status *fpst = fpstp; \ + a1 = extract32(two_a, 0, 16); \ + a2 = extract32(two_a, 16, 16); \ + b1 = extract32(two_b, 0, 16); \ + b2 = extract32(two_b, 16, 16); \ + r1 = float16_ ## name(a1, b1, fpst); \ + r2 = float16_ ## name(a2, b2, fpst); \ + return deposit32(r1, 16, 16, r2); \ +} + +ADVSIMD_TWOHALFOP(add) +ADVSIMD_TWOHALFOP(sub) +ADVSIMD_TWOHALFOP(mul) +ADVSIMD_TWOHALFOP(div) +ADVSIMD_TWOHALFOP(min) +ADVSIMD_TWOHALFOP(max) +ADVSIMD_TWOHALFOP(minnum) +ADVSIMD_TWOHALFOP(maxnum) + +/* Data processing - scalar floating-point and advanced SIMD */ +static float16 float16_mulx(float16 a, float16 b, void *fpstp) +{ + float_status *fpst = fpstp; + + a = float16_squash_input_denormal(a, fpst); + b = float16_squash_input_denormal(b, fpst); + + if ((float16_is_zero(a) && float16_is_infinity(b)) || + (float16_is_infinity(a) && float16_is_zero(b))) { + /* 2.0 with the sign bit set to sign(A) XOR sign(B) */ + return make_float16((1U << 14) | + ((float16_val(a) ^ float16_val(b)) & (1U << 15))); + } + return float16_mul(a, b, fpst); +} + +ADVSIMD_HALFOP(mulx) +ADVSIMD_TWOHALFOP(mulx) + +/* fused multiply-accumulate */ +uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c, + void *fpstp) +{ + float_status *fpst = fpstp; + return float16_muladd(a, b, c, 0, fpst); +} + +uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b, + uint32_t two_c, void *fpstp) +{ + float_status *fpst = fpstp; + float16 a1, a2, b1, b2, c1, c2; + uint32_t r1, r2; + a1 = extract32(two_a, 0, 16); + a2 = extract32(two_a, 16, 16); + b1 = extract32(two_b, 0, 16); + b2 = extract32(two_b, 16, 16); + c1 = extract32(two_c, 0, 16); + c2 = extract32(two_c, 16, 16); + r1 = float16_muladd(a1, b1, c1, 0, fpst); + r2 = float16_muladd(a2, b2, c2, 0, fpst); + return deposit32(r1, 16, 16, r2); +} + +/* + * Floating point comparisons produce an integer result. Softfloat + * routines return float_relation types which we convert to the 0/-1 + * Neon requires. + */ + +#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0 + +uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + int compare = float16_compare_quiet(a, b, fpst); + return ADVSIMD_CMPRES(compare == float_relation_equal); +} + +uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + int compare = float16_compare(a, b, fpst); + return ADVSIMD_CMPRES(compare == float_relation_greater || + compare == float_relation_equal); +} + +uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + int compare = float16_compare(a, b, fpst); + return ADVSIMD_CMPRES(compare == float_relation_greater); +} + +uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float16 f0 = float16_abs(a); + float16 f1 = float16_abs(b); + int compare = float16_compare(f0, f1, fpst); + return ADVSIMD_CMPRES(compare == float_relation_greater || + compare == float_relation_equal); +} + +uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float16 f0 = float16_abs(a); + float16 f1 = float16_abs(b); + int compare = float16_compare(f0, f1, fpst); + return ADVSIMD_CMPRES(compare == float_relation_greater); +} + +/* round to integral */ +uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status) +{ + return float16_round_to_int(x, fp_status); +} + +uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status) +{ + int old_flags = get_float_exception_flags(fp_status), new_flags; + float16 ret; + + ret = float16_round_to_int(x, fp_status); + + /* Suppress any inexact exceptions the conversion produced */ + if (!(old_flags & float_flag_inexact)) { + new_flags = get_float_exception_flags(fp_status); + set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status); + } + + return ret; +} + +/* + * Half-precision floating point conversion functions + * + * There are a multitude of conversion functions with various + * different rounding modes. This is dealt with by the calling code + * setting the mode appropriately before calling the helper. + */ + +uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp) +{ + float_status *fpst = fpstp; + + /* Invalid if we are passed a NaN */ + if (float16_is_any_nan(a)) { + float_raise(float_flag_invalid, fpst); + return 0; + } + return float16_to_int16(a, fpst); +} + +uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp) +{ + float_status *fpst = fpstp; + + /* Invalid if we are passed a NaN */ + if (float16_is_any_nan(a)) { + float_raise(float_flag_invalid, fpst); + return 0; + } + return float16_to_uint16(a, fpst); +} + +static int el_from_spsr(uint32_t spsr) +{ + /* Return the exception level that this SPSR is requesting a return to, + * or -1 if it is invalid (an illegal return) + */ + if (spsr & PSTATE_nRW) { + switch (spsr & CPSR_M) { + case ARM_CPU_MODE_USR: + return 0; + case ARM_CPU_MODE_HYP: + return 2; + case ARM_CPU_MODE_FIQ: + case ARM_CPU_MODE_IRQ: + case ARM_CPU_MODE_SVC: + case ARM_CPU_MODE_ABT: + case ARM_CPU_MODE_UND: + case ARM_CPU_MODE_SYS: + return 1; + case ARM_CPU_MODE_MON: + /* Returning to Mon from AArch64 is never possible, + * so this is an illegal return. + */ + default: + return -1; + } + } else { + if (extract32(spsr, 1, 1)) { + /* Return with reserved M[1] bit set */ + return -1; + } + if (extract32(spsr, 0, 4) == 1) { + /* return to EL0 with M[0] bit set */ + return -1; + } + return extract32(spsr, 2, 2); + } +} + +static void cpsr_write_from_spsr_elx(CPUARMState *env, + uint32_t val) +{ + uint32_t mask; + + /* Save SPSR_ELx.SS into PSTATE. */ + env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS); + val &= ~PSTATE_SS; + + /* Move DIT to the correct location for CPSR */ + if (val & PSTATE_DIT) { + val &= ~PSTATE_DIT; + val |= CPSR_DIT; + } + + mask = aarch32_cpsr_valid_mask(env->features, \ + &env_archcpu(env)->isar); + cpsr_write(env, val, mask, CPSRWriteRaw); +} + +void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) +{ + int cur_el = arm_current_el(env); + unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el); + uint32_t spsr = env->banked_spsr[spsr_idx]; + int new_el; + bool return_to_aa64 = (spsr & PSTATE_nRW) == 0; + + aarch64_save_sp(env, cur_el); + + arm_clear_exclusive(env); + + /* We must squash the PSTATE.SS bit to zero unless both of the + * following hold: + * 1. debug exceptions are currently disabled + * 2. singlestep will be active in the EL we return to + * We check 1 here and 2 after we've done the pstate/cpsr write() to + * transition to the EL we're going to. + */ + if (arm_generate_debug_exceptions(env)) { + spsr &= ~PSTATE_SS; + } + + new_el = el_from_spsr(spsr); + if (new_el == -1) { + goto illegal_return; + } + if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) { + /* Disallow return to an EL which is unimplemented or higher + * than the current one. + */ + goto illegal_return; + } + + if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) { + /* Return to an EL which is configured for a different register width */ + goto illegal_return; + } + + if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) { + goto illegal_return; + } + + qemu_mutex_lock_iothread(); + arm_call_pre_el_change_hook(env_archcpu(env)); + qemu_mutex_unlock_iothread(); + + if (!return_to_aa64) { + env->aarch64 = false; + /* We do a raw CPSR write because aarch64_sync_64_to_32() + * will sort the register banks out for us, and we've already + * caught all the bad-mode cases in el_from_spsr(). + */ + cpsr_write_from_spsr_elx(env, spsr); + if (!arm_singlestep_active(env)) { + env->pstate &= ~PSTATE_SS; + } + aarch64_sync_64_to_32(env); + + if (spsr & CPSR_T) { + env->regs[15] = new_pc & ~0x1; + } else { + env->regs[15] = new_pc & ~0x3; + } + helper_rebuild_hflags_a32(env, new_el); + qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to " + "AArch32 EL%d PC 0x%" PRIx32 "\n", + cur_el, new_el, env->regs[15]); + } else { + int tbii; + + env->aarch64 = true; + spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar); + pstate_write(env, spsr); + if (!arm_singlestep_active(env)) { + env->pstate &= ~PSTATE_SS; + } + aarch64_restore_sp(env, new_el); + helper_rebuild_hflags_a64(env, new_el); + + /* + * Apply TBI to the exception return address. We had to delay this + * until after we selected the new EL, so that we could select the + * correct TBI+TBID bits. This is made easier by waiting until after + * the hflags rebuild, since we can pull the composite TBII field + * from there. + */ + tbii = EX_TBFLAG_A64(env->hflags, TBII); + if ((tbii >> extract64(new_pc, 55, 1)) & 1) { + /* TBI is enabled. */ + int core_mmu_idx = cpu_mmu_index(env, false); + if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) { + new_pc = sextract64(new_pc, 0, 56); + } else { + new_pc = extract64(new_pc, 0, 56); + } + } + env->pc = new_pc; + + qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to " + "AArch64 EL%d PC 0x%" PRIx64 "\n", + cur_el, new_el, env->pc); + } + + /* + * Note that cur_el can never be 0. If new_el is 0, then + * el0_a64 is return_to_aa64, else el0_a64 is ignored. + */ + aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64); + + qemu_mutex_lock_iothread(); + arm_call_el_change_hook(env_archcpu(env)); + qemu_mutex_unlock_iothread(); + + return; + +illegal_return: + /* Illegal return events of various kinds have architecturally + * mandated behaviour: + * restore NZCV and DAIF from SPSR_ELx + * set PSTATE.IL + * restore PC from ELR_ELx + * no change to exception level, execution state or stack pointer + */ + env->pstate |= PSTATE_IL; + env->pc = new_pc; + spsr &= PSTATE_NZCV | PSTATE_DAIF; + spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF); + pstate_write(env, spsr); + if (!arm_singlestep_active(env)) { + env->pstate &= ~PSTATE_SS; + } + helper_rebuild_hflags_a64(env, cur_el); + qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: " + "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc); +} + +/* + * Square Root and Reciprocal square root + */ + +uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp) +{ + float_status *s = fpstp; + + return float16_sqrt(a, s); +} + +void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in) +{ + /* + * Implement DC ZVA, which zeroes a fixed-length block of memory. + * Note that we do not implement the (architecturally mandated) + * alignment fault for attempts to use this on Device memory + * (which matches the usual QEMU behaviour of not implementing either + * alignment faults or any memory attribute handling). + */ + int blocklen = 4 << env_archcpu(env)->dcz_blocksize; + uint64_t vaddr = vaddr_in & ~(blocklen - 1); + int mmu_idx = cpu_mmu_index(env, false); + void *mem; + + /* + * Trapless lookup. In addition to actual invalid page, may + * return NULL for I/O, watchpoints, clean pages, etc. + */ + mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx); + +#ifndef CONFIG_USER_ONLY + if (unlikely(!mem)) { + uintptr_t ra = GETPC(); + + /* + * Trap if accessing an invalid page. DC_ZVA requires that we supply + * the original pointer for an invalid page. But watchpoints require + * that we probe the actual space. So do both. + */ + (void) probe_write(env, vaddr_in, 1, mmu_idx, ra); + mem = probe_write(env, vaddr, blocklen, mmu_idx, ra); + + if (unlikely(!mem)) { + /* + * The only remaining reason for mem == NULL is I/O. + * Just do a series of byte writes as the architecture demands. + */ + for (int i = 0; i < blocklen; i++) { + cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra); + } + return; + } + } +#endif + + memset(mem, 0, blocklen); +} diff --git a/target/arm/tcg/iwmmxt_helper.c b/target/arm/tcg/iwmmxt_helper.c new file mode 100644 index 0000000000..610b1b2103 --- /dev/null +++ b/target/arm/tcg/iwmmxt_helper.c @@ -0,0 +1,670 @@ +/* + * iwMMXt micro operations for XScale. + * + * Copyright (c) 2007 OpenedHand, Ltd. + * Written by Andrzej Zaborowski + * Copyright (c) 2008 CodeSourcery + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "exec/helper-proto.h" + +/* iwMMXt macros extracted from GNU gdb. */ + +/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations. */ +#define SIMD8_SET(v, n, b) ((v != 0) << ((((b) + 1) * 4) + (n))) +#define SIMD16_SET(v, n, h) ((v != 0) << ((((h) + 1) * 8) + (n))) +#define SIMD32_SET(v, n, w) ((v != 0) << ((((w) + 1) * 16) + (n))) +#define SIMD64_SET(v, n) ((v != 0) << (32 + (n))) +/* Flags to pass as "n" above. */ +#define SIMD_NBIT -1 +#define SIMD_ZBIT -2 +#define SIMD_CBIT -3 +#define SIMD_VBIT -4 +/* Various status bit macros. */ +#define NBIT8(x) ((x) & 0x80) +#define NBIT16(x) ((x) & 0x8000) +#define NBIT32(x) ((x) & 0x80000000) +#define NBIT64(x) ((x) & 0x8000000000000000ULL) +#define ZBIT8(x) (((x) & 0xff) == 0) +#define ZBIT16(x) (((x) & 0xffff) == 0) +#define ZBIT32(x) (((x) & 0xffffffff) == 0) +#define ZBIT64(x) (x == 0) +/* Sign extension macros. */ +#define EXTEND8H(a) ((uint16_t) (int8_t) (a)) +#define EXTEND8(a) ((uint32_t) (int8_t) (a)) +#define EXTEND16(a) ((uint32_t) (int16_t) (a)) +#define EXTEND16S(a) ((int32_t) (int16_t) (a)) +#define EXTEND32(a) ((uint64_t) (int32_t) (a)) + +uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b) +{ + a = (( + EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) + + EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff) + ) & 0xffffffff) | ((uint64_t) ( + EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) + + EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff) + ) << 32); + return a; +} + +uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b) +{ + a = (( + ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) + + ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff) + ) & 0xffffffff) | (( + ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) + + ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff) + ) << 32); + return a; +} + +uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b) +{ +#define abs(x) (((x) >= 0) ? x : -x) +#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff)) + return + SADB(0) + SADB(8) + SADB(16) + SADB(24) + + SADB(32) + SADB(40) + SADB(48) + SADB(56); +#undef SADB +} + +uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b) +{ +#define SADW(SHR) \ + abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff)) + return SADW(0) + SADW(16) + SADW(32) + SADW(48); +#undef SADW +} + +uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b) +{ +#define MULS(SHR) ((uint64_t) ((( \ + EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \ + ) >> 0) & 0xffff) << SHR) + return MULS(0) | MULS(16) | MULS(32) | MULS(48); +#undef MULS +} + +uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b) +{ +#define MULS(SHR) ((uint64_t) ((( \ + EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \ + ) >> 16) & 0xffff) << SHR) + return MULS(0) | MULS(16) | MULS(32) | MULS(48); +#undef MULS +} + +uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b) +{ +#define MULU(SHR) ((uint64_t) ((( \ + ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \ + ) >> 0) & 0xffff) << SHR) + return MULU(0) | MULU(16) | MULU(32) | MULU(48); +#undef MULU +} + +uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b) +{ +#define MULU(SHR) ((uint64_t) ((( \ + ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \ + ) >> 16) & 0xffff) << SHR) + return MULU(0) | MULU(16) | MULU(32) | MULU(48); +#undef MULU +} + +uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b) +{ +#define MACS(SHR) ( \ + EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff)) + return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48)); +#undef MACS +} + +uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b) +{ +#define MACU(SHR) ( \ + (uint32_t) ((a >> SHR) & 0xffff) * \ + (uint32_t) ((b >> SHR) & 0xffff)) + return MACU(0) + MACU(16) + MACU(32) + MACU(48); +#undef MACU +} + +#define NZBIT8(x, i) \ + SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \ + SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i) +#define NZBIT16(x, i) \ + SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \ + SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i) +#define NZBIT32(x, i) \ + SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \ + SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i) +#define NZBIT64(x) \ + SIMD64_SET(NBIT64(x), SIMD_NBIT) | \ + SIMD64_SET(ZBIT64(x), SIMD_ZBIT) +#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3) \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = \ + (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) | \ + (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) | \ + (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) | \ + (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \ + NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \ + NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \ + NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \ + return a; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = \ + (((a >> SH0) & 0xffff) << 0) | \ + (((b >> SH0) & 0xffff) << 16) | \ + (((a >> SH2) & 0xffff) << 32) | \ + (((b >> SH2) & 0xffff) << 48); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) | \ + NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3); \ + return a; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = \ + (((a >> SH0) & 0xffffffff) << 0) | \ + (((b >> SH0) & 0xffffffff) << 32); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \ + return a; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = \ + (((x >> SH0) & 0xff) << 0) | \ + (((x >> SH1) & 0xff) << 16) | \ + (((x >> SH2) & 0xff) << 32) | \ + (((x >> SH3) & 0xff) << 48); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \ + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \ + return x; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = \ + (((x >> SH0) & 0xffff) << 0) | \ + (((x >> SH2) & 0xffff) << 32); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \ + return x; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = (((x >> SH0) & 0xffffffff) << 0); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \ + return x; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = \ + ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) | \ + ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) | \ + ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) | \ + ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \ + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \ + return x; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = \ + ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) | \ + ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \ + return x; \ +} \ +uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \ + uint64_t x) \ +{ \ + x = EXTEND32((x >> SH0) & 0xffffffff); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \ + return x; \ +} +IWMMXT_OP_UNPACK(l, 0, 8, 16, 24) +IWMMXT_OP_UNPACK(h, 32, 40, 48, 56) + +#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O) \ +uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = \ + CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) | \ + CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) | \ + CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) | \ + CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \ + NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \ + NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \ + NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \ + return a; \ +} \ +uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) | \ + CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | \ + NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); \ + return a; \ +} \ +uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env, \ + uint64_t a, uint64_t b) \ +{ \ + a = CMP(0, Tl, O, 0xffffffff) | \ + CMP(32, Tl, O, 0xffffffff); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \ + return a; \ +} +#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \ + (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR) +IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==) +IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >) +IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >) +#undef CMP +#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \ + (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR)) +IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <) +IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <) +IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >) +IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >) +#undef CMP +#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \ + OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR) +IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -) +IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +) +#undef CMP +/* TODO Signed- and Unsigned-Saturation */ +#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \ + OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR) +IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -) +IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +) +IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -) +IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +) +#undef CMP +#undef IWMMXT_OP_CMP + +#define AVGB(SHR) ((( \ + ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR) +#define IWMMXT_OP_AVGB(r) \ +uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b) \ +{ \ + const int round = r; \ + a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) | \ + AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) | \ + SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) | \ + SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) | \ + SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) | \ + SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) | \ + SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) | \ + SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) | \ + SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7); \ + return a; \ +} +IWMMXT_OP_AVGB(0) +IWMMXT_OP_AVGB(1) +#undef IWMMXT_OP_AVGB +#undef AVGB + +#define AVGW(SHR) ((( \ + ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR) +#define IWMMXT_OP_AVGW(r) \ +uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b) \ +{ \ + const int round = r; \ + a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48); \ + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \ + SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) | \ + SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) | \ + SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) | \ + SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3); \ + return a; \ +} +IWMMXT_OP_AVGW(0) +IWMMXT_OP_AVGW(1) +#undef IWMMXT_OP_AVGW +#undef AVGW + +uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n) +{ + a >>= n << 3; + a |= b << (64 - (n << 3)); + return a; +} + +uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n) +{ + x &= ~((uint64_t) b << n); + x |= (uint64_t) (a & b) << n; + return x; +} + +uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x) +{ + return SIMD64_SET((x == 0), SIMD_ZBIT) | + SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT); +} + +uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg) +{ + arg &= 0xff; + return + ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) | + ((uint64_t) arg << 16) | ((uint64_t) arg << 24) | + ((uint64_t) arg << 32) | ((uint64_t) arg << 40) | + ((uint64_t) arg << 48) | ((uint64_t) arg << 56); +} + +uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg) +{ + arg &= 0xffff; + return + ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) | + ((uint64_t) arg << 32) | ((uint64_t) arg << 48); +} + +uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg) +{ + return arg | ((uint64_t) arg << 32); +} + +uint64_t HELPER(iwmmxt_addcb)(uint64_t x) +{ + return + ((x >> 0) & 0xff) + ((x >> 8) & 0xff) + + ((x >> 16) & 0xff) + ((x >> 24) & 0xff) + + ((x >> 32) & 0xff) + ((x >> 40) & 0xff) + + ((x >> 48) & 0xff) + ((x >> 56) & 0xff); +} + +uint64_t HELPER(iwmmxt_addcw)(uint64_t x) +{ + return + ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) + + ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff); +} + +uint64_t HELPER(iwmmxt_addcl)(uint64_t x) +{ + return (x & 0xffffffff) + (x >> 32); +} + +uint32_t HELPER(iwmmxt_msbb)(uint64_t x) +{ + return + ((x >> 7) & 0x01) | ((x >> 14) & 0x02) | + ((x >> 21) & 0x04) | ((x >> 28) & 0x08) | + ((x >> 35) & 0x10) | ((x >> 42) & 0x20) | + ((x >> 49) & 0x40) | ((x >> 56) & 0x80); +} + +uint32_t HELPER(iwmmxt_msbw)(uint64_t x) +{ + return + ((x >> 15) & 0x01) | ((x >> 30) & 0x02) | + ((x >> 45) & 0x04) | ((x >> 52) & 0x08); +} + +uint32_t HELPER(iwmmxt_msbl)(uint64_t x) +{ + return ((x >> 31) & 0x01) | ((x >> 62) & 0x02); +} + +/* FIXME: Split wCASF setting into a separate op to avoid env use. */ +uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) | + (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) | + (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) | + (((x & (0xffffll << 48)) >> n) & (0xffffll << 48)); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); + return x; +} + +uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ((x & (0xffffffffll << 0)) >> n) | + ((x >> n) & (0xffffffffll << 32)); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); + return x; +} + +uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x >>= n; + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); + return x; +} + +uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) | + (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) | + (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) | + (((x & (0xffffll << 48)) << n) & (0xffffll << 48)); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); + return x; +} + +uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ((x << n) & (0xffffffffll << 0)) | + ((x & (0xffffffffll << 32)) << n); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); + return x; +} + +uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x <<= n; + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); + return x; +} + +uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) | + ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) | + ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) | + ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); + return x; +} + +uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) | + (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); + return x; +} + +uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = (int64_t) x >> n; + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); + return x; +} + +uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ((((x & (0xffffll << 0)) >> n) | + ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) | + ((((x & (0xffffll << 16)) >> n) | + ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) | + ((((x & (0xffffll << 32)) >> n) | + ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) | + ((((x & (0xffffll << 48)) >> n) | + ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48)); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); + return x; +} + +uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ((x & (0xffffffffll << 0)) >> n) | + ((x >> n) & (0xffffffffll << 32)) | + ((x << (32 - n)) & (0xffffffffll << 0)) | + ((x & (0xffffffffll << 32)) << (32 - n)); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); + return x; +} + +uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = ror64(x, n); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x); + return x; +} + +uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n) +{ + x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) | + (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) | + (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) | + (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | + NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); + return x; +} + +/* TODO: Unsigned-Saturation */ +uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) | + (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) | + (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) | + (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | + NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | + NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | + NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); + return a; +} + +uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) | + (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | + NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); + return a; +} + +uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (a & 0xffffffff) | ((b & 0xffffffff) << 32); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); + return a; +} + +/* TODO: Signed-Saturation */ +uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) | + (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) | + (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) | + (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | + NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | + NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | + NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); + return a; +} + +uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) | + (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | + NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); + return a; +} + +uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b) +{ + a = (a & 0xffffffff) | ((b & 0xffffffff) << 32); + env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = + NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); + return a; +} + +uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b) +{ + return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b)); +} + +uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b) +{ + c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) * + EXTEND16S((b >> 0) & 0xffff)); + c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) * + EXTEND16S((b >> 16) & 0xffff)); + return c; +} + +uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b) +{ + return c + (EXTEND32(EXTEND16S(a & 0xffff) * + EXTEND16S(b & 0xffff))); +} diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c new file mode 100644 index 0000000000..f94e87e728 --- /dev/null +++ b/target/arm/tcg/m_helper.c @@ -0,0 +1,2902 @@ +/* + * ARM generic helpers. + * + * This code is licensed under the GNU GPL v2 or later. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "exec/helper-proto.h" +#include "qemu/main-loop.h" +#include "qemu/bitops.h" +#include "qemu/log.h" +#include "exec/exec-all.h" +#ifdef CONFIG_TCG +#include "exec/cpu_ldst.h" +#include "semihosting/common-semi.h" +#endif +#if !defined(CONFIG_USER_ONLY) +#include "hw/intc/armv7m_nvic.h" +#endif + +static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask, + uint32_t reg, uint32_t val) +{ + /* Only APSR is actually writable */ + if (!(reg & 4)) { + uint32_t apsrmask = 0; + + if (mask & 8) { + apsrmask |= XPSR_NZCV | XPSR_Q; + } + if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) { + apsrmask |= XPSR_GE; + } + xpsr_write(env, val, apsrmask); + } +} + +static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el) +{ + uint32_t mask = 0; + + if ((reg & 1) && el) { + mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */ + } + if (!(reg & 4)) { + mask |= XPSR_NZCV | XPSR_Q; /* APSR */ + if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) { + mask |= XPSR_GE; + } + } + /* EPSR reads as zero */ + return xpsr_read(env) & mask; +} + +static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure) +{ + uint32_t value = env->v7m.control[secure]; + + if (!secure) { + /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */ + value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK; + } + return value; +} + +#ifdef CONFIG_USER_ONLY + +void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val) +{ + uint32_t mask = extract32(maskreg, 8, 4); + uint32_t reg = extract32(maskreg, 0, 8); + + switch (reg) { + case 0 ... 7: /* xPSR sub-fields */ + v7m_msr_xpsr(env, mask, reg, val); + break; + case 20: /* CONTROL */ + /* There are no sub-fields that are actually writable from EL0. */ + break; + default: + /* Unprivileged writes to other registers are ignored */ + break; + } +} + +uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg) +{ + switch (reg) { + case 0 ... 7: /* xPSR sub-fields */ + return v7m_mrs_xpsr(env, reg, 0); + case 20: /* CONTROL */ + return v7m_mrs_control(env, 0); + default: + /* Unprivileged reads others as zero. */ + return 0; + } +} + +void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest) +{ + /* translate.c should never generate calls here in user-only mode */ + g_assert_not_reached(); +} + +void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest) +{ + /* translate.c should never generate calls here in user-only mode */ + g_assert_not_reached(); +} + +void HELPER(v7m_preserve_fp_state)(CPUARMState *env) +{ + /* translate.c should never generate calls here in user-only mode */ + g_assert_not_reached(); +} + +void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) +{ + /* translate.c should never generate calls here in user-only mode */ + g_assert_not_reached(); +} + +void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) +{ + /* translate.c should never generate calls here in user-only mode */ + g_assert_not_reached(); +} + +uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op) +{ + /* + * The TT instructions can be used by unprivileged code, but in + * user-only emulation we don't have the MPU. + * Luckily since we know we are NonSecure unprivileged (and that in + * turn means that the A flag wasn't specified), all the bits in the + * register must be zero: + * IREGION: 0 because IRVALID is 0 + * IRVALID: 0 because NS + * S: 0 because NS + * NSRW: 0 because NS + * NSR: 0 because NS + * RW: 0 because unpriv and A flag not set + * R: 0 because unpriv and A flag not set + * SRVALID: 0 because NS + * MRVALID: 0 because unpriv and A flag not set + * SREGION: 0 becaus SRVALID is 0 + * MREGION: 0 because MRVALID is 0 + */ + return 0; +} + +ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate) +{ + return ARMMMUIdx_MUser; +} + +#else /* !CONFIG_USER_ONLY */ + +static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env, + bool secstate, bool priv, bool negpri) +{ + ARMMMUIdx mmu_idx = ARM_MMU_IDX_M; + + if (priv) { + mmu_idx |= ARM_MMU_IDX_M_PRIV; + } + + if (negpri) { + mmu_idx |= ARM_MMU_IDX_M_NEGPRI; + } + + if (secstate) { + mmu_idx |= ARM_MMU_IDX_M_S; + } + + return mmu_idx; +} + +static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env, + bool secstate, bool priv) +{ + bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate); + + return arm_v7m_mmu_idx_all(env, secstate, priv, negpri); +} + +/* Return the MMU index for a v7M CPU in the specified security state */ +ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate) +{ + bool priv = arm_v7m_is_handler_mode(env) || + !(env->v7m.control[secstate] & 1); + + return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv); +} + +/* + * What kind of stack write are we doing? This affects how exceptions + * generated during the stacking are treated. + */ +typedef enum StackingMode { + STACK_NORMAL, + STACK_IGNFAULTS, + STACK_LAZYFP, +} StackingMode; + +static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value, + ARMMMUIdx mmu_idx, StackingMode mode) +{ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxResult txres; + GetPhysAddrResult res = {}; + ARMMMUFaultInfo fi = {}; + bool secure = mmu_idx & ARM_MMU_IDX_M_S; + int exc; + bool exc_secure; + + if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) { + /* MPU/SAU lookup failed */ + if (fi.type == ARMFault_QEMU_SFault) { + if (mode == STACK_LAZYFP) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault with SFSR.LSPERR " + "during lazy stacking\n"); + env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK; + } else { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault with SFSR.AUVIOL " + "during stacking\n"); + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK; + } + env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK; + env->v7m.sfar = addr; + exc = ARMV7M_EXCP_SECURE; + exc_secure = false; + } else { + if (mode == STACK_LAZYFP) { + qemu_log_mask(CPU_LOG_INT, + "...MemManageFault with CFSR.MLSPERR\n"); + env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK; + } else { + qemu_log_mask(CPU_LOG_INT, + "...MemManageFault with CFSR.MSTKERR\n"); + env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK; + } + exc = ARMV7M_EXCP_MEM; + exc_secure = secure; + } + goto pend_fault; + } + address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr, + value, res.f.attrs, &txres); + if (txres != MEMTX_OK) { + /* BusFault trying to write the data */ + if (mode == STACK_LAZYFP) { + qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n"); + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK; + } else { + qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n"); + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK; + } + exc = ARMV7M_EXCP_BUS; + exc_secure = false; + goto pend_fault; + } + return true; + +pend_fault: + /* + * By pending the exception at this point we are making + * the IMPDEF choice "overridden exceptions pended" (see the + * MergeExcInfo() pseudocode). The other choice would be to not + * pend them now and then make a choice about which to throw away + * later if we have two derived exceptions. + * The only case when we must not pend the exception but instead + * throw it away is if we are doing the push of the callee registers + * and we've already generated a derived exception (this is indicated + * by the caller passing STACK_IGNFAULTS). Even in this case we will + * still update the fault status registers. + */ + switch (mode) { + case STACK_NORMAL: + armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure); + break; + case STACK_LAZYFP: + armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure); + break; + case STACK_IGNFAULTS: + break; + } + return false; +} + +static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr, + ARMMMUIdx mmu_idx) +{ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxResult txres; + GetPhysAddrResult res = {}; + ARMMMUFaultInfo fi = {}; + bool secure = mmu_idx & ARM_MMU_IDX_M_S; + int exc; + bool exc_secure; + uint32_t value; + + if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) { + /* MPU/SAU lookup failed */ + if (fi.type == ARMFault_QEMU_SFault) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault with SFSR.AUVIOL during unstack\n"); + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; + env->v7m.sfar = addr; + exc = ARMV7M_EXCP_SECURE; + exc_secure = false; + } else { + qemu_log_mask(CPU_LOG_INT, + "...MemManageFault with CFSR.MUNSTKERR\n"); + env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK; + exc = ARMV7M_EXCP_MEM; + exc_secure = secure; + } + goto pend_fault; + } + + value = address_space_ldl(arm_addressspace(cs, res.f.attrs), + res.f.phys_addr, res.f.attrs, &txres); + if (txres != MEMTX_OK) { + /* BusFault trying to read the data */ + qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n"); + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK; + exc = ARMV7M_EXCP_BUS; + exc_secure = false; + goto pend_fault; + } + + *dest = value; + return true; + +pend_fault: + /* + * By pending the exception at this point we are making + * the IMPDEF choice "overridden exceptions pended" (see the + * MergeExcInfo() pseudocode). The other choice would be to not + * pend them now and then make a choice about which to throw away + * later if we have two derived exceptions. + */ + armv7m_nvic_set_pending(env->nvic, exc, exc_secure); + return false; +} + +void HELPER(v7m_preserve_fp_state)(CPUARMState *env) +{ + /* + * Preserve FP state (because LSPACT was set and we are about + * to execute an FP instruction). This corresponds to the + * PreserveFPState() pseudocode. + * We may throw an exception if the stacking fails. + */ + ARMCPU *cpu = env_archcpu(env); + bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; + bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK); + bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK); + bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK; + uint32_t fpcar = env->v7m.fpcar[is_secure]; + bool stacked_ok = true; + bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK); + bool take_exception; + + /* Take the iothread lock as we are going to touch the NVIC */ + qemu_mutex_lock_iothread(); + + /* Check the background context had access to the FPU */ + if (!v7m_cpacr_pass(env, is_secure, is_priv)) { + armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure); + env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK; + stacked_ok = false; + } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) { + armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S); + env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; + stacked_ok = false; + } + + if (!splimviol && stacked_ok) { + /* We only stack if the stack limit wasn't violated */ + int i; + ARMMMUIdx mmu_idx; + + mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri); + for (i = 0; i < (ts ? 32 : 16); i += 2) { + uint64_t dn = *aa32_vfp_dreg(env, i / 2); + uint32_t faddr = fpcar + 4 * i; + uint32_t slo = extract64(dn, 0, 32); + uint32_t shi = extract64(dn, 32, 32); + + if (i >= 16) { + faddr += 8; /* skip the slot for the FPSCR/VPR */ + } + stacked_ok = stacked_ok && + v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) && + v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP); + } + + stacked_ok = stacked_ok && + v7m_stack_write(cpu, fpcar + 0x40, + vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP); + if (cpu_isar_feature(aa32_mve, cpu)) { + stacked_ok = stacked_ok && + v7m_stack_write(cpu, fpcar + 0x44, + env->v7m.vpr, mmu_idx, STACK_LAZYFP); + } + } + + /* + * We definitely pended an exception, but it's possible that it + * might not be able to be taken now. If its priority permits us + * to take it now, then we must not update the LSPACT or FP regs, + * but instead jump out to take the exception immediately. + * If it's just pending and won't be taken until the current + * handler exits, then we do update LSPACT and the FP regs. + */ + take_exception = !stacked_ok && + armv7m_nvic_can_take_pending_exception(env->nvic); + + qemu_mutex_unlock_iothread(); + + if (take_exception) { + raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC()); + } + + env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK; + + if (ts) { + /* Clear s0 to s31 and the FPSCR and VPR */ + int i; + + for (i = 0; i < 32; i += 2) { + *aa32_vfp_dreg(env, i / 2) = 0; + } + vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } + } + /* + * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them + * unchanged. + */ +} + +/* + * Write to v7M CONTROL.SPSEL bit for the specified security bank. + * This may change the current stack pointer between Main and Process + * stack pointers if it is done for the CONTROL register for the current + * security state. + */ +static void write_v7m_control_spsel_for_secstate(CPUARMState *env, + bool new_spsel, + bool secstate) +{ + bool old_is_psp = v7m_using_psp(env); + + env->v7m.control[secstate] = + deposit32(env->v7m.control[secstate], + R_V7M_CONTROL_SPSEL_SHIFT, + R_V7M_CONTROL_SPSEL_LENGTH, new_spsel); + + if (secstate == env->v7m.secure) { + bool new_is_psp = v7m_using_psp(env); + uint32_t tmp; + + if (old_is_psp != new_is_psp) { + tmp = env->v7m.other_sp; + env->v7m.other_sp = env->regs[13]; + env->regs[13] = tmp; + } + } +} + +/* + * Write to v7M CONTROL.SPSEL bit. This may change the current + * stack pointer between Main and Process stack pointers. + */ +static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel) +{ + write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure); +} + +void write_v7m_exception(CPUARMState *env, uint32_t new_exc) +{ + /* + * Write a new value to v7m.exception, thus transitioning into or out + * of Handler mode; this may result in a change of active stack pointer. + */ + bool new_is_psp, old_is_psp = v7m_using_psp(env); + uint32_t tmp; + + env->v7m.exception = new_exc; + + new_is_psp = v7m_using_psp(env); + + if (old_is_psp != new_is_psp) { + tmp = env->v7m.other_sp; + env->v7m.other_sp = env->regs[13]; + env->regs[13] = tmp; + } +} + +/* Switch M profile security state between NS and S */ +static void switch_v7m_security_state(CPUARMState *env, bool new_secstate) +{ + uint32_t new_ss_msp, new_ss_psp; + + if (env->v7m.secure == new_secstate) { + return; + } + + /* + * All the banked state is accessed by looking at env->v7m.secure + * except for the stack pointer; rearrange the SP appropriately. + */ + new_ss_msp = env->v7m.other_ss_msp; + new_ss_psp = env->v7m.other_ss_psp; + + if (v7m_using_psp(env)) { + env->v7m.other_ss_psp = env->regs[13]; + env->v7m.other_ss_msp = env->v7m.other_sp; + } else { + env->v7m.other_ss_msp = env->regs[13]; + env->v7m.other_ss_psp = env->v7m.other_sp; + } + + env->v7m.secure = new_secstate; + + if (v7m_using_psp(env)) { + env->regs[13] = new_ss_psp; + env->v7m.other_sp = new_ss_msp; + } else { + env->regs[13] = new_ss_msp; + env->v7m.other_sp = new_ss_psp; + } +} + +void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest) +{ + /* + * Handle v7M BXNS: + * - if the return value is a magic value, do exception return (like BX) + * - otherwise bit 0 of the return value is the target security state + */ + uint32_t min_magic; + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + /* Covers FNC_RETURN and EXC_RETURN magic */ + min_magic = FNC_RETURN_MIN_MAGIC; + } else { + /* EXC_RETURN magic only */ + min_magic = EXC_RETURN_MIN_MAGIC; + } + + if (dest >= min_magic) { + /* + * This is an exception return magic value; put it where + * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT. + * Note that if we ever add gen_ss_advance() singlestep support to + * M profile this should count as an "instruction execution complete" + * event (compare gen_bx_excret_final_code()). + */ + env->regs[15] = dest & ~1; + env->thumb = dest & 1; + HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT); + /* notreached */ + } + + /* translate.c should have made BXNS UNDEF unless we're secure */ + assert(env->v7m.secure); + + if (!(dest & 1)) { + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; + } + switch_v7m_security_state(env, dest & 1); + env->thumb = true; + env->regs[15] = dest & ~1; + arm_rebuild_hflags(env); +} + +void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest) +{ + /* + * Handle v7M BLXNS: + * - bit 0 of the destination address is the target security state + */ + + /* At this point regs[15] is the address just after the BLXNS */ + uint32_t nextinst = env->regs[15] | 1; + uint32_t sp = env->regs[13] - 8; + uint32_t saved_psr; + + /* translate.c will have made BLXNS UNDEF unless we're secure */ + assert(env->v7m.secure); + + if (dest & 1) { + /* + * Target is Secure, so this is just a normal BLX, + * except that the low bit doesn't indicate Thumb/not. + */ + env->regs[14] = nextinst; + env->thumb = true; + env->regs[15] = dest & ~1; + return; + } + + /* Target is non-secure: first push a stack frame */ + if (!QEMU_IS_ALIGNED(sp, 8)) { + qemu_log_mask(LOG_GUEST_ERROR, + "BLXNS with misaligned SP is UNPREDICTABLE\n"); + } + + if (sp < v7m_sp_limit(env)) { + raise_exception(env, EXCP_STKOF, 0, 1); + } + + saved_psr = env->v7m.exception; + if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) { + saved_psr |= XPSR_SFPA; + } + + /* Note that these stores can throw exceptions on MPU faults */ + cpu_stl_data_ra(env, sp, nextinst, GETPC()); + cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC()); + + env->regs[13] = sp; + env->regs[14] = 0xfeffffff; + if (arm_v7m_is_handler_mode(env)) { + /* + * Write a dummy value to IPSR, to avoid leaking the current secure + * exception number to non-secure code. This is guaranteed not + * to cause write_v7m_exception() to actually change stacks. + */ + write_v7m_exception(env, 1); + } + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; + switch_v7m_security_state(env, 0); + env->thumb = true; + env->regs[15] = dest; + arm_rebuild_hflags(env); +} + +static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode, + bool spsel) +{ + /* + * Return a pointer to the location where we currently store the + * stack pointer for the requested security state and thread mode. + * This pointer will become invalid if the CPU state is updated + * such that the stack pointers are switched around (eg changing + * the SPSEL control bit). + * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode(). + * Unlike that pseudocode, we require the caller to pass us in the + * SPSEL control bit value; this is because we also use this + * function in handling of pushing of the callee-saves registers + * part of the v8M stack frame (pseudocode PushCalleeStack()), + * and in the tailchain codepath the SPSEL bit comes from the exception + * return magic LR value from the previous exception. The pseudocode + * opencodes the stack-selection in PushCalleeStack(), but we prefer + * to make this utility function generic enough to do the job. + */ + bool want_psp = threadmode && spsel; + + if (secure == env->v7m.secure) { + if (want_psp == v7m_using_psp(env)) { + return &env->regs[13]; + } else { + return &env->v7m.other_sp; + } + } else { + if (want_psp) { + return &env->v7m.other_ss_psp; + } else { + return &env->v7m.other_ss_msp; + } + } +} + +static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure, + uint32_t *pvec) +{ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxResult result; + uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4; + uint32_t vector_entry; + MemTxAttrs attrs = {}; + ARMMMUIdx mmu_idx; + bool exc_secure; + + qemu_log_mask(CPU_LOG_INT, + "...loading from element %d of %s vector table at 0x%x\n", + exc, targets_secure ? "secure" : "non-secure", addr); + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true); + + /* + * We don't do a get_phys_addr() here because the rules for vector + * loads are special: they always use the default memory map, and + * the default memory map permits reads from all addresses. + * Since there's no easy way to pass through to pmsav8_mpu_lookup() + * that we want this special case which would always say "yes", + * we just do the SAU lookup here followed by a direct physical load. + */ + attrs.secure = targets_secure; + attrs.user = false; + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + V8M_SAttributes sattrs = {}; + + v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, + targets_secure, &sattrs); + if (sattrs.ns) { + attrs.secure = false; + } else if (!targets_secure) { + /* + * NS access to S memory: the underlying exception which we escalate + * to HardFault is SecureFault, which always targets Secure. + */ + exc_secure = true; + goto load_fail; + } + } + + vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr, + attrs, &result); + if (result != MEMTX_OK) { + /* + * Underlying exception is BusFault: its target security state + * depends on BFHFNMINS. + */ + exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK); + goto load_fail; + } + *pvec = vector_entry; + qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec); + return true; + +load_fail: + /* + * All vector table fetch fails are reported as HardFault, with + * HFSR.VECTTBL and .FORCED set. (FORCED is set because + * technically the underlying exception is a SecureFault or BusFault + * that is escalated to HardFault.) This is a terminal exception, + * so we will either take the HardFault immediately or else enter + * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()). + * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are + * secure); otherwise it targets the same security state as the + * underlying exception. + * In v8.1M HardFaults from vector table fetch fails don't set FORCED. + */ + if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) { + exc_secure = true; + } + env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK; + if (!arm_feature(env, ARM_FEATURE_V8_1M)) { + env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK; + } + armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure); + return false; +} + +static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr) +{ + /* + * Return the integrity signature value for the callee-saves + * stack frame section. @lr is the exception return payload/LR value + * whose FType bit forms bit 0 of the signature if FP is present. + */ + uint32_t sig = 0xfefa125a; + + if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) + || (lr & R_V7M_EXCRET_FTYPE_MASK)) { + sig |= 1; + } + return sig; +} + +static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain, + bool ignore_faults) +{ + /* + * For v8M, push the callee-saves register part of the stack frame. + * Compare the v8M pseudocode PushCalleeStack(). + * In the tailchaining case this may not be the current stack. + */ + CPUARMState *env = &cpu->env; + uint32_t *frame_sp_p; + uint32_t frameptr; + ARMMMUIdx mmu_idx; + bool stacked_ok; + uint32_t limit; + bool want_psp; + uint32_t sig; + StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL; + + if (dotailchain) { + bool mode = lr & R_V7M_EXCRET_MODE_MASK; + bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) || + !mode; + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv); + frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode, + lr & R_V7M_EXCRET_SPSEL_MASK); + want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK); + if (want_psp) { + limit = env->v7m.psplim[M_REG_S]; + } else { + limit = env->v7m.msplim[M_REG_S]; + } + } else { + mmu_idx = arm_mmu_idx(env); + frame_sp_p = &env->regs[13]; + limit = v7m_sp_limit(env); + } + + frameptr = *frame_sp_p - 0x28; + if (frameptr < limit) { + /* + * Stack limit failure: set SP to the limit value, and generate + * STKOF UsageFault. Stack pushes below the limit must not be + * performed. It is IMPDEF whether pushes above the limit are + * performed; we choose not to. + */ + qemu_log_mask(CPU_LOG_INT, + "...STKOF during callee-saves register stacking\n"); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + *frame_sp_p = limit; + return true; + } + + /* + * Write as much of the stack frame as we can. A write failure may + * cause us to pend a derived exception. + */ + sig = v7m_integrity_sig(env, lr); + stacked_ok = + v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) && + v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode); + + /* Update SP regardless of whether any of the stack accesses failed. */ + *frame_sp_p = frameptr; + + return !stacked_ok; +} + +static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain, + bool ignore_stackfaults) +{ + /* + * Do the "take the exception" parts of exception entry, + * but not the pushing of state to the stack. This is + * similar to the pseudocode ExceptionTaken() function. + */ + CPUARMState *env = &cpu->env; + uint32_t addr; + bool targets_secure; + int exc; + bool push_failed = false; + + armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure); + qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n", + targets_secure ? "secure" : "nonsecure", exc); + + if (dotailchain) { + /* Sanitize LR FType and PREFIX bits */ + if (!cpu_isar_feature(aa32_vfp_simd, cpu)) { + lr |= R_V7M_EXCRET_FTYPE_MASK; + } + lr = deposit32(lr, 24, 8, 0xff); + } + + if (arm_feature(env, ARM_FEATURE_V8)) { + if (arm_feature(env, ARM_FEATURE_M_SECURITY) && + (lr & R_V7M_EXCRET_S_MASK)) { + /* + * The background code (the owner of the registers in the + * exception frame) is Secure. This means it may either already + * have or now needs to push callee-saves registers. + */ + if (targets_secure) { + if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) { + /* + * We took an exception from Secure to NonSecure + * (which means the callee-saved registers got stacked) + * and are now tailchaining to a Secure exception. + * Clear DCRS so eventual return from this Secure + * exception unstacks the callee-saved registers. + */ + lr &= ~R_V7M_EXCRET_DCRS_MASK; + } + } else { + /* + * We're going to a non-secure exception; push the + * callee-saves registers to the stack now, if they're + * not already saved. + */ + if (lr & R_V7M_EXCRET_DCRS_MASK && + !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) { + push_failed = v7m_push_callee_stack(cpu, lr, dotailchain, + ignore_stackfaults); + } + lr |= R_V7M_EXCRET_DCRS_MASK; + } + } + + lr &= ~R_V7M_EXCRET_ES_MASK; + if (targets_secure) { + lr |= R_V7M_EXCRET_ES_MASK; + } + lr &= ~R_V7M_EXCRET_SPSEL_MASK; + if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) { + lr |= R_V7M_EXCRET_SPSEL_MASK; + } + + /* + * Clear registers if necessary to prevent non-secure exception + * code being able to see register values from secure code. + * Where register values become architecturally UNKNOWN we leave + * them with their previous values. v8.1M is tighter than v8.0M + * here and always zeroes the caller-saved registers regardless + * of the security state the exception is targeting. + */ + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) { + /* + * Always clear the caller-saved registers (they have been + * pushed to the stack earlier in v7m_push_stack()). + * Clear callee-saved registers if the background code is + * Secure (in which case these regs were saved in + * v7m_push_callee_stack()). + */ + int i; + /* + * r4..r11 are callee-saves, zero only if background + * state was Secure (EXCRET.S == 1) and exception + * targets Non-secure state + */ + bool zero_callee_saves = !targets_secure && + (lr & R_V7M_EXCRET_S_MASK); + + for (i = 0; i < 13; i++) { + if (i < 4 || i > 11 || zero_callee_saves) { + env->regs[i] = 0; + } + } + /* Clear EAPSR */ + xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT); + } + } + } + + if (push_failed && !ignore_stackfaults) { + /* + * Derived exception on callee-saves register stacking: + * we might now want to take a different exception which + * targets a different security state, so try again from the top. + */ + qemu_log_mask(CPU_LOG_INT, + "...derived exception on callee-saves register stacking"); + v7m_exception_taken(cpu, lr, true, true); + return; + } + + if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) { + /* Vector load failed: derived exception */ + qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load"); + v7m_exception_taken(cpu, lr, true, true); + return; + } + + /* + * Now we've done everything that might cause a derived exception + * we can go ahead and activate whichever exception we're going to + * take (which might now be the derived exception). + */ + armv7m_nvic_acknowledge_irq(env->nvic); + + /* Switch to target security state -- must do this before writing SPSEL */ + switch_v7m_security_state(env, targets_secure); + write_v7m_control_spsel(env, 0); + arm_clear_exclusive(env); + /* Clear SFPA and FPCA (has no effect if no FPU) */ + env->v7m.control[M_REG_S] &= + ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK); + /* Clear IT bits */ + env->condexec_bits = 0; + env->regs[14] = lr; + env->regs[15] = addr & 0xfffffffe; + env->thumb = addr & 1; + arm_rebuild_hflags(env); +} + +static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr, + bool apply_splim) +{ + /* + * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR + * that we will need later in order to do lazy FP reg stacking. + */ + bool is_secure = env->v7m.secure; + NVICState *nvic = env->nvic; + /* + * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits + * are banked and we want to update the bit in the bank for the + * current security state; and in one case we want to specifically + * update the NS banked version of a bit even if we are secure. + */ + uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S]; + uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS]; + uint32_t *fpccr = &env->v7m.fpccr[is_secure]; + bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy; + + env->v7m.fpcar[is_secure] = frameptr & ~0x7; + + if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) { + bool splimviol; + uint32_t splim = v7m_sp_limit(env); + bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) && + (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK); + + splimviol = !ign && frameptr < splim; + *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol); + } + + *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1); + + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure); + + *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0); + + *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD, + !arm_v7m_is_handler_mode(env)); + + hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false); + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy); + + bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false); + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy); + + mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure); + *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy); + + ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false); + *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy); + + monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false); + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy); + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true); + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy); + + sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false); + *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy); + } +} + +void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr) +{ + /* fptr is the value of Rn, the frame pointer we store the FP regs to */ + ARMCPU *cpu = env_archcpu(env); + bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; + bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK; + uintptr_t ra = GETPC(); + + assert(env->v7m.secure); + + if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { + return; + } + + /* Check access to the coprocessor is permitted */ + if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) { + raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC()); + } + + if (lspact) { + /* LSPACT should not be active when there is active FP state */ + raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC()); + } + + if (fptr & 7) { + raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC()); + } + + /* + * Note that we do not use v7m_stack_write() here, because the + * accesses should not set the FSR bits for stacking errors if they + * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK + * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions + * and longjmp out. + */ + if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) { + bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK; + int i; + + for (i = 0; i < (ts ? 32 : 16); i += 2) { + uint64_t dn = *aa32_vfp_dreg(env, i / 2); + uint32_t faddr = fptr + 4 * i; + uint32_t slo = extract64(dn, 0, 32); + uint32_t shi = extract64(dn, 32, 32); + + if (i >= 16) { + faddr += 8; /* skip the slot for the FPSCR */ + } + cpu_stl_data_ra(env, faddr, slo, ra); + cpu_stl_data_ra(env, faddr + 4, shi, ra); + } + cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra); + if (cpu_isar_feature(aa32_mve, cpu)) { + cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra); + } + + /* + * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to + * leave them unchanged, matching our choice in v7m_preserve_fp_state. + */ + if (ts) { + for (i = 0; i < 32; i += 2) { + *aa32_vfp_dreg(env, i / 2) = 0; + } + vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } + } + } else { + v7m_update_fpccr(env, fptr, false); + } + + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; +} + +void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr) +{ + ARMCPU *cpu = env_archcpu(env); + uintptr_t ra = GETPC(); + + /* fptr is the value of Rn, the frame pointer we load the FP regs from */ + assert(env->v7m.secure); + + if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { + return; + } + + /* Check access to the coprocessor is permitted */ + if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) { + raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC()); + } + + if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) { + /* State in FP is still valid */ + env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK; + } else { + bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK; + int i; + uint32_t fpscr; + + if (fptr & 7) { + raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC()); + } + + for (i = 0; i < (ts ? 32 : 16); i += 2) { + uint32_t slo, shi; + uint64_t dn; + uint32_t faddr = fptr + 4 * i; + + if (i >= 16) { + faddr += 8; /* skip the slot for the FPSCR and VPR */ + } + + slo = cpu_ldl_data_ra(env, faddr, ra); + shi = cpu_ldl_data_ra(env, faddr + 4, ra); + + dn = (uint64_t) shi << 32 | slo; + *aa32_vfp_dreg(env, i / 2) = dn; + } + fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra); + vfp_set_fpscr(env, fpscr); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra); + } + } + + env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK; +} + +static bool v7m_push_stack(ARMCPU *cpu) +{ + /* + * Do the "set up stack frame" part of exception entry, + * similar to pseudocode PushStack(). + * Return true if we generate a derived exception (and so + * should ignore further stack faults trying to process + * that derived exception.) + */ + bool stacked_ok = true, limitviol = false; + CPUARMState *env = &cpu->env; + uint32_t xpsr = xpsr_read(env); + uint32_t frameptr = env->regs[13]; + ARMMMUIdx mmu_idx = arm_mmu_idx(env); + uint32_t framesize; + bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1); + + if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) && + (env->v7m.secure || nsacr_cp10)) { + if (env->v7m.secure && + env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) { + framesize = 0xa8; + } else { + framesize = 0x68; + } + } else { + framesize = 0x20; + } + + /* Align stack pointer if the guest wants that */ + if ((frameptr & 4) && + (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) { + frameptr -= 4; + xpsr |= XPSR_SPREALIGN; + } + + xpsr &= ~XPSR_SFPA; + if (env->v7m.secure && + (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) { + xpsr |= XPSR_SFPA; + } + + frameptr -= framesize; + + if (arm_feature(env, ARM_FEATURE_V8)) { + uint32_t limit = v7m_sp_limit(env); + + if (frameptr < limit) { + /* + * Stack limit failure: set SP to the limit value, and generate + * STKOF UsageFault. Stack pushes below the limit must not be + * performed. It is IMPDEF whether pushes above the limit are + * performed; we choose not to. + */ + qemu_log_mask(CPU_LOG_INT, + "...STKOF during stacking\n"); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + env->regs[13] = limit; + /* + * We won't try to perform any further memory accesses but + * we must continue through the following code to check for + * permission faults during FPU state preservation, and we + * must update FPCCR if lazy stacking is enabled. + */ + limitviol = true; + stacked_ok = false; + } + } + + /* + * Write as much of the stack frame as we can. If we fail a stack + * write this will result in a derived exception being pended + * (which may be taken in preference to the one we started with + * if it has higher priority). + */ + stacked_ok = stacked_ok && + v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 4, env->regs[1], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 8, env->regs[2], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 12, env->regs[3], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 16, env->regs[12], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 20, env->regs[14], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 24, env->regs[15], + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL); + + if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) { + /* FPU is active, try to save its registers */ + bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK; + bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK; + + if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault because LSPACT and FPCA both set\n"); + env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + } else if (!env->v7m.secure && !nsacr_cp10) { + qemu_log_mask(CPU_LOG_INT, + "...Secure UsageFault with CFSR.NOCP because " + "NSACR.CP10 prevents stacking FP regs\n"); + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S); + env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; + } else { + if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) { + /* Lazy stacking disabled, save registers now */ + int i; + bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure, + arm_current_el(env) != 0); + + if (stacked_ok && !cpacr_pass) { + /* + * Take UsageFault if CPACR forbids access. The pseudocode + * here does a full CheckCPEnabled() but we know the NSACR + * check can never fail as we have already handled that. + */ + qemu_log_mask(CPU_LOG_INT, + "...UsageFault with CFSR.NOCP because " + "CPACR.CP10 prevents stacking FP regs\n"); + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK; + stacked_ok = false; + } + + for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) { + uint64_t dn = *aa32_vfp_dreg(env, i / 2); + uint32_t faddr = frameptr + 0x20 + 4 * i; + uint32_t slo = extract64(dn, 0, 32); + uint32_t shi = extract64(dn, 32, 32); + + if (i >= 16) { + faddr += 8; /* skip the slot for the FPSCR and VPR */ + } + stacked_ok = stacked_ok && + v7m_stack_write(cpu, faddr, slo, + mmu_idx, STACK_NORMAL) && + v7m_stack_write(cpu, faddr + 4, shi, + mmu_idx, STACK_NORMAL); + } + stacked_ok = stacked_ok && + v7m_stack_write(cpu, frameptr + 0x60, + vfp_get_fpscr(env), mmu_idx, STACK_NORMAL); + if (cpu_isar_feature(aa32_mve, cpu)) { + stacked_ok = stacked_ok && + v7m_stack_write(cpu, frameptr + 0x64, + env->v7m.vpr, mmu_idx, STACK_NORMAL); + } + if (cpacr_pass) { + for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) { + *aa32_vfp_dreg(env, i / 2) = 0; + } + vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } + } + } else { + /* Lazy stacking enabled, save necessary info to stack later */ + v7m_update_fpccr(env, frameptr + 0x20, true); + } + } + } + + /* + * If we broke a stack limit then SP was already updated earlier; + * otherwise we update SP regardless of whether any of the stack + * accesses failed or we took some other kind of fault. + */ + if (!limitviol) { + env->regs[13] = frameptr; + } + + return !stacked_ok; +} + +static void do_v7m_exception_exit(ARMCPU *cpu) +{ + CPUARMState *env = &cpu->env; + uint32_t excret; + uint32_t xpsr, xpsr_mask; + bool ufault = false; + bool sfault = false; + bool return_to_sp_process; + bool return_to_handler; + bool rettobase = false; + bool exc_secure = false; + bool return_to_secure; + bool ftype; + bool restore_s16_s31 = false; + + /* + * If we're not in Handler mode then jumps to magic exception-exit + * addresses don't have magic behaviour. However for the v8M + * security extensions the magic secure-function-return has to + * work in thread mode too, so to avoid doing an extra check in + * the generated code we allow exception-exit magic to also cause the + * internal exception and bring us here in thread mode. Correct code + * will never try to do this (the following insn fetch will always + * fault) so we the overhead of having taken an unnecessary exception + * doesn't matter. + */ + if (!arm_v7m_is_handler_mode(env)) { + return; + } + + /* + * In the spec pseudocode ExceptionReturn() is called directly + * from BXWritePC() and gets the full target PC value including + * bit zero. In QEMU's implementation we treat it as a normal + * jump-to-register (which is then caught later on), and so split + * the target value up between env->regs[15] and env->thumb in + * gen_bx(). Reconstitute it. + */ + excret = env->regs[15]; + if (env->thumb) { + excret |= 1; + } + + qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32 + " previous exception %d\n", + excret, env->v7m.exception); + + if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) { + qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception " + "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n", + excret); + } + + ftype = excret & R_V7M_EXCRET_FTYPE_MASK; + + if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) { + qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception " + "exit PC value 0x%" PRIx32 " is UNPREDICTABLE " + "if FPU not present\n", + excret); + ftype = true; + } + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + /* + * EXC_RETURN.ES validation check (R_SMFL). We must do this before + * we pick which FAULTMASK to clear. + */ + if (!env->v7m.secure && + ((excret & R_V7M_EXCRET_ES_MASK) || + !(excret & R_V7M_EXCRET_DCRS_MASK))) { + sfault = 1; + /* For all other purposes, treat ES as 0 (R_HXSR) */ + excret &= ~R_V7M_EXCRET_ES_MASK; + } + exc_secure = excret & R_V7M_EXCRET_ES_MASK; + } + + if (env->v7m.exception != ARMV7M_EXCP_NMI) { + /* + * Auto-clear FAULTMASK on return from other than NMI. + * If the security extension is implemented then this only + * happens if the raw execution priority is >= 0; the + * value of the ES bit in the exception return value indicates + * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.) + */ + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) { + env->v7m.faultmask[exc_secure] = 0; + } + } else { + env->v7m.faultmask[M_REG_NS] = 0; + } + } + + switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception, + exc_secure)) { + case -1: + /* attempt to exit an exception that isn't active */ + ufault = true; + break; + case 0: + /* still an irq active now */ + break; + case 1: + /* + * We returned to base exception level, no nesting. + * (In the pseudocode this is written using "NestedActivation != 1" + * where we have 'rettobase == false'.) + */ + rettobase = true; + break; + default: + g_assert_not_reached(); + } + + return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK); + return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK; + return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) && + (excret & R_V7M_EXCRET_S_MASK); + + if (arm_feature(env, ARM_FEATURE_V8)) { + if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) { + /* + * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP); + * we choose to take the UsageFault. + */ + if ((excret & R_V7M_EXCRET_S_MASK) || + (excret & R_V7M_EXCRET_ES_MASK) || + !(excret & R_V7M_EXCRET_DCRS_MASK)) { + ufault = true; + } + } + if (excret & R_V7M_EXCRET_RES0_MASK) { + ufault = true; + } + } else { + /* For v7M we only recognize certain combinations of the low bits */ + switch (excret & 0xf) { + case 1: /* Return to Handler */ + break; + case 13: /* Return to Thread using Process stack */ + case 9: /* Return to Thread using Main stack */ + /* + * We only need to check NONBASETHRDENA for v7M, because in + * v8M this bit does not exist (it is RES1). + */ + if (!rettobase && + !(env->v7m.ccr[env->v7m.secure] & + R_V7M_CCR_NONBASETHRDENA_MASK)) { + ufault = true; + } + break; + default: + ufault = true; + } + } + + /* + * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in + * Handler mode (and will be until we write the new XPSR.Interrupt + * field) this does not switch around the current stack pointer. + * We must do this before we do any kind of tailchaining, including + * for the derived exceptions on integrity check failures, or we will + * give the guest an incorrect EXCRET.SPSEL value on exception entry. + */ + write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure); + + /* + * Clear scratch FP values left in caller saved registers; this + * must happen before any kind of tail chaining. + */ + if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) && + (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) { + if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) { + env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " + "stackframe: error during lazy state deactivation\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } else { + if (arm_feature(env, ARM_FEATURE_V8_1M)) { + /* v8.1M adds this NOCP check */ + bool nsacr_pass = exc_secure || + extract32(env->v7m.nsacr, 10, 1); + bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true); + if (!nsacr_pass) { + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true); + env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK; + qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " + "stackframe: NSACR prevents clearing FPU registers\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } else if (!cpacr_pass) { + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + exc_secure); + env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK; + qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " + "stackframe: CPACR prevents clearing FPU registers\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + } + /* Clear s0..s15, FPSCR and VPR */ + int i; + + for (i = 0; i < 16; i += 2) { + *aa32_vfp_dreg(env, i / 2) = 0; + } + vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } + } + } + + if (sfault) { + env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " + "stackframe: failed EXC_RETURN.ES validity check\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + if (ufault) { + /* + * Bad exception return: instead of popping the exception + * stack, directly take a usage fault on the current stack. + */ + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " + "stackframe: failed exception return integrity check\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + /* + * Tailchaining: if there is currently a pending exception that + * is high enough priority to preempt execution at the level we're + * about to return to, then just directly take that exception now, + * avoiding an unstack-and-then-stack. Note that now we have + * deactivated the previous exception by calling armv7m_nvic_complete_irq() + * our current execution priority is already the execution priority we are + * returning to -- none of the state we would unstack or set based on + * the EXCRET value affects it. + */ + if (armv7m_nvic_can_take_pending_exception(env->nvic)) { + qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + switch_v7m_security_state(env, return_to_secure); + + { + /* + * The stack pointer we should be reading the exception frame from + * depends on bits in the magic exception return type value (and + * for v8M isn't necessarily the stack pointer we will eventually + * end up resuming execution with). Get a pointer to the location + * in the CPU state struct where the SP we need is currently being + * stored; we will use and modify it in place. + * We use this limited C variable scope so we don't accidentally + * use 'frame_sp_p' after we do something that makes it invalid. + */ + bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK; + uint32_t *frame_sp_p = get_v7m_sp_ptr(env, + return_to_secure, + !return_to_handler, + spsel); + uint32_t frameptr = *frame_sp_p; + bool pop_ok = true; + ARMMMUIdx mmu_idx; + bool return_to_priv = return_to_handler || + !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK); + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure, + return_to_priv); + + if (!QEMU_IS_ALIGNED(frameptr, 8) && + arm_feature(env, ARM_FEATURE_V8)) { + qemu_log_mask(LOG_GUEST_ERROR, + "M profile exception return with non-8-aligned SP " + "for destination state is UNPREDICTABLE\n"); + } + + /* Do we need to pop callee-saved registers? */ + if (return_to_secure && + ((excret & R_V7M_EXCRET_ES_MASK) == 0 || + (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) { + uint32_t actual_sig; + + pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx); + + if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) { + /* Take a SecureFault on the current stack */ + env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " + "stackframe: failed exception return integrity " + "signature check\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + pop_ok = pop_ok && + v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) && + v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) && + v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) && + v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) && + v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) && + v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) && + v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) && + v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx); + + frameptr += 0x28; + } + + /* Pop registers */ + pop_ok = pop_ok && + v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) && + v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) && + v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) && + v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) && + v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) && + v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) && + v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) && + v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx); + + if (!pop_ok) { + /* + * v7m_stack_read() pended a fault, so take it (as a tail + * chained exception on the same stack frame) + */ + qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + /* + * Returning from an exception with a PC with bit 0 set is defined + * behaviour on v8M (bit 0 is ignored), but for v7M it was specified + * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore + * the lsbit, and there are several RTOSes out there which incorrectly + * assume the r15 in the stack frame should be a Thumb-style "lsbit + * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but + * complain about the badly behaved guest. + */ + if (env->regs[15] & 1) { + env->regs[15] &= ~1U; + if (!arm_feature(env, ARM_FEATURE_V8)) { + qemu_log_mask(LOG_GUEST_ERROR, + "M profile return from interrupt with misaligned " + "PC is UNPREDICTABLE on v7M\n"); + } + } + + if (arm_feature(env, ARM_FEATURE_V8)) { + /* + * For v8M we have to check whether the xPSR exception field + * matches the EXCRET value for return to handler/thread + * before we commit to changing the SP and xPSR. + */ + bool will_be_handler = (xpsr & XPSR_EXCP) != 0; + if (return_to_handler != will_be_handler) { + /* + * Take an INVPC UsageFault on the current stack. + * By this point we will have switched to the security state + * for the background state, so this UsageFault will target + * that state. + */ + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; + qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " + "stackframe: failed exception return integrity " + "check\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + } + + if (!ftype) { + /* FP present and we need to handle it */ + if (!return_to_secure && + (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) { + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; + qemu_log_mask(CPU_LOG_INT, + "...taking SecureFault on existing stackframe: " + "Secure LSPACT set but exception return is " + "not to secure state\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + restore_s16_s31 = return_to_secure && + (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK); + + if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) { + /* State in FPU is still valid, just clear LSPACT */ + env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK; + } else { + int i; + uint32_t fpscr; + bool cpacr_pass, nsacr_pass; + + cpacr_pass = v7m_cpacr_pass(env, return_to_secure, + return_to_priv); + nsacr_pass = return_to_secure || + extract32(env->v7m.nsacr, 10, 1); + + if (!cpacr_pass) { + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + return_to_secure); + env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK; + qemu_log_mask(CPU_LOG_INT, + "...taking UsageFault on existing " + "stackframe: CPACR.CP10 prevents unstacking " + "FP regs\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } else if (!nsacr_pass) { + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true); + env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK; + qemu_log_mask(CPU_LOG_INT, + "...taking Secure UsageFault on existing " + "stackframe: NSACR.CP10 prevents unstacking " + "FP regs\n"); + v7m_exception_taken(cpu, excret, true, false); + return; + } + + for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) { + uint32_t slo, shi; + uint64_t dn; + uint32_t faddr = frameptr + 0x20 + 4 * i; + + if (i >= 16) { + faddr += 8; /* Skip the slot for the FPSCR and VPR */ + } + + pop_ok = pop_ok && + v7m_stack_read(cpu, &slo, faddr, mmu_idx) && + v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx); + + if (!pop_ok) { + break; + } + + dn = (uint64_t)shi << 32 | slo; + *aa32_vfp_dreg(env, i / 2) = dn; + } + pop_ok = pop_ok && + v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx); + if (pop_ok) { + vfp_set_fpscr(env, fpscr); + } + if (cpu_isar_feature(aa32_mve, cpu)) { + pop_ok = pop_ok && + v7m_stack_read(cpu, &env->v7m.vpr, + frameptr + 0x64, mmu_idx); + } + if (!pop_ok) { + /* + * These regs are 0 if security extension present; + * otherwise merely UNKNOWN. We zero always. + */ + for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) { + *aa32_vfp_dreg(env, i / 2) = 0; + } + vfp_set_fpscr(env, 0); + if (cpu_isar_feature(aa32_mve, cpu)) { + env->v7m.vpr = 0; + } + } + } + } + env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S], + V7M_CONTROL, FPCA, !ftype); + + /* Commit to consuming the stack frame */ + frameptr += 0x20; + if (!ftype) { + frameptr += 0x48; + if (restore_s16_s31) { + frameptr += 0x40; + } + } + /* + * Undo stack alignment (the SPREALIGN bit indicates that the original + * pre-exception SP was not 8-aligned and we added a padding word to + * align it, so we undo this by ORing in the bit that increases it + * from the current 8-aligned value to the 8-unaligned value. (Adding 4 + * would work too but a logical OR is how the pseudocode specifies it.) + */ + if (xpsr & XPSR_SPREALIGN) { + frameptr |= 4; + } + *frame_sp_p = frameptr; + } + + xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA); + if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) { + xpsr_mask &= ~XPSR_GE; + } + /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */ + xpsr_write(env, xpsr, xpsr_mask); + + if (env->v7m.secure) { + bool sfpa = xpsr & XPSR_SFPA; + + env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S], + V7M_CONTROL, SFPA, sfpa); + } + + /* + * The restored xPSR exception field will be zero if we're + * resuming in Thread mode. If that doesn't match what the + * exception return excret specified then this is a UsageFault. + * v7M requires we make this check here; v8M did it earlier. + */ + if (return_to_handler != arm_v7m_is_handler_mode(env)) { + /* + * Take an INVPC UsageFault by pushing the stack again; + * we know we're v7M so this is never a Secure UsageFault. + */ + bool ignore_stackfaults; + + assert(!arm_feature(env, ARM_FEATURE_V8)); + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; + ignore_stackfaults = v7m_push_stack(cpu); + qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: " + "failed exception return integrity check\n"); + v7m_exception_taken(cpu, excret, false, ignore_stackfaults); + return; + } + + /* Otherwise, we have a successful exception exit. */ + arm_clear_exclusive(env); + arm_rebuild_hflags(env); + qemu_log_mask(CPU_LOG_INT, "...successful exception return\n"); +} + +static bool do_v7m_function_return(ARMCPU *cpu) +{ + /* + * v8M security extensions magic function return. + * We may either: + * (1) throw an exception (longjump) + * (2) return true if we successfully handled the function return + * (3) return false if we failed a consistency check and have + * pended a UsageFault that needs to be taken now + * + * At this point the magic return value is split between env->regs[15] + * and env->thumb. We don't bother to reconstitute it because we don't + * need it (all values are handled the same way). + */ + CPUARMState *env = &cpu->env; + uint32_t newpc, newpsr, newpsr_exc; + + qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n"); + + { + bool threadmode, spsel; + MemOpIdx oi; + ARMMMUIdx mmu_idx; + uint32_t *frame_sp_p; + uint32_t frameptr; + + /* Pull the return address and IPSR from the Secure stack */ + threadmode = !arm_v7m_is_handler_mode(env); + spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK; + + frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel); + frameptr = *frame_sp_p; + + /* + * These loads may throw an exception (for MPU faults). We want to + * do them as secure, so work out what MMU index that is. + */ + mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true); + oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx)); + newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0); + newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0); + + /* Consistency checks on new IPSR */ + newpsr_exc = newpsr & XPSR_EXCP; + if (!((env->v7m.exception == 0 && newpsr_exc == 0) || + (env->v7m.exception == 1 && newpsr_exc != 0))) { + /* Pend the fault and tell our caller to take it */ + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + qemu_log_mask(CPU_LOG_INT, + "...taking INVPC UsageFault: " + "IPSR consistency check failed\n"); + return false; + } + + *frame_sp_p = frameptr + 8; + } + + /* This invalidates frame_sp_p */ + switch_v7m_security_state(env, true); + env->v7m.exception = newpsr_exc; + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; + if (newpsr & XPSR_SFPA) { + env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK; + } + xpsr_write(env, 0, XPSR_IT); + env->thumb = newpc & 1; + env->regs[15] = newpc & ~1; + arm_rebuild_hflags(env); + + qemu_log_mask(CPU_LOG_INT, "...function return successful\n"); + return true; +} + +static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure, + uint32_t addr, uint16_t *insn) +{ + /* + * Load a 16-bit portion of a v7M instruction, returning true on success, + * or false on failure (in which case we will have pended the appropriate + * exception). + * We need to do the instruction fetch's MPU and SAU checks + * like this because there is no MMU index that would allow + * doing the load with a single function call. Instead we must + * first check that the security attributes permit the load + * and that they don't mismatch on the two halves of the instruction, + * and then we do the load as a secure load (ie using the security + * attributes of the address, not the CPU, as architecturally required). + */ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + V8M_SAttributes sattrs = {}; + GetPhysAddrResult res = {}; + ARMMMUFaultInfo fi = {}; + MemTxResult txres; + + v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs); + if (!sattrs.nsc || sattrs.ns) { + /* + * This must be the second half of the insn, and it straddles a + * region boundary with the second half not being S&NSC. + */ + env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + qemu_log_mask(CPU_LOG_INT, + "...really SecureFault with SFSR.INVEP\n"); + return false; + } + if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) { + /* the MPU lookup failed */ + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure); + qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n"); + return false; + } + *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs), + res.f.phys_addr, res.f.attrs, &txres); + if (txres != MEMTX_OK) { + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); + qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n"); + return false; + } + return true; +} + +static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx, + uint32_t addr, uint32_t *spdata) +{ + /* + * Read a word of data from the stack for the SG instruction, + * writing the value into *spdata. If the load succeeds, return + * true; otherwise pend an appropriate exception and return false. + * (We can't use data load helpers here that throw an exception + * because of the context we're called in, which is halfway through + * arm_v7m_cpu_do_interrupt().) + */ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxResult txres; + GetPhysAddrResult res = {}; + ARMMMUFaultInfo fi = {}; + uint32_t value; + + if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) { + /* MPU/SAU lookup failed */ + if (fi.type == ARMFault_QEMU_SFault) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault during stack word read\n"); + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; + env->v7m.sfar = addr; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + } else { + qemu_log_mask(CPU_LOG_INT, + "...MemManageFault during stack word read\n"); + env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK | + R_V7M_CFSR_MMARVALID_MASK; + env->v7m.mmfar[M_REG_S] = addr; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false); + } + return false; + } + value = address_space_ldl(arm_addressspace(cs, res.f.attrs), + res.f.phys_addr, res.f.attrs, &txres); + if (txres != MEMTX_OK) { + /* BusFault trying to read the data */ + qemu_log_mask(CPU_LOG_INT, + "...BusFault during stack word read\n"); + env->v7m.cfsr[M_REG_NS] |= + (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK); + env->v7m.bfar = addr; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); + return false; + } + + *spdata = value; + return true; +} + +static bool v7m_handle_execute_nsc(ARMCPU *cpu) +{ + /* + * Check whether this attempt to execute code in a Secure & NS-Callable + * memory region is for an SG instruction; if so, then emulate the + * effect of the SG instruction and return true. Otherwise pend + * the correct kind of exception and return false. + */ + CPUARMState *env = &cpu->env; + ARMMMUIdx mmu_idx; + uint16_t insn; + + /* + * We should never get here unless get_phys_addr_pmsav8() caused + * an exception for NS executing in S&NSC memory. + */ + assert(!env->v7m.secure); + assert(arm_feature(env, ARM_FEATURE_M_SECURITY)); + + /* We want to do the MPU lookup as secure; work out what mmu_idx that is */ + mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true); + + if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) { + return false; + } + + if (!env->thumb) { + goto gen_invep; + } + + if (insn != 0xe97f) { + /* + * Not an SG instruction first half (we choose the IMPDEF + * early-SG-check option). + */ + goto gen_invep; + } + + if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) { + return false; + } + + if (insn != 0xe97f) { + /* + * Not an SG instruction second half (yes, both halves of the SG + * insn have the same hex value) + */ + goto gen_invep; + } + + /* + * OK, we have confirmed that we really have an SG instruction. + * We know we're NS in S memory so don't need to repeat those checks. + */ + qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32 + ", executing it\n", env->regs[15]); + + if (cpu_isar_feature(aa32_m_sec_state, cpu) && + !arm_v7m_is_handler_mode(env)) { + /* + * v8.1M exception stack frame integrity check. Note that we + * must perform the memory access even if CCR_S.TRD is zero + * and we aren't going to check what the data loaded is. + */ + uint32_t spdata, sp; + + /* + * We know we are currently NS, so the S stack pointers must be + * in other_ss_{psp,msp}, not in regs[13]/other_sp. + */ + sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp; + if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) { + /* Stack access failed and an exception has been pended */ + return false; + } + + if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) { + if (((spdata & ~1) == 0xfefa125a) || + !(env->v7m.control[M_REG_S] & 1)) { + goto gen_invep; + } + } + } + + env->regs[14] &= ~1; + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; + switch_v7m_security_state(env, true); + xpsr_write(env, 0, XPSR_IT); + env->regs[15] += 4; + arm_rebuild_hflags(env); + return true; + +gen_invep: + env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + qemu_log_mask(CPU_LOG_INT, + "...really SecureFault with SFSR.INVEP\n"); + return false; +} + +void arm_v7m_cpu_do_interrupt(CPUState *cs) +{ + ARMCPU *cpu = ARM_CPU(cs); + CPUARMState *env = &cpu->env; + uint32_t lr; + bool ignore_stackfaults; + + arm_log_exception(cs); + + /* + * For exceptions we just mark as pending on the NVIC, and let that + * handle it. + */ + switch (cs->exception_index) { + case EXCP_UDEF: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK; + break; + case EXCP_NOCP: + { + /* + * NOCP might be directed to something other than the current + * security state if this fault is because of NSACR; we indicate + * the target security state using exception.target_el. + */ + int target_secstate; + + if (env->exception.target_el == 3) { + target_secstate = M_REG_S; + } else { + target_secstate = env->v7m.secure; + } + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate); + env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK; + break; + } + case EXCP_INVSTATE: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK; + break; + case EXCP_STKOF: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK; + break; + case EXCP_LSERR: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK; + break; + case EXCP_UNALIGNED: + /* Unaligned faults reported by M-profile aware code */ + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK; + break; + case EXCP_DIVBYZERO: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK; + break; + case EXCP_SWI: + /* The PC already points to the next instruction. */ + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure); + break; + case EXCP_PREFETCH_ABORT: + case EXCP_DATA_ABORT: + /* + * Note that for M profile we don't have a guest facing FSR, but + * the env->exception.fsr will be populated by the code that + * raises the fault, in the A profile short-descriptor format. + * + * Log the exception.vaddress now regardless of subtype, because + * logging below only logs it when it goes into a guest visible + * register. + */ + qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n", + (uint32_t)env->exception.vaddress); + switch (env->exception.fsr & 0xf) { + case M_FAKE_FSR_NSC_EXEC: + /* + * Exception generated when we try to execute code at an address + * which is marked as Secure & Non-Secure Callable and the CPU + * is in the Non-Secure state. The only instruction which can + * be executed like this is SG (and that only if both halves of + * the SG instruction have the same security attributes.) + * Everything else must generate an INVEP SecureFault, so we + * emulate the SG instruction here. + */ + if (v7m_handle_execute_nsc(cpu)) { + return; + } + break; + case M_FAKE_FSR_SFAULT: + /* + * Various flavours of SecureFault for attempts to execute or + * access data in the wrong security state. + */ + switch (cs->exception_index) { + case EXCP_PREFETCH_ABORT: + if (env->v7m.secure) { + env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK; + qemu_log_mask(CPU_LOG_INT, + "...really SecureFault with SFSR.INVTRAN\n"); + } else { + env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK; + qemu_log_mask(CPU_LOG_INT, + "...really SecureFault with SFSR.INVEP\n"); + } + break; + case EXCP_DATA_ABORT: + /* This must be an NS access to S memory */ + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK; + qemu_log_mask(CPU_LOG_INT, + "...really SecureFault with SFSR.AUVIOL\n"); + break; + } + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); + break; + case 0x8: /* External Abort */ + switch (cs->exception_index) { + case EXCP_PREFETCH_ABORT: + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK; + qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n"); + break; + case EXCP_DATA_ABORT: + env->v7m.cfsr[M_REG_NS] |= + (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK); + env->v7m.bfar = env->exception.vaddress; + qemu_log_mask(CPU_LOG_INT, + "...with CFSR.PRECISERR and BFAR 0x%x\n", + env->v7m.bfar); + break; + } + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false); + break; + case 0x1: /* Alignment fault reported by generic code */ + qemu_log_mask(CPU_LOG_INT, + "...really UsageFault with UFSR.UNALIGNED\n"); + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK; + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, + env->v7m.secure); + break; + default: + /* + * All other FSR values are either MPU faults or "can't happen + * for M profile" cases. + */ + switch (cs->exception_index) { + case EXCP_PREFETCH_ABORT: + env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK; + qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n"); + break; + case EXCP_DATA_ABORT: + env->v7m.cfsr[env->v7m.secure] |= + (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK); + env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress; + qemu_log_mask(CPU_LOG_INT, + "...with CFSR.DACCVIOL and MMFAR 0x%x\n", + env->v7m.mmfar[env->v7m.secure]); + break; + } + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, + env->v7m.secure); + break; + } + break; + case EXCP_SEMIHOST: + qemu_log_mask(CPU_LOG_INT, + "...handling as semihosting call 0x%x\n", + env->regs[0]); +#ifdef CONFIG_TCG + do_common_semihosting(cs); +#else + g_assert_not_reached(); +#endif + env->regs[15] += env->thumb ? 2 : 4; + return; + case EXCP_BKPT: + armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false); + break; + case EXCP_IRQ: + break; + case EXCP_EXCEPTION_EXIT: + if (env->regs[15] < EXC_RETURN_MIN_MAGIC) { + /* Must be v8M security extension function return */ + assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC); + assert(arm_feature(env, ARM_FEATURE_M_SECURITY)); + if (do_v7m_function_return(cpu)) { + return; + } + } else { + do_v7m_exception_exit(cpu); + return; + } + break; + case EXCP_LAZYFP: + /* + * We already pended the specific exception in the NVIC in the + * v7m_preserve_fp_state() helper function. + */ + break; + default: + cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index); + return; /* Never happens. Keep compiler happy. */ + } + + if (arm_feature(env, ARM_FEATURE_V8)) { + lr = R_V7M_EXCRET_RES1_MASK | + R_V7M_EXCRET_DCRS_MASK; + /* + * The S bit indicates whether we should return to Secure + * or NonSecure (ie our current state). + * The ES bit indicates whether we're taking this exception + * to Secure or NonSecure (ie our target state). We set it + * later, in v7m_exception_taken(). + * The SPSEL bit is also set in v7m_exception_taken() for v8M. + * This corresponds to the ARM ARM pseudocode for v8M setting + * some LR bits in PushStack() and some in ExceptionTaken(); + * the distinction matters for the tailchain cases where we + * can take an exception without pushing the stack. + */ + if (env->v7m.secure) { + lr |= R_V7M_EXCRET_S_MASK; + } + } else { + lr = R_V7M_EXCRET_RES1_MASK | + R_V7M_EXCRET_S_MASK | + R_V7M_EXCRET_DCRS_MASK | + R_V7M_EXCRET_ES_MASK; + if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) { + lr |= R_V7M_EXCRET_SPSEL_MASK; + } + } + if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) { + lr |= R_V7M_EXCRET_FTYPE_MASK; + } + if (!arm_v7m_is_handler_mode(env)) { + lr |= R_V7M_EXCRET_MODE_MASK; + } + + ignore_stackfaults = v7m_push_stack(cpu); + v7m_exception_taken(cpu, lr, false, ignore_stackfaults); +} + +uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg) +{ + unsigned el = arm_current_el(env); + + /* First handle registers which unprivileged can read */ + switch (reg) { + case 0 ... 7: /* xPSR sub-fields */ + return v7m_mrs_xpsr(env, reg, el); + case 20: /* CONTROL */ + return v7m_mrs_control(env, env->v7m.secure); + case 0x94: /* CONTROL_NS */ + /* + * We have to handle this here because unprivileged Secure code + * can read the NS CONTROL register. + */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.control[M_REG_NS] | + (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK); + } + + if (el == 0) { + return 0; /* unprivileged reads others as zero */ + } + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + switch (reg) { + case 0x88: /* MSP_NS */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.other_ss_msp; + case 0x89: /* PSP_NS */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.other_ss_psp; + case 0x8a: /* MSPLIM_NS */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.msplim[M_REG_NS]; + case 0x8b: /* PSPLIM_NS */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.psplim[M_REG_NS]; + case 0x90: /* PRIMASK_NS */ + if (!env->v7m.secure) { + return 0; + } + return env->v7m.primask[M_REG_NS]; + case 0x91: /* BASEPRI_NS */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + if (!env->v7m.secure) { + return 0; + } + return env->v7m.basepri[M_REG_NS]; + case 0x93: /* FAULTMASK_NS */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + if (!env->v7m.secure) { + return 0; + } + return env->v7m.faultmask[M_REG_NS]; + case 0x98: /* SP_NS */ + { + /* + * This gives the non-secure SP selected based on whether we're + * currently in handler mode or not, using the NS CONTROL.SPSEL. + */ + bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK; + + if (!env->v7m.secure) { + return 0; + } + if (!arm_v7m_is_handler_mode(env) && spsel) { + return env->v7m.other_ss_psp; + } else { + return env->v7m.other_ss_msp; + } + } + default: + break; + } + } + + switch (reg) { + case 8: /* MSP */ + return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13]; + case 9: /* PSP */ + return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp; + case 10: /* MSPLIM */ + if (!arm_feature(env, ARM_FEATURE_V8)) { + goto bad_reg; + } + return env->v7m.msplim[env->v7m.secure]; + case 11: /* PSPLIM */ + if (!arm_feature(env, ARM_FEATURE_V8)) { + goto bad_reg; + } + return env->v7m.psplim[env->v7m.secure]; + case 16: /* PRIMASK */ + return env->v7m.primask[env->v7m.secure]; + case 17: /* BASEPRI */ + case 18: /* BASEPRI_MAX */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + return env->v7m.basepri[env->v7m.secure]; + case 19: /* FAULTMASK */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + return env->v7m.faultmask[env->v7m.secure]; + default: + bad_reg: + qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special" + " register %d\n", reg); + return 0; + } +} + +void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val) +{ + /* + * We're passed bits [11..0] of the instruction; extract + * SYSm and the mask bits. + * Invalid combinations of SYSm and mask are UNPREDICTABLE; + * we choose to treat them as if the mask bits were valid. + * NB that the pseudocode 'mask' variable is bits [11..10], + * whereas ours is [11..8]. + */ + uint32_t mask = extract32(maskreg, 8, 4); + uint32_t reg = extract32(maskreg, 0, 8); + int cur_el = arm_current_el(env); + + if (cur_el == 0 && reg > 7 && reg != 20) { + /* + * only xPSR sub-fields and CONTROL.SFPA may be written by + * unprivileged code + */ + return; + } + + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + switch (reg) { + case 0x88: /* MSP_NS */ + if (!env->v7m.secure) { + return; + } + env->v7m.other_ss_msp = val & ~3; + return; + case 0x89: /* PSP_NS */ + if (!env->v7m.secure) { + return; + } + env->v7m.other_ss_psp = val & ~3; + return; + case 0x8a: /* MSPLIM_NS */ + if (!env->v7m.secure) { + return; + } + env->v7m.msplim[M_REG_NS] = val & ~7; + return; + case 0x8b: /* PSPLIM_NS */ + if (!env->v7m.secure) { + return; + } + env->v7m.psplim[M_REG_NS] = val & ~7; + return; + case 0x90: /* PRIMASK_NS */ + if (!env->v7m.secure) { + return; + } + env->v7m.primask[M_REG_NS] = val & 1; + return; + case 0x91: /* BASEPRI_NS */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + if (!env->v7m.secure) { + return; + } + env->v7m.basepri[M_REG_NS] = val & 0xff; + return; + case 0x93: /* FAULTMASK_NS */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + if (!env->v7m.secure) { + return; + } + env->v7m.faultmask[M_REG_NS] = val & 1; + return; + case 0x94: /* CONTROL_NS */ + if (!env->v7m.secure) { + return; + } + write_v7m_control_spsel_for_secstate(env, + val & R_V7M_CONTROL_SPSEL_MASK, + M_REG_NS); + if (arm_feature(env, ARM_FEATURE_M_MAIN)) { + env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK; + env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK; + } + /* + * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0, + * RES0 if the FPU is not present, and is stored in the S bank + */ + if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) && + extract32(env->v7m.nsacr, 10, 1)) { + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; + env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK; + } + return; + case 0x98: /* SP_NS */ + { + /* + * This gives the non-secure SP selected based on whether we're + * currently in handler mode or not, using the NS CONTROL.SPSEL. + */ + bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK; + bool is_psp = !arm_v7m_is_handler_mode(env) && spsel; + uint32_t limit; + + if (!env->v7m.secure) { + return; + } + + limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false]; + + val &= ~0x3; + + if (val < limit) { + raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC()); + } + + if (is_psp) { + env->v7m.other_ss_psp = val; + } else { + env->v7m.other_ss_msp = val; + } + return; + } + default: + break; + } + } + + switch (reg) { + case 0 ... 7: /* xPSR sub-fields */ + v7m_msr_xpsr(env, mask, reg, val); + break; + case 8: /* MSP */ + if (v7m_using_psp(env)) { + env->v7m.other_sp = val & ~3; + } else { + env->regs[13] = val & ~3; + } + break; + case 9: /* PSP */ + if (v7m_using_psp(env)) { + env->regs[13] = val & ~3; + } else { + env->v7m.other_sp = val & ~3; + } + break; + case 10: /* MSPLIM */ + if (!arm_feature(env, ARM_FEATURE_V8)) { + goto bad_reg; + } + env->v7m.msplim[env->v7m.secure] = val & ~7; + break; + case 11: /* PSPLIM */ + if (!arm_feature(env, ARM_FEATURE_V8)) { + goto bad_reg; + } + env->v7m.psplim[env->v7m.secure] = val & ~7; + break; + case 16: /* PRIMASK */ + env->v7m.primask[env->v7m.secure] = val & 1; + break; + case 17: /* BASEPRI */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + env->v7m.basepri[env->v7m.secure] = val & 0xff; + break; + case 18: /* BASEPRI_MAX */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + val &= 0xff; + if (val != 0 && (val < env->v7m.basepri[env->v7m.secure] + || env->v7m.basepri[env->v7m.secure] == 0)) { + env->v7m.basepri[env->v7m.secure] = val; + } + break; + case 19: /* FAULTMASK */ + if (!arm_feature(env, ARM_FEATURE_M_MAIN)) { + goto bad_reg; + } + env->v7m.faultmask[env->v7m.secure] = val & 1; + break; + case 20: /* CONTROL */ + /* + * Writing to the SPSEL bit only has an effect if we are in + * thread mode; other bits can be updated by any privileged code. + * write_v7m_control_spsel() deals with updating the SPSEL bit in + * env->v7m.control, so we only need update the others. + * For v7M, we must just ignore explicit writes to SPSEL in handler + * mode; for v8M the write is permitted but will have no effect. + * All these bits are writes-ignored from non-privileged code, + * except for SFPA. + */ + if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) || + !arm_v7m_is_handler_mode(env))) { + write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0); + } + if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) { + env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK; + env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK; + } + if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) { + /* + * SFPA is RAZ/WI from NS or if no FPU. + * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present. + * Both are stored in the S bank. + */ + if (env->v7m.secure) { + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK; + env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK; + } + if (cur_el > 0 && + (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) || + extract32(env->v7m.nsacr, 10, 1))) { + env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK; + env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK; + } + } + break; + default: + bad_reg: + qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special" + " register %d\n", reg); + return; + } +} + +uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op) +{ + /* Implement the TT instruction. op is bits [7:6] of the insn. */ + bool forceunpriv = op & 1; + bool alt = op & 2; + V8M_SAttributes sattrs = {}; + uint32_t tt_resp; + bool r, rw, nsr, nsrw, mrvalid; + ARMMMUIdx mmu_idx; + uint32_t mregion; + bool targetpriv; + bool targetsec = env->v7m.secure; + + /* + * Work out what the security state and privilege level we're + * interested in is... + */ + if (alt) { + targetsec = !targetsec; + } + + if (forceunpriv) { + targetpriv = false; + } else { + targetpriv = arm_v7m_is_handler_mode(env) || + !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK); + } + + /* ...and then figure out which MMU index this is */ + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv); + + /* + * We know that the MPU and SAU don't care about the access type + * for our purposes beyond that we don't want to claim to be + * an insn fetch, so we arbitrarily call this a read. + */ + + /* + * MPU region info only available for privileged or if + * inspecting the other MPU state. + */ + if (arm_current_el(env) != 0 || alt) { + GetPhysAddrResult res = {}; + ARMMMUFaultInfo fi = {}; + + /* We can ignore the return value as prot is always set */ + pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec, + &res, &fi, &mregion); + if (mregion == -1) { + mrvalid = false; + mregion = 0; + } else { + mrvalid = true; + } + r = res.f.prot & PAGE_READ; + rw = res.f.prot & PAGE_WRITE; + } else { + r = false; + rw = false; + mrvalid = false; + mregion = 0; + } + + if (env->v7m.secure) { + v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, + targetsec, &sattrs); + nsr = sattrs.ns && r; + nsrw = sattrs.ns && rw; + } else { + sattrs.ns = true; + nsr = false; + nsrw = false; + } + + tt_resp = (sattrs.iregion << 24) | + (sattrs.irvalid << 23) | + ((!sattrs.ns) << 22) | + (nsrw << 21) | + (nsr << 20) | + (rw << 19) | + (r << 18) | + (sattrs.srvalid << 17) | + (mrvalid << 16) | + (sattrs.sregion << 8) | + mregion; + + return tt_resp; +} + +#endif /* !CONFIG_USER_ONLY */ diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build index 044561bd4d..1f27ba1272 100644 --- a/target/arm/tcg/meson.build +++ b/target/arm/tcg/meson.build @@ -23,10 +23,23 @@ arm_ss.add(files( 'translate-mve.c', 'translate-neon.c', 'translate-vfp.c', + 'crypto_helper.c', + 'iwmmxt_helper.c', + 'm_helper.c', + 'mve_helper.c', + 'neon_helper.c', + 'op_helper.c', + 'tlb_helper.c', + 'vec_helper.c', )) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'translate-a64.c', 'translate-sve.c', 'translate-sme.c', + 'helper-a64.c', + 'mte_helper.c', + 'pauth_helper.c', + 'sme_helper.c', + 'sve_helper.c', )) diff --git a/target/arm/tcg/mte_helper.c b/target/arm/tcg/mte_helper.c new file mode 100644 index 0000000000..98bcf59c22 --- /dev/null +++ b/target/arm/tcg/mte_helper.c @@ -0,0 +1,908 @@ +/* + * ARM v8.5-MemTag Operations + * + * Copyright (c) 2020 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "cpu.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/ram_addr.h" +#include "exec/cpu_ldst.h" +#include "exec/helper-proto.h" +#include "qapi/error.h" +#include "qemu/guest-random.h" + + +static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude) +{ + if (exclude == 0xffff) { + return 0; + } + if (offset == 0) { + while (exclude & (1 << tag)) { + tag = (tag + 1) & 15; + } + } else { + do { + do { + tag = (tag + 1) & 15; + } while (exclude & (1 << tag)); + } while (--offset > 0); + } + return tag; +} + +/** + * allocation_tag_mem: + * @env: the cpu environment + * @ptr_mmu_idx: the addressing regime to use for the virtual address + * @ptr: the virtual address for which to look up tag memory + * @ptr_access: the access to use for the virtual address + * @ptr_size: the number of bytes in the normal memory access + * @tag_access: the access to use for the tag memory + * @tag_size: the number of bytes in the tag memory access + * @ra: the return address for exception handling + * + * Our tag memory is formatted as a sequence of little-endian nibbles. + * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two + * tags, with the tag at [3:0] for the lower addr and the tag at [7:4] + * for the higher addr. + * + * Here, resolve the physical address from the virtual address, and return + * a pointer to the corresponding tag byte. Exit with exception if the + * virtual address is not accessible for @ptr_access. + * + * The @ptr_size and @tag_size values may not have an obvious relation + * due to the alignment of @ptr, and the number of tag checks required. + * + * If there is no tag storage corresponding to @ptr, return NULL. + */ +static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx, + uint64_t ptr, MMUAccessType ptr_access, + int ptr_size, MMUAccessType tag_access, + int tag_size, uintptr_t ra) +{ +#ifdef CONFIG_USER_ONLY + uint64_t clean_ptr = useronly_clean_ptr(ptr); + int flags = page_get_flags(clean_ptr); + uint8_t *tags; + uintptr_t index; + + if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) { + cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access, + !(flags & PAGE_VALID), ra); + } + + /* Require both MAP_ANON and PROT_MTE for the page. */ + if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) { + return NULL; + } + + tags = page_get_target_data(clean_ptr); + + index = extract32(ptr, LOG2_TAG_GRANULE + 1, + TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1); + return tags + index; +#else + CPUTLBEntryFull *full; + MemTxAttrs attrs; + int in_page, flags; + hwaddr ptr_paddr, tag_paddr, xlat; + MemoryRegion *mr; + ARMASIdx tag_asi; + AddressSpace *tag_as; + void *host; + + /* + * Probe the first byte of the virtual address. This raises an + * exception for inaccessible pages, and resolves the virtual address + * into the softmmu tlb. + * + * When RA == 0, this is for mte_probe. The page is expected to be + * valid. Indicate to probe_access_flags no-fault, then assert that + * we received a valid page. + */ + flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx, + ra == 0, &host, &full, ra); + assert(!(flags & TLB_INVALID_MASK)); + + /* If the virtual page MemAttr != Tagged, access unchecked. */ + if (full->pte_attrs != 0xf0) { + return NULL; + } + + /* + * If not backed by host ram, there is no tag storage: access unchecked. + * This is probably a guest os bug though, so log it. + */ + if (unlikely(flags & TLB_MMIO)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Page @ 0x%" PRIx64 " indicates Tagged Normal memory " + "but is not backed by host ram\n", ptr); + return NULL; + } + + /* + * Remember these values across the second lookup below, + * which may invalidate this pointer via tlb resize. + */ + ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK); + attrs = full->attrs; + full = NULL; + + /* + * The Normal memory access can extend to the next page. E.g. a single + * 8-byte access to the last byte of a page will check only the last + * tag on the first page. + * Any page access exception has priority over tag check exception. + */ + in_page = -(ptr | TARGET_PAGE_MASK); + if (unlikely(ptr_size > in_page)) { + flags |= probe_access_full(env, ptr + in_page, ptr_access, + ptr_mmu_idx, ra == 0, &host, &full, ra); + assert(!(flags & TLB_INVALID_MASK)); + } + + /* Any debug exception has priority over a tag check exception. */ + if (unlikely(flags & TLB_WATCHPOINT)) { + int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE; + assert(ra != 0); + cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra); + } + + /* Convert to the physical address in tag space. */ + tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1); + + /* Look up the address in tag space. */ + tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS; + tag_as = cpu_get_address_space(env_cpu(env), tag_asi); + mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL, + tag_access == MMU_DATA_STORE, attrs); + + /* + * Note that @mr will never be NULL. If there is nothing in the address + * space at @tag_paddr, the translation will return the unallocated memory + * region. For our purposes, the result must be ram. + */ + if (unlikely(!memory_region_is_ram(mr))) { + /* ??? Failure is a board configuration error. */ + qemu_log_mask(LOG_UNIMP, + "Tag Memory @ 0x%" HWADDR_PRIx " not found for " + "Normal Memory @ 0x%" HWADDR_PRIx "\n", + tag_paddr, ptr_paddr); + return NULL; + } + + /* + * Ensure the tag memory is dirty on write, for migration. + * Tag memory can never contain code or display memory (vga). + */ + if (tag_access == MMU_DATA_STORE) { + ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat; + cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION); + } + + return memory_region_get_ram_ptr(mr) + xlat; +#endif +} + +uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm) +{ + uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16); + int rrnd = extract32(env->cp15.gcr_el1, 16, 1); + int start = extract32(env->cp15.rgsr_el1, 0, 4); + int seed = extract32(env->cp15.rgsr_el1, 8, 16); + int offset, i, rtag; + + /* + * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the + * deterministic algorithm. Except that with RRND==1 the kernel is + * not required to have set RGSR_EL1.SEED != 0, which is required for + * the deterministic algorithm to function. So we force a non-zero + * SEED for that case. + */ + if (unlikely(seed == 0) && rrnd) { + do { + Error *err = NULL; + uint16_t two; + + if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) { + /* + * Failed, for unknown reasons in the crypto subsystem. + * Best we can do is log the reason and use a constant seed. + */ + qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n", + error_get_pretty(err)); + error_free(err); + two = 1; + } + seed = two; + } while (seed == 0); + } + + /* RandomTag */ + for (i = offset = 0; i < 4; ++i) { + /* NextRandomTagBit */ + int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^ + extract32(seed, 2, 1) ^ extract32(seed, 0, 1)); + seed = (top << 15) | (seed >> 1); + offset |= top << i; + } + rtag = choose_nonexcluded_tag(start, offset, exclude); + env->cp15.rgsr_el1 = rtag | (seed << 8); + + return address_with_allocation_tag(rn, rtag); +} + +uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr, + int32_t offset, uint32_t tag_offset) +{ + int start_tag = allocation_tag_from_addr(ptr); + uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16); + int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude); + + return address_with_allocation_tag(ptr + offset, rtag); +} + +static int load_tag1(uint64_t ptr, uint8_t *mem) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + return extract32(*mem, ofs, 4); +} + +uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + int mmu_idx = cpu_mmu_index(env, false); + uint8_t *mem; + int rtag = 0; + + /* Trap if accessing an invalid page. */ + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1, + MMU_DATA_LOAD, 1, GETPC()); + + /* Load if page supports tags. */ + if (mem) { + rtag = load_tag1(ptr, mem); + } + + return address_with_allocation_tag(xt, rtag); +} + +static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra) +{ + if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) { + arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE, + cpu_mmu_index(env, false), ra); + g_assert_not_reached(); + } +} + +/* For use in a non-parallel context, store to the given nibble. */ +static void store_tag1(uint64_t ptr, uint8_t *mem, int tag) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + *mem = deposit32(*mem, ofs, 4, tag); +} + +/* For use in a parallel context, atomically store to the given nibble. */ +static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + uint8_t old = qatomic_read(mem); + + while (1) { + uint8_t new = deposit32(old, ofs, 4, tag); + uint8_t cmp = qatomic_cmpxchg(mem, old, new); + if (likely(cmp == old)) { + return; + } + old = cmp; + } +} + +typedef void stg_store1(uint64_t, uint8_t *, int); + +static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt, + uintptr_t ra, stg_store1 store1) +{ + int mmu_idx = cpu_mmu_index(env, false); + uint8_t *mem; + + check_tag_aligned(env, ptr, ra); + + /* Trap if accessing an invalid page. */ + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE, + MMU_DATA_STORE, 1, ra); + + /* Store if page supports tags. */ + if (mem) { + store1(ptr, mem, allocation_tag_from_addr(xt)); + } +} + +void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_stg(env, ptr, xt, GETPC(), store_tag1); +} + +void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_stg(env, ptr, xt, GETPC(), store_tag1_parallel); +} + +void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + + check_tag_aligned(env, ptr, ra); + probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); +} + +static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt, + uintptr_t ra, stg_store1 store1) +{ + int mmu_idx = cpu_mmu_index(env, false); + int tag = allocation_tag_from_addr(xt); + uint8_t *mem1, *mem2; + + check_tag_aligned(env, ptr, ra); + + /* + * Trap if accessing an invalid page(s). + * This takes priority over !allocation_tag_access_enabled. + */ + if (ptr & TAG_GRANULE) { + /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */ + mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + TAG_GRANULE, MMU_DATA_STORE, 1, ra); + mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE, + MMU_DATA_STORE, TAG_GRANULE, + MMU_DATA_STORE, 1, ra); + + /* Store if page(s) support tags. */ + if (mem1) { + store1(TAG_GRANULE, mem1, tag); + } + if (mem2) { + store1(0, mem2, tag); + } + } else { + /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */ + mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + 2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra); + if (mem1) { + tag |= tag << 4; + qatomic_set(mem1, tag); + } + } +} + +void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_st2g(env, ptr, xt, GETPC(), store_tag1); +} + +void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel); +} + +void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + int in_page = -(ptr | TARGET_PAGE_MASK); + + check_tag_aligned(env, ptr, ra); + + if (likely(in_page >= 2 * TAG_GRANULE)) { + probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra); + } else { + probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); + probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra); + } +} + +#define LDGM_STGM_SIZE (4 << GMID_EL1_BS) + +uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + void *tag_mem; + + ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); + + /* Trap if accessing an invalid page. */ + tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, + LDGM_STGM_SIZE, MMU_DATA_LOAD, + LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); + + /* The tag is squashed to zero if the page does not support tags. */ + if (!tag_mem) { + return 0; + } + + QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); + /* + * We are loading 64-bits worth of tags. The ordering of elements + * within the word corresponds to a 64-bit little-endian operation. + */ + return ldq_le_p(tag_mem); +} + +void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + void *tag_mem; + + ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); + + /* Trap if accessing an invalid page. */ + tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + LDGM_STGM_SIZE, MMU_DATA_LOAD, + LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); + + /* + * Tag store only happens if the page support tags, + * and if the OS has enabled access to the tags. + */ + if (!tag_mem) { + return; + } + + QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); + /* + * We are storing 64-bits worth of tags. The ordering of elements + * within the word corresponds to a 64-bit little-endian operation. + */ + stq_le_p(tag_mem, val); +} + +void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val) +{ + uintptr_t ra = GETPC(); + int mmu_idx = cpu_mmu_index(env, false); + int log2_dcz_bytes, log2_tag_bytes; + intptr_t dcz_bytes, tag_bytes; + uint8_t *mem; + + /* + * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1, + * i.e. 32 bytes, which is an unreasonably small dcz anyway, + * to make sure that we can access one complete tag byte here. + */ + log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; + log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); + dcz_bytes = (intptr_t)1 << log2_dcz_bytes; + tag_bytes = (intptr_t)1 << log2_tag_bytes; + ptr &= -dcz_bytes; + + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes, + MMU_DATA_STORE, tag_bytes, ra); + if (mem) { + int tag_pair = (val & 0xf) * 0x11; + memset(mem, tag_pair, tag_bytes); + } +} + +static void mte_sync_check_fail(CPUARMState *env, uint32_t desc, + uint64_t dirty_ptr, uintptr_t ra) +{ + int is_write, syn; + + env->exception.vaddress = dirty_ptr; + + is_write = FIELD_EX32(desc, MTEDESC, WRITE); + syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write, + 0x11); + raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra); + g_assert_not_reached(); +} + +static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr, + uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el) +{ + int select; + + if (regime_has_2_ranges(arm_mmu_idx)) { + select = extract64(dirty_ptr, 55, 1); + } else { + select = 0; + } + env->cp15.tfsr_el[el] |= 1 << select; +#ifdef CONFIG_USER_ONLY + /* + * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT, + * which then sends a SIGSEGV when the thread is next scheduled. + * This cpu will return to the main loop at the end of the TB, + * which is rather sooner than "normal". But the alternative + * is waiting until the next syscall. + */ + qemu_cpu_kick(env_cpu(env)); +#endif +} + +/* Record a tag check failure. */ +static void mte_check_fail(CPUARMState *env, uint32_t desc, + uint64_t dirty_ptr, uintptr_t ra) +{ + int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx); + int el, reg_el, tcf; + uint64_t sctlr; + + reg_el = regime_el(env, arm_mmu_idx); + sctlr = env->cp15.sctlr_el[reg_el]; + + switch (arm_mmu_idx) { + case ARMMMUIdx_E10_0: + case ARMMMUIdx_E20_0: + el = 0; + tcf = extract64(sctlr, 38, 2); + break; + default: + el = reg_el; + tcf = extract64(sctlr, 40, 2); + } + + switch (tcf) { + case 1: + /* Tag check fail causes a synchronous exception. */ + mte_sync_check_fail(env, desc, dirty_ptr, ra); + break; + + case 0: + /* + * Tag check fail does not affect the PE. + * We eliminate this case by not setting MTE_ACTIVE + * in tb_flags, so that we never make this runtime call. + */ + g_assert_not_reached(); + + case 2: + /* Tag check fail causes asynchronous flag set. */ + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); + break; + + case 3: + /* + * Tag check fail causes asynchronous flag set for stores, or + * a synchronous exception for loads. + */ + if (FIELD_EX32(desc, MTEDESC, WRITE)) { + mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el); + } else { + mte_sync_check_fail(env, desc, dirty_ptr, ra); + } + break; + } +} + +/** + * checkN: + * @tag: tag memory to test + * @odd: true to begin testing at tags at odd nibble + * @cmp: the tag to compare against + * @count: number of tags to test + * + * Return the number of successful tests. + * Thus a return value < @count indicates a failure. + * + * A note about sizes: count is expected to be small. + * + * The most common use will be LDP/STP of two integer registers, + * which means 16 bytes of memory touching at most 2 tags, but + * often the access is aligned and thus just 1 tag. + * + * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory, + * touching at most 5 tags. SVE LDR/STR (vector) with the default + * vector length is also 64 bytes; the maximum architectural length + * is 256 bytes touching at most 9 tags. + * + * The loop below uses 7 logical operations and 1 memory operation + * per tag pair. An implementation that loads an aligned word and + * uses masking to ignore adjacent tags requires 18 logical operations + * and thus does not begin to pay off until 6 tags. + * Which, according to the survey above, is unlikely to be common. + */ +static int checkN(uint8_t *mem, int odd, int cmp, int count) +{ + int n = 0, diff; + + /* Replicate the test tag and compare. */ + cmp *= 0x11; + diff = *mem++ ^ cmp; + + if (odd) { + goto start_odd; + } + + while (1) { + /* Test even tag. */ + if (unlikely((diff) & 0x0f)) { + break; + } + if (++n == count) { + break; + } + + start_odd: + /* Test odd tag. */ + if (unlikely((diff) & 0xf0)) { + break; + } + if (++n == count) { + break; + } + + diff = *mem++ ^ cmp; + } + return n; +} + +/** + * mte_probe_int() - helper for mte_probe and mte_check + * @env: CPU environment + * @desc: MTEDESC descriptor + * @ptr: virtual address of the base of the access + * @fault: return virtual address of the first check failure + * + * Internal routine for both mte_probe and mte_check. + * Return zero on failure, filling in *fault. + * Return negative on trivial success for tbi disabled. + * Return positive on success with tbi enabled. + */ +static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr, + uintptr_t ra, uint64_t *fault) +{ + int mmu_idx, ptr_tag, bit55; + uint64_t ptr_last, prev_page, next_page; + uint64_t tag_first, tag_last; + uint64_t tag_byte_first, tag_byte_last; + uint32_t sizem1, tag_count, tag_size, n, c; + uint8_t *mem1, *mem2; + MMUAccessType type; + + bit55 = extract64(ptr, 55, 1); + *fault = ptr; + + /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ + if (unlikely(!tbi_check(desc, bit55))) { + return -1; + } + + ptr_tag = allocation_tag_from_addr(ptr); + + if (tcma_check(desc, bit55, ptr_tag)) { + return 1; + } + + mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD; + sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1); + + /* Find the addr of the end of the access */ + ptr_last = ptr + sizem1; + + /* Round the bounds to the tag granule, and compute the number of tags. */ + tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE); + tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE); + tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1; + + /* Round the bounds to twice the tag granule, and compute the bytes. */ + tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE); + tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE); + + /* Locate the page boundaries. */ + prev_page = ptr & TARGET_PAGE_MASK; + next_page = prev_page + TARGET_PAGE_SIZE; + + if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) { + /* Memory access stays on one page. */ + tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1; + mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1, + MMU_DATA_LOAD, tag_size, ra); + if (!mem1) { + return 1; + } + /* Perform all of the comparisons. */ + n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count); + } else { + /* Memory access crosses to next page. */ + tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE); + mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr, + MMU_DATA_LOAD, tag_size, ra); + + tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1; + mem2 = allocation_tag_mem(env, mmu_idx, next_page, type, + ptr_last - next_page + 1, + MMU_DATA_LOAD, tag_size, ra); + + /* + * Perform all of the comparisons. + * Note the possible but unlikely case of the operation spanning + * two pages that do not both have tagging enabled. + */ + n = c = (next_page - tag_first) / TAG_GRANULE; + if (mem1) { + n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c); + } + if (n == c) { + if (!mem2) { + return 1; + } + n += checkN(mem2, 0, ptr_tag, tag_count - c); + } + } + + if (likely(n == tag_count)) { + return 1; + } + + /* + * If we failed, we know which granule. For the first granule, the + * failure address is @ptr, the first byte accessed. Otherwise the + * failure address is the first byte of the nth granule. + */ + if (n > 0) { + *fault = tag_first + n * TAG_GRANULE; + } + return 0; +} + +uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra) +{ + uint64_t fault; + int ret = mte_probe_int(env, desc, ptr, ra, &fault); + + if (unlikely(ret == 0)) { + mte_check_fail(env, desc, fault, ra); + } else if (ret < 0) { + return ptr; + } + return useronly_clean_ptr(ptr); +} + +uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + return mte_check(env, desc, ptr, GETPC()); +} + +/* + * No-fault version of mte_check, to be used by SVE for MemSingleNF. + * Returns false if the access is Checked and the check failed. This + * is only intended to probe the tag -- the validity of the page must + * be checked beforehand. + */ +bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + uint64_t fault; + int ret = mte_probe_int(env, desc, ptr, 0, &fault); + + return ret != 0; +} + +/* + * Perform an MTE checked access for DC_ZVA. + */ +uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + uintptr_t ra = GETPC(); + int log2_dcz_bytes, log2_tag_bytes; + int mmu_idx, bit55; + intptr_t dcz_bytes, tag_bytes, i; + void *mem; + uint64_t ptr_tag, mem_tag, align_ptr; + + bit55 = extract64(ptr, 55, 1); + + /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ + if (unlikely(!tbi_check(desc, bit55))) { + return ptr; + } + + ptr_tag = allocation_tag_from_addr(ptr); + + if (tcma_check(desc, bit55, ptr_tag)) { + goto done; + } + + /* + * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1, + * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make + * sure that we can access one complete tag byte here. + */ + log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; + log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); + dcz_bytes = (intptr_t)1 << log2_dcz_bytes; + tag_bytes = (intptr_t)1 << log2_tag_bytes; + align_ptr = ptr & -dcz_bytes; + + /* + * Trap if accessing an invalid page. DC_ZVA requires that we supply + * the original pointer for an invalid page. But watchpoints require + * that we probe the actual space. So do both. + */ + mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + (void) probe_write(env, ptr, 1, mmu_idx, ra); + mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE, + dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra); + if (!mem) { + goto done; + } + + /* + * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus + * it is quite easy to perform all of the comparisons at once without + * any extra masking. + * + * The most common zva block size is 64; some of the thunderx cpus use + * a block size of 128. For user-only, aarch64_max_initfn will set the + * block size to 512. Fill out the other cases for future-proofing. + * + * In order to be able to find the first miscompare later, we want the + * tag bytes to be in little-endian order. + */ + switch (log2_tag_bytes) { + case 0: /* zva_blocksize 32 */ + mem_tag = *(uint8_t *)mem; + ptr_tag *= 0x11u; + break; + case 1: /* zva_blocksize 64 */ + mem_tag = cpu_to_le16(*(uint16_t *)mem); + ptr_tag *= 0x1111u; + break; + case 2: /* zva_blocksize 128 */ + mem_tag = cpu_to_le32(*(uint32_t *)mem); + ptr_tag *= 0x11111111u; + break; + case 3: /* zva_blocksize 256 */ + mem_tag = cpu_to_le64(*(uint64_t *)mem); + ptr_tag *= 0x1111111111111111ull; + break; + + default: /* zva_blocksize 512, 1024, 2048 */ + ptr_tag *= 0x1111111111111111ull; + i = 0; + do { + mem_tag = cpu_to_le64(*(uint64_t *)(mem + i)); + if (unlikely(mem_tag != ptr_tag)) { + goto fail; + } + i += 8; + align_ptr += 16 * TAG_GRANULE; + } while (i < tag_bytes); + goto done; + } + + if (likely(mem_tag == ptr_tag)) { + goto done; + } + + fail: + /* Locate the first nibble that differs. */ + i = ctz64(mem_tag ^ ptr_tag) >> 4; + mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra); + + done: + return useronly_clean_ptr(ptr); +} diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c new file mode 100644 index 0000000000..403b345ea3 --- /dev/null +++ b/target/arm/tcg/mve_helper.c @@ -0,0 +1,3450 @@ +/* + * M-profile MVE Operations + * + * Copyright (c) 2021 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "vec_internal.h" +#include "exec/helper-proto.h" +#include "exec/cpu_ldst.h" +#include "exec/exec-all.h" +#include "tcg/tcg.h" +#include "fpu/softfloat.h" + +static uint16_t mve_eci_mask(CPUARMState *env) +{ + /* + * Return the mask of which elements in the MVE vector correspond + * to beats being executed. The mask has 1 bits for executed lanes + * and 0 bits where ECI says this beat was already executed. + */ + int eci; + + if ((env->condexec_bits & 0xf) != 0) { + return 0xffff; + } + + eci = env->condexec_bits >> 4; + switch (eci) { + case ECI_NONE: + return 0xffff; + case ECI_A0: + return 0xfff0; + case ECI_A0A1: + return 0xff00; + case ECI_A0A1A2: + case ECI_A0A1A2B0: + return 0xf000; + default: + g_assert_not_reached(); + } +} + +static uint16_t mve_element_mask(CPUARMState *env) +{ + /* + * Return the mask of which elements in the MVE vector should be + * updated. This is a combination of multiple things: + * (1) by default, we update every lane in the vector + * (2) VPT predication stores its state in the VPR register; + * (3) low-overhead-branch tail predication will mask out part + * the vector on the final iteration of the loop + * (4) if EPSR.ECI is set then we must execute only some beats + * of the insn + * We combine all these into a 16-bit result with the same semantics + * as VPR.P0: 0 to mask the lane, 1 if it is active. + * 8-bit vector ops will look at all bits of the result; + * 16-bit ops will look at bits 0, 2, 4, ...; + * 32-bit ops will look at bits 0, 4, 8 and 12. + * Compare pseudocode GetCurInstrBeat(), though that only returns + * the 4-bit slice of the mask corresponding to a single beat. + */ + uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); + + if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { + mask |= 0xff; + } + if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { + mask |= 0xff00; + } + + if (env->v7m.ltpsize < 4 && + env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { + /* + * Tail predication active, and this is the last loop iteration. + * The element size is (1 << ltpsize), and we only want to process + * loopcount elements, so we want to retain the least significant + * (loopcount * esize) predicate bits and zero out bits above that. + */ + int masklen = env->regs[14] << env->v7m.ltpsize; + assert(masklen <= 16); + uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; + mask &= ltpmask; + } + + /* + * ECI bits indicate which beats are already executed; + * we handle this by effectively predicating them out. + */ + mask &= mve_eci_mask(env); + return mask; +} + +static void mve_advance_vpt(CPUARMState *env) +{ + /* Advance the VPT and ECI state if necessary */ + uint32_t vpr = env->v7m.vpr; + unsigned mask01, mask23; + uint16_t inv_mask; + uint16_t eci_mask = mve_eci_mask(env); + + if ((env->condexec_bits & 0xf) == 0) { + env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? + (ECI_A0 << 4) : (ECI_NONE << 4); + } + + if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) { + /* VPT not enabled, nothing to do */ + return; + } + + /* Invert P0 bits if needed, but only for beats we actually executed */ + mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); + mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); + /* Start by assuming we invert all bits corresponding to executed beats */ + inv_mask = eci_mask; + if (mask01 <= 8) { + /* MASK01 says don't invert low half of P0 */ + inv_mask &= ~0xff; + } + if (mask23 <= 8) { + /* MASK23 says don't invert high half of P0 */ + inv_mask &= ~0xff00; + } + vpr ^= inv_mask; + /* Only update MASK01 if beat 1 executed */ + if (eci_mask & 0xf0) { + vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); + } + /* Beat 3 always executes, so update MASK23 */ + vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); + env->v7m.vpr = vpr; +} + +/* For loads, predicated lanes are zeroed instead of keeping their old values */ +#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned b, e; \ + /* \ + * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ + * beats so we don't care if we update part of the dest and \ + * then take an exception. \ + */ \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (eci_mask & (1 << b)) { \ + d[H##ESIZE(e)] = (mask & (1 << b)) ? \ + cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned b, e; \ + for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ + if (mask & (1 << b)) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + addr += MSIZE; \ + } \ + mve_advance_vpt(env); \ + } + +DO_VLDR(vldrb, 1, ldub, 1, uint8_t) +DO_VLDR(vldrh, 2, lduw, 2, uint16_t) +DO_VLDR(vldrw, 4, ldl, 4, uint32_t) + +DO_VSTR(vstrb, 1, stb, 1, uint8_t) +DO_VSTR(vstrh, 2, stw, 2, uint16_t) +DO_VSTR(vstrw, 4, stl, 4, uint32_t) + +DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) +DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) +DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) +DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) +DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) +DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) + +DO_VSTR(vstrb_h, 1, stb, 2, int16_t) +DO_VSTR(vstrb_w, 1, stb, 4, int32_t) +DO_VSTR(vstrh_w, 2, stw, 4, int32_t) + +#undef DO_VLDR +#undef DO_VSTR + +/* + * Gather loads/scatter stores. Here each element of Qm specifies + * an offset to use from the base register Rm. In the _os_ versions + * that offset is scaled by the element size. + * For loads, predicated lanes are zeroed instead of retaining + * their previous values. + */ +#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + OFFTYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + d[H##ESIZE(e)] = (mask & 1) ? \ + cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +/* We know here TYPE is unsigned so always the same as the offset type */ +#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + TYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + if (mask & 1) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + if (WB) { \ + m[H##ESIZE(e)] = addr; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +/* + * 64-bit accesses are slightly different: they are done as two 32-bit + * accesses, controlled by the predicate mask for the relevant beat, + * and with a single 32-bit offset in the first of the two Qm elements. + * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little). + * Address writeback happens on the odd beats and updates the address + * stored in the even-beat element. + */ +#define DO_VLDR64_SG(OP, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR64_SG(OP, ADDRFN, WB) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + if (mask & 1) { \ + cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ + } \ + if (WB && (e & 1)) { \ + m[H4(e & ~1)] = addr - 4; \ + } \ + } \ + mve_advance_vpt(env); \ + } + +#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET)) +#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1)) +#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) +#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) + +DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false) + +DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false) +DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false) +DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false) + +DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false) +DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false) +DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false) +DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false) +DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false) +DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false) + +DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false) +DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false) +DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false) + +DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true) +DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true) +DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true) +DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true) + +/* + * Deinterleaving loads/interleaving stores. + * + * For these helpers we are passed the index of the first Qreg + * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3) + * and the value of the base address register Rn. + * The helpers are specialized for pattern and element size, so + * for instance vld42h is VLD4 with pattern 2, element size MO_16. + * + * These insns are beatwise but not predicated, so we must honour ECI, + * but need not look at mve_element_mask(). + * + * The pseudocode implements these insns with multiple memory accesses + * of the element size, but rules R_VVVG and R_FXDM permit us to make + * one 32-bit memory access per beat. + */ +#define DO_VLD4B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 4; e++, data >>= 8) { \ + uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ + qd[H1(off[beat])] = data; \ + } \ + } \ + } + +#define DO_VLD4H(OP, O1, O2) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O1, O2, O2 }; \ + uint32_t addr, data; \ + int y; /* y counts 0 2 0 2 */ \ + uint16_t *qd; \ + for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 8 + (beat & 1) * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ + qd[H2(off[beat])] = data; \ + data >>= 16; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ + qd[H2(off[beat])] = data; \ + } \ + } + +#define DO_VLD4W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + int y; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + y = (beat + (O1 & 2)) & 3; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ + qd[H4(off[beat] >> 2)] = data; \ + } \ + } + +DO_VLD4B(vld40b, 0, 1, 10, 11) +DO_VLD4B(vld41b, 2, 3, 12, 13) +DO_VLD4B(vld42b, 4, 5, 14, 15) +DO_VLD4B(vld43b, 6, 7, 8, 9) + +DO_VLD4H(vld40h, 0, 5) +DO_VLD4H(vld41h, 1, 6) +DO_VLD4H(vld42h, 2, 7) +DO_VLD4H(vld43h, 3, 4) + +DO_VLD4W(vld40w, 0, 1, 10, 11) +DO_VLD4W(vld41w, 2, 3, 12, 13) +DO_VLD4W(vld42w, 4, 5, 14, 15) +DO_VLD4W(vld43w, 6, 7, 8, 9) + +#define DO_VLD2B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint8_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 2; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 4; e++, data >>= 8) { \ + qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ + qd[H1(off[beat] + (e >> 1))] = data; \ + } \ + } \ + } + +#define DO_VLD2H(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + int e; \ + uint16_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + for (e = 0; e < 2; e++, data >>= 16) { \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ + qd[H2(off[beat])] = data; \ + } \ + } \ + } + +#define DO_VLD2W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat]; \ + data = cpu_ldl_le_data_ra(env, addr, GETPC()); \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ + qd[H4(off[beat] >> 3)] = data; \ + } \ + } + +DO_VLD2B(vld20b, 0, 2, 12, 14) +DO_VLD2B(vld21b, 4, 6, 8, 10) + +DO_VLD2H(vld20h, 0, 1, 6, 7) +DO_VLD2H(vld21h, 2, 3, 4, 5) + +DO_VLD2W(vld20w, 0, 4, 24, 28) +DO_VLD2W(vld21w, 8, 12, 16, 20) + +#define DO_VST4B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = 0; \ + for (e = 3; e >= 0; e--) { \ + uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \ + data = (data << 8) | qd[H1(off[beat])]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST4H(OP, O1, O2) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O1, O2, O2 }; \ + uint32_t addr, data; \ + int y; /* y counts 0 2 0 2 */ \ + uint16_t *qd; \ + for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 8 + (beat & 1) * 4; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \ + data = qd[H2(off[beat])]; \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \ + data |= qd[H2(off[beat])] << 16; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST4W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + int y; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + y = (beat + (O1 & 2)) & 3; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \ + data = qd[H4(off[beat] >> 2)]; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +DO_VST4B(vst40b, 0, 1, 10, 11) +DO_VST4B(vst41b, 2, 3, 12, 13) +DO_VST4B(vst42b, 4, 5, 14, 15) +DO_VST4B(vst43b, 6, 7, 8, 9) + +DO_VST4H(vst40h, 0, 5) +DO_VST4H(vst41h, 1, 6) +DO_VST4H(vst42h, 2, 7) +DO_VST4H(vst43h, 3, 4) + +DO_VST4W(vst40w, 0, 1, 10, 11) +DO_VST4W(vst41w, 2, 3, 12, 13) +DO_VST4W(vst42w, 4, 5, 14, 15) +DO_VST4W(vst43w, 6, 7, 8, 9) + +#define DO_VST2B(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat, e; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint8_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 2; \ + data = 0; \ + for (e = 3; e >= 0; e--) { \ + qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \ + data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST2H(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + int e; \ + uint16_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat] * 4; \ + data = 0; \ + for (e = 1; e >= 0; e--) { \ + qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \ + data = (data << 16) | qd[H2(off[beat])]; \ + } \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +#define DO_VST2W(OP, O1, O2, O3, O4) \ + void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \ + uint32_t base) \ + { \ + int beat; \ + uint16_t mask = mve_eci_mask(env); \ + static const uint8_t off[4] = { O1, O2, O3, O4 }; \ + uint32_t addr, data; \ + uint32_t *qd; \ + for (beat = 0; beat < 4; beat++, mask >>= 4) { \ + if ((mask & 1) == 0) { \ + /* ECI says skip this beat */ \ + continue; \ + } \ + addr = base + off[beat]; \ + qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \ + data = qd[H4(off[beat] >> 3)]; \ + cpu_stl_le_data_ra(env, addr, data, GETPC()); \ + } \ + } + +DO_VST2B(vst20b, 0, 2, 12, 14) +DO_VST2B(vst21b, 4, 6, 8, 10) + +DO_VST2H(vst20h, 0, 1, 6, 7) +DO_VST2H(vst21h, 2, 3, 4, 5) + +DO_VST2W(vst20w, 0, 4, 24, 28) +DO_VST2W(vst21w, 8, 12, 16, 20) + +/* + * The mergemask(D, R, M) macro performs the operation "*D = R" but + * storing only the bytes which correspond to 1 bits in M, + * leaving other bytes in *D unchanged. We use _Generic + * to select the correct implementation based on the type of D. + */ + +static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask) +{ + if (mask & 1) { + *d = r; + } +} + +static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask) +{ + mergemask_ub((uint8_t *)d, r, mask); +} + +static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask) +{ + uint16_t bmask = expand_pred_b(mask); + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask) +{ + mergemask_uh((uint16_t *)d, r, mask); +} + +static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask) +{ + uint32_t bmask = expand_pred_b(mask); + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask) +{ + mergemask_uw((uint32_t *)d, r, mask); +} + +static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask) +{ + uint64_t bmask = expand_pred_b(mask); + *d = (*d & ~bmask) | (r & bmask); +} + +static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) +{ + mergemask_uq((uint64_t *)d, r, mask); +} + +#define mergemask(D, R, M) \ + _Generic(D, \ + uint8_t *: mergemask_ub, \ + int8_t *: mergemask_sb, \ + uint16_t *: mergemask_uh, \ + int16_t *: mergemask_sh, \ + uint32_t *: mergemask_uw, \ + int32_t *: mergemask_sw, \ + uint64_t *: mergemask_uq, \ + int64_t *: mergemask_sq)(D, R, M) + +void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val) +{ + /* + * The generated code already replicated an 8 or 16 bit constant + * into the 32-bit value, so we only need to write the 32-bit + * value to all elements of the Qreg, allowing for predication. + */ + uint32_t *d = vd; + uint16_t mask = mve_element_mask(env); + unsigned e; + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + mergemask(&d[H4(e)], val, mask); + } + mve_advance_vpt(env); +} + +#define DO_1OP(OP, ESIZE, TYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_CLS_B(N) (clrsb32(N) - 24) +#define DO_CLS_H(N) (clrsb32(N) - 16) + +DO_1OP(vclsb, 1, int8_t, DO_CLS_B) +DO_1OP(vclsh, 2, int16_t, DO_CLS_H) +DO_1OP(vclsw, 4, int32_t, clrsb32) + +#define DO_CLZ_B(N) (clz32(N) - 24) +#define DO_CLZ_H(N) (clz32(N) - 16) + +DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) +DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) +DO_1OP(vclzw, 4, uint32_t, clz32) + +DO_1OP(vrev16b, 2, uint16_t, bswap16) +DO_1OP(vrev32b, 4, uint32_t, bswap32) +DO_1OP(vrev32h, 4, uint32_t, hswap32) +DO_1OP(vrev64b, 8, uint64_t, bswap64) +DO_1OP(vrev64h, 8, uint64_t, hswap64) +DO_1OP(vrev64w, 8, uint64_t, wswap64) + +#define DO_NOT(N) (~(N)) + +DO_1OP(vmvn, 8, uint64_t, DO_NOT) + +#define DO_ABS(N) ((N) < 0 ? -(N) : (N)) +#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff)) +#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff)) + +DO_1OP(vabsb, 1, int8_t, DO_ABS) +DO_1OP(vabsh, 2, int16_t, DO_ABS) +DO_1OP(vabsw, 4, int32_t, DO_ABS) + +/* We can do these 64 bits at a time */ +DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) +DO_1OP(vfabss, 8, uint64_t, DO_FABSS) + +#define DO_NEG(N) (-(N)) +#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000)) +#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000)) + +DO_1OP(vnegb, 1, int8_t, DO_NEG) +DO_1OP(vnegh, 2, int16_t, DO_NEG) +DO_1OP(vnegw, 4, int32_t, DO_NEG) + +/* We can do these 64 bits at a time */ +DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) +DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) + +/* + * 1 operand immediates: Vda is destination and possibly also one source. + * All these insns work at 64-bit widths. + */ +#define DO_1OP_IMM(OP, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \ + { \ + uint64_t *da = vda; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ + mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_MOVI(N, I) (I) +#define DO_ANDI(N, I) ((N) & (I)) +#define DO_ORRI(N, I) ((N) | (I)) + +DO_1OP_IMM(vmovi, DO_MOVI) +DO_1OP_IMM(vandi, DO_ANDI) +DO_1OP_IMM(vorri, DO_ORRI) + +#define DO_2OP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_U(OP, FN) \ + DO_2OP(OP##b, 1, uint8_t, FN) \ + DO_2OP(OP##h, 2, uint16_t, FN) \ + DO_2OP(OP##w, 4, uint32_t, FN) + +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_S(OP, FN) \ + DO_2OP(OP##b, 1, int8_t, FN) \ + DO_2OP(OP##h, 2, int16_t, FN) \ + DO_2OP(OP##w, 4, int32_t, FN) + +/* + * "Long" operations where two half-sized inputs (taken from either the + * top or the bottom of the input vector) produce a double-width result. + * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output. + */ +#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \ + m[H##ESIZE(le * 2 + TOP)]); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op helpers for all sizes */ +#define DO_2OP_SAT_U(OP, FN) \ + DO_2OP_SAT(OP##b, 1, uint8_t, FN) \ + DO_2OP_SAT(OP##h, 2, uint16_t, FN) \ + DO_2OP_SAT(OP##w, 4, uint32_t, FN) + +/* provide signed 2-op helpers for all sizes */ +#define DO_2OP_SAT_S(OP, FN) \ + DO_2OP_SAT(OP##b, 1, int8_t, FN) \ + DO_2OP_SAT(OP##h, 2, int16_t, FN) \ + DO_2OP_SAT(OP##w, 4, int32_t, FN) + +#define DO_AND(N, M) ((N) & (M)) +#define DO_BIC(N, M) ((N) & ~(M)) +#define DO_ORR(N, M) ((N) | (M)) +#define DO_ORN(N, M) ((N) | ~(M)) +#define DO_EOR(N, M) ((N) ^ (M)) + +DO_2OP(vand, 8, uint64_t, DO_AND) +DO_2OP(vbic, 8, uint64_t, DO_BIC) +DO_2OP(vorr, 8, uint64_t, DO_ORR) +DO_2OP(vorn, 8, uint64_t, DO_ORN) +DO_2OP(veor, 8, uint64_t, DO_EOR) + +#define DO_ADD(N, M) ((N) + (M)) +#define DO_SUB(N, M) ((N) - (M)) +#define DO_MUL(N, M) ((N) * (M)) + +DO_2OP_U(vadd, DO_ADD) +DO_2OP_U(vsub, DO_SUB) +DO_2OP_U(vmul, DO_MUL) + +DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL) + +DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL) +DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL) +DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL) +DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) +DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) +DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) + +/* + * Polynomial multiply. We can always do this generating 64 bits + * of the result at a time, so we don't need to use DO_2OP_L. + */ +#define VMULLPH_MASK 0x00ff00ff00ff00ffULL +#define VMULLPW_MASK 0x0000ffff0000ffffULL +#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) +#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) +#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) +#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) + +DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) +DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) +DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) +DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) + +/* + * Because the computation type is at least twice as large as required, + * these work for both signed and unsigned source types. + */ +static inline uint8_t do_mulh_b(int32_t n, int32_t m) +{ + return (n * m) >> 8; +} + +static inline uint16_t do_mulh_h(int32_t n, int32_t m) +{ + return (n * m) >> 16; +} + +static inline uint32_t do_mulh_w(int64_t n, int64_t m) +{ + return (n * m) >> 32; +} + +static inline uint8_t do_rmulh_b(int32_t n, int32_t m) +{ + return (n * m + (1U << 7)) >> 8; +} + +static inline uint16_t do_rmulh_h(int32_t n, int32_t m) +{ + return (n * m + (1U << 15)) >> 16; +} + +static inline uint32_t do_rmulh_w(int64_t n, int64_t m) +{ + return (n * m + (1U << 31)) >> 32; +} + +DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) +DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) +DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) +DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) +DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) +DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) + +DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b) +DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h) +DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) +DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) +DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) +DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) + +#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) +#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) + +DO_2OP_S(vmaxs, DO_MAX) +DO_2OP_U(vmaxu, DO_MAX) +DO_2OP_S(vmins, DO_MIN) +DO_2OP_U(vminu, DO_MIN) + +#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) + +DO_2OP_S(vabds, DO_ABD) +DO_2OP_U(vabdu, DO_ABD) + +static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n + m) >> 1; +} + +static inline int32_t do_vhadd_s(int32_t n, int32_t m) +{ + return ((int64_t)n + m) >> 1; +} + +static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) +{ + return ((uint64_t)n - m) >> 1; +} + +static inline int32_t do_vhsub_s(int32_t n, int32_t m) +{ + return ((int64_t)n - m) >> 1; +} + +DO_2OP_S(vhadds, do_vhadd_s) +DO_2OP_U(vhaddu, do_vhadd_u) +DO_2OP_S(vhsubs, do_vhsub_s) +DO_2OP_U(vhsubu, do_vhsub_u) + +#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) +#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) +#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) + +DO_2OP_S(vshls, DO_VSHLS) +DO_2OP_U(vshlu, DO_VSHLU) +DO_2OP_S(vrshls, DO_VRSHLS) +DO_2OP_U(vrshlu, DO_VRSHLU) + +#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) +#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) + +DO_2OP_S(vrhadds, DO_RHADD_S) +DO_2OP_U(vrhaddu, DO_RHADD_U) + +static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, + uint32_t inv, uint32_t carry_in, bool update_flags) +{ + uint16_t mask = mve_element_mask(env); + unsigned e; + + /* If any additions trigger, we will update flags. */ + if (mask & 0x1111) { + update_flags = true; + } + + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + uint64_t r = carry_in; + r += n[H4(e)]; + r += m[H4(e)] ^ inv; + if (mask & 1) { + carry_in = r >> 32; + } + mergemask(&d[H4(e)], r, mask); + } + + if (update_flags) { + /* Store C, clear NZV. */ + env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; + env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; + } + mve_advance_vpt(env); +} + +void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, 0, carry_in, false); +} + +void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; + do_vadc(env, vd, vn, vm, -1, carry_in, false); +} + + +void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, 0, 0, true); +} + +void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + do_vadc(env, vd, vn, vm, -1, 1, true); +} + +#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE r[16 / ESIZE]; \ + /* Calculate all results first to avoid overwriting inputs */ \ + for (e = 0; e < 16 / ESIZE; e++) { \ + if (!(e & 1)) { \ + r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \ + } else { \ + r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \ + } \ + } \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], r[e], mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VCADD_ALL(OP, FN0, FN1) \ + DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \ + DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \ + DO_VCADD(OP##w, 4, int32_t, FN0, FN1) + +DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) +DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) +DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s) +DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s) + +static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) +{ + if (val > max) { + *s = true; + return max; + } else if (val < min) { + *s = true; + return min; + } + return val; +} + +#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) +#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) +#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) + +#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) +#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) +#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) + +#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) +#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) +#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) + +#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) +#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) +#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) + +/* + * For QDMULH and QRDMULH we simplify "double and shift by esize" into + * "shift by esize-1", adjusting the QRDMULH rounding constant to match. + */ +#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ + INT32_MIN, INT32_MAX, s) + +#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ + INT8_MIN, INT8_MAX, s) +#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ + INT16_MIN, INT16_MAX, s) +#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ + INT32_MIN, INT32_MAX, s) + +DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W) + +DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) + +DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) + +/* + * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs() + * and friends wanting a uint32_t* sat and our needing a bool*. + */ +#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ + ({ \ + uint32_t su32 = 0; \ + typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \ + if (su32) { \ + *satp = true; \ + } \ + r; \ + }) + +#define DO_SQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) +#define DO_UQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) +#define DO_SQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp) +#define DO_UQRSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp) +#define DO_SUQSHL_OP(N, M, satp) \ + WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp) + +DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) +DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) +DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) +DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) + +/* + * Multiply add dual returning high half + * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of + * whether to add the rounding constant, and the pointer to the + * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant", + * saturate to twice the input size and return the high half; or + * (A * B - C * D) etc for VQDMLSDH. + */ +#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + if ((e & 1) == XCHG) { \ + TYPE r = FN(n[H##ESIZE(e)], \ + m[H##ESIZE(e - XCHG)], \ + n[H##ESIZE(e + (1 - 2 * XCHG))], \ + m[H##ESIZE(e + (1 - XCHG))], \ + ROUND, &sat); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* + * Architecturally we should do the entire add, double, round + * and then check for saturation. We do three saturating adds, + * but we need to be careful about the order. If the first + * m1 + m2 saturates then it's impossible for the *2+rc to + * bring it back into the non-saturated range. However, if + * m1 + m2 is negative then it's possible that doing the doubling + * would take the intermediate result below INT64_MAX and the + * addition of the rounding constant then brings it back in range. + * So we add half the rounding constant before doubling rather + * than adding the rounding constant after the doubling. + */ + if (sadd64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d, + int round, bool *sat) +{ + int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d, + int round, bool *sat) +{ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c * d; + int64_t r; + /* The same ordering issue as in do_vqdmladh_w applies here too */ + if (ssub64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) +DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b) +DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h) +DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w) + +DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w) +DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) +DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) +DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) + +DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w) + +DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w) +DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b) +DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h) +DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) + +#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \ + mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* "accumulating" version where FN takes d as well as n and m */ +#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], \ + FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \ + mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op scalar helpers for all sizes */ +#define DO_2OP_SCALAR_U(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) +#define DO_2OP_SCALAR_S(OP, FN) \ + DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \ + DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ + DO_2OP_SCALAR(OP##w, 4, int32_t, FN) + +#define DO_2OP_ACC_SCALAR_U(OP, FN) \ + DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN) \ + DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN) \ + DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN) + +DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) +DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) +DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) +DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s) +DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) +DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) +DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) + +DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B) +DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H) +DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W) +DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B) +DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H) +DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W) + +DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B) +DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H) +DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W) +DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) +DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) +DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) + +DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B) +DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H) +DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) +DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) + +static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat) +{ + int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7); + return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; +} + +static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c, + int round, bool *sat) +{ + int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15); + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; +} + +static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c, + int round, bool *sat) +{ + /* + * Architecturally we should do the entire add, double, round + * and then check for saturation. We do three saturating adds, + * but we need to be careful about the order. If the first + * m1 + m2 saturates then it's impossible for the *2+rc to + * bring it back into the non-saturated range. However, if + * m1 + m2 is negative then it's possible that doing the doubling + * would take the intermediate result below INT64_MAX and the + * addition of the rounding constant then brings it back in range. + * So we add half the rounding constant and half the "c << esize" + * before doubling rather than adding the rounding constant after + * the doubling. + */ + int64_t m1 = (int64_t)a * b; + int64_t m2 = (int64_t)c << 31; + int64_t r; + if (sadd64_overflow(m1, m2, &r) || + sadd64_overflow(r, (round << 30), &r) || + sadd64_overflow(r, r, &r)) { + *sat = true; + return r < 0 ? INT32_MAX : INT32_MIN; + } + return r >> 32; +} + +/* + * The *MLAH insns are vector * scalar + vector; + * the *MLASH insns are vector * vector + scalar + */ +#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S) +#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S) +#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S) +#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S) +#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S) +#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S) + +#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S) +#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S) +#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S) +#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S) +#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S) +#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S) + +DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B) +DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H) +DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H) +DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W) + +DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B) +DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H) +DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H) +DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W) + +/* Vector by scalar plus vector */ +#define DO_VMLA(D, N, M) ((N) * (M) + (D)) + +DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA) + +/* Vector by vector plus scalar */ +#define DO_VMLAS(D, N, M) ((N) * (D) + (M)) + +DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS) + +/* + * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the + * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. + * SATMASK specifies which bits of the predicate mask matter for determining + * whether to propagate a saturation indication into FPSCR.QC -- for + * the 16x16->32 case we must check only the bit corresponding to the T or B + * half that we used, but for the 32x32->64 case we propagate if the mask + * bit is set for either half. + */ +#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + uint32_t rm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn; \ + TYPE m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat) +{ + int64_t r = ((int64_t)n * m) * 2; + return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat); +} + +static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) +{ + /* The multiply can't overflow, but the doubling might */ + int64_t r = (int64_t)n * m; + if (r > INT64_MAX / 2) { + *sat = true; + return INT64_MAX; + } else if (r < INT64_MIN / 2) { + *sat = true; + return INT64_MIN; + } else { + return r * 2; + } +} + +#define SATMASK16B 1 +#define SATMASK16T (1 << 2) +#define SATMASK32 ((1 << 4) | 1) + +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16B) +DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ + do_qdmullh, SATMASK16T) +DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ + do_qdmullw, SATMASK32) + +/* + * Long saturating ops + */ +#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ + void *vm) \ + { \ + LTYPE *d = vd; \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + bool qc = false; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \ + LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \ + mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \ + qc |= sat && (mask & SATMASK); \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B) +DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) +DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T) +DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) + +static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit8(n); + if (m < 8) { + n >>= 8 - m; + } + return n; +} + +static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit16(n); + if (m < 16) { + n >>= 16 - m; + } + return n; +} + +static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) +{ + m &= 0xff; + if (m == 0) { + return 0; + } + n = revbit32(n); + if (m < 32) { + n >>= 32 - m; + } + return n; +} + +DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb) +DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh) +DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw) + +/* + * Multiply add long dual accumulate ops. + */ +#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + a ODDACC \ + (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ + } else { \ + a EVENACC \ + (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ + } \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=) +DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=) +DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=) +DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) + +DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) +DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) + +DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) +DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) +DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) +DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) + +/* + * Multiply add dual accumulate ops + */ +#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint32_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + if (e & 1) { \ + a ODDACC \ + n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ + } else { \ + a EVENACC \ + n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ + } \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC) + +#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \ + DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC) + +DO_DAV_S(vmladavs, false, +=, +=) +DO_DAV_U(vmladavu, false, +=, +=) +DO_DAV_S(vmlsdav, false, +=, -=) +DO_DAV_S(vmladavsx, true, +=, +=) +DO_DAV_S(vmlsdavx, true, +=, -=) + +/* + * Rounding multiply add long dual accumulate high. In the pseudocode + * this is implemented with a 72-bit internal accumulator value of which + * the top 64 bits are returned. We optimize this to avoid having to + * use 128-bit arithmetic -- we can do this because the 74-bit accumulator + * is squashed back into 64-bits after each beat. + */ +#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint64_t a) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *n = vn, *m = vm; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ + if (mask & 1) { \ + LTYPE mul; \ + if (e & 1) { \ + mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \ + if (SUB) { \ + mul = -mul; \ + } \ + } else { \ + mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \ + } \ + mul = (mul >> 8) + ((mul >> 7) & 1); \ + a += mul; \ + } \ + } \ + mve_advance_vpt(env); \ + return a; \ + } + +DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false) +DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false) + +DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false) + +DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true) +DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true) + +/* Vector add across vector */ +#define DO_VADDV(OP, ESIZE, TYPE) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + ra += m[H##ESIZE(e)]; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +DO_VADDV(vaddvsb, 1, int8_t) +DO_VADDV(vaddvsh, 2, int16_t) +DO_VADDV(vaddvsw, 4, int32_t) +DO_VADDV(vaddvub, 1, uint8_t) +DO_VADDV(vaddvuh, 2, uint16_t) +DO_VADDV(vaddvuw, 4, uint32_t) + +/* + * Vector max/min across vector. Unlike VADDV, we must + * read ra as the element size, not its full width. + * We work with int64_t internally for simplicity. + */ +#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra_in) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + int64_t ra = (RATYPE)ra_in; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + ra = FN(ra, m[H##ESIZE(e)]); \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +#define DO_VMAXMINV_U(INSN, FN) \ + DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN) \ + DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN) \ + DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN) +#define DO_VMAXMINV_S(INSN, FN) \ + DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN) \ + DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN) \ + DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN) + +/* + * Helpers for max and min of absolute values across vector: + * note that we only take the absolute value of 'm', not 'n' + */ +static int64_t do_maxa(int64_t n, int64_t m) +{ + if (m < 0) { + m = -m; + } + return MAX(n, m); +} + +static int64_t do_mina(int64_t n, int64_t m) +{ + if (m < 0) { + m = -m; + } + return MIN(n, m); +} + +DO_VMAXMINV_S(vmaxvs, DO_MAX) +DO_VMAXMINV_U(vmaxvu, DO_MAX) +DO_VMAXMINV_S(vminvs, DO_MIN) +DO_VMAXMINV_U(vminvu, DO_MIN) +/* + * VMAXAV, VMINAV treat the general purpose input as unsigned + * and the vector elements as signed. + */ +DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa) +DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa) +DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa) +DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina) +DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina) +DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina) + +#define DO_VABAV(OP, ESIZE, TYPE) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + void *vm, uint32_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm, *n = vn; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + int64_t n0 = n[H##ESIZE(e)]; \ + int64_t m0 = m[H##ESIZE(e)]; \ + uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \ + ra += r; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } + +DO_VABAV(vabavsb, 1, int8_t) +DO_VABAV(vabavsh, 2, int16_t) +DO_VABAV(vabavsw, 4, int32_t) +DO_VABAV(vabavub, 1, uint8_t) +DO_VABAV(vabavuh, 2, uint16_t) +DO_VABAV(vabavuw, 4, uint32_t) + +#define DO_VADDLV(OP, TYPE, LTYPE) \ + uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint64_t ra) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ + if (mask & 1) { \ + ra += (LTYPE)m[H4(e)]; \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +DO_VADDLV(vaddlv_s, int32_t, int64_t) +DO_VADDLV(vaddlv_u, uint32_t, uint64_t) + +/* Shifts by immediate */ +#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], \ + FN(m[H##ESIZE(e)], shift), mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], \ + FN(m[H##ESIZE(e)], shift, &sat), mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +/* provide unsigned 2-op shift helpers for all sizes */ +#define DO_2SHIFT_U(OP, FN) \ + DO_2SHIFT(OP##b, 1, uint8_t, FN) \ + DO_2SHIFT(OP##h, 2, uint16_t, FN) \ + DO_2SHIFT(OP##w, 4, uint32_t, FN) +#define DO_2SHIFT_S(OP, FN) \ + DO_2SHIFT(OP##b, 1, int8_t, FN) \ + DO_2SHIFT(OP##h, 2, int16_t, FN) \ + DO_2SHIFT(OP##w, 4, int32_t, FN) + +#define DO_2SHIFT_SAT_U(OP, FN) \ + DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \ + DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \ + DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN) +#define DO_2SHIFT_SAT_S(OP, FN) \ + DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \ + DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \ + DO_2SHIFT_SAT(OP##w, 4, int32_t, FN) + +DO_2SHIFT_U(vshli_u, DO_VSHLU) +DO_2SHIFT_S(vshli_s, DO_VSHLS) +DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP) +DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP) +DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP) +DO_2SHIFT_U(vrshli_u, DO_VRSHLU) +DO_2SHIFT_S(vrshli_s, DO_VRSHLS) +DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP) +DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP) + +/* Shift-and-insert; we always work with 64 bits at a time */ +#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + uint64_t *d = vd, *m = vm; \ + uint16_t mask; \ + uint64_t shiftmask; \ + unsigned e; \ + if (shift == ESIZE * 8) { \ + /* \ + * Only VSRI can shift by
; it should mean "don't \ + * update the destination". The generic logic can't handle \ + * this because it would try to shift by an out-of-range \ + * amount, so special case it here. \ + */ \ + goto done; \ + } \ + assert(shift < ESIZE * 8); \ + mask = mve_element_mask(env); \ + /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \ + shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \ + for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ + uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \ + (d[H8(e)] & ~shiftmask); \ + mergemask(&d[H8(e)], r, mask); \ + } \ +done: \ + mve_advance_vpt(env); \ + } + +#define DO_SHL(N, SHIFT) ((N) << (SHIFT)) +#define DO_SHR(N, SHIFT) ((N) >> (SHIFT)) +#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT)) +#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT)) + +DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK) +DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK) +DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK) +DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK) +DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK) +DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK) + +/* + * Long shifts taking half-sized inputs from top or bottom of the input + * vector and producing a double-width result. ESIZE, TYPE are for + * the input, and LESIZE, LTYPE for the output. + * Unlike the normal shift helpers, we do not handle negative shift counts, + * because the long shift is strictly left-only. + */ +#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + LTYPE *d = vd; \ + TYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + assert(shift <= 16); \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \ + mergemask(&d[H##LESIZE(le)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSHLL_ALL(OP, TOP) \ + DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \ + DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \ + DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \ + DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \ + +DO_VSHLL_ALL(vshllb, false) +DO_VSHLL_ALL(vshllt, true) + +/* + * Narrowing right shifts, taking a double sized input, shifting it + * and putting the result in either the top or bottom half of the output. + * ESIZE, TYPE are the output, and LESIZE, LTYPE the input. + */ +#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + LTYPE *m = vm; \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + mask >>= ESIZE * TOP; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + TYPE r = FN(m[H##LESIZE(le)], shift); \ + mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSHRN_ALL(OP, FN) \ + DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \ + DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \ + DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \ + DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN) + +static inline uint64_t do_urshr(uint64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else if (sh == 64) { + return x >> 63; + } else { + return 0; + } +} + +static inline int64_t do_srshr(int64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else { + /* Rounding the sign bit always produces 0. */ + return 0; + } +} + +DO_VSHRN_ALL(vshrn, DO_SHR) +DO_VSHRN_ALL(vrshrn, do_urshr) + +static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max, + bool *satp) +{ + if (val > max) { + *satp = true; + return max; + } else if (val < min) { + *satp = true; + return min; + } else { + return val; + } +} + +/* Saturating narrowing right shifts */ +#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ + void *vm, uint32_t shift) \ + { \ + LTYPE *m = vm; \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + bool qc = false; \ + unsigned le; \ + mask >>= ESIZE * TOP; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \ + mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSHRN_SAT_UB(BOP, TOP, FN) \ + DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \ + DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN) + +#define DO_VSHRN_SAT_UH(BOP, TOP, FN) \ + DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \ + DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN) + +#define DO_VSHRN_SAT_SB(BOP, TOP, FN) \ + DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \ + DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN) + +#define DO_VSHRN_SAT_SH(BOP, TOP, FN) \ + DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \ + DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN) + +#define DO_SHRN_SB(N, M, SATP) \ + do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP) +#define DO_SHRN_UB(N, M, SATP) \ + do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP) +#define DO_SHRUN_B(N, M, SATP) \ + do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP) + +#define DO_SHRN_SH(N, M, SATP) \ + do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP) +#define DO_SHRN_UH(N, M, SATP) \ + do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP) +#define DO_SHRUN_H(N, M, SATP) \ + do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP) + +#define DO_RSHRN_SB(N, M, SATP) \ + do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP) +#define DO_RSHRN_UB(N, M, SATP) \ + do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP) +#define DO_RSHRUN_B(N, M, SATP) \ + do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP) + +#define DO_RSHRN_SH(N, M, SATP) \ + do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP) +#define DO_RSHRN_UH(N, M, SATP) \ + do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP) +#define DO_RSHRUN_H(N, M, SATP) \ + do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP) + +DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB) +DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH) +DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB) +DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH) +DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B) +DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H) + +DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB) +DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH) +DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB) +DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH) +DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B) +DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H) + +#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + LTYPE *m = vm; \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned le; \ + mask >>= ESIZE * TOP; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + mergemask(&d[H##ESIZE(le * 2 + TOP)], \ + m[H##LESIZE(le)], mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t) +DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t) +DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t) +DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t) + +#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + LTYPE *m = vm; \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + bool qc = false; \ + unsigned le; \ + mask >>= ESIZE * TOP; \ + for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ + bool sat = false; \ + TYPE r = FN(m[H##LESIZE(le)], &sat); \ + mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VMOVN_SAT_UB(BOP, TOP, FN) \ + DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \ + DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN) + +#define DO_VMOVN_SAT_UH(BOP, TOP, FN) \ + DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \ + DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN) + +#define DO_VMOVN_SAT_SB(BOP, TOP, FN) \ + DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \ + DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN) + +#define DO_VMOVN_SAT_SH(BOP, TOP, FN) \ + DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \ + DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN) + +#define DO_VQMOVN_SB(N, SATP) \ + do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP) +#define DO_VQMOVN_UB(N, SATP) \ + do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP) +#define DO_VQMOVUN_B(N, SATP) \ + do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP) + +#define DO_VQMOVN_SH(N, SATP) \ + do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP) +#define DO_VQMOVN_UH(N, SATP) \ + do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP) +#define DO_VQMOVUN_H(N, SATP) \ + do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP) + +DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB) +DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH) +DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB) +DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH) +DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B) +DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H) + +uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm, + uint32_t shift) +{ + uint32_t *d = vd; + uint16_t mask = mve_element_mask(env); + unsigned e; + uint32_t r; + + /* + * For each 32-bit element, we shift it left, bringing in the + * low 'shift' bits of rdm at the bottom. Bits shifted out at + * the top become the new rdm, if the predicate mask permits. + * The final rdm value is returned to update the register. + * shift == 0 here means "shift by 32 bits". + */ + if (shift == 0) { + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + r = rdm; + if (mask & 1) { + rdm = d[H4(e)]; + } + mergemask(&d[H4(e)], r, mask); + } + } else { + uint32_t shiftmask = MAKE_64BIT_MASK(0, shift); + + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + r = (d[H4(e)] << shift) | (rdm & shiftmask); + if (mask & 1) { + rdm = d[H4(e)] >> (32 - shift); + } + mergemask(&d[H4(e)], r, mask); + } + } + mve_advance_vpt(env); + return rdm; +} + +uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_sqrshl_d(n, -(int8_t)shift, false, NULL); +} + +uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_uqrshl_d(n, (int8_t)shift, false, NULL); +} + +uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_sqrshl_d(n, (int8_t)shift, false, &env->QF); +} + +uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_uqrshl_d(n, (int8_t)shift, false, &env->QF); +} + +uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF); +} + +uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_uqrshl_d(n, (int8_t)shift, true, &env->QF); +} + +/* Operate on 64-bit values, but saturate at 48 bits */ +static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift, + bool round, uint32_t *sat) +{ + int64_t val, extval; + + if (shift <= -48) { + /* Rounding the sign bit always produces 0. */ + if (round) { + return 0; + } + return src >> 63; + } else if (shift < 0) { + if (round) { + src >>= -shift - 1; + val = (src >> 1) + (src & 1); + } else { + val = src >> -shift; + } + extval = sextract64(val, 0, 48); + if (!sat || val == extval) { + return extval; + } + } else if (shift < 48) { + int64_t extval = sextract64(src << shift, 0, 48); + if (!sat || src == (extval >> shift)) { + return extval; + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17); +} + +/* Operate on 64-bit values, but saturate at 48 bits */ +static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift, + bool round, uint32_t *sat) +{ + uint64_t val, extval; + + if (shift <= -(48 + round)) { + return 0; + } else if (shift < 0) { + if (round) { + val = src >> (-shift - 1); + val = (val >> 1) + (val & 1); + } else { + val = src >> -shift; + } + extval = extract64(val, 0, 48); + if (!sat || val == extval) { + return extval; + } + } else if (shift < 48) { + uint64_t extval = extract64(src << shift, 0, 48); + if (!sat || src == (extval >> shift)) { + return extval; + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return MAKE_64BIT_MASK(0, 48); +} + +uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF); +} + +uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift) +{ + return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF); +} + +uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift) +{ + return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); +} + +uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift) +{ + return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); +} + +uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift) +{ + return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF); +} + +uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift) +{ + return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF); +} + +#define DO_VIDUP(OP, ESIZE, TYPE, FN) \ + uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \ + uint32_t offset, uint32_t imm) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], offset, mask); \ + offset = FN(offset, imm); \ + } \ + mve_advance_vpt(env); \ + return offset; \ + } + +#define DO_VIWDUP(OP, ESIZE, TYPE, FN) \ + uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \ + uint32_t offset, uint32_t wrap, \ + uint32_t imm) \ + { \ + TYPE *d = vd; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], offset, mask); \ + offset = FN(offset, wrap, imm); \ + } \ + mve_advance_vpt(env); \ + return offset; \ + } + +#define DO_VIDUP_ALL(OP, FN) \ + DO_VIDUP(OP##b, 1, int8_t, FN) \ + DO_VIDUP(OP##h, 2, int16_t, FN) \ + DO_VIDUP(OP##w, 4, int32_t, FN) + +#define DO_VIWDUP_ALL(OP, FN) \ + DO_VIWDUP(OP##b, 1, int8_t, FN) \ + DO_VIWDUP(OP##h, 2, int16_t, FN) \ + DO_VIWDUP(OP##w, 4, int32_t, FN) + +static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm) +{ + offset += imm; + if (offset == wrap) { + offset = 0; + } + return offset; +} + +static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm) +{ + if (offset == 0) { + offset = wrap; + } + offset -= imm; + return offset; +} + +DO_VIDUP_ALL(vidup, DO_ADD) +DO_VIWDUP_ALL(viwdup, do_add_wrap) +DO_VIWDUP_ALL(vdwdup, do_sub_wrap) + +/* + * Vector comparison. + * P0 bits for non-executed beats (where eci_mask is 0) are unchanged. + * P0 bits for predicated lanes in executed beats (where mask is 0) are 0. + * P0 bits otherwise are updated with the results of the comparisons. + * We must also keep unchanged the MASK fields at the top of v7m.vpr. + */ +#define DO_VCMP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \ + { \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + uint16_t beatpred = 0; \ + uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++) { \ + bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]); \ + /* Comparison sets 0/1 bits for each byte in the element */ \ + beatpred |= r * emask; \ + emask <<= ESIZE; \ + } \ + beatpred &= mask; \ + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ + (beatpred & eci_mask); \ + mve_advance_vpt(env); \ + } + +#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + uint32_t rm) \ + { \ + TYPE *n = vn; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + uint16_t beatpred = 0; \ + uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++) { \ + bool r = FN(n[H##ESIZE(e)], (TYPE)rm); \ + /* Comparison sets 0/1 bits for each byte in the element */ \ + beatpred |= r * emask; \ + emask <<= ESIZE; \ + } \ + beatpred &= mask; \ + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ + (beatpred & eci_mask); \ + mve_advance_vpt(env); \ + } + +#define DO_VCMP_S(OP, FN) \ + DO_VCMP(OP##b, 1, int8_t, FN) \ + DO_VCMP(OP##h, 2, int16_t, FN) \ + DO_VCMP(OP##w, 4, int32_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN) + +#define DO_VCMP_U(OP, FN) \ + DO_VCMP(OP##b, 1, uint8_t, FN) \ + DO_VCMP(OP##h, 2, uint16_t, FN) \ + DO_VCMP(OP##w, 4, uint32_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN) \ + DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN) + +#define DO_EQ(N, M) ((N) == (M)) +#define DO_NE(N, M) ((N) != (M)) +#define DO_EQ(N, M) ((N) == (M)) +#define DO_EQ(N, M) ((N) == (M)) +#define DO_GE(N, M) ((N) >= (M)) +#define DO_LT(N, M) ((N) < (M)) +#define DO_GT(N, M) ((N) > (M)) +#define DO_LE(N, M) ((N) <= (M)) + +DO_VCMP_U(vcmpeq, DO_EQ) +DO_VCMP_U(vcmpne, DO_NE) +DO_VCMP_U(vcmpcs, DO_GE) +DO_VCMP_U(vcmphi, DO_GT) +DO_VCMP_S(vcmpge, DO_GE) +DO_VCMP_S(vcmplt, DO_LT) +DO_VCMP_S(vcmpgt, DO_GT) +DO_VCMP_S(vcmple, DO_LE) + +void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm) +{ + /* + * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n] + * but note that whether bytes are written to Qd is still subject + * to (all forms of) predication in the usual way. + */ + uint64_t *d = vd, *n = vn, *m = vm; + uint16_t mask = mve_element_mask(env); + uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); + unsigned e; + for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) { + uint64_t r = m[H8(e)]; + mergemask(&r, n[H8(e)], p0); + mergemask(&d[H8(e)], r, mask); + } + mve_advance_vpt(env); +} + +void HELPER(mve_vpnot)(CPUARMState *env) +{ + /* + * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged. + * P0 bits for predicated lanes in executed bits (where mask is 0) are 0. + * P0 bits otherwise are inverted. + * (This is the same logic as VCMP.) + * This insn is itself subject to predication and to beat-wise execution, + * and after it executes VPT state advances in the usual way. + */ + uint16_t mask = mve_element_mask(env); + uint16_t eci_mask = mve_eci_mask(env); + uint16_t beatpred = ~env->v7m.vpr & mask; + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask); + mve_advance_vpt(env); +} + +/* + * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed, + * otherwise set according to value of Rn. The calculation of + * newmask here works in the same way as the calculation of the + * ltpmask in mve_element_mask(), but we have pre-calculated + * the masklen in the generated code. + */ +void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen) +{ + uint16_t mask = mve_element_mask(env); + uint16_t eci_mask = mve_eci_mask(env); + uint16_t newmask; + + assert(masklen <= 16); + newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; + newmask &= mask; + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask); + mve_advance_vpt(env); +} + +#define DO_1OP_SAT(OP, ESIZE, TYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + TYPE *d = vd, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + bool qc = false; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + bool sat = false; \ + mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \ + qc |= sat & mask & 1; \ + } \ + if (qc) { \ + env->vfp.qc[0] = qc; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VQABS_B(N, SATP) \ + do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP) +#define DO_VQABS_H(N, SATP) \ + do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP) +#define DO_VQABS_W(N, SATP) \ + do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP) + +#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP) +#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP) +#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP) + +DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B) +DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H) +DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W) + +DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B) +DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H) +DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W) + +/* + * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its + * absolute value; we then do an unsigned comparison. + */ +#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ + { \ + UTYPE *d = vd; \ + STYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + UTYPE r = DO_ABS(m[H##ESIZE(e)]); \ + r = FN(d[H##ESIZE(e)], r); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX) +DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX) +DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX) +DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN) +DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN) +DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN) + +/* + * 2-operand floating point. Note that if an element is partially + * predicated we must do the FP operation to update the non-predicated + * bytes, but we must be careful to avoid updating the FP exception + * state unless byte 0 of the element was unpredicated. + */ +#define DO_2OP_FP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + TYPE r; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_FP_ALL(OP, FN) \ + DO_2OP_FP(OP##h, 2, float16, float16_##FN) \ + DO_2OP_FP(OP##s, 4, float32, float32_##FN) + +DO_2OP_FP_ALL(vfadd, add) +DO_2OP_FP_ALL(vfsub, sub) +DO_2OP_FP_ALL(vfmul, mul) + +static inline float16 float16_abd(float16 a, float16 b, float_status *s) +{ + return float16_abs(float16_sub(a, b, s)); +} + +static inline float32 float32_abd(float32 a, float32 b, float_status *s) +{ + return float32_abs(float32_sub(a, b, s)); +} + +DO_2OP_FP_ALL(vfabd, abd) +DO_2OP_FP_ALL(vmaxnm, maxnum) +DO_2OP_FP_ALL(vminnm, minnum) + +static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s) +{ + return float16_maxnum(float16_abs(a), float16_abs(b), s); +} + +static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s) +{ + return float32_maxnum(float32_abs(a), float32_abs(b), s); +} + +static inline float16 float16_minnuma(float16 a, float16 b, float_status *s) +{ + return float16_minnum(float16_abs(a), float16_abs(b), s); +} + +static inline float32 float32_minnuma(float32 a, float32 b, float_status *s) +{ + return float32_minnum(float32_abs(a), float32_abs(b), s); +} + +DO_2OP_FP_ALL(vmaxnma, maxnuma) +DO_2OP_FP_ALL(vminnma, minnuma) + +#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + TYPE r[16 / ESIZE]; \ + uint16_t tm, mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + /* Calculate all results first to avoid overwriting inputs */ \ + for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) { \ + if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + r[e] = 0; \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(tm & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + if (!(e & 1)) { \ + r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst); \ + } else { \ + r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst); \ + } \ + } \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + mergemask(&d[H##ESIZE(e)], r[e], mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add) +DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add) +DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub) +DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub) + +#define DO_VFMA(OP, ESIZE, TYPE, CHS) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + TYPE r; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = n[H##ESIZE(e)]; \ + if (CHS) { \ + r = TYPE##_chs(r); \ + } \ + r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)], \ + 0, fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_VFMA(vfmah, 2, float16, false) +DO_VFMA(vfmas, 4, float32, false) +DO_VFMA(vfmsh, 2, float16, true) +DO_VFMA(vfmss, 4, float32, true) + +#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, void *vm) \ + { \ + TYPE *d = vd, *n = vn, *m = vm; \ + TYPE r0, r1, e1, e2, e3, e4; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst0, *fpst1; \ + float_status scratch_fpst; \ + /* We loop through pairs of elements at a time */ \ + for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) { \ + continue; \ + } \ + fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + fpst1 = fpst0; \ + if (!(mask & 1)) { \ + scratch_fpst = *fpst0; \ + fpst0 = &scratch_fpst; \ + } \ + if (!(mask & (1 << ESIZE))) { \ + scratch_fpst = *fpst1; \ + fpst1 = &scratch_fpst; \ + } \ + switch (ROT) { \ + case 0: \ + e1 = m[H##ESIZE(e)]; \ + e2 = n[H##ESIZE(e)]; \ + e3 = m[H##ESIZE(e + 1)]; \ + e4 = n[H##ESIZE(e)]; \ + break; \ + case 1: \ + e1 = TYPE##_chs(m[H##ESIZE(e + 1)]); \ + e2 = n[H##ESIZE(e + 1)]; \ + e3 = m[H##ESIZE(e)]; \ + e4 = n[H##ESIZE(e + 1)]; \ + break; \ + case 2: \ + e1 = TYPE##_chs(m[H##ESIZE(e)]); \ + e2 = n[H##ESIZE(e)]; \ + e3 = TYPE##_chs(m[H##ESIZE(e + 1)]); \ + e4 = n[H##ESIZE(e)]; \ + break; \ + case 3: \ + e1 = m[H##ESIZE(e + 1)]; \ + e2 = n[H##ESIZE(e + 1)]; \ + e3 = TYPE##_chs(m[H##ESIZE(e)]); \ + e4 = n[H##ESIZE(e + 1)]; \ + break; \ + default: \ + g_assert_not_reached(); \ + } \ + r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0); \ + r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1); \ + mergemask(&d[H##ESIZE(e)], r0, mask); \ + mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S) +#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S) + +#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S) +#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S) + +DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH) +DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS) +DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH) +DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS) +DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH) +DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS) +DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH) +DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS) + +DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH) +DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS) +DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH) +DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS) +DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH) +DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS) +DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH) +DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS) + +#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE r, m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(n[H##ESIZE(e)], m, fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_2OP_FP_SCALAR_ALL(OP, FN) \ + DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN) \ + DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN) + +DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add) +DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub) +DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul) + +#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vn, uint32_t rm) \ + { \ + TYPE *d = vd, *n = vn; \ + TYPE r, m = rm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +/* VFMAS is vector * vector + scalar, so swap op2 and op3 */ +#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S) +#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S) + +/* VFMA is vector * scalar + vector */ +DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd) +DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd) +DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH) +DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS) + +/* Floating point max/min across vector. */ +#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN) \ + uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ + uint32_t ra_in) \ + { \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + TYPE *m = vm; \ + TYPE ra = (TYPE)ra_in; \ + float_status *fpst = (ESIZE == 2) ? \ + &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if (mask & 1) { \ + TYPE v = m[H##ESIZE(e)]; \ + if (TYPE##_is_signaling_nan(ra, fpst)) { \ + ra = TYPE##_silence_nan(ra, fpst); \ + float_raise(float_flag_invalid, fpst); \ + } \ + if (TYPE##_is_signaling_nan(v, fpst)) { \ + v = TYPE##_silence_nan(v, fpst); \ + float_raise(float_flag_invalid, fpst); \ + } \ + if (ABS) { \ + v = TYPE##_abs(v); \ + } \ + ra = FN(ra, v, fpst); \ + } \ + } \ + mve_advance_vpt(env); \ + return ra; \ + } \ + +#define NOP(X) (X) + +DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum) +DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum) +DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum) +DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum) +DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum) +DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum) +DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum) +DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum) + +/* FP compares; note that all comparisons signal InvalidOp for QNaNs */ +#define DO_VCMP_FP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \ + { \ + TYPE *n = vn, *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + uint16_t beatpred = 0; \ + uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + bool r; \ + for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \ + if ((mask & emask) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & (1 << (e * ESIZE)))) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \ + /* Comparison sets 0/1 bits for each byte in the element */ \ + beatpred |= r * emask; \ + } \ + beatpred &= mask; \ + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ + (beatpred & eci_mask); \ + mve_advance_vpt(env); \ + } + +#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ + uint32_t rm) \ + { \ + TYPE *n = vn; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + uint16_t beatpred = 0; \ + uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + bool r; \ + for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \ + if ((mask & emask) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & (1 << (e * ESIZE)))) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst); \ + /* Comparison sets 0/1 bits for each byte in the element */ \ + beatpred |= r * emask; \ + } \ + beatpred &= mask; \ + env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ + (beatpred & eci_mask); \ + mve_advance_vpt(env); \ + } + +#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN) \ + DO_VCMP_FP(VOP, ESIZE, TYPE, FN) \ + DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN) + +/* + * Some care is needed here to get the correct result for the unordered case. + * Architecturally EQ, GE and GT are defined to be false for unordered, but + * the NE, LT and LE comparisons are defined as simple logical inverses of + * EQ, GE and GT and so they must return true for unordered. The softfloat + * comparison functions float*_{eq,le,lt} all return false for unordered. + */ +#define DO_GE16(X, Y, S) float16_le(Y, X, S) +#define DO_GE32(X, Y, S) float32_le(Y, X, S) +#define DO_GT16(X, Y, S) float16_lt(Y, X, S) +#define DO_GT32(X, Y, S) float32_lt(Y, X, S) + +DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq) +DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq) + +DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq) +DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq) + +DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16) +DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32) + +DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16) +DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32) + +DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16) +DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32) + +DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16) +DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32) + +#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm, \ + uint32_t shift) \ + { \ + TYPE *d = vd, *m = vm; \ + TYPE r; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(m[H##ESIZE(e)], shift, fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh) +DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh) +DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero) +DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero) +DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos) +DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos) +DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero) +DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero) + +/* VCVT with specified rmode */ +#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, \ + void *vd, void *vm, uint32_t rmode) \ + { \ + TYPE *d = vd, *m = vm; \ + TYPE r; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + float_status *base_fpst = (ESIZE == 2) ? \ + &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + uint32_t prev_rmode = get_float_rounding_mode(base_fpst); \ + set_float_rounding_mode(rmode, base_fpst); \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = base_fpst; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(m[H##ESIZE(e)], 0, fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + set_float_rounding_mode(prev_rmode, base_fpst); \ + mve_advance_vpt(env); \ + } + +DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh) +DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh) +DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls) +DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls) + +#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S) +#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S) + +DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H) +DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S) + +/* + * VCVT between halfprec and singleprec. As usual for halfprec + * conversions, FZ16 is ignored and AHP is observed. + */ +static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top) +{ + uint16_t *d = vd; + uint32_t *m = vm; + uint16_t r; + uint16_t mask = mve_element_mask(env); + bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); + unsigned e; + float_status *fpst; + float_status scratch_fpst; + float_status *base_fpst = &env->vfp.standard_fp_status; + bool old_fz = get_flush_to_zero(base_fpst); + set_flush_to_zero(false, base_fpst); + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) { + continue; + } + fpst = base_fpst; + if (!(mask & 1)) { + /* We need the result but without updating flags */ + scratch_fpst = *fpst; + fpst = &scratch_fpst; + } + r = float32_to_float16(m[H4(e)], ieee, fpst); + mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2)); + } + set_flush_to_zero(old_fz, base_fpst); + mve_advance_vpt(env); +} + +static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top) +{ + uint32_t *d = vd; + uint16_t *m = vm; + uint32_t r; + uint16_t mask = mve_element_mask(env); + bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP); + unsigned e; + float_status *fpst; + float_status scratch_fpst; + float_status *base_fpst = &env->vfp.standard_fp_status; + bool old_fiz = get_flush_inputs_to_zero(base_fpst); + set_flush_inputs_to_zero(false, base_fpst); + for (e = 0; e < 16 / 4; e++, mask >>= 4) { + if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) { + continue; + } + fpst = base_fpst; + if (!(mask & (1 << (top * 2)))) { + /* We need the result but without updating flags */ + scratch_fpst = *fpst; + fpst = &scratch_fpst; + } + r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst); + mergemask(&d[H4(e)], r, mask); + } + set_flush_inputs_to_zero(old_fiz, base_fpst); + mve_advance_vpt(env); +} + +void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm) +{ + do_vcvt_sh(env, vd, vm, 0); +} +void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm) +{ + do_vcvt_sh(env, vd, vm, 1); +} +void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm) +{ + do_vcvt_hs(env, vd, vm, 0); +} +void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm) +{ + do_vcvt_hs(env, vd, vm, 1); +} + +#define DO_1OP_FP(OP, ESIZE, TYPE, FN) \ + void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm) \ + { \ + TYPE *d = vd, *m = vm; \ + TYPE r; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + float_status *fpst; \ + float_status scratch_fpst; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ + continue; \ + } \ + fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ + &env->vfp.standard_fp_status; \ + if (!(mask & 1)) { \ + /* We need the result but without updating flags */ \ + scratch_fpst = *fpst; \ + fpst = &scratch_fpst; \ + } \ + r = FN(m[H##ESIZE(e)], fpst); \ + mergemask(&d[H##ESIZE(e)], r, mask); \ + } \ + mve_advance_vpt(env); \ + } + +DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int) +DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int) diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c new file mode 100644 index 0000000000..bc6c4a54e9 --- /dev/null +++ b/target/arm/tcg/neon_helper.c @@ -0,0 +1,1740 @@ +/* + * ARM NEON vector operations. + * + * Copyright (c) 2007, 2008 CodeSourcery. + * Written by Paul Brook + * + * This code is licensed under the GNU GPL v2. + */ +#include "qemu/osdep.h" + +#include "cpu.h" +#include "exec/helper-proto.h" +#include "fpu/softfloat.h" +#include "vec_internal.h" + +#define SIGNBIT (uint32_t)0x80000000 +#define SIGNBIT64 ((uint64_t)1 << 63) + +#define SET_QC() env->vfp.qc[0] = 1 + +#define NEON_TYPE1(name, type) \ +typedef struct \ +{ \ + type v1; \ +} neon_##name; +#if HOST_BIG_ENDIAN +#define NEON_TYPE2(name, type) \ +typedef struct \ +{ \ + type v2; \ + type v1; \ +} neon_##name; +#define NEON_TYPE4(name, type) \ +typedef struct \ +{ \ + type v4; \ + type v3; \ + type v2; \ + type v1; \ +} neon_##name; +#else +#define NEON_TYPE2(name, type) \ +typedef struct \ +{ \ + type v1; \ + type v2; \ +} neon_##name; +#define NEON_TYPE4(name, type) \ +typedef struct \ +{ \ + type v1; \ + type v2; \ + type v3; \ + type v4; \ +} neon_##name; +#endif + +NEON_TYPE4(s8, int8_t) +NEON_TYPE4(u8, uint8_t) +NEON_TYPE2(s16, int16_t) +NEON_TYPE2(u16, uint16_t) +NEON_TYPE1(s32, int32_t) +NEON_TYPE1(u32, uint32_t) +#undef NEON_TYPE4 +#undef NEON_TYPE2 +#undef NEON_TYPE1 + +/* Copy from a uint32_t to a vector structure type. */ +#define NEON_UNPACK(vtype, dest, val) do { \ + union { \ + vtype v; \ + uint32_t i; \ + } conv_u; \ + conv_u.i = (val); \ + dest = conv_u.v; \ + } while(0) + +/* Copy from a vector structure type to a uint32_t. */ +#define NEON_PACK(vtype, dest, val) do { \ + union { \ + vtype v; \ + uint32_t i; \ + } conv_u; \ + conv_u.v = (val); \ + dest = conv_u.i; \ + } while(0) + +#define NEON_DO1 \ + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); +#define NEON_DO2 \ + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); +#define NEON_DO4 \ + NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ + NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ + NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ + NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); + +#define NEON_VOP_BODY(vtype, n) \ +{ \ + uint32_t res; \ + vtype vsrc1; \ + vtype vsrc2; \ + vtype vdest; \ + NEON_UNPACK(vtype, vsrc1, arg1); \ + NEON_UNPACK(vtype, vsrc2, arg2); \ + NEON_DO##n; \ + NEON_PACK(vtype, res, vdest); \ + return res; \ +} + +#define NEON_VOP(name, vtype, n) \ +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ +NEON_VOP_BODY(vtype, n) + +#define NEON_VOP_ENV(name, vtype, n) \ +uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ +NEON_VOP_BODY(vtype, n) + +/* Pairwise operations. */ +/* For 32-bit elements each segment only contains a single element, so + the elementwise and pairwise operations are the same. */ +#define NEON_PDO2 \ + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ + NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); +#define NEON_PDO4 \ + NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ + NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ + NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ + NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ + +#define NEON_POP(name, vtype, n) \ +uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ +{ \ + uint32_t res; \ + vtype vsrc1; \ + vtype vsrc2; \ + vtype vdest; \ + NEON_UNPACK(vtype, vsrc1, arg1); \ + NEON_UNPACK(vtype, vsrc2, arg2); \ + NEON_PDO##n; \ + NEON_PACK(vtype, res, vdest); \ + return res; \ +} + +/* Unary operators. */ +#define NEON_VOP1(name, vtype, n) \ +uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ +{ \ + vtype vsrc1; \ + vtype vdest; \ + NEON_UNPACK(vtype, vsrc1, arg); \ + NEON_DO##n; \ + NEON_PACK(vtype, arg, vdest); \ + return arg; \ +} + + +#define NEON_USAT(dest, src1, src2, type) do { \ + uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ + if (tmp != (type)tmp) { \ + SET_QC(); \ + dest = ~0; \ + } else { \ + dest = tmp; \ + }} while(0) +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) +NEON_VOP_ENV(qadd_u8, neon_u8, 4) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) +NEON_VOP_ENV(qadd_u16, neon_u16, 2) +#undef NEON_FN +#undef NEON_USAT + +uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a + b; + if (res < a) { + SET_QC(); + res = ~0; + } + return res; +} + +uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) +{ + uint64_t res; + + res = src1 + src2; + if (res < src1) { + SET_QC(); + res = ~(uint64_t)0; + } + return res; +} + +#define NEON_SSAT(dest, src1, src2, type) do { \ + int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ + if (tmp != (type)tmp) { \ + SET_QC(); \ + if (src2 > 0) { \ + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ + } else { \ + tmp = 1 << (sizeof(type) * 8 - 1); \ + } \ + } \ + dest = tmp; \ + } while(0) +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) +NEON_VOP_ENV(qadd_s8, neon_s8, 4) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) +NEON_VOP_ENV(qadd_s16, neon_s16, 2) +#undef NEON_FN +#undef NEON_SSAT + +uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a + b; + if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { + SET_QC(); + res = ~(((int32_t)a >> 31) ^ SIGNBIT); + } + return res; +} + +uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) +{ + uint64_t res; + + res = src1 + src2; + if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { + SET_QC(); + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; + } + return res; +} + +/* Unsigned saturating accumulate of signed value + * + * Op1/Rn is treated as signed + * Op2/Rd is treated as unsigned + * + * Explicit casting is used to ensure the correct sign extension of + * inputs. The result is treated as a unsigned value and saturated as such. + * + * We use a macro for the 8/16 bit cases which expects signed integers of va, + * vb, and vr for interim calculation and an unsigned 32 bit result value r. + */ + +#define USATACC(bits, shift) \ + do { \ + va = sextract32(a, shift, bits); \ + vb = extract32(b, shift, bits); \ + vr = va + vb; \ + if (vr > UINT##bits##_MAX) { \ + SET_QC(); \ + vr = UINT##bits##_MAX; \ + } else if (vr < 0) { \ + SET_QC(); \ + vr = 0; \ + } \ + r = deposit32(r, shift, bits, vr); \ + } while (0) + +uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int16_t va, vb, vr; + uint32_t r = 0; + + USATACC(8, 0); + USATACC(8, 8); + USATACC(8, 16); + USATACC(8, 24); + return r; +} + +uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int32_t va, vb, vr; + uint64_t r = 0; + + USATACC(16, 0); + USATACC(16, 16); + return r; +} + +#undef USATACC + +uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int64_t va = (int32_t)a; + int64_t vb = (uint32_t)b; + int64_t vr = va + vb; + if (vr > UINT32_MAX) { + SET_QC(); + vr = UINT32_MAX; + } else if (vr < 0) { + SET_QC(); + vr = 0; + } + return vr; +} + +uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b) +{ + uint64_t res; + res = a + b; + /* We only need to look at the pattern of SIGN bits to detect + * +ve/-ve saturation + */ + if (~a & b & ~res & SIGNBIT64) { + SET_QC(); + res = UINT64_MAX; + } else if (a & ~b & res & SIGNBIT64) { + SET_QC(); + res = 0; + } + return res; +} + +/* Signed saturating accumulate of unsigned value + * + * Op1/Rn is treated as unsigned + * Op2/Rd is treated as signed + * + * The result is treated as a signed value and saturated as such + * + * We use a macro for the 8/16 bit cases which expects signed integers of va, + * vb, and vr for interim calculation and an unsigned 32 bit result value r. + */ + +#define SSATACC(bits, shift) \ + do { \ + va = extract32(a, shift, bits); \ + vb = sextract32(b, shift, bits); \ + vr = va + vb; \ + if (vr > INT##bits##_MAX) { \ + SET_QC(); \ + vr = INT##bits##_MAX; \ + } else if (vr < INT##bits##_MIN) { \ + SET_QC(); \ + vr = INT##bits##_MIN; \ + } \ + r = deposit32(r, shift, bits, vr); \ + } while (0) + +uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int16_t va, vb, vr; + uint32_t r = 0; + + SSATACC(8, 0); + SSATACC(8, 8); + SSATACC(8, 16); + SSATACC(8, 24); + return r; +} + +uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int32_t va, vb, vr; + uint32_t r = 0; + + SSATACC(16, 0); + SSATACC(16, 16); + + return r; +} + +#undef SSATACC + +uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + int64_t res; + int64_t op1 = (uint32_t)a; + int64_t op2 = (int32_t)b; + res = op1 + op2; + if (res > INT32_MAX) { + SET_QC(); + res = INT32_MAX; + } else if (res < INT32_MIN) { + SET_QC(); + res = INT32_MIN; + } + return res; +} + +uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b) +{ + uint64_t res; + res = a + b; + /* We only need to look at the pattern of SIGN bits to detect an overflow */ + if (((a & res) + | (~b & res) + | (a & ~b)) & SIGNBIT64) { + SET_QC(); + res = INT64_MAX; + } + return res; +} + + +#define NEON_USAT(dest, src1, src2, type) do { \ + uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ + if (tmp != (type)tmp) { \ + SET_QC(); \ + dest = 0; \ + } else { \ + dest = tmp; \ + }} while(0) +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) +NEON_VOP_ENV(qsub_u8, neon_u8, 4) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) +NEON_VOP_ENV(qsub_u16, neon_u16, 2) +#undef NEON_FN +#undef NEON_USAT + +uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a - b; + if (res > a) { + SET_QC(); + res = 0; + } + return res; +} + +uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) +{ + uint64_t res; + + if (src1 < src2) { + SET_QC(); + res = 0; + } else { + res = src1 - src2; + } + return res; +} + +#define NEON_SSAT(dest, src1, src2, type) do { \ + int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ + if (tmp != (type)tmp) { \ + SET_QC(); \ + if (src2 < 0) { \ + tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ + } else { \ + tmp = 1 << (sizeof(type) * 8 - 1); \ + } \ + } \ + dest = tmp; \ + } while(0) +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) +NEON_VOP_ENV(qsub_s8, neon_s8, 4) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) +NEON_VOP_ENV(qsub_s16, neon_s16, 2) +#undef NEON_FN +#undef NEON_SSAT + +uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a - b; + if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { + SET_QC(); + res = ~(((int32_t)a >> 31) ^ SIGNBIT); + } + return res; +} + +uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) +{ + uint64_t res; + + res = src1 - src2; + if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { + SET_QC(); + res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; + } + return res; +} + +#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 +NEON_VOP(hadd_s8, neon_s8, 4) +NEON_VOP(hadd_u8, neon_u8, 4) +NEON_VOP(hadd_s16, neon_s16, 2) +NEON_VOP(hadd_u16, neon_u16, 2) +#undef NEON_FN + +int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) +{ + int32_t dest; + + dest = (src1 >> 1) + (src2 >> 1); + if (src1 & src2 & 1) + dest++; + return dest; +} + +uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) +{ + uint32_t dest; + + dest = (src1 >> 1) + (src2 >> 1); + if (src1 & src2 & 1) + dest++; + return dest; +} + +#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 +NEON_VOP(rhadd_s8, neon_s8, 4) +NEON_VOP(rhadd_u8, neon_u8, 4) +NEON_VOP(rhadd_s16, neon_s16, 2) +NEON_VOP(rhadd_u16, neon_u16, 2) +#undef NEON_FN + +int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) +{ + int32_t dest; + + dest = (src1 >> 1) + (src2 >> 1); + if ((src1 | src2) & 1) + dest++; + return dest; +} + +uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) +{ + uint32_t dest; + + dest = (src1 >> 1) + (src2 >> 1); + if ((src1 | src2) & 1) + dest++; + return dest; +} + +#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 +NEON_VOP(hsub_s8, neon_s8, 4) +NEON_VOP(hsub_u8, neon_u8, 4) +NEON_VOP(hsub_s16, neon_s16, 2) +NEON_VOP(hsub_u16, neon_u16, 2) +#undef NEON_FN + +int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) +{ + int32_t dest; + + dest = (src1 >> 1) - (src2 >> 1); + if ((~src1) & src2 & 1) + dest--; + return dest; +} + +uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) +{ + uint32_t dest; + + dest = (src1 >> 1) - (src2 >> 1); + if ((~src1) & src2 & 1) + dest--; + return dest; +} + +#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 +NEON_POP(pmin_s8, neon_s8, 4) +NEON_POP(pmin_u8, neon_u8, 4) +NEON_POP(pmin_s16, neon_s16, 2) +NEON_POP(pmin_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 +NEON_POP(pmax_s8, neon_s8, 4) +NEON_POP(pmax_u8, neon_u8, 4) +NEON_POP(pmax_s16, neon_s16, 2) +NEON_POP(pmax_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) +NEON_VOP(shl_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) +NEON_VOP(shl_s16, neon_s16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) +NEON_VOP(rshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) +NEON_VOP(rshl_s16, neon_s16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) +{ + return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); +} + +uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, true, NULL); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) +NEON_VOP(rshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) +NEON_VOP(rshl_u16, neon_u16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) +{ + return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); +} + +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, true, NULL); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) +NEON_VOP_ENV(qshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) +NEON_VOP_ENV(qshl_u16, neon_u16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); +} + +uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) +NEON_VOP_ENV(qshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) +NEON_VOP_ENV(qshl_s16, neon_s16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); +} + +uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) +NEON_VOP_ENV(qshlu_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) +NEON_VOP_ENV(qshlu_s16, neon_s16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); +} + +uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) +NEON_VOP_ENV(qrshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) +NEON_VOP_ENV(qrshl_u16, neon_u16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); +} + +uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) +NEON_VOP_ENV(qrshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) +NEON_VOP_ENV(qrshl_s16, neon_s16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); +} + +uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); +} + +uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) +{ + uint32_t mask; + mask = (a ^ b) & 0x80808080u; + a &= ~0x80808080u; + b &= ~0x80808080u; + return (a + b) ^ mask; +} + +uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) +{ + uint32_t mask; + mask = (a ^ b) & 0x80008000u; + a &= ~0x80008000u; + b &= ~0x80008000u; + return (a + b) ^ mask; +} + +#define NEON_FN(dest, src1, src2) dest = src1 + src2 +NEON_POP(padd_u8, neon_u8, 4) +NEON_POP(padd_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) dest = src1 - src2 +NEON_VOP(sub_u8, neon_u8, 4) +NEON_VOP(sub_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) dest = src1 * src2 +NEON_VOP(mul_u8, neon_u8, 4) +NEON_VOP(mul_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 +NEON_VOP(tst_u8, neon_u8, 4) +NEON_VOP(tst_u16, neon_u16, 2) +NEON_VOP(tst_u32, neon_u32, 1) +#undef NEON_FN + +/* Count Leading Sign/Zero Bits. */ +static inline int do_clz8(uint8_t x) +{ + int n; + for (n = 8; x; n--) + x >>= 1; + return n; +} + +static inline int do_clz16(uint16_t x) +{ + int n; + for (n = 16; x; n--) + x >>= 1; + return n; +} + +#define NEON_FN(dest, src, dummy) dest = do_clz8(src) +NEON_VOP1(clz_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src, dummy) dest = do_clz16(src) +NEON_VOP1(clz_u16, neon_u16, 2) +#undef NEON_FN + +#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 +NEON_VOP1(cls_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 +NEON_VOP1(cls_s16, neon_s16, 2) +#undef NEON_FN + +uint32_t HELPER(neon_cls_s32)(uint32_t x) +{ + int count; + if ((int32_t)x < 0) + x = ~x; + for (count = 32; x; count--) + x = x >> 1; + return count - 1; +} + +/* Bit count. */ +uint32_t HELPER(neon_cnt_u8)(uint32_t x) +{ + x = (x & 0x55555555) + ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); + return x; +} + +/* Reverse bits in each 8 bit word */ +uint32_t HELPER(neon_rbit_u8)(uint32_t x) +{ + x = ((x & 0xf0f0f0f0) >> 4) + | ((x & 0x0f0f0f0f) << 4); + x = ((x & 0x88888888) >> 3) + | ((x & 0x44444444) >> 1) + | ((x & 0x22222222) << 1) + | ((x & 0x11111111) << 3); + return x; +} + +#define NEON_QDMULH16(dest, src1, src2, round) do { \ + uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ + if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ + SET_QC(); \ + tmp = (tmp >> 31) ^ ~SIGNBIT; \ + } else { \ + tmp <<= 1; \ + } \ + if (round) { \ + int32_t old = tmp; \ + tmp += 1 << 15; \ + if ((int32_t)tmp < old) { \ + SET_QC(); \ + tmp = SIGNBIT - 1; \ + } \ + } \ + dest = tmp >> 16; \ + } while(0) +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) +NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) +NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) +#undef NEON_FN +#undef NEON_QDMULH16 + +#define NEON_QDMULH32(dest, src1, src2, round) do { \ + uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ + if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ + SET_QC(); \ + tmp = (tmp >> 63) ^ ~SIGNBIT64; \ + } else { \ + tmp <<= 1; \ + } \ + if (round) { \ + int64_t old = tmp; \ + tmp += (int64_t)1 << 31; \ + if ((int64_t)tmp < old) { \ + SET_QC(); \ + tmp = SIGNBIT64 - 1; \ + } \ + } \ + dest = tmp >> 32; \ + } while(0) +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) +NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) +#undef NEON_FN +#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) +NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) +#undef NEON_FN +#undef NEON_QDMULH32 + +uint32_t HELPER(neon_narrow_u8)(uint64_t x) +{ + return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) + | ((x >> 24) & 0xff000000u); +} + +uint32_t HELPER(neon_narrow_u16)(uint64_t x) +{ + return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); +} + +uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) +{ + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); +} + +uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) +{ + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); +} + +uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) +{ + x &= 0xff80ff80ff80ff80ull; + x += 0x0080008000800080ull; + return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) + | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); +} + +uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) +{ + x &= 0xffff8000ffff8000ull; + x += 0x0000800000008000ull; + return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); +} + +uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) +{ + uint16_t s; + uint8_t d; + uint32_t res = 0; +#define SAT8(n) \ + s = x >> n; \ + if (s & 0x8000) { \ + SET_QC(); \ + } else { \ + if (s > 0xff) { \ + d = 0xff; \ + SET_QC(); \ + } else { \ + d = s; \ + } \ + res |= (uint32_t)d << (n / 2); \ + } + + SAT8(0); + SAT8(16); + SAT8(32); + SAT8(48); +#undef SAT8 + return res; +} + +uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) +{ + uint16_t s; + uint8_t d; + uint32_t res = 0; +#define SAT8(n) \ + s = x >> n; \ + if (s > 0xff) { \ + d = 0xff; \ + SET_QC(); \ + } else { \ + d = s; \ + } \ + res |= (uint32_t)d << (n / 2); + + SAT8(0); + SAT8(16); + SAT8(32); + SAT8(48); +#undef SAT8 + return res; +} + +uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) +{ + int16_t s; + uint8_t d; + uint32_t res = 0; +#define SAT8(n) \ + s = x >> n; \ + if (s != (int8_t)s) { \ + d = (s >> 15) ^ 0x7f; \ + SET_QC(); \ + } else { \ + d = s; \ + } \ + res |= (uint32_t)d << (n / 2); + + SAT8(0); + SAT8(16); + SAT8(32); + SAT8(48); +#undef SAT8 + return res; +} + +uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) +{ + uint32_t high; + uint32_t low; + low = x; + if (low & 0x80000000) { + low = 0; + SET_QC(); + } else if (low > 0xffff) { + low = 0xffff; + SET_QC(); + } + high = x >> 32; + if (high & 0x80000000) { + high = 0; + SET_QC(); + } else if (high > 0xffff) { + high = 0xffff; + SET_QC(); + } + return low | (high << 16); +} + +uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) +{ + uint32_t high; + uint32_t low; + low = x; + if (low > 0xffff) { + low = 0xffff; + SET_QC(); + } + high = x >> 32; + if (high > 0xffff) { + high = 0xffff; + SET_QC(); + } + return low | (high << 16); +} + +uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) +{ + int32_t low; + int32_t high; + low = x; + if (low != (int16_t)low) { + low = (low >> 31) ^ 0x7fff; + SET_QC(); + } + high = x >> 32; + if (high != (int16_t)high) { + high = (high >> 31) ^ 0x7fff; + SET_QC(); + } + return (uint16_t)low | (high << 16); +} + +uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) +{ + if (x & 0x8000000000000000ull) { + SET_QC(); + return 0; + } + if (x > 0xffffffffu) { + SET_QC(); + return 0xffffffffu; + } + return x; +} + +uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) +{ + if (x > 0xffffffffu) { + SET_QC(); + return 0xffffffffu; + } + return x; +} + +uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) +{ + if ((int64_t)x != (int32_t)x) { + SET_QC(); + return ((int64_t)x >> 63) ^ 0x7fffffff; + } + return x; +} + +uint64_t HELPER(neon_widen_u8)(uint32_t x) +{ + uint64_t tmp; + uint64_t ret; + ret = (uint8_t)x; + tmp = (uint8_t)(x >> 8); + ret |= tmp << 16; + tmp = (uint8_t)(x >> 16); + ret |= tmp << 32; + tmp = (uint8_t)(x >> 24); + ret |= tmp << 48; + return ret; +} + +uint64_t HELPER(neon_widen_s8)(uint32_t x) +{ + uint64_t tmp; + uint64_t ret; + ret = (uint16_t)(int8_t)x; + tmp = (uint16_t)(int8_t)(x >> 8); + ret |= tmp << 16; + tmp = (uint16_t)(int8_t)(x >> 16); + ret |= tmp << 32; + tmp = (uint16_t)(int8_t)(x >> 24); + ret |= tmp << 48; + return ret; +} + +uint64_t HELPER(neon_widen_u16)(uint32_t x) +{ + uint64_t high = (uint16_t)(x >> 16); + return ((uint16_t)x) | (high << 32); +} + +uint64_t HELPER(neon_widen_s16)(uint32_t x) +{ + uint64_t high = (int16_t)(x >> 16); + return ((uint32_t)(int16_t)x) | (high << 32); +} + +uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) +{ + uint64_t mask; + mask = (a ^ b) & 0x8000800080008000ull; + a &= ~0x8000800080008000ull; + b &= ~0x8000800080008000ull; + return (a + b) ^ mask; +} + +uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) +{ + uint64_t mask; + mask = (a ^ b) & 0x8000000080000000ull; + a &= ~0x8000000080000000ull; + b &= ~0x8000000080000000ull; + return (a + b) ^ mask; +} + +uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) +{ + uint64_t tmp; + uint64_t tmp2; + + tmp = a & 0x0000ffff0000ffffull; + tmp += (a >> 16) & 0x0000ffff0000ffffull; + tmp2 = b & 0xffff0000ffff0000ull; + tmp2 += (b << 16) & 0xffff0000ffff0000ull; + return ( tmp & 0xffff) + | ((tmp >> 16) & 0xffff0000ull) + | ((tmp2 << 16) & 0xffff00000000ull) + | ( tmp2 & 0xffff000000000000ull); +} + +uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) +{ + uint32_t low = a + (a >> 32); + uint32_t high = b + (b >> 32); + return low + ((uint64_t)high << 32); +} + +uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) +{ + uint64_t mask; + mask = (a ^ ~b) & 0x8000800080008000ull; + a |= 0x8000800080008000ull; + b &= ~0x8000800080008000ull; + return (a - b) ^ mask; +} + +uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) +{ + uint64_t mask; + mask = (a ^ ~b) & 0x8000000080000000ull; + a |= 0x8000000080000000ull; + b &= ~0x8000000080000000ull; + return (a - b) ^ mask; +} + +uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) +{ + uint32_t x, y; + uint32_t low, high; + + x = a; + y = b; + low = x + y; + if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { + SET_QC(); + low = ((int32_t)x >> 31) ^ ~SIGNBIT; + } + x = a >> 32; + y = b >> 32; + high = x + y; + if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { + SET_QC(); + high = ((int32_t)x >> 31) ^ ~SIGNBIT; + } + return low | ((uint64_t)high << 32); +} + +uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) +{ + uint64_t result; + + result = a + b; + if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { + SET_QC(); + result = ((int64_t)a >> 63) ^ ~SIGNBIT64; + } + return result; +} + +/* We have to do the arithmetic in a larger type than + * the input type, because for example with a signed 32 bit + * op the absolute difference can overflow a signed 32 bit value. + */ +#define DO_ABD(dest, x, y, intype, arithtype) do { \ + arithtype tmp_x = (intype)(x); \ + arithtype tmp_y = (intype)(y); \ + dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ + } while(0) + +uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + DO_ABD(result, a, b, uint8_t, uint32_t); + DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); + result |= tmp << 16; + DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); + result |= tmp << 32; + DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); + result |= tmp << 48; + return result; +} + +uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + DO_ABD(result, a, b, int8_t, int32_t); + DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); + result |= tmp << 16; + DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); + result |= tmp << 32; + DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); + result |= tmp << 48; + return result; +} + +uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + DO_ABD(result, a, b, uint16_t, uint32_t); + DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); + return result | (tmp << 32); +} + +uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + DO_ABD(result, a, b, int16_t, int32_t); + DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); + return result | (tmp << 32); +} + +uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) +{ + uint64_t result; + DO_ABD(result, a, b, uint32_t, uint64_t); + return result; +} + +uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) +{ + uint64_t result; + DO_ABD(result, a, b, int32_t, int64_t); + return result; +} +#undef DO_ABD + +/* Widening multiply. Named type is the source type. */ +#define DO_MULL(dest, x, y, type1, type2) do { \ + type1 tmp_x = x; \ + type1 tmp_y = y; \ + dest = (type2)((type2)tmp_x * (type2)tmp_y); \ + } while(0) + +uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + + DO_MULL(result, a, b, uint8_t, uint16_t); + DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); + result |= tmp << 16; + DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); + result |= tmp << 32; + DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); + result |= tmp << 48; + return result; +} + +uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + + DO_MULL(result, a, b, int8_t, uint16_t); + DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); + result |= tmp << 16; + DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); + result |= tmp << 32; + DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); + result |= tmp << 48; + return result; +} + +uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + + DO_MULL(result, a, b, uint16_t, uint32_t); + DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); + return result | (tmp << 32); +} + +uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) +{ + uint64_t tmp; + uint64_t result; + + DO_MULL(result, a, b, int16_t, uint32_t); + DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); + return result | (tmp << 32); +} + +uint64_t HELPER(neon_negl_u16)(uint64_t x) +{ + uint16_t tmp; + uint64_t result; + result = (uint16_t)-x; + tmp = -(x >> 16); + result |= (uint64_t)tmp << 16; + tmp = -(x >> 32); + result |= (uint64_t)tmp << 32; + tmp = -(x >> 48); + result |= (uint64_t)tmp << 48; + return result; +} + +uint64_t HELPER(neon_negl_u32)(uint64_t x) +{ + uint32_t low = -x; + uint32_t high = -(x >> 32); + return low | ((uint64_t)high << 32); +} + +/* Saturating sign manipulation. */ +/* ??? Make these use NEON_VOP1 */ +#define DO_QABS8(x) do { \ + if (x == (int8_t)0x80) { \ + x = 0x7f; \ + SET_QC(); \ + } else if (x < 0) { \ + x = -x; \ + }} while (0) +uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) +{ + neon_s8 vec; + NEON_UNPACK(neon_s8, vec, x); + DO_QABS8(vec.v1); + DO_QABS8(vec.v2); + DO_QABS8(vec.v3); + DO_QABS8(vec.v4); + NEON_PACK(neon_s8, x, vec); + return x; +} +#undef DO_QABS8 + +#define DO_QNEG8(x) do { \ + if (x == (int8_t)0x80) { \ + x = 0x7f; \ + SET_QC(); \ + } else { \ + x = -x; \ + }} while (0) +uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) +{ + neon_s8 vec; + NEON_UNPACK(neon_s8, vec, x); + DO_QNEG8(vec.v1); + DO_QNEG8(vec.v2); + DO_QNEG8(vec.v3); + DO_QNEG8(vec.v4); + NEON_PACK(neon_s8, x, vec); + return x; +} +#undef DO_QNEG8 + +#define DO_QABS16(x) do { \ + if (x == (int16_t)0x8000) { \ + x = 0x7fff; \ + SET_QC(); \ + } else if (x < 0) { \ + x = -x; \ + }} while (0) +uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) +{ + neon_s16 vec; + NEON_UNPACK(neon_s16, vec, x); + DO_QABS16(vec.v1); + DO_QABS16(vec.v2); + NEON_PACK(neon_s16, x, vec); + return x; +} +#undef DO_QABS16 + +#define DO_QNEG16(x) do { \ + if (x == (int16_t)0x8000) { \ + x = 0x7fff; \ + SET_QC(); \ + } else { \ + x = -x; \ + }} while (0) +uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) +{ + neon_s16 vec; + NEON_UNPACK(neon_s16, vec, x); + DO_QNEG16(vec.v1); + DO_QNEG16(vec.v2); + NEON_PACK(neon_s16, x, vec); + return x; +} +#undef DO_QNEG16 + +uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) +{ + if (x == SIGNBIT) { + SET_QC(); + x = ~SIGNBIT; + } else if ((int32_t)x < 0) { + x = -x; + } + return x; +} + +uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) +{ + if (x == SIGNBIT) { + SET_QC(); + x = ~SIGNBIT; + } else { + x = -x; + } + return x; +} + +uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) +{ + if (x == SIGNBIT64) { + SET_QC(); + x = ~SIGNBIT64; + } else if ((int64_t)x < 0) { + x = -x; + } + return x; +} + +uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) +{ + if (x == SIGNBIT64) { + SET_QC(); + x = ~SIGNBIT64; + } else { + x = -x; + } + return x; +} + +/* NEON Float helpers. */ + +/* Floating point comparisons produce an integer result. + * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. + * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. + */ +uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); +} + +uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float32_le(make_float32(b), make_float32(a), fpst); +} + +uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + return -float32_lt(make_float32(b), make_float32(a), fpst); +} + +uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float32 f0 = float32_abs(make_float32(a)); + float32 f1 = float32_abs(make_float32(b)); + return -float32_le(f1, f0, fpst); +} + +uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float32 f0 = float32_abs(make_float32(a)); + float32 f1 = float32_abs(make_float32(b)); + return -float32_lt(f1, f0, fpst); +} + +uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float64 f0 = float64_abs(make_float64(a)); + float64 f1 = float64_abs(make_float64(b)); + return -float64_le(f1, f0, fpst); +} + +uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp) +{ + float_status *fpst = fpstp; + float64 f0 = float64_abs(make_float64(a)); + float64 f1 = float64_abs(make_float64(b)); + return -float64_lt(f1, f0, fpst); +} + +#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) + +void HELPER(neon_qunzip8)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) + | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) + | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) + | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); + uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) + | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) + | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) + | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); + uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) + | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) + | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) + | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); + uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) + | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) + | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) + | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_qunzip16)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) + | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); + uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) + | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); + uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) + | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); + uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) + | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_qunzip32)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); + uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); + uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); + uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_unzip8)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd = rd[0], zm = rm[0]; + + uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) + | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) + | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) + | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); + uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) + | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) + | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) + | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); + + rm[0] = m0; + rd[0] = d0; +} + +void HELPER(neon_unzip16)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd = rd[0], zm = rm[0]; + + uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) + | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); + uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) + | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); + + rm[0] = m0; + rd[0] = d0; +} + +void HELPER(neon_qzip8)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) + | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) + | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) + | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); + uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) + | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) + | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) + | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); + uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) + | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) + | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) + | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); + uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) + | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) + | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) + | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_qzip16)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) + | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); + uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) + | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); + uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) + | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); + uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) + | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_qzip32)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd0 = rd[0], zd1 = rd[1]; + uint64_t zm0 = rm[0], zm1 = rm[1]; + + uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); + uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); + uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); + uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); + + rm[0] = m0; + rm[1] = m1; + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(neon_zip8)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd = rd[0], zm = rm[0]; + + uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) + | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) + | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) + | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); + uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) + | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) + | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) + | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); + + rm[0] = m0; + rd[0] = d0; +} + +void HELPER(neon_zip16)(void *vd, void *vm) +{ + uint64_t *rd = vd, *rm = vm; + uint64_t zd = rd[0], zm = rm[0]; + + uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) + | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); + uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) + | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); + + rm[0] = m0; + rd[0] = d0; +} diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c new file mode 100644 index 0000000000..3baf8004f6 --- /dev/null +++ b/target/arm/tcg/op_helper.c @@ -0,0 +1,1069 @@ +/* + * ARM helper routines + * + * Copyright (c) 2005-2007 CodeSourcery, LLC + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "cpu.h" +#include "exec/helper-proto.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/cpu_ldst.h" +#include "cpregs.h" + +#define SIGNBIT (uint32_t)0x80000000 +#define SIGNBIT64 ((uint64_t)1 << 63) + +int exception_target_el(CPUARMState *env) +{ + int target_el = MAX(1, arm_current_el(env)); + + /* + * No such thing as secure EL1 if EL3 is aarch32, + * so update the target EL to EL3 in this case. + */ + if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) { + target_el = 3; + } + + return target_el; +} + +void raise_exception(CPUARMState *env, uint32_t excp, + uint32_t syndrome, uint32_t target_el) +{ + CPUState *cs = env_cpu(env); + + if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) { + /* + * Redirect NS EL1 exceptions to NS EL2. These are reported with + * their original syndrome register value, with the exception of + * SIMD/FP access traps, which are reported as uncategorized + * (see DDI0478C.a D1.10.4) + */ + target_el = 2; + if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) { + syndrome = syn_uncategorized(); + } + } + + assert(!excp_is_internal(excp)); + cs->exception_index = excp; + env->exception.syndrome = syndrome; + env->exception.target_el = target_el; + cpu_loop_exit(cs); +} + +void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome, + uint32_t target_el, uintptr_t ra) +{ + CPUState *cs = env_cpu(env); + + /* + * restore_state_to_opc() will set env->exception.syndrome, so + * we must restore CPU state here before setting the syndrome + * the caller passed us, and cannot use cpu_loop_exit_restore(). + */ + cpu_restore_state(cs, ra); + raise_exception(env, excp, syndrome, target_el); +} + +uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc, + uint64_t ireg, uint64_t def) +{ + uint64_t tmp, val = 0; + uint32_t maxindex = ((desc & 3) + 1) * 8; + uint32_t base_reg = desc >> 2; + uint32_t shift, index, reg; + + for (shift = 0; shift < 64; shift += 8) { + index = (ireg >> shift) & 0xff; + if (index < maxindex) { + reg = base_reg + (index >> 3); + tmp = *aa32_vfp_dreg(env, reg); + tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift; + } else { + tmp = def & (0xffull << shift); + } + val |= tmp; + } + return val; +} + +void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue) +{ + /* + * Perform the v8M stack limit check for SP updates from translated code, + * raising an exception if the limit is breached. + */ + if (newvalue < v7m_sp_limit(env)) { + /* + * Stack limit exceptions are a rare case, so rather than syncing + * PC/condbits before the call, we use raise_exception_ra() so + * that cpu_restore_state() will sort them out. + */ + raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC()); + } +} + +uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a + b; + if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) + env->QF = 1; + return res; +} + +uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a + b; + if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { + env->QF = 1; + res = ~(((int32_t)a >> 31) ^ SIGNBIT); + } + return res; +} + +uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a - b; + if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { + env->QF = 1; + res = ~(((int32_t)a >> 31) ^ SIGNBIT); + } + return res; +} + +uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a + b; + if (res < a) { + env->QF = 1; + res = ~0; + } + return res; +} + +uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b) +{ + uint32_t res = a - b; + if (res > a) { + env->QF = 1; + res = 0; + } + return res; +} + +/* Signed saturation. */ +static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift) +{ + int32_t top; + uint32_t mask; + + top = val >> shift; + mask = (1u << shift) - 1; + if (top > 0) { + env->QF = 1; + return mask; + } else if (top < -1) { + env->QF = 1; + return ~mask; + } + return val; +} + +/* Unsigned saturation. */ +static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift) +{ + uint32_t max; + + max = (1u << shift) - 1; + if (val < 0) { + env->QF = 1; + return 0; + } else if (val > max) { + env->QF = 1; + return max; + } + return val; +} + +/* Signed saturate. */ +uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift) +{ + return do_ssat(env, x, shift); +} + +/* Dual halfword signed saturate. */ +uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift) +{ + uint32_t res; + + res = (uint16_t)do_ssat(env, (int16_t)x, shift); + res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16; + return res; +} + +/* Unsigned saturate. */ +uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift) +{ + return do_usat(env, x, shift); +} + +/* Dual halfword unsigned saturate. */ +uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift) +{ + uint32_t res; + + res = (uint16_t)do_usat(env, (int16_t)x, shift); + res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16; + return res; +} + +void HELPER(setend)(CPUARMState *env) +{ + env->uncached_cpsr ^= CPSR_E; + arm_rebuild_hflags(env); +} + +void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm) +{ + /* + * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU; + * check if HSTR.TJDBX means we need to trap to EL2. + */ + if (env->cp15.hstr_el2 & HSTR_TJDBX) { + /* + * We know the condition code check passed, so take the IMPDEF + * choice to always report CV=1 COND 0xe + */ + uint32_t syn = syn_bxjtrap(1, 0xe, rm); + raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC()); + } +} + +#ifndef CONFIG_USER_ONLY +/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped. + * The function returns the target EL (1-3) if the instruction is to be trapped; + * otherwise it returns 0 indicating it is not trapped. + */ +static inline int check_wfx_trap(CPUARMState *env, bool is_wfe) +{ + int cur_el = arm_current_el(env); + uint64_t mask; + + if (arm_feature(env, ARM_FEATURE_M)) { + /* M profile cores can never trap WFI/WFE. */ + return 0; + } + + /* If we are currently in EL0 then we need to check if SCTLR is set up for + * WFx instructions being trapped to EL1. These trap bits don't exist in v7. + */ + if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) { + int target_el; + + mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI; + if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) { + /* Secure EL0 and Secure PL1 is at EL3 */ + target_el = 3; + } else { + target_el = 1; + } + + if (!(env->cp15.sctlr_el[target_el] & mask)) { + return target_el; + } + } + + /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it + * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the + * bits will be zero indicating no trap. + */ + if (cur_el < 2) { + mask = is_wfe ? HCR_TWE : HCR_TWI; + if (arm_hcr_el2_eff(env) & mask) { + return 2; + } + } + + /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */ + if (cur_el < 3) { + mask = (is_wfe) ? SCR_TWE : SCR_TWI; + if (env->cp15.scr_el3 & mask) { + return 3; + } + } + + return 0; +} +#endif + +void HELPER(wfi)(CPUARMState *env, uint32_t insn_len) +{ +#ifdef CONFIG_USER_ONLY + /* + * WFI in the user-mode emulator is technically permitted but not + * something any real-world code would do. AArch64 Linux kernels + * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP; + * AArch32 kernels don't trap it so it will delay a bit. + * For QEMU, make it NOP here, because trying to raise EXCP_HLT + * would trigger an abort. + */ + return; +#else + CPUState *cs = env_cpu(env); + int target_el = check_wfx_trap(env, false); + + if (cpu_has_work(cs)) { + /* Don't bother to go into our "low power state" if + * we would just wake up immediately. + */ + return; + } + + if (target_el) { + if (env->aarch64) { + env->pc -= insn_len; + } else { + env->regs[15] -= insn_len; + } + + raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2), + target_el); + } + + cs->exception_index = EXCP_HLT; + cs->halted = 1; + cpu_loop_exit(cs); +#endif +} + +void HELPER(wfe)(CPUARMState *env) +{ + /* This is a hint instruction that is semantically different + * from YIELD even though we currently implement it identically. + * Don't actually halt the CPU, just yield back to top + * level loop. This is not going into a "low power state" + * (ie halting until some event occurs), so we never take + * a configurable trap to a different exception level. + */ + HELPER(yield)(env); +} + +void HELPER(yield)(CPUARMState *env) +{ + CPUState *cs = env_cpu(env); + + /* This is a non-trappable hint instruction that generally indicates + * that the guest is currently busy-looping. Yield control back to the + * top level loop so that a more deserving VCPU has a chance to run. + */ + cs->exception_index = EXCP_YIELD; + cpu_loop_exit(cs); +} + +/* Raise an internal-to-QEMU exception. This is limited to only + * those EXCP values which are special cases for QEMU to interrupt + * execution and not to be used for exceptions which are passed to + * the guest (those must all have syndrome information and thus should + * use exception_with_syndrome*). + */ +void HELPER(exception_internal)(CPUARMState *env, uint32_t excp) +{ + CPUState *cs = env_cpu(env); + + assert(excp_is_internal(excp)); + cs->exception_index = excp; + cpu_loop_exit(cs); +} + +/* Raise an exception with the specified syndrome register value */ +void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp, + uint32_t syndrome, uint32_t target_el) +{ + raise_exception(env, excp, syndrome, target_el); +} + +/* + * Raise an exception with the specified syndrome register value + * to the default target el. + */ +void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp, + uint32_t syndrome) +{ + raise_exception(env, excp, syndrome, exception_target_el(env)); +} + +uint32_t HELPER(cpsr_read)(CPUARMState *env) +{ + return cpsr_read(env) & ~CPSR_EXEC; +} + +void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask) +{ + cpsr_write(env, val, mask, CPSRWriteByInstr); + /* TODO: Not all cpsr bits are relevant to hflags. */ + arm_rebuild_hflags(env); +} + +/* Write the CPSR for a 32-bit exception return */ +void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val) +{ + uint32_t mask; + + qemu_mutex_lock_iothread(); + arm_call_pre_el_change_hook(env_archcpu(env)); + qemu_mutex_unlock_iothread(); + + mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar); + cpsr_write(env, val, mask, CPSRWriteExceptionReturn); + + /* Generated code has already stored the new PC value, but + * without masking out its low bits, because which bits need + * masking depends on whether we're returning to Thumb or ARM + * state. Do the masking now. + */ + env->regs[15] &= (env->thumb ? ~1 : ~3); + arm_rebuild_hflags(env); + + qemu_mutex_lock_iothread(); + arm_call_el_change_hook(env_archcpu(env)); + qemu_mutex_unlock_iothread(); +} + +/* Access to user mode registers from privileged modes. */ +uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno) +{ + uint32_t val; + + if (regno == 13) { + val = env->banked_r13[BANK_USRSYS]; + } else if (regno == 14) { + val = env->banked_r14[BANK_USRSYS]; + } else if (regno >= 8 + && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) { + val = env->usr_regs[regno - 8]; + } else { + val = env->regs[regno]; + } + return val; +} + +void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val) +{ + if (regno == 13) { + env->banked_r13[BANK_USRSYS] = val; + } else if (regno == 14) { + env->banked_r14[BANK_USRSYS] = val; + } else if (regno >= 8 + && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) { + env->usr_regs[regno - 8] = val; + } else { + env->regs[regno] = val; + } +} + +void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val) +{ + if ((env->uncached_cpsr & CPSR_M) == mode) { + env->regs[13] = val; + } else { + env->banked_r13[bank_number(mode)] = val; + } +} + +uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode) +{ + if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) { + /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF. + * Other UNPREDICTABLE and UNDEF cases were caught at translate time. + */ + raise_exception(env, EXCP_UDEF, syn_uncategorized(), + exception_target_el(env)); + } + + if ((env->uncached_cpsr & CPSR_M) == mode) { + return env->regs[13]; + } else { + return env->banked_r13[bank_number(mode)]; + } +} + +static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode, + uint32_t regno) +{ + /* Raise an exception if the requested access is one of the UNPREDICTABLE + * cases; otherwise return. This broadly corresponds to the pseudocode + * BankedRegisterAccessValid() and SPSRAccessValid(), + * except that we have already handled some cases at translate time. + */ + int curmode = env->uncached_cpsr & CPSR_M; + + if (regno == 17) { + /* ELR_Hyp: a special case because access from tgtmode is OK */ + if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) { + goto undef; + } + return; + } + + if (curmode == tgtmode) { + goto undef; + } + + if (tgtmode == ARM_CPU_MODE_USR) { + switch (regno) { + case 8 ... 12: + if (curmode != ARM_CPU_MODE_FIQ) { + goto undef; + } + break; + case 13: + if (curmode == ARM_CPU_MODE_SYS) { + goto undef; + } + break; + case 14: + if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) { + goto undef; + } + break; + default: + break; + } + } + + if (tgtmode == ARM_CPU_MODE_HYP) { + /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */ + if (curmode != ARM_CPU_MODE_MON) { + goto undef; + } + } + + return; + +undef: + raise_exception(env, EXCP_UDEF, syn_uncategorized(), + exception_target_el(env)); +} + +void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode, + uint32_t regno) +{ + msr_mrs_banked_exc_checks(env, tgtmode, regno); + + switch (regno) { + case 16: /* SPSRs */ + env->banked_spsr[bank_number(tgtmode)] = value; + break; + case 17: /* ELR_Hyp */ + env->elr_el[2] = value; + break; + case 13: + env->banked_r13[bank_number(tgtmode)] = value; + break; + case 14: + env->banked_r14[r14_bank_number(tgtmode)] = value; + break; + case 8 ... 12: + switch (tgtmode) { + case ARM_CPU_MODE_USR: + env->usr_regs[regno - 8] = value; + break; + case ARM_CPU_MODE_FIQ: + env->fiq_regs[regno - 8] = value; + break; + default: + g_assert_not_reached(); + } + break; + default: + g_assert_not_reached(); + } +} + +uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno) +{ + msr_mrs_banked_exc_checks(env, tgtmode, regno); + + switch (regno) { + case 16: /* SPSRs */ + return env->banked_spsr[bank_number(tgtmode)]; + case 17: /* ELR_Hyp */ + return env->elr_el[2]; + case 13: + return env->banked_r13[bank_number(tgtmode)]; + case 14: + return env->banked_r14[r14_bank_number(tgtmode)]; + case 8 ... 12: + switch (tgtmode) { + case ARM_CPU_MODE_USR: + return env->usr_regs[regno - 8]; + case ARM_CPU_MODE_FIQ: + return env->fiq_regs[regno - 8]; + default: + g_assert_not_reached(); + } + default: + g_assert_not_reached(); + } +} + +const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key, + uint32_t syndrome, uint32_t isread) +{ + ARMCPU *cpu = env_archcpu(env); + const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key); + CPAccessResult res = CP_ACCESS_OK; + int target_el; + + assert(ri != NULL); + + if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14 + && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) { + res = CP_ACCESS_TRAP; + goto fail; + } + + if (ri->accessfn) { + res = ri->accessfn(env, ri, isread); + } + + /* + * If the access function indicates a trap from EL0 to EL1 then + * that always takes priority over the HSTR_EL2 trap. (If it indicates + * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates + * a trap to EL2, then the syndrome is the same either way so we don't + * care whether technically the architecture says that HSTR_EL2 trap or + * the other trap takes priority. So we take the "check HSTR_EL2" path + * for all of those cases.) + */ + if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) && + arm_current_el(env) == 0) { + goto fail; + } + + /* + * HSTR_EL2 traps from EL1 are checked earlier, in generated code; + * we only need to check here for traps from EL0. + */ + if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 && + arm_is_el2_enabled(env) && + (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) { + uint32_t mask = 1 << ri->crn; + + if (ri->type & ARM_CP_64BIT) { + mask = 1 << ri->crm; + } + + /* T4 and T14 are RES0 */ + mask &= ~((1 << 4) | (1 << 14)); + + if (env->cp15.hstr_el2 & mask) { + res = CP_ACCESS_TRAP_EL2; + goto fail; + } + } + + /* + * Fine-grained traps also are lower priority than undef-to-EL1, + * higher priority than trap-to-EL3, and we don't care about priority + * order with other EL2 traps because the syndrome value is the same. + */ + if (arm_fgt_active(env, arm_current_el(env))) { + uint64_t trapword = 0; + unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX); + unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS); + bool rev = FIELD_EX32(ri->fgt, FGT, REV); + bool trapbit; + + if (ri->fgt & FGT_EXEC) { + assert(idx < ARRAY_SIZE(env->cp15.fgt_exec)); + trapword = env->cp15.fgt_exec[idx]; + } else if (isread && (ri->fgt & FGT_R)) { + assert(idx < ARRAY_SIZE(env->cp15.fgt_read)); + trapword = env->cp15.fgt_read[idx]; + } else if (!isread && (ri->fgt & FGT_W)) { + assert(idx < ARRAY_SIZE(env->cp15.fgt_write)); + trapword = env->cp15.fgt_write[idx]; + } + + trapbit = extract64(trapword, bitpos, 1); + if (trapbit != rev) { + res = CP_ACCESS_TRAP_EL2; + goto fail; + } + } + + if (likely(res == CP_ACCESS_OK)) { + return ri; + } + + fail: + switch (res & ~CP_ACCESS_EL_MASK) { + case CP_ACCESS_TRAP: + break; + case CP_ACCESS_TRAP_UNCATEGORIZED: + /* Only CP_ACCESS_TRAP traps are direct to a specified EL */ + assert((res & CP_ACCESS_EL_MASK) == 0); + if (cpu_isar_feature(aa64_ids, cpu) && isread && + arm_cpreg_in_idspace(ri)) { + /* + * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP, + * not EC_UNCATEGORIZED + */ + break; + } + syndrome = syn_uncategorized(); + break; + default: + g_assert_not_reached(); + } + + target_el = res & CP_ACCESS_EL_MASK; + switch (target_el) { + case 0: + target_el = exception_target_el(env); + break; + case 2: + assert(arm_current_el(env) != 3); + assert(arm_is_el2_enabled(env)); + break; + case 3: + assert(arm_feature(env, ARM_FEATURE_EL3)); + break; + default: + /* No "direct" traps to EL1 */ + g_assert_not_reached(); + } + + raise_exception(env, EXCP_UDEF, syndrome, target_el); +} + +const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key) +{ + ARMCPU *cpu = env_archcpu(env); + const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key); + + assert(ri != NULL); + return ri; +} + +void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value) +{ + const ARMCPRegInfo *ri = rip; + + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + ri->writefn(env, ri, value); + qemu_mutex_unlock_iothread(); + } else { + ri->writefn(env, ri, value); + } +} + +uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip) +{ + const ARMCPRegInfo *ri = rip; + uint32_t res; + + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + res = ri->readfn(env, ri); + qemu_mutex_unlock_iothread(); + } else { + res = ri->readfn(env, ri); + } + + return res; +} + +void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value) +{ + const ARMCPRegInfo *ri = rip; + + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + ri->writefn(env, ri, value); + qemu_mutex_unlock_iothread(); + } else { + ri->writefn(env, ri, value); + } +} + +uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip) +{ + const ARMCPRegInfo *ri = rip; + uint64_t res; + + if (ri->type & ARM_CP_IO) { + qemu_mutex_lock_iothread(); + res = ri->readfn(env, ri); + qemu_mutex_unlock_iothread(); + } else { + res = ri->readfn(env, ri); + } + + return res; +} + +void HELPER(pre_hvc)(CPUARMState *env) +{ + ARMCPU *cpu = env_archcpu(env); + int cur_el = arm_current_el(env); + /* FIXME: Use actual secure state. */ + bool secure = false; + bool undef; + + if (arm_is_psci_call(cpu, EXCP_HVC)) { + /* If PSCI is enabled and this looks like a valid PSCI call then + * that overrides the architecturally mandated HVC behaviour. + */ + return; + } + + if (!arm_feature(env, ARM_FEATURE_EL2)) { + /* If EL2 doesn't exist, HVC always UNDEFs */ + undef = true; + } else if (arm_feature(env, ARM_FEATURE_EL3)) { + /* EL3.HCE has priority over EL2.HCD. */ + undef = !(env->cp15.scr_el3 & SCR_HCE); + } else { + undef = env->cp15.hcr_el2 & HCR_HCD; + } + + /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state. + * For ARMv8/AArch64, HVC is allowed in EL3. + * Note that we've already trapped HVC from EL0 at translation + * time. + */ + if (secure && (!is_a64(env) || cur_el == 1)) { + undef = true; + } + + if (undef) { + raise_exception(env, EXCP_UDEF, syn_uncategorized(), + exception_target_el(env)); + } +} + +void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome) +{ + ARMCPU *cpu = env_archcpu(env); + int cur_el = arm_current_el(env); + bool secure = arm_is_secure(env); + bool smd_flag = env->cp15.scr_el3 & SCR_SMD; + + /* + * SMC behaviour is summarized in the following table. + * This helper handles the "Trap to EL2" and "Undef insn" cases. + * The "Trap to EL3" and "PSCI call" cases are handled in the exception + * helper. + * + * -> ARM_FEATURE_EL3 and !SMD + * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 + * + * Conduit SMC, valid call Trap to EL2 PSCI Call + * Conduit SMC, inval call Trap to EL2 Trap to EL3 + * Conduit not SMC Trap to EL2 Trap to EL3 + * + * + * -> ARM_FEATURE_EL3 and SMD + * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 + * + * Conduit SMC, valid call Trap to EL2 PSCI Call + * Conduit SMC, inval call Trap to EL2 Undef insn + * Conduit not SMC Trap to EL2 Undef insn + * + * + * -> !ARM_FEATURE_EL3 + * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1 + * + * Conduit SMC, valid call Trap to EL2 PSCI Call + * Conduit SMC, inval call Trap to EL2 Undef insn + * Conduit not SMC Undef insn Undef insn + */ + + /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state. + * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization + * extensions, SMD only applies to NS state. + * On ARMv7 without the Virtualization extensions, the SMD bit + * doesn't exist, but we forbid the guest to set it to 1 in scr_write(), + * so we need not special case this here. + */ + bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag + : smd_flag && !secure; + + if (!arm_feature(env, ARM_FEATURE_EL3) && + cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) { + /* If we have no EL3 then SMC always UNDEFs and can't be + * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3 + * firmware within QEMU, and we want an EL2 guest to be able + * to forbid its EL1 from making PSCI calls into QEMU's + * "firmware" via HCR.TSC, so for these purposes treat + * PSCI-via-SMC as implying an EL3. + * This handles the very last line of the previous table. + */ + raise_exception(env, EXCP_UDEF, syn_uncategorized(), + exception_target_el(env)); + } + + if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) { + /* In NS EL1, HCR controlled routing to EL2 has priority over SMD. + * We also want an EL2 guest to be able to forbid its EL1 from + * making PSCI calls into QEMU's "firmware" via HCR.TSC. + * This handles all the "Trap to EL2" cases of the previous table. + */ + raise_exception(env, EXCP_HYP_TRAP, syndrome, 2); + } + + /* Catch the two remaining "Undef insn" cases of the previous table: + * - PSCI conduit is SMC but we don't have a valid PCSI call, + * - We don't have EL3 or SMD is set. + */ + if (!arm_is_psci_call(cpu, EXCP_SMC) && + (smd || !arm_feature(env, ARM_FEATURE_EL3))) { + raise_exception(env, EXCP_UDEF, syn_uncategorized(), + exception_target_el(env)); + } +} + +/* ??? Flag setting arithmetic is awkward because we need to do comparisons. + The only way to do that in TCG is a conditional branch, which clobbers + all our temporaries. For now implement these as helper functions. */ + +/* Similarly for variable shift instructions. */ + +uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i) +{ + int shift = i & 0xff; + if (shift >= 32) { + if (shift == 32) + env->CF = x & 1; + else + env->CF = 0; + return 0; + } else if (shift != 0) { + env->CF = (x >> (32 - shift)) & 1; + return x << shift; + } + return x; +} + +uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i) +{ + int shift = i & 0xff; + if (shift >= 32) { + if (shift == 32) + env->CF = (x >> 31) & 1; + else + env->CF = 0; + return 0; + } else if (shift != 0) { + env->CF = (x >> (shift - 1)) & 1; + return x >> shift; + } + return x; +} + +uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i) +{ + int shift = i & 0xff; + if (shift >= 32) { + env->CF = (x >> 31) & 1; + return (int32_t)x >> 31; + } else if (shift != 0) { + env->CF = (x >> (shift - 1)) & 1; + return (int32_t)x >> shift; + } + return x; +} + +uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i) +{ + int shift1, shift; + shift1 = i & 0xff; + shift = shift1 & 0x1f; + if (shift == 0) { + if (shift1 != 0) + env->CF = (x >> 31) & 1; + return x; + } else { + env->CF = (x >> (shift - 1)) & 1; + return ((uint32_t)x >> shift) | (x << (32 - shift)); + } +} + +void HELPER(probe_access)(CPUARMState *env, target_ulong ptr, + uint32_t access_type, uint32_t mmu_idx, + uint32_t size) +{ + uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE); + uintptr_t ra = GETPC(); + + if (likely(size <= in_page)) { + probe_access(env, ptr, size, access_type, mmu_idx, ra); + } else { + probe_access(env, ptr, in_page, access_type, mmu_idx, ra); + probe_access(env, ptr + in_page, size - in_page, + access_type, mmu_idx, ra); + } +} + +/* + * This function corresponds to AArch64.vESBOperation(). + * Note that the AArch32 version is not functionally different. + */ +void HELPER(vesb)(CPUARMState *env) +{ + /* + * The EL2Enabled() check is done inside arm_hcr_el2_eff, + * and will return HCR_EL2.VSE == 0, so nothing happens. + */ + uint64_t hcr = arm_hcr_el2_eff(env); + bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO); + bool pending = enabled && (hcr & HCR_VSE); + bool masked = (env->daif & PSTATE_A); + + /* If VSE pending and masked, defer the exception. */ + if (pending && masked) { + uint32_t syndrome; + + if (arm_el_is_aa64(env, 1)) { + /* Copy across IDS and ISS from VSESR. */ + syndrome = env->cp15.vsesr_el2 & 0x1ffffff; + } else { + ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal }; + + if (extended_addresses_enabled(env)) { + syndrome = arm_fi_to_lfsc(&fi); + } else { + syndrome = arm_fi_to_sfsc(&fi); + } + /* Copy across AET and ExT from VSESR. */ + syndrome |= env->cp15.vsesr_el2 & 0xd000; + } + + /* Set VDISR_EL2.A along with the syndrome. */ + env->cp15.vdisr_el2 = syndrome | (1u << 31); + + /* Clear pending virtual SError */ + env->cp15.hcr_el2 &= ~HCR_VSE; + cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR); + } +} diff --git a/target/arm/tcg/pauth_helper.c b/target/arm/tcg/pauth_helper.c new file mode 100644 index 0000000000..d0483bf051 --- /dev/null +++ b/target/arm/tcg/pauth_helper.c @@ -0,0 +1,515 @@ +/* + * ARM v8.3-PAuth Operations + * + * Copyright (c) 2019 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/cpu_ldst.h" +#include "exec/helper-proto.h" +#include "tcg/tcg-gvec-desc.h" +#include "qemu/xxhash.h" + + +static uint64_t pac_cell_shuffle(uint64_t i) +{ + uint64_t o = 0; + + o |= extract64(i, 52, 4); + o |= extract64(i, 24, 4) << 4; + o |= extract64(i, 44, 4) << 8; + o |= extract64(i, 0, 4) << 12; + + o |= extract64(i, 28, 4) << 16; + o |= extract64(i, 48, 4) << 20; + o |= extract64(i, 4, 4) << 24; + o |= extract64(i, 40, 4) << 28; + + o |= extract64(i, 32, 4) << 32; + o |= extract64(i, 12, 4) << 36; + o |= extract64(i, 56, 4) << 40; + o |= extract64(i, 20, 4) << 44; + + o |= extract64(i, 8, 4) << 48; + o |= extract64(i, 36, 4) << 52; + o |= extract64(i, 16, 4) << 56; + o |= extract64(i, 60, 4) << 60; + + return o; +} + +static uint64_t pac_cell_inv_shuffle(uint64_t i) +{ + uint64_t o = 0; + + o |= extract64(i, 12, 4); + o |= extract64(i, 24, 4) << 4; + o |= extract64(i, 48, 4) << 8; + o |= extract64(i, 36, 4) << 12; + + o |= extract64(i, 56, 4) << 16; + o |= extract64(i, 44, 4) << 20; + o |= extract64(i, 4, 4) << 24; + o |= extract64(i, 16, 4) << 28; + + o |= i & MAKE_64BIT_MASK(32, 4); + o |= extract64(i, 52, 4) << 36; + o |= extract64(i, 28, 4) << 40; + o |= extract64(i, 8, 4) << 44; + + o |= extract64(i, 20, 4) << 48; + o |= extract64(i, 0, 4) << 52; + o |= extract64(i, 40, 4) << 56; + o |= i & MAKE_64BIT_MASK(60, 4); + + return o; +} + +static uint64_t pac_sub(uint64_t i) +{ + static const uint8_t sub[16] = { + 0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe, + 0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa, + }; + uint64_t o = 0; + int b; + + for (b = 0; b < 64; b += 4) { + o |= (uint64_t)sub[(i >> b) & 0xf] << b; + } + return o; +} + +static uint64_t pac_inv_sub(uint64_t i) +{ + static const uint8_t inv_sub[16] = { + 0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9, + 0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3, + }; + uint64_t o = 0; + int b; + + for (b = 0; b < 64; b += 4) { + o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b; + } + return o; +} + +static int rot_cell(int cell, int n) +{ + /* 4-bit rotate left by n. */ + cell |= cell << 4; + return extract32(cell, 4 - n, 4); +} + +static uint64_t pac_mult(uint64_t i) +{ + uint64_t o = 0; + int b; + + for (b = 0; b < 4 * 4; b += 4) { + int i0, i4, i8, ic, t0, t1, t2, t3; + + i0 = extract64(i, b, 4); + i4 = extract64(i, b + 4 * 4, 4); + i8 = extract64(i, b + 8 * 4, 4); + ic = extract64(i, b + 12 * 4, 4); + + t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1); + t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2); + t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1); + t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1); + + o |= (uint64_t)t3 << b; + o |= (uint64_t)t2 << (b + 4 * 4); + o |= (uint64_t)t1 << (b + 8 * 4); + o |= (uint64_t)t0 << (b + 12 * 4); + } + return o; +} + +static uint64_t tweak_cell_rot(uint64_t cell) +{ + return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3); +} + +static uint64_t tweak_shuffle(uint64_t i) +{ + uint64_t o = 0; + + o |= extract64(i, 16, 4) << 0; + o |= extract64(i, 20, 4) << 4; + o |= tweak_cell_rot(extract64(i, 24, 4)) << 8; + o |= extract64(i, 28, 4) << 12; + + o |= tweak_cell_rot(extract64(i, 44, 4)) << 16; + o |= extract64(i, 8, 4) << 20; + o |= extract64(i, 12, 4) << 24; + o |= tweak_cell_rot(extract64(i, 32, 4)) << 28; + + o |= extract64(i, 48, 4) << 32; + o |= extract64(i, 52, 4) << 36; + o |= extract64(i, 56, 4) << 40; + o |= tweak_cell_rot(extract64(i, 60, 4)) << 44; + + o |= tweak_cell_rot(extract64(i, 0, 4)) << 48; + o |= extract64(i, 4, 4) << 52; + o |= tweak_cell_rot(extract64(i, 40, 4)) << 56; + o |= tweak_cell_rot(extract64(i, 36, 4)) << 60; + + return o; +} + +static uint64_t tweak_cell_inv_rot(uint64_t cell) +{ + return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3)); +} + +static uint64_t tweak_inv_shuffle(uint64_t i) +{ + uint64_t o = 0; + + o |= tweak_cell_inv_rot(extract64(i, 48, 4)); + o |= extract64(i, 52, 4) << 4; + o |= extract64(i, 20, 4) << 8; + o |= extract64(i, 24, 4) << 12; + + o |= extract64(i, 0, 4) << 16; + o |= extract64(i, 4, 4) << 20; + o |= tweak_cell_inv_rot(extract64(i, 8, 4)) << 24; + o |= extract64(i, 12, 4) << 28; + + o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32; + o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36; + o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40; + o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44; + + o |= extract64(i, 32, 4) << 48; + o |= extract64(i, 36, 4) << 52; + o |= extract64(i, 40, 4) << 56; + o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60; + + return o; +} + +static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier, + ARMPACKey key) +{ + static const uint64_t RC[5] = { + 0x0000000000000000ull, + 0x13198A2E03707344ull, + 0xA4093822299F31D0ull, + 0x082EFA98EC4E6C89ull, + 0x452821E638D01377ull, + }; + const uint64_t alpha = 0xC0AC29B7C97C50DDull; + /* + * Note that in the ARM pseudocode, key0 contains bits <127:64> + * and key1 contains bits <63:0> of the 128-bit key. + */ + uint64_t key0 = key.hi, key1 = key.lo; + uint64_t workingval, runningmod, roundkey, modk0; + int i; + + modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63)); + runningmod = modifier; + workingval = data ^ key0; + + for (i = 0; i <= 4; ++i) { + roundkey = key1 ^ runningmod; + workingval ^= roundkey; + workingval ^= RC[i]; + if (i > 0) { + workingval = pac_cell_shuffle(workingval); + workingval = pac_mult(workingval); + } + workingval = pac_sub(workingval); + runningmod = tweak_shuffle(runningmod); + } + roundkey = modk0 ^ runningmod; + workingval ^= roundkey; + workingval = pac_cell_shuffle(workingval); + workingval = pac_mult(workingval); + workingval = pac_sub(workingval); + workingval = pac_cell_shuffle(workingval); + workingval = pac_mult(workingval); + workingval ^= key1; + workingval = pac_cell_inv_shuffle(workingval); + workingval = pac_inv_sub(workingval); + workingval = pac_mult(workingval); + workingval = pac_cell_inv_shuffle(workingval); + workingval ^= key0; + workingval ^= runningmod; + for (i = 0; i <= 4; ++i) { + workingval = pac_inv_sub(workingval); + if (i < 4) { + workingval = pac_mult(workingval); + workingval = pac_cell_inv_shuffle(workingval); + } + runningmod = tweak_inv_shuffle(runningmod); + roundkey = key1 ^ runningmod; + workingval ^= RC[4 - i]; + workingval ^= roundkey; + workingval ^= alpha; + } + workingval ^= modk0; + + return workingval; +} + +static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier, + ARMPACKey key) +{ + return qemu_xxhash64_4(data, modifier, key.lo, key.hi); +} + +static uint64_t pauth_computepac(CPUARMState *env, uint64_t data, + uint64_t modifier, ARMPACKey key) +{ + if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) { + return pauth_computepac_architected(data, modifier, key); + } else { + return pauth_computepac_impdef(data, modifier, key); + } +} + +static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, + ARMPACKey *key, bool data) +{ + ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + uint64_t pac, ext_ptr, ext, test; + int bot_bit, top_bit; + + /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>. */ + if (param.tbi) { + ext = sextract64(ptr, 55, 1); + } else { + ext = sextract64(ptr, 63, 1); + } + + /* Build a pointer with known good extension bits. */ + top_bit = 64 - 8 * param.tbi; + bot_bit = 64 - param.tsz; + ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext); + + pac = pauth_computepac(env, ext_ptr, modifier, *key); + + /* + * Check if the ptr has good extension bits and corrupt the + * pointer authentication code if not. + */ + test = sextract64(ptr, bot_bit, top_bit - bot_bit); + if (test != 0 && test != -1) { + /* + * Note that our top_bit is one greater than the pseudocode's + * version, hence "- 2" here. + */ + pac ^= MAKE_64BIT_MASK(top_bit - 2, 1); + } + + /* + * Preserve the determination between upper and lower at bit 55, + * and insert pointer authentication code. + */ + if (param.tbi) { + ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1); + pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1); + } else { + ptr &= MAKE_64BIT_MASK(0, bot_bit); + pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit)); + } + ext &= MAKE_64BIT_MASK(55, 1); + return pac | ext | ptr; +} + +static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param) +{ + /* Note that bit 55 is used whether or not the regime has 2 ranges. */ + uint64_t extfield = sextract64(ptr, 55, 1); + int bot_pac_bit = 64 - param.tsz; + int top_pac_bit = 64 - 8 * param.tbi; + + return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield); +} + +static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, + ARMPACKey *key, bool data, int keynumber) +{ + ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + int bot_bit, top_bit; + uint64_t pac, orig_ptr, test; + + orig_ptr = pauth_original_ptr(ptr, param); + pac = pauth_computepac(env, orig_ptr, modifier, *key); + bot_bit = 64 - param.tsz; + top_bit = 64 - 8 * param.tbi; + + test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1); + if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) { + int error_code = (keynumber << 1) | (keynumber ^ 1); + if (param.tbi) { + return deposit64(orig_ptr, 53, 2, error_code); + } else { + return deposit64(orig_ptr, 61, 2, error_code); + } + } + return orig_ptr; +} + +static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data) +{ + ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env); + ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data); + + return pauth_original_ptr(ptr, param); +} + +static G_NORETURN +void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra) +{ + raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra); +} + +static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra) +{ + if (el < 2 && arm_is_el2_enabled(env)) { + uint64_t hcr = arm_hcr_el2_eff(env); + bool trap = !(hcr & HCR_API); + if (el == 0) { + /* Trap only applies to EL1&0 regime. */ + trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE); + } + /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB]. */ + if (trap) { + pauth_trap(env, 2, ra); + } + } + if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) { + if (!(env->cp15.scr_el3 & SCR_API)) { + pauth_trap(env, 3, ra); + } + } +} + +static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit) +{ + return (arm_sctlr(env, el) & bit) != 0; +} + +uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnIA)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_addpac(env, x, y, &env->keys.apia, false); +} + +uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnIB)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_addpac(env, x, y, &env->keys.apib, false); +} + +uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnDA)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_addpac(env, x, y, &env->keys.apda, true); +} + +uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnDB)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_addpac(env, x, y, &env->keys.apdb, true); +} + +uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y) +{ + uint64_t pac; + + pauth_check_trap(env, arm_current_el(env), GETPC()); + pac = pauth_computepac(env, x, y, env->keys.apga); + + return pac & 0xffffffff00000000ull; +} + +uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnIA)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_auth(env, x, y, &env->keys.apia, false, 0); +} + +uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnIB)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_auth(env, x, y, &env->keys.apib, false, 1); +} + +uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnDA)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_auth(env, x, y, &env->keys.apda, true, 0); +} + +uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y) +{ + int el = arm_current_el(env); + if (!pauth_key_enabled(env, el, SCTLR_EnDB)) { + return x; + } + pauth_check_trap(env, el, GETPC()); + return pauth_auth(env, x, y, &env->keys.apdb, true, 1); +} + +uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a) +{ + return pauth_strip(env, a, false); +} + +uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a) +{ + return pauth_strip(env, a, true); +} diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c new file mode 100644 index 0000000000..1e67fcac30 --- /dev/null +++ b/target/arm/tcg/sme_helper.c @@ -0,0 +1,1168 @@ +/* + * ARM SME Operations + * + * Copyright (c) 2022 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "tcg/tcg-gvec-desc.h" +#include "exec/helper-proto.h" +#include "exec/cpu_ldst.h" +#include "exec/exec-all.h" +#include "qemu/int128.h" +#include "fpu/softfloat.h" +#include "vec_internal.h" +#include "sve_ldst_internal.h" + +void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask) +{ + aarch64_set_svcr(env, val, mask); +} + +void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl) +{ + uint32_t i; + + /* + * Special case clearing the entire ZA space. + * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any + * parts of the ZA storage outside of SVL. + */ + if (imm == 0xff) { + memset(env->zarray, 0, sizeof(env->zarray)); + return; + } + + /* + * Recall that ZAnH.D[m] is spread across ZA[n+8*m], + * so each row is discontiguous within ZA[]. + */ + for (i = 0; i < svl; i++) { + if (imm & (1 << (i % 8))) { + memset(&env->zarray[i], 0, svl); + } + } +} + + +/* + * When considering the ZA storage as an array of elements of + * type T, the index within that array of the Nth element of + * a vertical slice of a tile can be calculated like this, + * regardless of the size of type T. This is because the tiles + * are interleaved, so if type T is size N bytes then row 1 of + * the tile is N rows away from row 0. The division by N to + * convert a byte offset into an array index and the multiplication + * by N to convert from vslice-index-within-the-tile to + * the index within the ZA storage cancel out. + */ +#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg)) + +/* + * When doing byte arithmetic on the ZA storage, the element + * byteoff bytes away in a tile vertical slice is always this + * many bytes away in the ZA storage, regardless of the + * size of the tile element, assuming that byteoff is a multiple + * of the element size. Again this is because of the interleaving + * of the tiles. For instance if we have 1 byte per element then + * each row of the ZA storage has one byte of the vslice data, + * and (counting from 0) byte 8 goes in row 8 of the storage + * at offset (8 * row-size-in-bytes). + * If we have 8 bytes per element then each row of the ZA storage + * has 8 bytes of the data, but there are 8 interleaved tiles and + * so byte 8 of the data goes into row 1 of the tile, + * which is again row 8 of the storage, so the offset is still + * (8 * row-size-in-bytes). Similarly for other element sizes. + */ +#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg)) + + +/* + * Move Zreg vector to ZArray column. + */ +#define DO_MOVA_C(NAME, TYPE, H) \ +void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \ +{ \ + int i, oprsz = simd_oprsz(desc); \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \ + } \ + i += sizeof(TYPE); \ + pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +DO_MOVA_C(sme_mova_cz_b, uint8_t, H1) +DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2) +DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4) + +void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc) +{ + int i, oprsz = simd_oprsz(desc) / 8; + uint8_t *pg = vg; + uint64_t *n = vn; + uint64_t *a = za; + + for (i = 0; i < oprsz; i++) { + if (pg[H1(i)] & 1) { + a[tile_vslice_index(i)] = n[i]; + } + } +} + +void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc) +{ + int i, oprsz = simd_oprsz(desc) / 16; + uint16_t *pg = vg; + Int128 *n = vn; + Int128 *a = za; + + /* + * Int128 is used here simply to copy 16 bytes, and to simplify + * the address arithmetic. + */ + for (i = 0; i < oprsz; i++) { + if (pg[H2(i)] & 1) { + a[tile_vslice_index(i)] = n[i]; + } + } +} + +#undef DO_MOVA_C + +/* + * Move ZArray column to Zreg vector. + */ +#define DO_MOVA_Z(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \ +{ \ + int i, oprsz = simd_oprsz(desc); \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \ + } \ + i += sizeof(TYPE); \ + pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1) +DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2) +DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4) + +void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc) +{ + int i, oprsz = simd_oprsz(desc) / 8; + uint8_t *pg = vg; + uint64_t *d = vd; + uint64_t *a = za; + + for (i = 0; i < oprsz; i++) { + if (pg[H1(i)] & 1) { + d[i] = a[tile_vslice_index(i)]; + } + } +} + +void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc) +{ + int i, oprsz = simd_oprsz(desc) / 16; + uint16_t *pg = vg; + Int128 *d = vd; + Int128 *a = za; + + /* + * Int128 is used here simply to copy 16 bytes, and to simplify + * the address arithmetic. + */ + for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) { + if (pg[H2(i)] & 1) { + d[i] = a[tile_vslice_index(i)]; + } + } +} + +#undef DO_MOVA_Z + +/* + * Clear elements in a tile slice comprising len bytes. + */ + +typedef void ClearFn(void *ptr, size_t off, size_t len); + +static void clear_horizontal(void *ptr, size_t off, size_t len) +{ + memset(ptr + off, 0, len); +} + +static void clear_vertical_b(void *vptr, size_t off, size_t len) +{ + for (size_t i = 0; i < len; ++i) { + *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0; + } +} + +static void clear_vertical_h(void *vptr, size_t off, size_t len) +{ + for (size_t i = 0; i < len; i += 2) { + *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0; + } +} + +static void clear_vertical_s(void *vptr, size_t off, size_t len) +{ + for (size_t i = 0; i < len; i += 4) { + *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0; + } +} + +static void clear_vertical_d(void *vptr, size_t off, size_t len) +{ + for (size_t i = 0; i < len; i += 8) { + *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0; + } +} + +static void clear_vertical_q(void *vptr, size_t off, size_t len) +{ + for (size_t i = 0; i < len; i += 16) { + memset(vptr + tile_vslice_offset(i + off), 0, 16); + } +} + +/* + * Copy elements from an array into a tile slice comprising len bytes. + */ + +typedef void CopyFn(void *dst, const void *src, size_t len); + +static void copy_horizontal(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void copy_vertical_b(void *vdst, const void *vsrc, size_t len) +{ + const uint8_t *src = vsrc; + uint8_t *dst = vdst; + size_t i; + + for (i = 0; i < len; ++i) { + dst[tile_vslice_index(i)] = src[i]; + } +} + +static void copy_vertical_h(void *vdst, const void *vsrc, size_t len) +{ + const uint16_t *src = vsrc; + uint16_t *dst = vdst; + size_t i; + + for (i = 0; i < len / 2; ++i) { + dst[tile_vslice_index(i)] = src[i]; + } +} + +static void copy_vertical_s(void *vdst, const void *vsrc, size_t len) +{ + const uint32_t *src = vsrc; + uint32_t *dst = vdst; + size_t i; + + for (i = 0; i < len / 4; ++i) { + dst[tile_vslice_index(i)] = src[i]; + } +} + +static void copy_vertical_d(void *vdst, const void *vsrc, size_t len) +{ + const uint64_t *src = vsrc; + uint64_t *dst = vdst; + size_t i; + + for (i = 0; i < len / 8; ++i) { + dst[tile_vslice_index(i)] = src[i]; + } +} + +static void copy_vertical_q(void *vdst, const void *vsrc, size_t len) +{ + for (size_t i = 0; i < len; i += 16) { + memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16); + } +} + +/* + * Host and TLB primitives for vertical tile slice addressing. + */ + +#define DO_LD(NAME, TYPE, HOST, TLB) \ +static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ +{ \ + TYPE val = HOST(host); \ + *(TYPE *)(za + tile_vslice_offset(off)) = val; \ +} \ +static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ + intptr_t off, target_ulong addr, uintptr_t ra) \ +{ \ + TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \ + *(TYPE *)(za + tile_vslice_offset(off)) = val; \ +} + +#define DO_ST(NAME, TYPE, HOST, TLB) \ +static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \ +{ \ + TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ + HOST(host, val); \ +} \ +static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \ + intptr_t off, target_ulong addr, uintptr_t ra) \ +{ \ + TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \ + TLB(env, useronly_clean_ptr(addr), val, ra); \ +} + +/* + * The ARMVectorReg elements are stored in host-endian 64-bit units. + * For 128-bit quantities, the sequence defined by the Elem[] pseudocode + * corresponds to storing the two 64-bit pieces in little-endian order. + */ +#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \ +static inline void HNAME##_host(void *za, intptr_t off, void *host) \ +{ \ + uint64_t val0 = HOST(host), val1 = HOST(host + 8); \ + uint64_t *ptr = za + off; \ + ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ +} \ +static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ +{ \ + HNAME##_host(za, tile_vslice_offset(off), host); \ +} \ +static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ + target_ulong addr, uintptr_t ra) \ +{ \ + uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \ + uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \ + uint64_t *ptr = za + off; \ + ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \ +} \ +static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ + target_ulong addr, uintptr_t ra) \ +{ \ + HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ +} + +#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \ +static inline void HNAME##_host(void *za, intptr_t off, void *host) \ +{ \ + uint64_t *ptr = za + off; \ + HOST(host, ptr[BE]); \ + HOST(host + 1, ptr[!BE]); \ +} \ +static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \ +{ \ + HNAME##_host(za, tile_vslice_offset(off), host); \ +} \ +static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \ + target_ulong addr, uintptr_t ra) \ +{ \ + uint64_t *ptr = za + off; \ + TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \ + TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \ +} \ +static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \ + target_ulong addr, uintptr_t ra) \ +{ \ + HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \ +} + +DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra) +DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra) +DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra) +DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra) +DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra) +DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra) +DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra) + +DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra) +DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra) + +DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra) +DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra) +DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra) +DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra) +DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra) +DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra) +DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra) + +DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra) +DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra) + +#undef DO_LD +#undef DO_ST +#undef DO_LDQ +#undef DO_STQ + +/* + * Common helper for all contiguous predicated loads. + */ + +static inline QEMU_ALWAYS_INLINE +void sme_ld1(CPUARMState *env, void *za, uint64_t *vg, + const target_ulong addr, uint32_t desc, const uintptr_t ra, + const int esz, uint32_t mtedesc, bool vertical, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn, + ClearFn *clr_fn, + CopyFn *cpy_fn) +{ + const intptr_t reg_max = simd_oprsz(desc); + const intptr_t esize = 1 << esz; + intptr_t reg_off, reg_last; + SVEContLdSt info; + void *host; + int flags; + + /* Find the active elements. */ + if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { + /* The entire predicate was false; no load occurs. */ + clr_fn(za, 0, reg_max); + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); + + /* Handle watchpoints for all active elements. */ + sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, + BP_MEM_READ, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, + mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { +#ifdef CONFIG_USER_ONLY + g_assert_not_reached(); +#else + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + ARMVectorReg scratch = { }; + + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[1]; + if (reg_last < 0) { + reg_last = info.reg_off_split; + if (reg_last < 0) { + reg_last = info.reg_off_last[0]; + } + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + tlb_fn(env, &scratch, reg_off, addr + reg_off, ra); + } + reg_off += esize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + + cpy_fn(za, &scratch, reg_max); + return; +#endif + } + + /* The entire operation is in RAM, on valid pages. */ + + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[0]; + host = info.page[0].host; + + if (!vertical) { + memset(za, 0, reg_max); + } else if (reg_off) { + clr_fn(za, 0, reg_off); + } + + while (reg_off <= reg_last) { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + host_fn(za, reg_off, host + reg_off); + } else if (vertical) { + clr_fn(za, reg_off, esize); + } + reg_off += esize; + } while (reg_off <= reg_last && (reg_off & 63)); + } + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + reg_off = info.reg_off_split; + if (unlikely(reg_off >= 0)) { + tlb_fn(env, za, reg_off, addr + reg_off, ra); + } + + reg_off = info.reg_off_first[1]; + if (unlikely(reg_off >= 0)) { + reg_last = info.reg_off_last[1]; + host = info.page[1].host; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + host_fn(za, reg_off, host + reg_off); + } else if (vertical) { + clr_fn(za, reg_off, esize); + } + reg_off += esize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, + target_ulong addr, uint32_t desc, uintptr_t ra, + const int esz, bool vertical, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn, + ClearFn *clr_fn, + CopyFn *cpy_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical, + host_fn, tlb_fn, clr_fn, cpy_fn); +} + +#define DO_LD(L, END, ESZ) \ +void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ + sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ + clear_horizontal, copy_horizontal); \ +} \ +void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ + sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ + clear_vertical_##L, copy_vertical_##L); \ +} \ +void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ + sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \ + clear_horizontal, copy_horizontal); \ +} \ +void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ + sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \ + clear_vertical_##L, copy_vertical_##L); \ +} + +DO_LD(b, , MO_8) +DO_LD(h, _be, MO_16) +DO_LD(h, _le, MO_16) +DO_LD(s, _be, MO_32) +DO_LD(s, _le, MO_32) +DO_LD(d, _be, MO_64) +DO_LD(d, _le, MO_64) +DO_LD(q, _be, MO_128) +DO_LD(q, _le, MO_128) + +#undef DO_LD + +/* + * Common helper for all contiguous predicated stores. + */ + +static inline QEMU_ALWAYS_INLINE +void sme_st1(CPUARMState *env, void *za, uint64_t *vg, + const target_ulong addr, uint32_t desc, const uintptr_t ra, + const int esz, uint32_t mtedesc, bool vertical, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const intptr_t reg_max = simd_oprsz(desc); + const intptr_t esize = 1 << esz; + intptr_t reg_off, reg_last; + SVEContLdSt info; + void *host; + int flags; + + /* Find the active elements. */ + if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) { + /* The entire predicate was false; no store occurs. */ + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); + + /* Handle watchpoints for all active elements. */ + sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize, + BP_MEM_WRITE, ra); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize, + mtedesc, ra); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { +#ifdef CONFIG_USER_ONLY + g_assert_not_reached(); +#else + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. We cannot avoid + * this fault and will leave with the store incomplete. + */ + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[1]; + if (reg_last < 0) { + reg_last = info.reg_off_split; + if (reg_last < 0) { + reg_last = info.reg_off_last[0]; + } + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + tlb_fn(env, za, reg_off, addr + reg_off, ra); + } + reg_off += esize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + return; +#endif + } + + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[0]; + host = info.page[0].host; + + while (reg_off <= reg_last) { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + host_fn(za, reg_off, host + reg_off); + } + reg_off += 1 << esz; + } while (reg_off <= reg_last && (reg_off & 63)); + } + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + reg_off = info.reg_off_split; + if (unlikely(reg_off >= 0)) { + tlb_fn(env, za, reg_off, addr + reg_off, ra); + } + + reg_off = info.reg_off_first[1]; + if (unlikely(reg_off >= 0)) { + reg_last = info.reg_off_last[1]; + host = info.page[1].host; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + host_fn(za, reg_off, host + reg_off); + } + reg_off += 1 << esz; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, + uint32_t desc, uintptr_t ra, int esz, bool vertical, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc, + vertical, host_fn, tlb_fn); +} + +#define DO_ST(L, END, ESZ) \ +void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \ + sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ +} \ +void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \ + sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ +} \ +void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \ + sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \ +} \ +void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \ + sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \ +} + +DO_ST(b, , MO_8) +DO_ST(h, _be, MO_16) +DO_ST(h, _le, MO_16) +DO_ST(s, _be, MO_32) +DO_ST(s, _le, MO_32) +DO_ST(d, _be, MO_64) +DO_ST(d, _le, MO_64) +DO_ST(q, _be, MO_128) +DO_ST(q, _le, MO_128) + +#undef DO_ST + +void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, + void *vpm, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 4; + uint64_t *pn = vpn, *pm = vpm; + uint32_t *zda = vzda, *zn = vzn; + + for (row = 0; row < oprsz; ) { + uint64_t pa = pn[row >> 4]; + do { + if (pa & 1) { + for (col = 0; col < oprsz; ) { + uint64_t pb = pm[col >> 4]; + do { + if (pb & 1) { + zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)]; + } + pb >>= 4; + } while (++col & 15); + } + } + pa >>= 4; + } while (++row & 15); + } +} + +void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, + void *vpm, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; + uint8_t *pn = vpn, *pm = vpm; + uint64_t *zda = vzda, *zn = vzn; + + for (row = 0; row < oprsz; ++row) { + if (pn[H1(row)] & 1) { + for (col = 0; col < oprsz; ++col) { + if (pm[H1(col)] & 1) { + zda[tile_vslice_index(row) + col] += zn[col]; + } + } + } + } +} + +void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, + void *vpm, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 4; + uint64_t *pn = vpn, *pm = vpm; + uint32_t *zda = vzda, *zn = vzn; + + for (row = 0; row < oprsz; ) { + uint64_t pa = pn[row >> 4]; + do { + if (pa & 1) { + uint32_t zn_row = zn[H4(row)]; + for (col = 0; col < oprsz; ) { + uint64_t pb = pm[col >> 4]; + do { + if (pb & 1) { + zda[tile_vslice_index(row) + H4(col)] += zn_row; + } + pb >>= 4; + } while (++col & 15); + } + } + pa >>= 4; + } while (++row & 15); + } +} + +void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, + void *vpm, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; + uint8_t *pn = vpn, *pm = vpm; + uint64_t *zda = vzda, *zn = vzn; + + for (row = 0; row < oprsz; ++row) { + if (pn[H1(row)] & 1) { + uint64_t zn_row = zn[row]; + for (col = 0; col < oprsz; ++col) { + if (pm[H1(col)] & 1) { + zda[tile_vslice_index(row) + col] += zn_row; + } + } + } + } +} + +void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, void *vst, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_maxsz(desc); + uint32_t neg = simd_data(desc) << 31; + uint16_t *pn = vpn, *pm = vpm; + float_status fpst; + + /* + * Make a copy of float_status because this operation does not + * update the cumulative fp exception status. It also produces + * default nans. + */ + fpst = *(float_status *)vst; + set_default_nan_mode(true, &fpst); + + for (row = 0; row < oprsz; ) { + uint16_t pa = pn[H2(row >> 4)]; + do { + if (pa & 1) { + void *vza_row = vza + tile_vslice_offset(row); + uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg; + + for (col = 0; col < oprsz; ) { + uint16_t pb = pm[H2(col >> 4)]; + do { + if (pb & 1) { + uint32_t *a = vza_row + H1_4(col); + uint32_t *m = vzm + H1_4(col); + *a = float32_muladd(n, *m, *a, 0, vst); + } + col += 4; + pb >>= 4; + } while (col & 15); + } + } + row += 4; + pa >>= 4; + } while (row & 15); + } +} + +void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, void *vst, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; + uint64_t neg = (uint64_t)simd_data(desc) << 63; + uint64_t *za = vza, *zn = vzn, *zm = vzm; + uint8_t *pn = vpn, *pm = vpm; + float_status fpst = *(float_status *)vst; + + set_default_nan_mode(true, &fpst); + + for (row = 0; row < oprsz; ++row) { + if (pn[H1(row)] & 1) { + uint64_t *za_row = &za[tile_vslice_index(row)]; + uint64_t n = zn[row] ^ neg; + + for (col = 0; col < oprsz; ++col) { + if (pm[H1(col)] & 1) { + uint64_t *a = &za_row[col]; + *a = float64_muladd(n, zm[col], *a, 0, &fpst); + } + } + } + } +} + +/* + * Alter PAIR as needed for controlling predicates being false, + * and for NEG on an enabled row element. + */ +static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg) +{ + /* + * The pseudocode uses a conditional negate after the conditional zero. + * It is simpler here to unconditionally negate before conditional zero. + */ + pair ^= neg; + if (!(pg & 1)) { + pair &= 0xffff0000u; + } + if (!(pg & 4)) { + pair &= 0x0000ffffu; + } + return pair; +} + +static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2, + float_status *s_std, float_status *s_odd) +{ + float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std); + float64 e1c = float16_to_float64(e1 >> 16, true, s_std); + float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std); + float64 e2c = float16_to_float64(e2 >> 16, true, s_std); + float64 t64; + float32 t32; + + /* + * The ARM pseudocode function FPDot performs both multiplies + * and the add with a single rounding operation. Emulate this + * by performing the first multiply in round-to-odd, then doing + * the second multiply as fused multiply-add, and rounding to + * float32 all in one step. + */ + t64 = float64_mul(e1r, e2r, s_odd); + t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std); + + /* This conversion is exact, because we've already rounded. */ + t32 = float64_to_float32(t64, s_std); + + /* The final accumulation step is not fused. */ + return float32_add(sum, t32, s_std); +} + +void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, void *vst, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_maxsz(desc); + uint32_t neg = simd_data(desc) * 0x80008000u; + uint16_t *pn = vpn, *pm = vpm; + float_status fpst_odd, fpst_std; + + /* + * Make a copy of float_status because this operation does not + * update the cumulative fp exception status. It also produces + * default nans. Make a second copy with round-to-odd -- see above. + */ + fpst_std = *(float_status *)vst; + set_default_nan_mode(true, &fpst_std); + fpst_odd = fpst_std; + set_float_rounding_mode(float_round_to_odd, &fpst_odd); + + for (row = 0; row < oprsz; ) { + uint16_t prow = pn[H2(row >> 4)]; + do { + void *vza_row = vza + tile_vslice_offset(row); + uint32_t n = *(uint32_t *)(vzn + H1_4(row)); + + n = f16mop_adj_pair(n, prow, neg); + + for (col = 0; col < oprsz; ) { + uint16_t pcol = pm[H2(col >> 4)]; + do { + if (prow & pcol & 0b0101) { + uint32_t *a = vza_row + H1_4(col); + uint32_t m = *(uint32_t *)(vzm + H1_4(col)); + + m = f16mop_adj_pair(m, pcol, 0); + *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd); + + col += 4; + pcol >>= 4; + } + } while (col & 15); + } + row += 4; + prow >>= 4; + } while (row & 15); + } +} + +void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn, + void *vpm, uint32_t desc) +{ + intptr_t row, col, oprsz = simd_maxsz(desc); + uint32_t neg = simd_data(desc) * 0x80008000u; + uint16_t *pn = vpn, *pm = vpm; + + for (row = 0; row < oprsz; ) { + uint16_t prow = pn[H2(row >> 4)]; + do { + void *vza_row = vza + tile_vslice_offset(row); + uint32_t n = *(uint32_t *)(vzn + H1_4(row)); + + n = f16mop_adj_pair(n, prow, neg); + + for (col = 0; col < oprsz; ) { + uint16_t pcol = pm[H2(col >> 4)]; + do { + if (prow & pcol & 0b0101) { + uint32_t *a = vza_row + H1_4(col); + uint32_t m = *(uint32_t *)(vzm + H1_4(col)); + + m = f16mop_adj_pair(m, pcol, 0); + *a = bfdotadd(*a, n, m); + + col += 4; + pcol >>= 4; + } + } while (col & 15); + } + row += 4; + prow >>= 4; + } while (row & 15); + } +} + +typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool); + +static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm, + uint8_t *pn, uint8_t *pm, + uint32_t desc, IMOPFn *fn) +{ + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; + bool neg = simd_data(desc); + + for (row = 0; row < oprsz; ++row) { + uint8_t pa = pn[H1(row)]; + uint64_t *za_row = &za[tile_vslice_index(row)]; + uint64_t n = zn[row]; + + for (col = 0; col < oprsz; ++col) { + uint8_t pb = pm[H1(col)]; + uint64_t *a = &za_row[col]; + + *a = fn(n, zm[col], *a, pa & pb, neg); + } + } +} + +#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \ +static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ +{ \ + uint32_t sum0 = 0, sum1 = 0; \ + /* Apply P to N as a mask, making the inactive elements 0. */ \ + n &= expand_pred_b(p); \ + sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ + sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \ + sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ + sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \ + sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ + sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \ + sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ + sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \ + if (neg) { \ + sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \ + } else { \ + sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \ + } \ + return ((uint64_t)sum1 << 32) | sum0; \ +} + +#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \ +static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ +{ \ + uint64_t sum = 0; \ + /* Apply P to N as a mask, making the inactive elements 0. */ \ + n &= expand_pred_h(p); \ + sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ + sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ + sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ + sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ + return neg ? a - sum : a + sum; \ +} + +DEF_IMOP_32(smopa_s, int8_t, int8_t) +DEF_IMOP_32(umopa_s, uint8_t, uint8_t) +DEF_IMOP_32(sumopa_s, int8_t, uint8_t) +DEF_IMOP_32(usmopa_s, uint8_t, int8_t) + +DEF_IMOP_64(smopa_d, int16_t, int16_t) +DEF_IMOP_64(umopa_d, uint16_t, uint16_t) +DEF_IMOP_64(sumopa_d, int16_t, uint16_t) +DEF_IMOP_64(usmopa_d, uint16_t, int16_t) + +#define DEF_IMOPH(NAME) \ + void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \ + void *vpm, uint32_t desc) \ + { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); } + +DEF_IMOPH(smopa_s) +DEF_IMOPH(umopa_s) +DEF_IMOPH(sumopa_s) +DEF_IMOPH(usmopa_s) +DEF_IMOPH(smopa_d) +DEF_IMOPH(umopa_d) +DEF_IMOPH(sumopa_d) +DEF_IMOPH(usmopa_d) diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c new file mode 100644 index 0000000000..521fc9b969 --- /dev/null +++ b/target/arm/tcg/sve_helper.c @@ -0,0 +1,7483 @@ +/* + * ARM SVE Operations + * + * Copyright (c) 2018 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/helper-proto.h" +#include "tcg/tcg-gvec-desc.h" +#include "fpu/softfloat.h" +#include "tcg/tcg.h" +#include "vec_internal.h" +#include "sve_ldst_internal.h" + + +/* Return a value for NZCV as per the ARM PredTest pseudofunction. + * + * The return value has bit 31 set if N is set, bit 1 set if Z is clear, + * and bit 0 set if C is set. Compare the definitions of these variables + * within CPUARMState. + */ + +/* For no G bits set, NZCV = C. */ +#define PREDTEST_INIT 1 + +/* This is an iterative function, called for each Pd and Pg word + * moving forward. + */ +static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) +{ + if (likely(g)) { + /* Compute N from first D & G. + Use bit 2 to signal first G bit seen. */ + if (!(flags & 4)) { + flags |= ((d & (g & -g)) != 0) << 31; + flags |= 4; + } + + /* Accumulate Z from each D & G. */ + flags |= ((d & g) != 0) << 1; + + /* Compute C from last !(D & G). Replace previous. */ + flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); + } + return flags; +} + +/* This is an iterative function, called for each Pd and Pg word + * moving backward. + */ +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) +{ + if (likely(g)) { + /* Compute C from first (i.e last) !(D & G). + Use bit 2 to signal first G bit seen. */ + if (!(flags & 4)) { + flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ + flags |= (d & pow2floor(g)) == 0; + } + + /* Accumulate Z from each D & G. */ + flags |= ((d & g) != 0) << 1; + + /* Compute N from last (i.e first) D & G. Replace previous. */ + flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); + } + return flags; +} + +/* The same for a single word predicate. */ +uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) +{ + return iter_predtest_fwd(d, g, PREDTEST_INIT); +} + +/* The same for a multi-word predicate. */ +uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) +{ + uint32_t flags = PREDTEST_INIT; + uint64_t *d = vd, *g = vg; + uintptr_t i = 0; + + do { + flags = iter_predtest_fwd(d[i], g[i], flags); + } while (++i < words); + + return flags; +} + +/* Similarly for single word elements. */ +static inline uint64_t expand_pred_s(uint8_t byte) +{ + static const uint64_t word[] = { + [0x01] = 0x00000000ffffffffull, + [0x10] = 0xffffffff00000000ull, + [0x11] = 0xffffffffffffffffull, + }; + return word[byte & 0x11]; +} + +#define LOGICAL_PPPP(NAME, FUNC) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + uintptr_t opr_sz = simd_oprsz(desc); \ + uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ + uintptr_t i; \ + for (i = 0; i < opr_sz / 8; ++i) { \ + d[i] = FUNC(n[i], m[i], g[i]); \ + } \ +} + +#define DO_AND(N, M, G) (((N) & (M)) & (G)) +#define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) +#define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) +#define DO_ORR(N, M, G) (((N) | (M)) & (G)) +#define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) +#define DO_NOR(N, M, G) (~((N) | (M)) & (G)) +#define DO_NAND(N, M, G) (~((N) & (M)) & (G)) +#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) + +LOGICAL_PPPP(sve_and_pppp, DO_AND) +LOGICAL_PPPP(sve_bic_pppp, DO_BIC) +LOGICAL_PPPP(sve_eor_pppp, DO_EOR) +LOGICAL_PPPP(sve_sel_pppp, DO_SEL) +LOGICAL_PPPP(sve_orr_pppp, DO_ORR) +LOGICAL_PPPP(sve_orn_pppp, DO_ORN) +LOGICAL_PPPP(sve_nor_pppp, DO_NOR) +LOGICAL_PPPP(sve_nand_pppp, DO_NAND) + +#undef DO_AND +#undef DO_BIC +#undef DO_EOR +#undef DO_ORR +#undef DO_ORN +#undef DO_NOR +#undef DO_NAND +#undef DO_SEL +#undef LOGICAL_PPPP + +/* Fully general three-operand expander, controlled by a predicate. + * This is complicated by the host-endian storage of the register file. + */ +/* ??? I don't expect the compiler could ever vectorize this itself. + * With some tables we can convert bit masks to byte masks, and with + * extra care wrt byte/word ordering we could use gcc generic vectors + * and do 16 bytes at a time. + */ +#define DO_ZPZZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZZ_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPE nn = n[i], mm = m[i]; \ + d[i] = OP(nn, mm); \ + } \ + } \ +} + +#define DO_AND(N, M) (N & M) +#define DO_EOR(N, M) (N ^ M) +#define DO_ORR(N, M) (N | M) +#define DO_BIC(N, M) (N & ~M) +#define DO_ADD(N, M) (N + M) +#define DO_SUB(N, M) (N - M) +#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) +#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) +#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) +#define DO_MUL(N, M) (N * M) + + +/* + * We must avoid the C undefined behaviour cases: division by + * zero and signed division of INT_MIN by -1. Both of these + * have architecturally defined required results for Arm. + * We special case all signed divisions by -1 to avoid having + * to deduce the minimum integer for the type involved. + */ +#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) +#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) + +DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) +DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) +DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) +DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) + +DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) +DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) +DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) +DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) + +DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) +DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) +DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) +DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) + +DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) +DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) +DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) +DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) + +DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) +DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) +DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) +DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) + +DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) +DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) +DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) +DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) + +DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) +DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) +DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) +DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) + +DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) +DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) +DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) +DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) + +DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) +DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) +DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) +DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) + +DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) +DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) +DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) +DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) + +DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) +DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) +DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) +DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) + +DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) +DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) +DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) +DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) + +/* Because the computation type is at least twice as large as required, + these work for both signed and unsigned source types. */ +static inline uint8_t do_mulh_b(int32_t n, int32_t m) +{ + return (n * m) >> 8; +} + +static inline uint16_t do_mulh_h(int32_t n, int32_t m) +{ + return (n * m) >> 16; +} + +static inline uint32_t do_mulh_s(int64_t n, int64_t m) +{ + return (n * m) >> 32; +} + +static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) +{ + uint64_t lo, hi; + muls64(&lo, &hi, n, m); + return hi; +} + +static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) +{ + uint64_t lo, hi; + mulu64(&lo, &hi, n, m); + return hi; +} + +DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) +DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) +DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) +DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) + +DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) +DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) +DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) +DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) + +DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) +DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) +DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) +DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) + +DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) +DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) + +DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) +DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) + +/* Note that all bits of the shift are significant + and not modulo the element size. */ +#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) +#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) +#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) + +DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) +DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) +DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) + +DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) +DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) +DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) + +DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) +DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) +DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) + +DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) +DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) +DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) + +static inline uint16_t do_sadalp_h(int16_t n, int16_t m) +{ + int8_t n1 = n, n2 = n >> 8; + return m + n1 + n2; +} + +static inline uint32_t do_sadalp_s(int32_t n, int32_t m) +{ + int16_t n1 = n, n2 = n >> 16; + return m + n1 + n2; +} + +static inline uint64_t do_sadalp_d(int64_t n, int64_t m) +{ + int32_t n1 = n, n2 = n >> 32; + return m + n1 + n2; +} + +DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) +DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) +DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) + +static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) +{ + uint8_t n1 = n, n2 = n >> 8; + return m + n1 + n2; +} + +static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) +{ + uint16_t n1 = n, n2 = n >> 16; + return m + n1 + n2; +} + +static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) +{ + uint32_t n1 = n, n2 = n >> 32; + return m + n1 + n2; +} + +DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) +DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) +DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) + +#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) +#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) +#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) +#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) + +DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) +DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) +DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) +DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) + +#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) +#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) +#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) +#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) + +DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) +DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) +DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) +DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) + +/* + * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. + * We pass in a pointer to a dummy saturation field to trigger + * the saturating arithmetic but discard the information about + * whether it has occurred. + */ +#define do_sqshl_b(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) +#define do_sqshl_h(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) +#define do_sqshl_s(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) +#define do_sqshl_d(n, m) \ + ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) + +DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) +DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) +DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) +DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) + +#define do_uqshl_b(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) +#define do_uqshl_h(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) +#define do_uqshl_s(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) +#define do_uqshl_d(n, m) \ + ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) + +DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) +DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) +DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) +DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) + +#define do_sqrshl_b(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) +#define do_sqrshl_h(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) +#define do_sqrshl_s(n, m) \ + ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) +#define do_sqrshl_d(n, m) \ + ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) + +DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) +DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) +DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) +DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) + +#undef do_sqrshl_d + +#define do_uqrshl_b(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) +#define do_uqrshl_h(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) +#define do_uqrshl_s(n, m) \ + ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) +#define do_uqrshl_d(n, m) \ + ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) + +DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) +DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) +DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) +DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) + +#undef do_uqrshl_d + +#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) +#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) + +DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) +DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) +DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) +DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) + +DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) +DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) +DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) +DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) + +#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) +#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) + +DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) +DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) +DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) +DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) + +DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) +DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) +DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) +DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) + +#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) +#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) + +DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) +DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) +DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) +DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) + +DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) +DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) +DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) +DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) + +static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) +{ + return val >= max ? max : val <= min ? min : val; +} + +#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) +#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) +#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) + +static inline int64_t do_sqadd_d(int64_t n, int64_t m) +{ + int64_t r = n + m; + if (((r ^ n) & ~(n ^ m)) < 0) { + /* Signed overflow. */ + return r < 0 ? INT64_MAX : INT64_MIN; + } + return r; +} + +DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) +DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) +DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) +DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) + +#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) +#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) +#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) + +static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) +{ + uint64_t r = n + m; + return r < n ? UINT64_MAX : r; +} + +DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) +DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) +DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) +DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) + +#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) +#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) +#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) + +static inline int64_t do_sqsub_d(int64_t n, int64_t m) +{ + int64_t r = n - m; + if (((r ^ n) & (n ^ m)) < 0) { + /* Signed overflow. */ + return r < 0 ? INT64_MAX : INT64_MIN; + } + return r; +} + +DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) +DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) +DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) +DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) + +#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) +#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) +#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) + +static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) +{ + return n > m ? n - m : 0; +} + +DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) +DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) +DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) +DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) + +#define DO_SUQADD_B(n, m) \ + do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) +#define DO_SUQADD_H(n, m) \ + do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) +#define DO_SUQADD_S(n, m) \ + do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) + +static inline int64_t do_suqadd_d(int64_t n, uint64_t m) +{ + uint64_t r = n + m; + + if (n < 0) { + /* Note that m - abs(n) cannot underflow. */ + if (r > INT64_MAX) { + /* Result is either very large positive or negative. */ + if (m > -n) { + /* m > abs(n), so r is a very large positive. */ + return INT64_MAX; + } + /* Result is negative. */ + } + } else { + /* Both inputs are positive: check for overflow. */ + if (r < m || r > INT64_MAX) { + return INT64_MAX; + } + } + return r; +} + +DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) +DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) +DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) +DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) + +#define DO_USQADD_B(n, m) \ + do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) +#define DO_USQADD_H(n, m) \ + do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) +#define DO_USQADD_S(n, m) \ + do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) + +static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) +{ + uint64_t r = n + m; + + if (m < 0) { + return n < -m ? 0 : r; + } + return r < n ? UINT64_MAX : r; +} + +DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) +DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) +DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) +DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) + +#undef DO_ZPZZ +#undef DO_ZPZZ_D + +/* + * Three operand expander, operating on element pairs. + * If the slot I is even, the elements from from VN {I, I+1}. + * If the slot I is odd, the elements from from VM {I-1, I}. + * Load all of the input elements in each pair before overwriting output. + */ +#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPE n0 = *(TYPE *)(vn + H(i)); \ + TYPE m0 = *(TYPE *)(vm + H(i)); \ + TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ + TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ + if (pg & 1) { \ + *(TYPE *)(vd + H(i)) = OP(n0, n1); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + if (pg & 1) { \ + *(TYPE *)(vd + H(i)) = OP(m0, m1); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 2) { \ + TYPE n0 = n[i], n1 = n[i + 1]; \ + TYPE m0 = m[i], m1 = m[i + 1]; \ + if (pg[H1(i)] & 1) { \ + d[i] = OP(n0, n1); \ + } \ + if (pg[H1(i + 1)] & 1) { \ + d[i + 1] = OP(m0, m1); \ + } \ + } \ +} + +DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) +DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) +DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) +DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) + +DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) +DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) +DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) +DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) + +DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) +DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) +DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) +DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) + +DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) +DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) +DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) +DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) + +DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) +DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) +DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) +DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) + +#undef DO_ZPZZ_PAIR +#undef DO_ZPZZ_PAIR_D + +#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPE n0 = *(TYPE *)(vn + H(i)); \ + TYPE m0 = *(TYPE *)(vm + H(i)); \ + TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ + TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ + if (pg & 1) { \ + *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + if (pg & 1) { \ + *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) +DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) +DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) + +DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) +DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) +DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) + +DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) +DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) +DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) + +DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) +DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) +DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) + +DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) +DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) +DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) + +#undef DO_ZPZZ_PAIR_FP + +/* Three-operand expander, controlled by a predicate, in which the + * third operand is "wide". That is, for D = N op M, the same 64-bit + * value of M is used with all of the narrower values of N. + */ +#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ + TYPEW mm = *(TYPEW *)(vm + i); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 7); \ + } \ +} + +DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) +DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) +DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) + +DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) +DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) +DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) + +DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) +DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) +DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) + +#undef DO_ZPZW + +/* Fully general two-operand expander, controlled by a predicate. + */ +#define DO_ZPZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZ_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *n = vn; \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPE nn = n[i]; \ + d[i] = OP(nn); \ + } \ + } \ +} + +#define DO_CLS_B(N) (clrsb32(N) - 24) +#define DO_CLS_H(N) (clrsb32(N) - 16) + +DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) +DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) +DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) +DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) + +#define DO_CLZ_B(N) (clz32(N) - 24) +#define DO_CLZ_H(N) (clz32(N) - 16) + +DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) +DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) +DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) +DO_ZPZ_D(sve_clz_d, uint64_t, clz64) + +DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) +DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) +DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) +DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) + +#define DO_CNOT(N) (N == 0) + +DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) +DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) +DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) +DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) + +#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) + +DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) +DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) +DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) + +#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) + +DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) +DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) +DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) + +#define DO_NOT(N) (~N) + +DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) +DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) +DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) +DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) + +#define DO_SXTB(N) ((int8_t)N) +#define DO_SXTH(N) ((int16_t)N) +#define DO_SXTS(N) ((int32_t)N) +#define DO_UXTB(N) ((uint8_t)N) +#define DO_UXTH(N) ((uint16_t)N) +#define DO_UXTS(N) ((uint32_t)N) + +DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) +DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) +DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) +DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) +DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) +DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) + +DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) +DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) +DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) +DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) +DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) +DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) + +#define DO_ABS(N) (N < 0 ? -N : N) + +DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) +DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) +DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) +DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) + +#define DO_NEG(N) (-N) + +DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) +DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) +DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) +DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) + +DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) +DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) +DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) + +DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) +DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) + +DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) + +void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 2) { + if (pg[H1(i)] & 1) { + uint64_t n0 = n[i + 0]; + uint64_t n1 = n[i + 1]; + d[i + 0] = n1; + d[i + 1] = n0; + } + } +} + +DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) +DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) +DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) +DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) + +#define DO_SQABS(X) \ + ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ + x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) + +DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) +DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) +DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) +DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) + +#define DO_SQNEG(X) \ + ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ + x_ == min_ ? -min_ - 1 : -x_; }) + +DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) +DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) +DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) +DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) + +DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) +DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) + +/* Three-operand expander, unpredicated, in which the third operand is "wide". + */ +#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + TYPEW mm = *(TYPEW *)(vm + i); \ + do { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm); \ + i += sizeof(TYPE); \ + } while (i & 7); \ + } \ +} + +DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) +DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) +DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) + +DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) +DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) +DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) + +DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) +DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) +DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) + +#undef DO_ZZW + +#undef DO_CLS_B +#undef DO_CLS_H +#undef DO_CLZ_B +#undef DO_CLZ_H +#undef DO_CNOT +#undef DO_FABS +#undef DO_FNEG +#undef DO_ABS +#undef DO_NEG +#undef DO_ZPZ +#undef DO_ZPZ_D + +/* + * Three-operand expander, unpredicated, in which the two inputs are + * selected from the top or bottom half of the wide column. + */ +#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ + int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ + TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ + *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ + } \ +} + +DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) +DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) +DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) + +DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) +DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) +DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) + +DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) +DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) +DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) + +DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) +DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) +DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) + +DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) +DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) +DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) + +DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) +DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) +DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) + +DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) +DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) +DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) + +DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) +DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) +DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) + +/* Note that the multiply cannot overflow, but the doubling can. */ +static inline int16_t do_sqdmull_h(int16_t n, int16_t m) +{ + int16_t val = n * m; + return DO_SQADD_H(val, val); +} + +static inline int32_t do_sqdmull_s(int32_t n, int32_t m) +{ + int32_t val = n * m; + return DO_SQADD_S(val, val); +} + +static inline int64_t do_sqdmull_d(int64_t n, int64_t m) +{ + int64_t val = n * m; + return do_sqadd_d(val, val); +} + +DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) +DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) +DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) + +#undef DO_ZZZ_TB + +#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEW *)(vn + HW(i)); \ + TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ + *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ + } \ +} + +DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) +DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) +DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) + +DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) +DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) +DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) + +DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) +DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) +DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) + +DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) +DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) +DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) + +#undef DO_ZZZ_WTB + +#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ + intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ + for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ + TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ + TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ + *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ + } \ +} + +DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) +DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) +DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) +DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) + +#undef DO_ZZZ_NTB + +#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ + TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ + TYPEW aa = *(TYPEW *)(va + HW(i)); \ + *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ + } \ +} + +DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) +DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) +DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) + +DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) +DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) +DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) + +DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) +DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) +DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) + +DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) +DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) +DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) + +#define DO_NMUL(N, M) -(N * M) + +DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) +DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) +DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) + +DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) +DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) +DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) + +#undef DO_ZZZW_ACC + +#define DO_XTNB(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ + TYPE nn = *(TYPE *)(vn + i); \ + nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ + *(TYPE *)(vd + i) = nn; \ + } \ +} + +#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ + for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ + TYPE nn = *(TYPE *)(vn + i); \ + *(TYPEN *)(vd + i + odd) = OP(nn); \ + } \ +} + +#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) +#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) +#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) + +DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) +DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) +DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) + +DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) +DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) +DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) + +#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) +#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) +#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) + +DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) +DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) +DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) + +DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) +DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) +DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) + +DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) +DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) +DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) + +DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) +DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) +DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) + +#undef DO_XTNB +#undef DO_XTNT + +void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); + uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t *a = va, *n = vn; + uint64_t *d = vd, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + uint32_t e1 = a[2 * i + H4(0)]; + uint32_t e2 = n[2 * i + sel] ^ inv; + uint64_t c = extract64(m[i], 32, 1); + /* Compute and store the entire 33-bit result at once. */ + d[i] = c + e1 + e2; + } +} + +void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int sel = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint64_t *d = vd, *a = va, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; i += 2) { + Int128 e1 = int128_make64(a[i]); + Int128 e2 = int128_make64(n[i + sel] ^ inv); + Int128 c = int128_make64(m[i + 1] & 1); + Int128 r = int128_add(int128_add(e1, e2), c); + d[i + 0] = int128_getlo(r); + d[i + 1] = int128_gethi(r); + } +} + +#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ + int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ + TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ + TYPEW aa = *(TYPEW *)(va + HW(i)); \ + *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ + } \ +} + +DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, + do_sqdmull_h, DO_SQADD_H) +DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, + do_sqdmull_s, DO_SQADD_S) +DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, + do_sqdmull_d, do_sqadd_d) + +DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, + do_sqdmull_h, DO_SQSUB_H) +DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, + do_sqdmull_s, DO_SQSUB_S) +DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, + do_sqdmull_d, do_sqsub_d) + +#undef DO_SQDMLAL + +#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ + int rot = simd_data(desc); \ + int sel_a = rot & 1, sel_b = sel_a ^ 1; \ + bool sub_r = rot == 1 || rot == 2; \ + bool sub_i = rot >= 2; \ + TYPE *d = vd, *n = vn, *m = vm, *a = va; \ + for (i = 0; i < opr_sz; i += 2) { \ + TYPE elt1_a = n[H(i + sel_a)]; \ + TYPE elt2_a = m[H(i + sel_a)]; \ + TYPE elt2_b = m[H(i + sel_b)]; \ + d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ + d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ + } \ +} + +#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) + +DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) +DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) +DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) +DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) + +#define DO_SQRDMLAH_B(N, M, A, S) \ + do_sqrdmlah_b(N, M, A, S, true) +#define DO_SQRDMLAH_H(N, M, A, S) \ + ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) +#define DO_SQRDMLAH_S(N, M, A, S) \ + ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) +#define DO_SQRDMLAH_D(N, M, A, S) \ + do_sqrdmlah_d(N, M, A, S, true) + +DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) +DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) +DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) +DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) + +#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ + int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ + int sel_a = rot & 1, sel_b = sel_a ^ 1; \ + bool sub_r = rot == 1 || rot == 2; \ + bool sub_i = rot >= 2; \ + TYPE *d = vd, *n = vn, *m = vm, *a = va; \ + for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ + TYPE elt2_a = m[H(i + idx + sel_a)]; \ + TYPE elt2_b = m[H(i + idx + sel_b)]; \ + for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ + TYPE elt1_a = n[H(i + j + sel_a)]; \ + d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ + d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ + } \ + } \ +} + +DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) +DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) + +DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) +DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) + +#undef DO_CMLA +#undef DO_CMLA_FUNC +#undef DO_CMLA_IDX_FUNC +#undef DO_SQRDMLAH_B +#undef DO_SQRDMLAH_H +#undef DO_SQRDMLAH_S +#undef DO_SQRDMLAH_D + +/* Note N and M are 4 elements bundled into one unit. */ +static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, + int sel_a, int sel_b, int sub_i) +{ + for (int i = 0; i <= 1; i++) { + int32_t elt1_r = (int8_t)(n >> (16 * i)); + int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); + int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); + int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); + + a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; + } + return a; +} + +static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, + int sel_a, int sel_b, int sub_i) +{ + for (int i = 0; i <= 1; i++) { + int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); + int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); + int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); + int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); + + a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; + } + return a; +} + +void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + int opr_sz = simd_oprsz(desc); + int rot = simd_data(desc); + int sel_a = rot & 1; + int sel_b = sel_a ^ 1; + int sub_i = (rot == 0 || rot == 3 ? -1 : 1); + uint32_t *d = vd, *n = vn, *m = vm, *a = va; + + for (int e = 0; e < opr_sz / 4; e++) { + d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); + } +} + +void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + int opr_sz = simd_oprsz(desc); + int rot = simd_data(desc); + int sel_a = rot & 1; + int sel_b = sel_a ^ 1; + int sub_i = (rot == 0 || rot == 3 ? -1 : 1); + uint64_t *d = vd, *n = vn, *m = vm, *a = va; + + for (int e = 0; e < opr_sz / 8; e++) { + d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); + } +} + +void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + int opr_sz = simd_oprsz(desc); + int rot = extract32(desc, SIMD_DATA_SHIFT, 2); + int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); + int sel_a = rot & 1; + int sel_b = sel_a ^ 1; + int sub_i = (rot == 0 || rot == 3 ? -1 : 1); + uint32_t *d = vd, *n = vn, *m = vm, *a = va; + + for (int seg = 0; seg < opr_sz / 4; seg += 4) { + uint32_t seg_m = m[seg + idx]; + for (int e = 0; e < 4; e++) { + d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], + sel_a, sel_b, sub_i); + } + } +} + +void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + int seg, opr_sz = simd_oprsz(desc); + int rot = extract32(desc, SIMD_DATA_SHIFT, 2); + int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); + int sel_a = rot & 1; + int sel_b = sel_a ^ 1; + int sub_i = (rot == 0 || rot == 3 ? -1 : 1); + uint64_t *d = vd, *n = vn, *m = vm, *a = va; + + for (seg = 0; seg < opr_sz / 8; seg += 2) { + uint64_t seg_m = m[seg + idx]; + for (int e = 0; e < 2; e++) { + d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], + sel_a, sel_b, sub_i); + } + } +} + +#define DO_ZZXZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ + intptr_t i, j, idx = simd_data(desc); \ + TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[i]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = OP(n[i + j], mm, a[i + j]); \ + } \ + } \ +} + +#define DO_SQRDMLAH_H(N, M, A) \ + ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) +#define DO_SQRDMLAH_S(N, M, A) \ + ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) +#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) + +DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) +DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) +DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) + +#define DO_SQRDMLSH_H(N, M, A) \ + ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) +#define DO_SQRDMLSH_S(N, M, A) \ + ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) +#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) + +DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) +DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) +DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) + +#undef DO_ZZXZ + +#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ + for (i = 0; i < oprsz; i += 16) { \ + TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ + for (j = 0; j < 16; j += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ + TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ + *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ + } \ + } \ +} + +#define DO_MLA(N, M, A) (A + N * M) + +DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) +DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) +DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) +DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) + +#define DO_MLS(N, M, A) (A - N * M) + +DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) +DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) +DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) +DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) + +#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) +#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) + +DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) +DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) + +#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) +#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) + +DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) +DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) + +#undef DO_MLA +#undef DO_MLS +#undef DO_ZZXW + +#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ + for (i = 0; i < oprsz; i += 16) { \ + TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ + for (j = 0; j < 16; j += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ + *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ + } \ + } \ +} + +DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) +DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) + +DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) +DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) + +DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) +DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) + +#undef DO_ZZX + +#define DO_BITPERM(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ + TYPE nn = *(TYPE *)(vn + i); \ + TYPE mm = *(TYPE *)(vm + i); \ + *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ + } \ +} + +static uint64_t bitextract(uint64_t data, uint64_t mask, int n) +{ + uint64_t res = 0; + int db, rb = 0; + + for (db = 0; db < n; ++db) { + if ((mask >> db) & 1) { + res |= ((data >> db) & 1) << rb; + ++rb; + } + } + return res; +} + +DO_BITPERM(sve2_bext_b, uint8_t, bitextract) +DO_BITPERM(sve2_bext_h, uint16_t, bitextract) +DO_BITPERM(sve2_bext_s, uint32_t, bitextract) +DO_BITPERM(sve2_bext_d, uint64_t, bitextract) + +static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) +{ + uint64_t res = 0; + int rb, db = 0; + + for (rb = 0; rb < n; ++rb) { + if ((mask >> rb) & 1) { + res |= ((data >> db) & 1) << rb; + ++db; + } + } + return res; +} + +DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) +DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) +DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) +DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) + +static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) +{ + uint64_t resm = 0, resu = 0; + int db, rbm = 0, rbu = 0; + + for (db = 0; db < n; ++db) { + uint64_t val = (data >> db) & 1; + if ((mask >> db) & 1) { + resm |= val << rbm++; + } else { + resu |= val << rbu++; + } + } + + return resm | (resu << rbm); +} + +DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) +DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) +DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) +DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) + +#undef DO_BITPERM + +#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int sub_r = simd_data(desc); \ + if (sub_r) { \ + for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ + TYPE acc_r = *(TYPE *)(vn + H(i)); \ + TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ + TYPE el2_r = *(TYPE *)(vm + H(i)); \ + TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ + acc_r = ADD_OP(acc_r, el2_i); \ + acc_i = SUB_OP(acc_i, el2_r); \ + *(TYPE *)(vd + H(i)) = acc_r; \ + *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ + } \ + } else { \ + for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ + TYPE acc_r = *(TYPE *)(vn + H(i)); \ + TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ + TYPE el2_r = *(TYPE *)(vm + H(i)); \ + TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ + acc_r = SUB_OP(acc_r, el2_i); \ + acc_i = ADD_OP(acc_i, el2_r); \ + *(TYPE *)(vd + H(i)) = acc_r; \ + *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ + } \ + } \ +} + +DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) +DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) +DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) +DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) + +DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) +DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) +DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) +DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) + +#undef DO_CADD + +#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ + int shift = simd_data(desc) >> 1; \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ + *(TYPEW *)(vd + HW(i)) = nn << shift; \ + } \ +} + +DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) +DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) +DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) + +DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) +DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) +DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) + +#undef DO_ZZI_SHLL + +/* Two-operand reduction expander, controlled by a predicate. + * The difference between TYPERED and TYPERET has to do with + * sign-extension. E.g. for SMAX, TYPERED must be signed, + * but TYPERET must be unsigned so that e.g. a 32-bit value + * is not sign-extended to the ABI uint64_t return type. + */ +/* ??? If we were to vectorize this by hand the reduction ordering + * would change. For integer operands, this is perfectly fine. + */ +#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ +uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPERED ret = INIT; \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ + ret = OP(ret, nn); \ + } \ + i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ + } while (i & 15); \ + } \ + return (TYPERET)ret; \ +} + +#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ +uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPEE *n = vn; \ + uint8_t *pg = vg; \ + TYPER ret = INIT; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPEE nn = n[i]; \ + ret = OP(ret, nn); \ + } \ + } \ + return ret; \ +} + +DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) +DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) +DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) +DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) + +DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) +DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) +DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) +DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) + +DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) +DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) +DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) +DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) + +DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) +DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) +DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) + +DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) +DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) +DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) +DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) + +DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) +DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) +DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) +DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) + +DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) +DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) +DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) +DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) + +DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) +DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) +DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) +DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) + +DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) +DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) +DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) +DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) + +#undef DO_VPZ +#undef DO_VPZ_D + +/* Two vector operand, one scalar operand, unpredicated. */ +#define DO_ZZI(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ + TYPE s = s64, *d = vd, *n = vn; \ + for (i = 0; i < opr_sz; ++i) { \ + d[i] = OP(n[i], s); \ + } \ +} + +#define DO_SUBR(X, Y) (Y - X) + +DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) +DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) +DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) +DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) + +DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) +DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) +DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) +DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) + +DO_ZZI(sve_smini_b, int8_t, DO_MIN) +DO_ZZI(sve_smini_h, int16_t, DO_MIN) +DO_ZZI(sve_smini_s, int32_t, DO_MIN) +DO_ZZI(sve_smini_d, int64_t, DO_MIN) + +DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) +DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) +DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) +DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) + +DO_ZZI(sve_umini_b, uint8_t, DO_MIN) +DO_ZZI(sve_umini_h, uint16_t, DO_MIN) +DO_ZZI(sve_umini_s, uint32_t, DO_MIN) +DO_ZZI(sve_umini_d, uint64_t, DO_MIN) + +#undef DO_ZZI + +#undef DO_AND +#undef DO_ORR +#undef DO_EOR +#undef DO_BIC +#undef DO_ADD +#undef DO_SUB +#undef DO_MAX +#undef DO_MIN +#undef DO_ABD +#undef DO_MUL +#undef DO_DIV +#undef DO_ASR +#undef DO_LSR +#undef DO_LSL +#undef DO_SUBR + +/* Similar to the ARM LastActiveElement pseudocode function, except the + result is multiplied by the element size. This includes the not found + indication; e.g. not found for esz=3 is -8. */ +static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) +{ + uint64_t mask = pred_esz_masks[esz]; + intptr_t i = words; + + do { + uint64_t this_g = g[--i] & mask; + if (this_g) { + return i * 64 + (63 - clz64(this_g)); + } + } while (i > 0); + return (intptr_t)-1 << esz; +} + +uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) +{ + intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); + uint32_t flags = PREDTEST_INIT; + uint64_t *d = vd, *g = vg; + intptr_t i = 0; + + do { + uint64_t this_d = d[i]; + uint64_t this_g = g[i]; + + if (this_g) { + if (!(flags & 4)) { + /* Set in D the first bit of G. */ + this_d |= this_g & -this_g; + d[i] = this_d; + } + flags = iter_predtest_fwd(this_d, this_g, flags); + } + } while (++i < words); + + return flags; +} + +uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) +{ + intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); + intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint32_t flags = PREDTEST_INIT; + uint64_t *d = vd, *g = vg, esz_mask; + intptr_t i, next; + + next = last_active_element(vd, words, esz) + (1 << esz); + esz_mask = pred_esz_masks[esz]; + + /* Similar to the pseudocode for pnext, but scaled by ESZ + so that we find the correct bit. */ + if (next < words * 64) { + uint64_t mask = -1; + + if (next & 63) { + mask = ~((1ull << (next & 63)) - 1); + next &= -64; + } + do { + uint64_t this_g = g[next / 64] & esz_mask & mask; + if (this_g != 0) { + next = (next & -64) + ctz64(this_g); + break; + } + next += 64; + mask = -1; + } while (next < words * 64); + } + + i = 0; + do { + uint64_t this_d = 0; + if (i == next / 64) { + this_d = 1ull << (next & 63); + } + d[i] = this_d; + flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); + } while (++i < words); + + return flags; +} + +/* + * Copy Zn into Zd, and store zero into inactive elements. + * If inv, store zeros into the active elements. + */ +void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t inv = -(uint64_t)(simd_data(desc) & 1); + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); + } +} + +void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t inv = -(uint64_t)(simd_data(desc) & 1); + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); + } +} + +void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t inv = -(uint64_t)(simd_data(desc) & 1); + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); + } +} + +void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + uint8_t inv = simd_data(desc); + + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); + } +} + +/* Three-operand expander, immediate operand, controlled by a predicate. + */ +#define DO_ZPZI(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPE imm = simd_data(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, imm); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZI_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *n = vn; \ + TYPE imm = simd_data(desc); \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPE nn = n[i]; \ + d[i] = OP(nn, imm); \ + } \ + } \ +} + +#define DO_SHR(N, M) (N >> M) +#define DO_SHL(N, M) (N << M) + +/* Arithmetic shift right for division. This rounds negative numbers + toward zero as per signed division. Therefore before shifting, + when N is negative, add 2**M-1. */ +#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) + +static inline uint64_t do_urshr(uint64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else if (sh == 64) { + return x >> 63; + } else { + return 0; + } +} + +static inline int64_t do_srshr(int64_t x, unsigned sh) +{ + if (likely(sh < 64)) { + return (x >> sh) + ((x >> (sh - 1)) & 1); + } else { + /* Rounding the sign bit always produces 0. */ + return 0; + } +} + +DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) +DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) +DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) +DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) + +DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) +DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) +DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) +DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) + +DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) +DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) +DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) +DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) + +DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) +DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) +DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) +DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) + +/* SVE2 bitwise shift by immediate */ +DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) +DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) +DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) +DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) + +DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) +DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) +DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) +DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) + +DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) +DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) +DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) +DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) + +DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) +DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) +DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) +DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) + +#define do_suqrshl_b(n, m) \ + ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) +#define do_suqrshl_h(n, m) \ + ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) +#define do_suqrshl_s(n, m) \ + ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) +#define do_suqrshl_d(n, m) \ + ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) + +DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) +DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) +DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) +DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) + +#undef DO_ASRD +#undef DO_ZPZI +#undef DO_ZPZI_D + +#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEW *)(vn + i); \ + *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ + } \ +} + +#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEW *)(vn + HW(i)); \ + *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ + } \ +} + +DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) +DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) +DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) + +DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) +DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) +DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) + +DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) +DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) +DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) + +DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) +DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) +DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) + +#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) +#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) +#define DO_SQSHRUN_D(x, sh) \ + do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) + +DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) +DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) +DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) + +DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) +DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) +DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) + +#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) +#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) +#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) + +DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) +DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) +DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) + +DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) +DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) +DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) + +#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) +#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) +#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) + +DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) +DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) +DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) + +DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) +DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) +DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) + +#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) +#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) +#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) + +DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) +DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) +DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) + +DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) +DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) +DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) + +#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) +#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) +#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) + +DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) +DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) +DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) + +DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) +DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) +DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) + +#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) +#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) +#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) + +DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) +DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) +DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) + +DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) +DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) +DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) + +#undef DO_SHRNB +#undef DO_SHRNT + +#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEW *)(vn + i); \ + TYPEW mm = *(TYPEW *)(vm + i); \ + *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ + } \ +} + +#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ + TYPEW nn = *(TYPEW *)(vn + HW(i)); \ + TYPEW mm = *(TYPEW *)(vm + HW(i)); \ + *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ + } \ +} + +#define DO_ADDHN(N, M, SH) ((N + M) >> SH) +#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) +#define DO_SUBHN(N, M, SH) ((N - M) >> SH) +#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) + +DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) +DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) +DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) + +DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) +DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) +DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) + +DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) +DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) +DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) + +DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) +DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) +DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) + +DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) +DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) +DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) + +DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) +DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) +DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) + +DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) +DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) +DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) + +DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) +DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) +DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) + +#undef DO_RSUBHN +#undef DO_SUBHN +#undef DO_RADDHN +#undef DO_ADDHN + +#undef DO_BINOPNB + +/* Fully general four-operand expander, controlled by a predicate. + */ +#define DO_ZPZZZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ + void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + TYPE aa = *(TYPE *)(va + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZZZ_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ + void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *a = va, *n = vn, *m = vm; \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPE aa = a[i], nn = n[i], mm = m[i]; \ + d[i] = OP(aa, nn, mm); \ + } \ + } \ +} + +#define DO_MLA(A, N, M) (A + N * M) +#define DO_MLS(A, N, M) (A - N * M) + +DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) +DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) + +DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) +DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) + +DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) +DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) + +DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) +DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) + +#undef DO_MLA +#undef DO_MLS +#undef DO_ZPZZZ +#undef DO_ZPZZZ_D + +void HELPER(sve_index_b)(void *vd, uint32_t start, + uint32_t incr, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint8_t *d = vd; + for (i = 0; i < opr_sz; i += 1) { + d[H1(i)] = start + i * incr; + } +} + +void HELPER(sve_index_h)(void *vd, uint32_t start, + uint32_t incr, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 2; + uint16_t *d = vd; + for (i = 0; i < opr_sz; i += 1) { + d[H2(i)] = start + i * incr; + } +} + +void HELPER(sve_index_s)(void *vd, uint32_t start, + uint32_t incr, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd; + for (i = 0; i < opr_sz; i += 1) { + d[H4(i)] = start + i * incr; + } +} + +void HELPER(sve_index_d)(void *vd, uint64_t start, + uint64_t incr, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd; + for (i = 0; i < opr_sz; i += 1) { + d[i] = start + i * incr; + } +} + +void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 4; + uint32_t sh = simd_data(desc); + uint32_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] + (m[i] << sh); + } +} + +void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t sh = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] + (m[i] << sh); + } +} + +void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t sh = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); + } +} + +void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t sh = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); + } +} + +void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) +{ + /* These constants are cut-and-paste directly from the ARM pseudocode. */ + static const uint16_t coeff[] = { + 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, + 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, + 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, + 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / 2; + uint16_t *d = vd, *n = vn; + + for (i = 0; i < opr_sz; i++) { + uint16_t nn = n[i]; + intptr_t idx = extract32(nn, 0, 5); + uint16_t exp = extract32(nn, 5, 5); + d[i] = coeff[idx] | (exp << 10); + } +} + +void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) +{ + /* These constants are cut-and-paste directly from the ARM pseudocode. */ + static const uint32_t coeff[] = { + 0x000000, 0x0164d2, 0x02cd87, 0x043a29, + 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, + 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, + 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, + 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, + 0x1ef532, 0x20b051, 0x227043, 0x243516, + 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, + 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, + 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, + 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, + 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, + 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, + 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, + 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, + 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, + 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd, *n = vn; + + for (i = 0; i < opr_sz; i++) { + uint32_t nn = n[i]; + intptr_t idx = extract32(nn, 0, 6); + uint32_t exp = extract32(nn, 6, 8); + d[i] = coeff[idx] | (exp << 23); + } +} + +void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) +{ + /* These constants are cut-and-paste directly from the ARM pseudocode. */ + static const uint64_t coeff[] = { + 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, + 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, + 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, + 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, + 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, + 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, + 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, + 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, + 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, + 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, + 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, + 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, + 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, + 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, + 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, + 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, + 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, + 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, + 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, + 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, + 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, + 0xFA7C1819E90D8ull, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + + for (i = 0; i < opr_sz; i++) { + uint64_t nn = n[i]; + intptr_t idx = extract32(nn, 0, 6); + uint64_t exp = extract32(nn, 6, 11); + d[i] = coeff[idx] | (exp << 52); + } +} + +void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 2; + uint16_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + uint16_t nn = n[i]; + uint16_t mm = m[i]; + if (mm & 1) { + nn = float16_one; + } + d[i] = nn ^ (mm & 2) << 14; + } +} + +void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + uint32_t nn = n[i]; + uint32_t mm = m[i]; + if (mm & 1) { + nn = float32_one; + } + d[i] = nn ^ (mm & 2) << 30; + } +} + +void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i]; + uint64_t mm = m[i]; + if (mm & 1) { + nn = float64_one; + } + d[i] = nn ^ (mm & 2) << 62; + } +} + +/* + * Signed saturating addition with scalar operand. + */ + +void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(int8_t)) { + *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); + } +} + +void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(int16_t)) { + *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); + } +} + +void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(int32_t)) { + *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); + } +} + +void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(int64_t)) { + *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); + } +} + +/* + * Unsigned saturating addition with scalar operand. + */ + +void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); + } +} + +void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { + *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); + } +} + +void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); + } +} + +void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); + } +} + +void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); + } +} + +/* Two operand predicated copy immediate with merge. All valid immediates + * can fit within 17 signed bits in the simd_data field. + */ +void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, + uint64_t mm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + mm = dup_const(MO_8, mm); + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i]; + uint64_t pp = expand_pred_b(pg[H1(i)]); + d[i] = (mm & pp) | (nn & ~pp); + } +} + +void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, + uint64_t mm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + mm = dup_const(MO_16, mm); + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i]; + uint64_t pp = expand_pred_h(pg[H1(i)]); + d[i] = (mm & pp) | (nn & ~pp); + } +} + +void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, + uint64_t mm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + mm = dup_const(MO_32, mm); + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i]; + uint64_t pp = expand_pred_s(pg[H1(i)]); + d[i] = (mm & pp) | (nn & ~pp); + } +} + +void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, + uint64_t mm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i]; + d[i] = (pg[H1(i)] & 1 ? mm : nn); + } +} + +void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd; + uint8_t *pg = vg; + + val = dup_const(MO_8, val); + for (i = 0; i < opr_sz; i += 1) { + d[i] = val & expand_pred_b(pg[H1(i)]); + } +} + +void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd; + uint8_t *pg = vg; + + val = dup_const(MO_16, val); + for (i = 0; i < opr_sz; i += 1) { + d[i] = val & expand_pred_h(pg[H1(i)]); + } +} + +void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd; + uint8_t *pg = vg; + + val = dup_const(MO_32, val); + for (i = 0; i < opr_sz; i += 1) { + d[i] = val & expand_pred_s(pg[H1(i)]); + } +} + +void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + d[i] = (pg[H1(i)] & 1 ? val : 0); + } +} + +/* Big-endian hosts need to frob the byte indices. If the copy + * happens to be 8-byte aligned, then no frobbing necessary. + */ +static void swap_memmove(void *vd, void *vs, size_t n) +{ + uintptr_t d = (uintptr_t)vd; + uintptr_t s = (uintptr_t)vs; + uintptr_t o = (d | s | n) & 7; + size_t i; + +#if !HOST_BIG_ENDIAN + o = 0; +#endif + switch (o) { + case 0: + memmove(vd, vs, n); + break; + + case 4: + if (d < s || d >= s + n) { + for (i = 0; i < n; i += 4) { + *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); + } + } else { + for (i = n; i > 0; ) { + i -= 4; + *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); + } + } + break; + + case 2: + case 6: + if (d < s || d >= s + n) { + for (i = 0; i < n; i += 2) { + *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); + } + } else { + for (i = n; i > 0; ) { + i -= 2; + *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); + } + } + break; + + default: + if (d < s || d >= s + n) { + for (i = 0; i < n; i++) { + *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); + } + } else { + for (i = n; i > 0; ) { + i -= 1; + *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); + } + } + break; + } +} + +/* Similarly for memset of 0. */ +static void swap_memzero(void *vd, size_t n) +{ + uintptr_t d = (uintptr_t)vd; + uintptr_t o = (d | n) & 7; + size_t i; + + /* Usually, the first bit of a predicate is set, so N is 0. */ + if (likely(n == 0)) { + return; + } + +#if !HOST_BIG_ENDIAN + o = 0; +#endif + switch (o) { + case 0: + memset(vd, 0, n); + break; + + case 4: + for (i = 0; i < n; i += 4) { + *(uint32_t *)H1_4(d + i) = 0; + } + break; + + case 2: + case 6: + for (i = 0; i < n; i += 2) { + *(uint16_t *)H1_2(d + i) = 0; + } + break; + + default: + for (i = 0; i < n; i++) { + *(uint8_t *)H1(d + i) = 0; + } + break; + } +} + +void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t opr_sz = simd_oprsz(desc); + size_t n_ofs = simd_data(desc); + size_t n_siz = opr_sz - n_ofs; + + if (vd != vm) { + swap_memmove(vd, vn + n_ofs, n_siz); + swap_memmove(vd + n_siz, vm, n_ofs); + } else if (vd != vn) { + swap_memmove(vd + n_siz, vd, n_ofs); + swap_memmove(vd, vn + n_ofs, n_siz); + } else { + /* vd == vn == vm. Need temp space. */ + ARMVectorReg tmp; + swap_memmove(&tmp, vm, n_ofs); + swap_memmove(vd, vd + n_ofs, n_siz); + memcpy(vd + n_siz, &tmp, n_ofs); + } +} + +#define DO_INSR(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ + *(TYPE *)(vd + H(0)) = val; \ +} + +DO_INSR(sve_insr_b, uint8_t, H1) +DO_INSR(sve_insr_h, uint16_t, H1_2) +DO_INSR(sve_insr_s, uint32_t, H1_4) +DO_INSR(sve_insr_d, uint64_t, H1_8) + +#undef DO_INSR + +void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = bswap64(b); + *(uint64_t *)(vd + j) = bswap64(f); + } +} + +void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = hswap64(b); + *(uint64_t *)(vd + j) = hswap64(f); + } +} + +void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = rol64(b, 32); + *(uint64_t *)(vd + j) = rol64(f, 32); + } +} + +void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { + uint64_t f = *(uint64_t *)(vn + i); + uint64_t b = *(uint64_t *)(vn + j); + *(uint64_t *)(vd + i) = b; + *(uint64_t *)(vd + j) = f; + } +} + +typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); + +static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, + bool is_tbx, tb_impl_fn *fn) +{ + ARMVectorReg scratch; + uintptr_t oprsz = simd_oprsz(desc); + + if (unlikely(vd == vn)) { + vn = memcpy(&scratch, vn, oprsz); + } + + fn(vd, vn, NULL, vm, oprsz, is_tbx); +} + +static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, + uint32_t desc, bool is_tbx, tb_impl_fn *fn) +{ + ARMVectorReg scratch; + uintptr_t oprsz = simd_oprsz(desc); + + if (unlikely(vd == vn0)) { + vn0 = memcpy(&scratch, vn0, oprsz); + if (vd == vn1) { + vn1 = vn0; + } + } else if (unlikely(vd == vn1)) { + vn1 = memcpy(&scratch, vn1, oprsz); + } + + fn(vd, vn0, vn1, vm, oprsz, is_tbx); +} + +#define DO_TB(SUFF, TYPE, H) \ +static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ + void *vm, uintptr_t oprsz, bool is_tbx) \ +{ \ + TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ + uintptr_t i, nelem = oprsz / sizeof(TYPE); \ + for (i = 0; i < nelem; ++i) { \ + TYPE index = indexes[H1(i)], val = 0; \ + if (index < nelem) { \ + val = tbl0[H(index)]; \ + } else { \ + index -= nelem; \ + if (tbl1 && index < nelem) { \ + val = tbl1[H(index)]; \ + } else if (is_tbx) { \ + continue; \ + } \ + } \ + d[H(i)] = val; \ + } \ +} \ +void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ +} \ +void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ + void *vm, uint32_t desc) \ +{ \ + do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ +} \ +void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ +} + +DO_TB(b, uint8_t, H1) +DO_TB(h, uint16_t, H2) +DO_TB(s, uint32_t, H4) +DO_TB(d, uint64_t, H8) + +#undef DO_TB + +#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPED *d = vd; \ + TYPES *n = vn; \ + ARMVectorReg tmp; \ + if (unlikely(vn - vd < opr_sz)) { \ + n = memcpy(&tmp, n, opr_sz / 2); \ + } \ + for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ + d[HD(i)] = n[HS(i)]; \ + } \ +} + +DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) +DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) +DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) + +DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) +DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) +DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) + +#undef DO_UNPK + +/* Mask of bits included in the even numbered predicates of width esz. + * We also use this for expand_bits/compress_bits, and so extend the + * same pattern out to 16-bit units. + */ +static const uint64_t even_bit_esz_masks[5] = { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0f0f0f0f0f0f0f0full, + 0x00ff00ff00ff00ffull, + 0x0000ffff0000ffffull, +}; + +/* Zero-extend units of 2**N bits to units of 2**(N+1) bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_shuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits. + */ +static uint64_t expand_bits(uint64_t x, int n) +{ + int i; + + x &= 0xffffffffu; + for (i = 4; i >= n; i--) { + int sh = 1 << i; + x = ((x << sh) | x) & even_bit_esz_masks[i]; + } + return x; +} + +/* Compress units of 2**(N+1) bits to units of 2**N bits. + * For N==0, this corresponds to the operation that in qemu/bitops.h + * we call half_unshuffle64; this algorithm is from Hacker's Delight, + * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. + */ +static uint64_t compress_bits(uint64_t x, int n) +{ + int i; + + for (i = n; i <= 4; i++) { + int sh = 1 << i; + x &= even_bit_esz_masks[i]; + x = (x >> sh) | x; + } + return x & 0xffffffffu; +} + +void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); + int esize = 1 << esz; + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + uint64_t mm = *(uint64_t *)vm; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + mm = extract64(mm, high * half, half); + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[0] = nn | (mm << esize); + } else { + ARMPredicateReg tmp; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if (vd == vn) { + vn = memcpy(&tmp, vn, oprsz); + if (vd == vm) { + vm = vn; + } + } else if (vd == vm) { + vm = memcpy(&tmp, vm, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((oprsz & 7) == 0) { + uint32_t *n = vn, *m = vm; + high >>= 2; + + for (i = 0; i < oprsz / 8; i++) { + uint64_t nn = n[H4(high + i)]; + uint64_t mm = m[H4(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d[i] = nn | (mm << esize); + } + } else { + uint8_t *n = vn, *m = vm; + uint16_t *d16 = vd; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + uint16_t mm = m[H1(high + i)]; + + nn = expand_bits(nn, esz); + mm = expand_bits(mm, esz); + d16[H2(i)] = nn | (mm << esize); + } + } + } +} + +void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t l, h; + intptr_t i; + + if (oprsz <= 8) { + l = compress_bits(n[0] >> odd, esz); + h = compress_bits(m[0] >> odd, esz); + d[0] = l | (h << (4 * oprsz)); + } else { + ARMPredicateReg tmp_m; + intptr_t oprsz_16 = oprsz / 16; + + if ((vm - vd) < (uintptr_t)oprsz) { + m = memcpy(&tmp_m, vm, oprsz); + } + + for (i = 0; i < oprsz_16; i++) { + l = n[2 * i + 0]; + h = n[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[i] = l | (h << 32); + } + + /* + * For VL which is not a multiple of 512, the results from M do not + * align nicely with the uint64_t for D. Put the aligned results + * from M into TMP_M and then copy it into place afterward. + */ + if (oprsz & 15) { + int final_shift = (oprsz & 15) * 2; + + l = n[2 * i + 0]; + h = n[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[i] = l | (h << final_shift); + + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + tmp_m.p[i] = l | (h << 32); + } + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + tmp_m.p[i] = l | (h << final_shift); + + swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); + } else { + for (i = 0; i < oprsz_16; i++) { + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[oprsz_16 + i] = l | (h << 32); + } + } + } +} + +void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t mask; + int shr, shl; + intptr_t i; + + shl = 1 << esz; + shr = 0; + mask = even_bit_esz_masks[esz]; + if (odd) { + mask <<= shl; + shr = shl; + shl = 0; + } + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { + uint64_t nn = (n[i] & mask) >> shr; + uint64_t mm = (m[i] & mask) << shl; + d[i] = nn + mm; + } +} + +/* Reverse units of 2**N bits. */ +static uint64_t reverse_bits_64(uint64_t x, int n) +{ + int i, sh; + + x = bswap64(x); + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + uint64_t mask = even_bit_esz_masks[i]; + x = ((x & mask) << sh) | ((x >> sh) & mask); + } + return x; +} + +static uint8_t reverse_bits_8(uint8_t x, int n) +{ + static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; + int i, sh; + + for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { + x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); + } + return x; +} + +void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + intptr_t i, oprsz_2 = oprsz / 2; + + if (oprsz <= 8) { + uint64_t l = *(uint64_t *)vn; + l = reverse_bits_64(l << (64 - 8 * oprsz), esz); + *(uint64_t *)vd = l; + } else if ((oprsz & 15) == 0) { + for (i = 0; i < oprsz_2; i += 8) { + intptr_t ih = oprsz - 8 - i; + uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); + uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); + *(uint64_t *)(vd + i) = h; + *(uint64_t *)(vd + ih) = l; + } + } else { + for (i = 0; i < oprsz_2; i += 1) { + intptr_t il = H1(i); + intptr_t ih = H1(oprsz - 1 - i); + uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); + uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); + *(uint8_t *)(vd + il) = h; + *(uint8_t *)(vd + ih) = l; + } + } +} + +void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); + uint64_t *d = vd; + intptr_t i; + + if (oprsz <= 8) { + uint64_t nn = *(uint64_t *)vn; + int half = 4 * oprsz; + + nn = extract64(nn, high * half, half); + nn = expand_bits(nn, 0); + d[0] = nn; + } else { + ARMPredicateReg tmp_n; + + /* We produce output faster than we consume input. + Therefore we must be mindful of possible overlap. */ + if ((vn - vd) < (uintptr_t)oprsz) { + vn = memcpy(&tmp_n, vn, oprsz); + } + if (high) { + high = oprsz >> 1; + } + + if ((oprsz & 7) == 0) { + uint32_t *n = vn; + high >>= 2; + + for (i = 0; i < oprsz / 8; i++) { + uint64_t nn = n[H4(high + i)]; + d[i] = expand_bits(nn, 0); + } + } else { + uint16_t *d16 = vd; + uint8_t *n = vn; + + for (i = 0; i < oprsz / 2; i++) { + uint16_t nn = n[H1(high + i)]; + d16[H2(i)] = expand_bits(nn, 0); + } + } + } +} + +#define DO_ZIP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i, oprsz_2 = oprsz / 2; \ + ARMVectorReg tmp_n, tmp_m; \ + /* We produce output faster than we consume input. \ + Therefore we must be mindful of possible overlap. */ \ + if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ + vn = memcpy(&tmp_n, vn, oprsz); \ + } \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz); \ + } \ + for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ + *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ + *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ + *(TYPE *)(vm + odd_ofs + H(i)); \ + } \ + if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ + memset(vd + oprsz - 16, 0, 16); \ + } \ +} + +DO_ZIP(sve_zip_b, uint8_t, H1) +DO_ZIP(sve_zip_h, uint16_t, H1_2) +DO_ZIP(sve_zip_s, uint32_t, H1_4) +DO_ZIP(sve_zip_d, uint64_t, H1_8) +DO_ZIP(sve2_zip_q, Int128, ) + +#define DO_UZP(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i, p; \ + ARMVectorReg tmp_m; \ + if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ + vm = memcpy(&tmp_m, vm, oprsz); \ + } \ + i = 0, p = odd_ofs; \ + do { \ + *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ + i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ + } while (p < oprsz); \ + p -= oprsz; \ + do { \ + *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ + i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ + } while (p < oprsz); \ + tcg_debug_assert(i == oprsz); \ +} + +DO_UZP(sve_uzp_b, uint8_t, H1) +DO_UZP(sve_uzp_h, uint16_t, H1_2) +DO_UZP(sve_uzp_s, uint32_t, H1_4) +DO_UZP(sve_uzp_d, uint64_t, H1_8) +DO_UZP(sve2_uzp_q, Int128, ) + +#define DO_TRN(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ + intptr_t i; \ + for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ + TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ + TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ + *(TYPE *)(vd + H(i + 0)) = ae; \ + *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ + } \ + if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ + memset(vd + oprsz - 16, 0, 16); \ + } \ +} + +DO_TRN(sve_trn_b, uint8_t, H1) +DO_TRN(sve_trn_h, uint16_t, H1_2) +DO_TRN(sve_trn_s, uint32_t, H1_4) +DO_TRN(sve_trn_d, uint64_t, H1_8) +DO_TRN(sve2_trn_q, Int128, ) + +#undef DO_ZIP +#undef DO_UZP +#undef DO_TRN + +void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; + uint32_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { + d[H4(j)] = n[H4(i)]; + j++; + } + } + for (; j < opr_sz; j++) { + d[H4(j)] = 0; + } +} + +void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + + for (i = j = 0; i < opr_sz; i++) { + if (pg[H1(i)] & 1) { + d[j] = n[i]; + j++; + } + } + for (; j < opr_sz; j++) { + d[j] = 0; + } +} + +/* Similar to the ARM LastActiveElement pseudocode function, except the + * result is multiplied by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) +{ + intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); + intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + + return last_active_element(vg, words, esz); +} + +void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) +{ + intptr_t opr_sz = simd_oprsz(desc) / 8; + int esz = simd_data(desc); + uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; + intptr_t i, first_i, last_i; + ARMVectorReg tmp; + + first_i = last_i = 0; + first_g = last_g = 0; + + /* Find the extent of the active elements within VG. */ + for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { + pg = *(uint64_t *)(vg + i) & mask; + if (pg) { + if (last_g == 0) { + last_g = pg; + last_i = i; + } + first_g = pg; + first_i = i; + } + } + + len = 0; + if (first_g != 0) { + first_i = first_i * 8 + ctz64(first_g); + last_i = last_i * 8 + 63 - clz64(last_g); + len = last_i - first_i + (1 << esz); + if (vd == vm) { + vm = memcpy(&tmp, vm, opr_sz * 8); + } + swap_memmove(vd, vn + first_i, len); + } + swap_memmove(vd + len, vm, opr_sz * 8 - len); +} + +void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_b(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_h(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + uint64_t pp = expand_pred_s(pg[H1(i)]); + d[i] = (nn & pp) | (mm & ~pp); + } +} + +void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + uint64_t nn = n[i], mm = m[i]; + d[i] = (pg[H1(i)] & 1 ? nn : mm); + } +} + +void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, + void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 16; + Int128 *d = vd, *n = vn, *m = vm; + uint16_t *pg = vg; + + for (i = 0; i < opr_sz; i += 1) { + d[i] = (pg[H2(i)] & 1 ? n : m)[i]; + } +} + +/* Two operand comparison controlled by a predicate. + * ??? It is very tempting to want to be able to expand this inline + * with x86 instructions, e.g. + * + * vcmpeqw zm, zn, %ymm0 + * vpmovmskb %ymm0, %eax + * and $0x5555, %eax + * and pg, %eax + * + * or even aarch64, e.g. + * + * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 + * cmeq v0.8h, zn, zm + * and v0.8h, v0.8h, mask + * addv h0, v0.8h + * and v0.8b, pg + * + * However, coming up with an abstraction that allows vector inputs and + * a scalar output, and also handles the byte-ordering of sub-uint64_t + * scalar outputs, is tricky. + */ +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ + DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) + +DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) +DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) +DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) +DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) + +DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) +DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) +DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) +DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) + +DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) +DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) +DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) +DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) + +DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) +DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) +DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) +DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) + +DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) +DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) +DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) +DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) + +DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) +DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) +DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) +DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) + +#undef DO_CMP_PPZZ_B +#undef DO_CMP_PPZZ_H +#undef DO_CMP_PPZZ_S +#undef DO_CMP_PPZZ_D +#undef DO_CMP_PPZZ + +/* Similar, but the second source is "wide". */ +#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + TYPEW mm = *(TYPEW *)(vm + i - 8); \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 7); \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ + DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) + +DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) +DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) +DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) + +DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) +DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) +DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) + +DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) +DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) +DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) + +DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) +DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) +DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) + +DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) +DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) +DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) + +DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) +DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) +DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) + +DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) +DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) +DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) + +DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) +DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) +DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) + +DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) +DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) +DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) + +DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) +DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) +DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) + +#undef DO_CMP_PPZW_B +#undef DO_CMP_PPZW_H +#undef DO_CMP_PPZW_S +#undef DO_CMP_PPZW + +/* Similar, but the second source is immediate. */ +#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ +{ \ + intptr_t opr_sz = simd_oprsz(desc); \ + uint32_t flags = PREDTEST_INIT; \ + TYPE mm = simd_data(desc); \ + intptr_t i = opr_sz; \ + do { \ + uint64_t out = 0, pg; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= nn OP mm; \ + } while (i & 63); \ + pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ + out &= pg; \ + *(uint64_t *)(vd + (i >> 3)) = out; \ + flags = iter_predtest_bwd(out, pg, flags); \ + } while (i > 0); \ + return flags; \ +} + +#define DO_CMP_PPZI_B(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) +#define DO_CMP_PPZI_H(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) +#define DO_CMP_PPZI_S(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) +#define DO_CMP_PPZI_D(NAME, TYPE, OP) \ + DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) + +DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) +DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) +DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) +DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) + +DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) +DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) +DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) +DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) + +DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) +DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) +DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) +DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) + +DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) +DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) +DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) +DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) + +DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) +DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) +DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) +DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) + +DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) +DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) +DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) +DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) + +DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) +DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) +DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) +DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) + +DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) +DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) +DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) +DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) + +DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) +DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) +DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) +DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) + +DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) +DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) +DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) +DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) + +#undef DO_CMP_PPZI_B +#undef DO_CMP_PPZI_H +#undef DO_CMP_PPZI_S +#undef DO_CMP_PPZI_D +#undef DO_CMP_PPZI + +/* Similar to the ARM LastActive pseudocode function. */ +static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) +{ + intptr_t i; + + for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { + uint64_t pg = *(uint64_t *)(vg + i); + if (pg) { + return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; + } + } + return 0; +} + +/* Compute a mask into RETB that is true for all G, up to and including + * (if after) or excluding (if !after) the first G & N. + * Return true if BRK found. + */ +static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, + bool brk, bool after) +{ + uint64_t b; + + if (brk) { + b = 0; + } else if ((g & n) == 0) { + /* For all G, no N are set; break not found. */ + b = g; + } else { + /* Break somewhere in N. Locate it. */ + b = g & n; /* guard true, pred true */ + b = b & -b; /* first such */ + if (after) { + b = b | (b - 1); /* break after same */ + } else { + b = b - 1; /* break before same */ + } + brk = true; + } + + *retb = b; + return brk; +} + +/* Compute a zeroing BRK. */ +static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_b & this_g; + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_d, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = this_b & this_g; + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +/* Compute a merging BRK. */ +static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + bool brk = false; + intptr_t i; + + for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { + uint64_t this_b, this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = (this_b & this_g) | (d[i] & ~this_g); + } +} + +/* Likewise, but also compute flags. */ +static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, + intptr_t oprsz, bool after) +{ + uint32_t flags = PREDTEST_INIT; + bool brk = false; + intptr_t i; + + for (i = 0; i < oprsz / 8; ++i) { + uint64_t this_b, this_d = d[i], this_g = g[i]; + + brk = compute_brk(&this_b, n[i], this_g, brk, after); + d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); + flags = iter_predtest_fwd(this_d, this_g, flags); + } + return flags; +} + +static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) +{ + /* It is quicker to zero the whole predicate than loop on OPRSZ. + * The compiler should turn this into 4 64-bit integer stores. + */ + memset(d, 0, sizeof(ARMPredicateReg)); + return PREDTEST_INIT; +} + +void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, true); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, true); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (last_active_pred(vn, vg, oprsz)) { + compute_brk_z(vd, vm, vg, oprsz, false); + } else { + do_zero(vd, oprsz); + } +} + +uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, + uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (last_active_pred(vn, vg, oprsz)) { + return compute_brks_z(vd, vm, vg, oprsz, false); + } else { + return do_zero(vd, oprsz); + } +} + +void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + compute_brk_z(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + return compute_brks_z(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + compute_brk_z(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + return compute_brks_z(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + compute_brk_m(vd, vn, vg, oprsz, true); +} + +uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + return compute_brks_m(vd, vn, vg, oprsz, true); +} + +void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + compute_brk_m(vd, vn, vg, oprsz, false); +} + +uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + return compute_brks_m(vd, vn, vg, oprsz, false); +} + +void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (!last_active_pred(vn, vg, oprsz)) { + do_zero(vd, oprsz); + } +} + +/* As if PredTest(Ones(PL), D, esz). */ +static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, + uint64_t esz_mask) +{ + uint32_t flags = PREDTEST_INIT; + intptr_t i; + + for (i = 0; i < oprsz / 8; i++) { + flags = iter_predtest_fwd(d->p[i], esz_mask, flags); + } + if (oprsz & 7) { + uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); + flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); + } + return flags; +} + +uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + if (last_active_pred(vn, vg, oprsz)) { + return predtest_ones(vd, oprsz, -1); + } else { + return do_zero(vd, oprsz); + } +} + +uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) +{ + intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); + intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; + intptr_t i; + + for (i = 0; i < words; ++i) { + uint64_t t = n[i] & g[i] & mask; + sum += ctpop64(t); + } + return sum; +} + +uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + uint32_t flags; + intptr_t i; + + /* Begin with a zero predicate register. */ + flags = do_zero(d, oprsz); + if (count == 0) { + return flags; + } + + /* Set all of the requested bits. */ + for (i = 0; i < count / 64; ++i) { + d->p[i] = esz_mask; + } + if (count & 63) { + d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; + } + + return predtest_ones(d, oprsz, esz_mask); +} + +uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) +{ + intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); + intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); + uint64_t esz_mask = pred_esz_masks[esz]; + ARMPredicateReg *d = vd; + intptr_t i, invcount, oprbits; + uint64_t bits; + + if (count == 0) { + return do_zero(d, oprsz); + } + + oprbits = oprsz * 8; + tcg_debug_assert(count <= oprbits); + + bits = esz_mask; + if (oprbits & 63) { + bits &= MAKE_64BIT_MASK(0, oprbits & 63); + } + + invcount = oprbits - count; + for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { + d->p[i] = bits; + bits = esz_mask; + } + + d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); + + while (--i >= 0) { + d->p[i] = 0; + } + + return predtest_ones(d, oprsz, esz_mask); +} + +/* Recursive reduction on a function; + * C.f. the ARM ARM function ReducePredicated. + * + * While it would be possible to write this without the DATA temporary, + * it is much simpler to process the predicate register this way. + * The recursion is bounded to depth 7 (128 fp16 elements), so there's + * little to gain with a more complex non-recursive form. + */ +#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ +static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ +{ \ + if (n == 1) { \ + return *data; \ + } else { \ + uintptr_t half = n / 2; \ + TYPE lo = NAME##_reduce(data, status, half); \ + TYPE hi = NAME##_reduce(data + half, status, half); \ + return TYPE##_##FUNC(lo, hi, status); \ + } \ +} \ +uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ +{ \ + uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ + TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ + for (; i < maxsz; i += sizeof(TYPE)) { \ + *(TYPE *)((void *)data + i) = IDENT; \ + } \ + return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ +} + +DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) +DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) +DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) + +/* Identity is floatN_default_nan, without the function call. */ +DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) +DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) +DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) + +DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) +DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) +DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) + +DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) +DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) +DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) + +DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) +DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) +DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) + +#undef DO_REDUCE + +uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc); + float16 result = nn; + + do { + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + float16 mm = *(float16 *)(vm + H1_2(i)); + result = float16_add(result, mm, status); + } + i += sizeof(float16), pg >>= sizeof(float16); + } while (i & 15); + } while (i < opr_sz); + + return result; +} + +uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc); + float32 result = nn; + + do { + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + float32 mm = *(float32 *)(vm + H1_2(i)); + result = float32_add(result, mm, status); + } + i += sizeof(float32), pg >>= sizeof(float32); + } while (i & 15); + } while (i < opr_sz); + + return result; +} + +uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; + uint64_t *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i++) { + if (pg[H1(i)] & 1) { + nn = float64_add(nn, m[i], status); + } + } + + return nn; +} + +/* Fully general three-operand expander, controlled by a predicate, + * With the extra float_status parameter. + */ +#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) +DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) +DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) + +DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) +DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) +DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) + +DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) +DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) +DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) + +DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) +DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) +DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) + +DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) +DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) +DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) + +DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) +DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) +DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) + +DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) +DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) +DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) + +DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) +DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) +DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) + +static inline float16 abd_h(float16 a, float16 b, float_status *s) +{ + return float16_abs(float16_sub(a, b, s)); +} + +static inline float32 abd_s(float32 a, float32 b, float_status *s) +{ + return float32_abs(float32_sub(a, b, s)); +} + +static inline float64 abd_d(float64 a, float64 b, float_status *s) +{ + return float64_abs(float64_sub(a, b, s)); +} + +DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) +DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) +DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) + +static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) +{ + int b_int = MIN(MAX(b, INT_MIN), INT_MAX); + return float64_scalbn(a, b_int, s); +} + +DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) +DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) +DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) + +DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) +DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) +DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) + +#undef DO_ZPZZ_FP + +/* Three-operand expander, with one scalar operand, controlled by + * a predicate, with the extra float_status parameter. + */ +#define DO_ZPZS_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + TYPE mm = scalar; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) +DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) +DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) + +DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) +DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) +DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) + +DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) +DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) +DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) + +static inline float16 subr_h(float16 a, float16 b, float_status *s) +{ + return float16_sub(b, a, s); +} + +static inline float32 subr_s(float32 a, float32 b, float_status *s) +{ + return float32_sub(b, a, s); +} + +static inline float64 subr_d(float64 a, float64 b, float_status *s) +{ + return float64_sub(b, a, s); +} + +DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) +DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) +DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) + +DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) +DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) +DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) + +DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) +DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) +DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) + +DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) +DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) +DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) + +DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) +DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) +DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) + +/* Fully general two-operand expander, controlled by a predicate, + * With the extra float_status parameter. + */ +#define DO_ZPZ_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore + * FZ16. When converting from fp16, this affects flushing input denormals; + * when converting to fp16, this affects flushing output denormals. + */ +static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) +{ + bool save = get_flush_inputs_to_zero(fpst); + float32 ret; + + set_flush_inputs_to_zero(false, fpst); + ret = float16_to_float32(f, true, fpst); + set_flush_inputs_to_zero(save, fpst); + return ret; +} + +static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) +{ + bool save = get_flush_inputs_to_zero(fpst); + float64 ret; + + set_flush_inputs_to_zero(false, fpst); + ret = float16_to_float64(f, true, fpst); + set_flush_inputs_to_zero(save, fpst); + return ret; +} + +static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) +{ + bool save = get_flush_to_zero(fpst); + float16 ret; + + set_flush_to_zero(false, fpst); + ret = float32_to_float16(f, true, fpst); + set_flush_to_zero(save, fpst); + return ret; +} + +static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) +{ + bool save = get_flush_to_zero(fpst); + float16 ret; + + set_flush_to_zero(false, fpst); + ret = float64_to_float16(f, true, fpst); + set_flush_to_zero(save, fpst); + return ret; +} + +static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_int16_round_to_zero(f, s); +} + +static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_int64_round_to_zero(f, s); +} + +static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) +{ + if (float32_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float32_to_int64_round_to_zero(f, s); +} + +static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) +{ + if (float64_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float64_to_int64_round_to_zero(f, s); +} + +static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_uint16_round_to_zero(f, s); +} + +static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_uint64_round_to_zero(f, s); +} + +static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) +{ + if (float32_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float32_to_uint64_round_to_zero(f, s); +} + +static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) +{ + if (float64_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float64_to_uint64_round_to_zero(f, s); +} + +DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) +DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) +DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) +DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) +DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) +DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) +DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) + +DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) +DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) +DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) +DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) +DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) + +DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) +DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) +DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) +DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) +DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) + +DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) +DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) +DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) + +DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) +DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) +DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) + +DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) +DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) +DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) + +DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) +DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) +DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) + +DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) +DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) +DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) +DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) +DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) +DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) +DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) + +DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) +DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) +DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) +DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) +DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) +DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) +DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) + +static int16_t do_float16_logb_as_int(float16 a, float_status *s) +{ + /* Extract frac to the top of the uint32_t. */ + uint32_t frac = (uint32_t)a << (16 + 6); + int16_t exp = extract32(a, 10, 5); + + if (unlikely(exp == 0)) { + if (frac != 0) { + if (!get_flush_inputs_to_zero(s)) { + /* denormal: bias - fractional_zeros */ + return -15 - clz32(frac); + } + /* flush to zero */ + float_raise(float_flag_input_denormal, s); + } + } else if (unlikely(exp == 0x1f)) { + if (frac == 0) { + return INT16_MAX; /* infinity */ + } + } else { + /* normal: exp - bias */ + return exp - 15; + } + /* nan or zero */ + float_raise(float_flag_invalid, s); + return INT16_MIN; +} + +static int32_t do_float32_logb_as_int(float32 a, float_status *s) +{ + /* Extract frac to the top of the uint32_t. */ + uint32_t frac = a << 9; + int32_t exp = extract32(a, 23, 8); + + if (unlikely(exp == 0)) { + if (frac != 0) { + if (!get_flush_inputs_to_zero(s)) { + /* denormal: bias - fractional_zeros */ + return -127 - clz32(frac); + } + /* flush to zero */ + float_raise(float_flag_input_denormal, s); + } + } else if (unlikely(exp == 0xff)) { + if (frac == 0) { + return INT32_MAX; /* infinity */ + } + } else { + /* normal: exp - bias */ + return exp - 127; + } + /* nan or zero */ + float_raise(float_flag_invalid, s); + return INT32_MIN; +} + +static int64_t do_float64_logb_as_int(float64 a, float_status *s) +{ + /* Extract frac to the top of the uint64_t. */ + uint64_t frac = a << 12; + int64_t exp = extract64(a, 52, 11); + + if (unlikely(exp == 0)) { + if (frac != 0) { + if (!get_flush_inputs_to_zero(s)) { + /* denormal: bias - fractional_zeros */ + return -1023 - clz64(frac); + } + /* flush to zero */ + float_raise(float_flag_input_denormal, s); + } + } else if (unlikely(exp == 0x7ff)) { + if (frac == 0) { + return INT64_MAX; /* infinity */ + } + } else { + /* normal: exp - bias */ + return exp - 1023; + } + /* nan or zero */ + float_raise(float_flag_invalid, s); + return INT64_MIN; +} + +DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) +DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) +DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) + +#undef DO_ZPZ_FP + +static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, + float_status *status, uint32_t desc, + uint16_t neg1, uint16_t neg3) +{ + intptr_t i = simd_oprsz(desc); + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 2; + if (likely((pg >> (i & 63)) & 1)) { + float16 e1, e2, e3, r; + + e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; + e2 = *(uint16_t *)(vm + H1_2(i)); + e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; + r = float16_muladd(e1, e2, e3, 0, status); + *(uint16_t *)(vd + H1_2(i)) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); +} + +void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); +} + +void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); +} + +static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, + float_status *status, uint32_t desc, + uint32_t neg1, uint32_t neg3) +{ + intptr_t i = simd_oprsz(desc); + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 4; + if (likely((pg >> (i & 63)) & 1)) { + float32 e1, e2, e3, r; + + e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; + e2 = *(uint32_t *)(vm + H1_4(i)); + e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; + r = float32_muladd(e1, e2, e3, 0, status); + *(uint32_t *)(vd + H1_4(i)) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); +} + +void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); +} + +void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); +} + +static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, + float_status *status, uint32_t desc, + uint64_t neg1, uint64_t neg3) +{ + intptr_t i = simd_oprsz(desc); + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 8; + if (likely((pg >> (i & 63)) & 1)) { + float64 e1, e2, e3, r; + + e1 = *(uint64_t *)(vn + i) ^ neg1; + e2 = *(uint64_t *)(vm + i); + e3 = *(uint64_t *)(va + i) ^ neg3; + r = float64_muladd(e1, e2, e3, 0, status); + *(uint64_t *)(vd + i) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); +} + +void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); +} + +void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); +} + +/* Two operand floating-point comparison controlled by a predicate. + * Unlike the integer version, we are not allowed to optimistically + * compare operands, since the comparison may have side effects wrt + * the FPSR. + */ +#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ + uint64_t *d = vd, *g = vg; \ + do { \ + uint64_t out = 0, pg = g[j]; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + out |= OP(TYPE, nn, mm, status); \ + } \ + } while (i & 63); \ + d[j--] = out; \ + } while (i > 0); \ +} + +#define DO_FPCMP_PPZZ_H(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) +#define DO_FPCMP_PPZZ_S(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) +#define DO_FPCMP_PPZZ_D(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) + +#define DO_FPCMP_PPZZ_ALL(NAME, OP) \ + DO_FPCMP_PPZZ_H(NAME, OP) \ + DO_FPCMP_PPZZ_S(NAME, OP) \ + DO_FPCMP_PPZZ_D(NAME, OP) + +#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 +#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 +#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 +#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 +#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 +#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 +#define DO_FCMUO(TYPE, X, Y, ST) \ + TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered +#define DO_FACGE(TYPE, X, Y, ST) \ + TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 +#define DO_FACGT(TYPE, X, Y, ST) \ + TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 + +DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) +DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) +DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) +DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) +DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) +DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) +DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) + +#undef DO_FPCMP_PPZZ_ALL +#undef DO_FPCMP_PPZZ_D +#undef DO_FPCMP_PPZZ_S +#undef DO_FPCMP_PPZZ_H +#undef DO_FPCMP_PPZZ + +/* One operand floating-point comparison against zero, controlled + * by a predicate. + */ +#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ + uint64_t *d = vd, *g = vg; \ + do { \ + uint64_t out = 0, pg = g[j]; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + if ((pg >> (i & 63)) & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= OP(TYPE, nn, 0, status); \ + } \ + } while (i & 63); \ + d[j--] = out; \ + } while (i > 0); \ +} + +#define DO_FPCMP_PPZ0_H(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) +#define DO_FPCMP_PPZ0_S(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) +#define DO_FPCMP_PPZ0_D(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) + +#define DO_FPCMP_PPZ0_ALL(NAME, OP) \ + DO_FPCMP_PPZ0_H(NAME, OP) \ + DO_FPCMP_PPZ0_S(NAME, OP) \ + DO_FPCMP_PPZ0_D(NAME, OP) + +DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) +DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) +DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) +DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) +DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) +DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) + +/* FP Trig Multiply-Add. */ + +void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float16 coeff[16] = { + 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); + intptr_t x = simd_data(desc); + float16 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float16 mm = m[i]; + intptr_t xx = x; + if (float16_is_neg(mm)) { + mm = float16_abs(mm); + xx += 8; + } + d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float32 coeff[16] = { + 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, + 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, + 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, + 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); + intptr_t x = simd_data(desc); + float32 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float32 mm = m[i]; + intptr_t xx = x; + if (float32_is_neg(mm)) { + mm = float32_abs(mm); + xx += 8; + } + d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float64 coeff[16] = { + 0x3ff0000000000000ull, 0xbfc5555555555543ull, + 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, + 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, + 0x3de5d8408868552full, 0x0000000000000000ull, + 0x3ff0000000000000ull, 0xbfe0000000000000ull, + 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, + 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, + 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); + intptr_t x = simd_data(desc); + float64 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float64 mm = m[i]; + intptr_t xx = x; + if (float64_is_neg(mm)) { + mm = float64_abs(mm); + xx += 8; + } + d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +/* + * FP Complex Add + */ + +void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float16 neg_imag = float16_set_sign(0, simd_data(desc)); + float16 neg_real = float16_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float16 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float16); + i -= 2 * sizeof(float16); + + e0 = *(float16 *)(vn + H1_2(i)); + e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float16 *)(vn + H1_2(j)); + e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float32 neg_imag = float32_set_sign(0, simd_data(desc)); + float32 neg_real = float32_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float32 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float32); + i -= 2 * sizeof(float32); + + e0 = *(float32 *)(vn + H1_2(i)); + e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float32 *)(vn + H1_2(j)); + e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float64 neg_imag = float64_set_sign(0, simd_data(desc)); + float64 neg_real = float64_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float64 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float64); + i -= 2 * sizeof(float64); + + e0 = *(float64 *)(vn + H1_2(i)); + e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float64 *)(vn + H1_2(j)); + e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +/* + * FP Complex Multiply + */ + +void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rot = simd_data(desc); + bool flip = rot & 1; + float16 neg_imag, neg_real; + uint64_t *g = vg; + + neg_imag = float16_set_sign(0, (rot & 2) != 0); + neg_real = float16_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float16 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float16); + i -= 2 * sizeof(float16); + + nr = *(float16 *)(vn + H1_2(i)); + ni = *(float16 *)(vn + H1_2(j)); + mr = *(float16 *)(vm + H1_2(i)); + mi = *(float16 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float16 *)(va + H1_2(i)); + d = float16_muladd(e2, e1, d, 0, status); + *(float16 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float16 *)(va + H1_2(j)); + d = float16_muladd(e4, e3, d, 0, status); + *(float16 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rot = simd_data(desc); + bool flip = rot & 1; + float32 neg_imag, neg_real; + uint64_t *g = vg; + + neg_imag = float32_set_sign(0, (rot & 2) != 0); + neg_real = float32_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float32 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float32); + i -= 2 * sizeof(float32); + + nr = *(float32 *)(vn + H1_2(i)); + ni = *(float32 *)(vn + H1_2(j)); + mr = *(float32 *)(vm + H1_2(i)); + mi = *(float32 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float32 *)(va + H1_2(i)); + d = float32_muladd(e2, e1, d, 0, status); + *(float32 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float32 *)(va + H1_2(j)); + d = float32_muladd(e4, e3, d, 0, status); + *(float32 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, void *status, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rot = simd_data(desc); + bool flip = rot & 1; + float64 neg_imag, neg_real; + uint64_t *g = vg; + + neg_imag = float64_set_sign(0, (rot & 2) != 0); + neg_real = float64_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float64 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float64); + i -= 2 * sizeof(float64); + + nr = *(float64 *)(vn + H1_2(i)); + ni = *(float64 *)(vn + H1_2(j)); + mr = *(float64 *)(vm + H1_2(i)); + mi = *(float64 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float64 *)(va + H1_2(i)); + d = float64_muladd(e2, e1, d, 0, status); + *(float64 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float64 *)(va + H1_2(j)); + d = float64_muladd(e4, e3, d, 0, status); + *(float64 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +/* + * Load contiguous data, protected by a governing predicate. + */ + +/* + * Skip through a sequence of inactive elements in the guarding predicate @vg, + * beginning at @reg_off bounded by @reg_max. Return the offset of the active + * element >= @reg_off, or @reg_max if there were no active elements at all. + */ +static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, + intptr_t reg_max, int esz) +{ + uint64_t pg_mask = pred_esz_masks[esz]; + uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); + + /* In normal usage, the first element is active. */ + if (likely(pg & 1)) { + return reg_off; + } + + if (pg == 0) { + reg_off &= -64; + do { + reg_off += 64; + if (unlikely(reg_off >= reg_max)) { + /* The entire predicate was false. */ + return reg_max; + } + pg = vg[reg_off >> 6] & pg_mask; + } while (pg == 0); + } + reg_off += ctz64(pg); + + /* We should never see an out of range predicate bit set. */ + tcg_debug_assert(reg_off < reg_max); + return reg_off; +} + +/* + * Resolve the guest virtual address to info->host and info->flags. + * If @nofault, return false if the page is invalid, otherwise + * exit via page fault exception. + */ + +bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, + target_ulong addr, int mem_off, MMUAccessType access_type, + int mmu_idx, uintptr_t retaddr) +{ + int flags; + + addr += mem_off; + + /* + * User-only currently always issues with TBI. See the comment + * above useronly_clean_ptr. Usually we clean this top byte away + * during translation, but we can't do that for e.g. vector + imm + * addressing modes. + * + * We currently always enable TBI for user-only, and do not provide + * a way to turn it off. So clean the pointer unconditionally here, + * rather than look it up here, or pass it down from above. + */ + addr = useronly_clean_ptr(addr); + +#ifdef CONFIG_USER_ONLY + flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, + &info->host, retaddr); +#else + CPUTLBEntryFull *full; + flags = probe_access_full(env, addr, access_type, mmu_idx, nofault, + &info->host, &full, retaddr); +#endif + info->flags = flags; + + if (flags & TLB_INVALID_MASK) { + g_assert(nofault); + return false; + } + +#ifdef CONFIG_USER_ONLY + memset(&info->attrs, 0, sizeof(info->attrs)); + /* Require both ANON and MTE; see allocation_tag_mem(). */ + info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); +#else + info->attrs = full->attrs; + info->tagged = full->pte_attrs == 0xf0; +#endif + + /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ + info->host -= mem_off; + return true; +} + +/* + * Find first active element on each page, and a loose bound for the + * final element on each page. Identify any single element that spans + * the page boundary. Return true if there are any active elements. + */ +bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, + intptr_t reg_max, int esz, int msize) +{ + const int esize = 1 << esz; + const uint64_t pg_mask = pred_esz_masks[esz]; + intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; + intptr_t mem_off_last, mem_off_split; + intptr_t page_split, elt_split; + intptr_t i; + + /* Set all of the element indices to -1, and the TLB data to 0. */ + memset(info, -1, offsetof(SVEContLdSt, page)); + memset(info->page, 0, sizeof(info->page)); + + /* Gross scan over the entire predicate to find bounds. */ + i = 0; + do { + uint64_t pg = vg[i] & pg_mask; + if (pg) { + reg_off_last = i * 64 + 63 - clz64(pg); + if (reg_off_first < 0) { + reg_off_first = i * 64 + ctz64(pg); + } + } + } while (++i * 64 < reg_max); + + if (unlikely(reg_off_first < 0)) { + /* No active elements, no pages touched. */ + return false; + } + tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); + + info->reg_off_first[0] = reg_off_first; + info->mem_off_first[0] = (reg_off_first >> esz) * msize; + mem_off_last = (reg_off_last >> esz) * msize; + + page_split = -(addr | TARGET_PAGE_MASK); + if (likely(mem_off_last + msize <= page_split)) { + /* The entire operation fits within a single page. */ + info->reg_off_last[0] = reg_off_last; + return true; + } + + info->page_split = page_split; + elt_split = page_split / msize; + reg_off_split = elt_split << esz; + mem_off_split = elt_split * msize; + + /* + * This is the last full element on the first page, but it is not + * necessarily active. If there is no full element, i.e. the first + * active element is the one that's split, this value remains -1. + * It is useful as iteration bounds. + */ + if (elt_split != 0) { + info->reg_off_last[0] = reg_off_split - esize; + } + + /* Determine if an unaligned element spans the pages. */ + if (page_split % msize != 0) { + /* It is helpful to know if the split element is active. */ + if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { + info->reg_off_split = reg_off_split; + info->mem_off_split = mem_off_split; + + if (reg_off_split == reg_off_last) { + /* The page crossing element is last. */ + return true; + } + } + reg_off_split += esize; + mem_off_split += msize; + } + + /* + * We do want the first active element on the second page, because + * this may affect the address reported in an exception. + */ + reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); + tcg_debug_assert(reg_off_split <= reg_off_last); + info->reg_off_first[1] = reg_off_split; + info->mem_off_first[1] = (reg_off_split >> esz) * msize; + info->reg_off_last[1] = reg_off_last; + return true; +} + +/* + * Resolve the guest virtual addresses to info->page[]. + * Control the generation of page faults with @fault. Return false if + * there is no work to do, which can only happen with @fault == FAULT_NO. + */ +bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, + CPUARMState *env, target_ulong addr, + MMUAccessType access_type, uintptr_t retaddr) +{ + int mmu_idx = cpu_mmu_index(env, false); + int mem_off = info->mem_off_first[0]; + bool nofault = fault == FAULT_NO; + bool have_work = true; + + if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, + access_type, mmu_idx, retaddr)) { + /* No work to be done. */ + return false; + } + + if (likely(info->page_split < 0)) { + /* The entire operation was on the one page. */ + return true; + } + + /* + * If the second page is invalid, then we want the fault address to be + * the first byte on that page which is accessed. + */ + if (info->mem_off_split >= 0) { + /* + * There is an element split across the pages. The fault address + * should be the first byte of the second page. + */ + mem_off = info->page_split; + /* + * If the split element is also the first active element + * of the vector, then: For first-fault we should continue + * to generate faults for the second page. For no-fault, + * we have work only if the second page is valid. + */ + if (info->mem_off_first[0] < info->mem_off_split) { + nofault = FAULT_FIRST; + have_work = false; + } + } else { + /* + * There is no element split across the pages. The fault address + * should be the first active element on the second page. + */ + mem_off = info->mem_off_first[1]; + /* + * There must have been one active element on the first page, + * so we're out of first-fault territory. + */ + nofault = fault != FAULT_ALL; + } + + have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, + access_type, mmu_idx, retaddr); + return have_work; +} + +#ifndef CONFIG_USER_ONLY +void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, + int esize, int msize, int wp_access, + uintptr_t retaddr) +{ + intptr_t mem_off, reg_off, reg_last; + int flags0 = info->page[0].flags; + int flags1 = info->page[1].flags; + + if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { + return; + } + + /* Indicate that watchpoints are handled. */ + info->page[0].flags = flags0 & ~TLB_WATCHPOINT; + info->page[1].flags = flags1 & ~TLB_WATCHPOINT; + + if (flags0 & TLB_WATCHPOINT) { + mem_off = info->mem_off_first[0]; + reg_off = info->reg_off_first[0]; + reg_last = info->reg_off_last[0]; + + while (reg_off <= reg_last) { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + cpu_check_watchpoint(env_cpu(env), addr + mem_off, + msize, info->page[0].attrs, + wp_access, retaddr); + } + reg_off += esize; + mem_off += msize; + } while (reg_off <= reg_last && (reg_off & 63)); + } + } + + mem_off = info->mem_off_split; + if (mem_off >= 0) { + cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, + info->page[0].attrs, wp_access, retaddr); + } + + mem_off = info->mem_off_first[1]; + if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { + reg_off = info->reg_off_first[1]; + reg_last = info->reg_off_last[1]; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + cpu_check_watchpoint(env_cpu(env), addr + mem_off, + msize, info->page[1].attrs, + wp_access, retaddr); + } + reg_off += esize; + mem_off += msize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} +#endif + +void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, int esize, + int msize, uint32_t mtedesc, uintptr_t ra) +{ + intptr_t mem_off, reg_off, reg_last; + + /* Process the page only if MemAttr == Tagged. */ + if (info->page[0].tagged) { + mem_off = info->mem_off_first[0]; + reg_off = info->reg_off_first[0]; + reg_last = info->reg_off_split; + if (reg_last < 0) { + reg_last = info->reg_off_last[0]; + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + mte_check(env, mtedesc, addr, ra); + } + reg_off += esize; + mem_off += msize; + } while (reg_off <= reg_last && (reg_off & 63)); + } while (reg_off <= reg_last); + } + + mem_off = info->mem_off_first[1]; + if (mem_off >= 0 && info->page[1].tagged) { + reg_off = info->reg_off_first[1]; + reg_last = info->reg_off_last[1]; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + mte_check(env, mtedesc, addr, ra); + } + reg_off += esize; + mem_off += msize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +/* + * Common helper for all contiguous 1,2,3,4-register predicated stores. + */ +static inline QEMU_ALWAYS_INLINE +void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, + uint32_t desc, const uintptr_t retaddr, + const int esz, const int msz, const int N, uint32_t mtedesc, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const unsigned rd = simd_data(desc); + const intptr_t reg_max = simd_oprsz(desc); + intptr_t reg_off, reg_last, mem_off; + SVEContLdSt info; + void *host; + int flags, i; + + /* Find the active elements. */ + if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { + /* The entire predicate was false; no load occurs. */ + for (i = 0; i < N; ++i) { + memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); + } + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); + + /* Handle watchpoints for all active elements. */ + sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, + BP_MEM_READ, retaddr); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, + mtedesc, retaddr); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { +#ifdef CONFIG_USER_ONLY + g_assert_not_reached(); +#else + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. Perform the load + * into scratch memory to preserve register state until the end. + */ + ARMVectorReg scratch[4] = { }; + + mem_off = info.mem_off_first[0]; + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[1]; + if (reg_last < 0) { + reg_last = info.reg_off_split; + if (reg_last < 0) { + reg_last = info.reg_off_last[0]; + } + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + tlb_fn(env, &scratch[i], reg_off, + addr + mem_off + (i << msz), retaddr); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off & 63); + } while (reg_off <= reg_last); + + for (i = 0; i < N; ++i) { + memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); + } + return; +#endif + } + + /* The entire operation is in RAM, on valid pages. */ + + for (i = 0; i < N; ++i) { + memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); + } + + mem_off = info.mem_off_first[0]; + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[0]; + host = info.page[0].host; + + while (reg_off <= reg_last) { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, + host + mem_off + (i << msz)); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off <= reg_last && (reg_off & 63)); + } + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + mem_off = info.mem_off_split; + if (unlikely(mem_off >= 0)) { + reg_off = info.reg_off_split; + for (i = 0; i < N; ++i) { + tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, + addr + mem_off + (i << msz), retaddr); + } + } + + mem_off = info.mem_off_first[1]; + if (unlikely(mem_off >= 0)) { + reg_off = info.reg_off_first[1]; + reg_last = info.reg_off_last[1]; + host = info.page[1].host; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, + host + mem_off + (i << msz)); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t ra, + const int esz, const int msz, const int N, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); +} + +#define DO_LD1_1(NAME, ESZ) \ +void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ + sve_##NAME##_host, sve_##NAME##_tlb); \ +} \ +void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ + sve_##NAME##_host, sve_##NAME##_tlb); \ +} + +#define DO_LD1_2(NAME, ESZ, MSZ) \ +void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} \ +void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +} + +DO_LD1_1(ld1bb, MO_8) +DO_LD1_1(ld1bhu, MO_16) +DO_LD1_1(ld1bhs, MO_16) +DO_LD1_1(ld1bsu, MO_32) +DO_LD1_1(ld1bss, MO_32) +DO_LD1_1(ld1bdu, MO_64) +DO_LD1_1(ld1bds, MO_64) + +DO_LD1_2(ld1hh, MO_16, MO_16) +DO_LD1_2(ld1hsu, MO_32, MO_16) +DO_LD1_2(ld1hss, MO_32, MO_16) +DO_LD1_2(ld1hdu, MO_64, MO_16) +DO_LD1_2(ld1hds, MO_64, MO_16) + +DO_LD1_2(ld1ss, MO_32, MO_32) +DO_LD1_2(ld1sdu, MO_64, MO_32) +DO_LD1_2(ld1sds, MO_64, MO_32) + +DO_LD1_2(ld1dd, MO_64, MO_64) + +#undef DO_LD1_1 +#undef DO_LD1_2 + +#define DO_LDN_1(N) \ +void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ + sve_ld1bb_host, sve_ld1bb_tlb); \ +} \ +void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ + sve_ld1bb_host, sve_ld1bb_tlb); \ +} + +#define DO_LDN_2(N, SUFF, ESZ) \ +void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ + sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ +} \ +void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ + sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ +} \ +void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ + sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ +} \ +void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ + sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ +} + +DO_LDN_1(2) +DO_LDN_1(3) +DO_LDN_1(4) + +DO_LDN_2(2, hh, MO_16) +DO_LDN_2(3, hh, MO_16) +DO_LDN_2(4, hh, MO_16) + +DO_LDN_2(2, ss, MO_32) +DO_LDN_2(3, ss, MO_32) +DO_LDN_2(4, ss, MO_32) + +DO_LDN_2(2, dd, MO_64) +DO_LDN_2(3, dd, MO_64) +DO_LDN_2(4, dd, MO_64) + +#undef DO_LDN_1 +#undef DO_LDN_2 + +/* + * Load contiguous data, first-fault and no-fault. + * + * For user-only, one could argue that we should hold the mmap_lock during + * the operation so that there is no race between page_check_range and the + * load operation. However, unmapping pages out from under a running thread + * is extraordinarily unlikely. This theoretical race condition also affects + * linux-user/ in its get_user/put_user macros. + * + * TODO: Construct some helpers, written in assembly, that interact with + * host_signal_handler to produce memory ops which can properly report errors + * without racing. + */ + +/* Fault on byte I. All bits in FFR from I are cleared. The vector + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE + * option, which leaves subsequent data unchanged. + */ +static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) +{ + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; + + if (i & 63) { + ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); + i = ROUND_UP(i, 64); + } + for (; i < oprsz; i += 64) { + ffr[i / 64] = 0; + } +} + +/* + * Common helper for all contiguous no-fault and first-fault loads. + */ +static inline QEMU_ALWAYS_INLINE +void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, + uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, + const int esz, const int msz, const SVEContFault fault, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const unsigned rd = simd_data(desc); + void *vd = &env->vfp.zregs[rd]; + const intptr_t reg_max = simd_oprsz(desc); + intptr_t reg_off, mem_off, reg_last; + SVEContLdSt info; + int flags; + void *host; + + /* Find the active elements. */ + if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { + /* The entire predicate was false; no load occurs. */ + memset(vd, 0, reg_max); + return; + } + reg_off = info.reg_off_first[0]; + + /* Probe the page(s). */ + if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { + /* Fault on first element. */ + tcg_debug_assert(fault == FAULT_NO); + memset(vd, 0, reg_max); + goto do_fault; + } + + mem_off = info.mem_off_first[0]; + flags = info.page[0].flags; + + /* + * Disable MTE checking if the Tagged bit is not set. Since TBI must + * be set within MTEDESC for MTE, !mtedesc => !mte_active. + */ + if (!info.page[0].tagged) { + mtedesc = 0; + } + + if (fault == FAULT_FIRST) { + /* Trapping mte check for the first-fault element. */ + if (mtedesc) { + mte_check(env, mtedesc, addr + mem_off, retaddr); + } + + /* + * Special handling of the first active element, + * if it crosses a page boundary or is MMIO. + */ + bool is_split = mem_off == info.mem_off_split; + if (unlikely(flags != 0) || unlikely(is_split)) { + /* + * Use the slow path for cross-page handling. + * Might trap for MMIO or watchpoints. + */ + tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); + + /* After any fault, zero the other elements. */ + swap_memzero(vd, reg_off); + reg_off += 1 << esz; + mem_off += 1 << msz; + swap_memzero(vd + reg_off, reg_max - reg_off); + + if (is_split) { + goto second_page; + } + } else { + memset(vd, 0, reg_max); + } + } else { + memset(vd, 0, reg_max); + if (unlikely(mem_off == info.mem_off_split)) { + /* The first active element crosses a page boundary. */ + flags |= info.page[1].flags; + if (unlikely(flags & TLB_MMIO)) { + /* Some page is MMIO, see below. */ + goto do_fault; + } + if (unlikely(flags & TLB_WATCHPOINT) && + (cpu_watchpoint_address_matches + (env_cpu(env), addr + mem_off, 1 << msz) + & BP_MEM_READ)) { + /* Watchpoint hit, see below. */ + goto do_fault; + } + if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { + goto do_fault; + } + /* + * Use the slow path for cross-page handling. + * This is RAM, without a watchpoint, and will not trap. + */ + tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); + goto second_page; + } + } + + /* + * From this point on, all memory operations are MemSingleNF. + * + * Per the MemSingleNF pseudocode, a no-fault load from Device memory + * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. + * + * Unfortuately we do not have access to the memory attributes from the + * PTE to tell Device memory from Normal memory. So we make a mostly + * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. + * This gives the right answer for the common cases of "Normal memory, + * backed by host RAM" and "Device memory, backed by MMIO". + * The architecture allows us to suppress an NF load and return + * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner + * case of "Normal memory, backed by MMIO" is permitted. The case we + * get wrong is "Device memory, backed by host RAM", for which we + * should return (UNKNOWN, FAULT) for but do not. + * + * Similarly, CPU_BP breakpoints would raise exceptions, and so + * return (UNKNOWN, FAULT). For simplicity, we consider gdb and + * architectural breakpoints the same. + */ + if (unlikely(flags & TLB_MMIO)) { + goto do_fault; + } + + reg_last = info.reg_off_last[0]; + host = info.page[0].host; + + do { + uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); + do { + if ((pg >> (reg_off & 63)) & 1) { + if (unlikely(flags & TLB_WATCHPOINT) && + (cpu_watchpoint_address_matches + (env_cpu(env), addr + mem_off, 1 << msz) + & BP_MEM_READ)) { + goto do_fault; + } + if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { + goto do_fault; + } + host_fn(vd, reg_off, host + mem_off); + } + reg_off += 1 << esz; + mem_off += 1 << msz; + } while (reg_off <= reg_last && (reg_off & 63)); + } while (reg_off <= reg_last); + + /* + * MemSingleNF is allowed to fail for any reason. We have special + * code above to handle the first element crossing a page boundary. + * As an implementation choice, decline to handle a cross-page element + * in any other position. + */ + reg_off = info.reg_off_split; + if (reg_off >= 0) { + goto do_fault; + } + + second_page: + reg_off = info.reg_off_first[1]; + if (likely(reg_off < 0)) { + /* No active elements on the second page. All done. */ + return; + } + + /* + * MemSingleNF is allowed to fail for any reason. As an implementation + * choice, decline to handle elements on the second page. This should + * be low frequency as the guest walks through memory -- the next + * iteration of the guest's loop should be aligned on the page boundary, + * and then all following iterations will stay aligned. + */ + + do_fault: + record_fault(env, reg_off, reg_max); +} + +static inline QEMU_ALWAYS_INLINE +void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, + uint32_t desc, const uintptr_t retaddr, + const int esz, const int msz, const SVEContFault fault, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, + esz, msz, fault, host_fn, tlb_fn); +} + +#define DO_LDFF1_LDNF1_1(PART, ESZ) \ +void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} + +#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ +void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} + +DO_LDFF1_LDNF1_1(bb, MO_8) +DO_LDFF1_LDNF1_1(bhu, MO_16) +DO_LDFF1_LDNF1_1(bhs, MO_16) +DO_LDFF1_LDNF1_1(bsu, MO_32) +DO_LDFF1_LDNF1_1(bss, MO_32) +DO_LDFF1_LDNF1_1(bdu, MO_64) +DO_LDFF1_LDNF1_1(bds, MO_64) + +DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) +DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) +DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) +DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) +DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) + +DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) +DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) +DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) + +DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) + +#undef DO_LDFF1_LDNF1_1 +#undef DO_LDFF1_LDNF1_2 + +/* + * Common helper for all contiguous 1,2,3,4-register predicated stores. + */ + +static inline QEMU_ALWAYS_INLINE +void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t retaddr, + const int esz, const int msz, const int N, uint32_t mtedesc, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const unsigned rd = simd_data(desc); + const intptr_t reg_max = simd_oprsz(desc); + intptr_t reg_off, reg_last, mem_off; + SVEContLdSt info; + void *host; + int i, flags; + + /* Find the active elements. */ + if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { + /* The entire predicate was false; no store occurs. */ + return; + } + + /* Probe the page(s). Exit with exception for any invalid page. */ + sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); + + /* Handle watchpoints for all active elements. */ + sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, + BP_MEM_WRITE, retaddr); + + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mtedesc) { + sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, + mtedesc, retaddr); + } + + flags = info.page[0].flags | info.page[1].flags; + if (unlikely(flags != 0)) { +#ifdef CONFIG_USER_ONLY + g_assert_not_reached(); +#else + /* + * At least one page includes MMIO. + * Any bus operation can fail with cpu_transaction_failed, + * which for ARM will raise SyncExternal. We cannot avoid + * this fault and will leave with the store incomplete. + */ + mem_off = info.mem_off_first[0]; + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[1]; + if (reg_last < 0) { + reg_last = info.reg_off_split; + if (reg_last < 0) { + reg_last = info.reg_off_last[0]; + } + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, + addr + mem_off + (i << msz), retaddr); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off & 63); + } while (reg_off <= reg_last); + return; +#endif + } + + mem_off = info.mem_off_first[0]; + reg_off = info.reg_off_first[0]; + reg_last = info.reg_off_last[0]; + host = info.page[0].host; + + while (reg_off <= reg_last) { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, + host + mem_off + (i << msz)); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off <= reg_last && (reg_off & 63)); + } + + /* + * Use the slow path to manage the cross-page misalignment. + * But we know this is RAM and cannot trap. + */ + mem_off = info.mem_off_split; + if (unlikely(mem_off >= 0)) { + reg_off = info.reg_off_split; + for (i = 0; i < N; ++i) { + tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, + addr + mem_off + (i << msz), retaddr); + } + } + + mem_off = info.mem_off_first[1]; + if (unlikely(mem_off >= 0)) { + reg_off = info.reg_off_first[1]; + reg_last = info.reg_off_last[1]; + host = info.page[1].host; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + for (i = 0; i < N; ++i) { + host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, + host + mem_off + (i << msz)); + } + } + reg_off += 1 << esz; + mem_off += N << msz; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +static inline QEMU_ALWAYS_INLINE +void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t ra, + const int esz, const int msz, const int N, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); +} + +#define DO_STN_1(N, NAME, ESZ) \ +void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ + sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ +} \ +void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ + sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ +} + +#define DO_STN_2(N, NAME, ESZ, MSZ) \ +void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ + sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ +} \ +void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ + sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ +} \ +void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ + sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ +} \ +void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ + sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ +} + +DO_STN_1(1, bb, MO_8) +DO_STN_1(1, bh, MO_16) +DO_STN_1(1, bs, MO_32) +DO_STN_1(1, bd, MO_64) +DO_STN_1(2, bb, MO_8) +DO_STN_1(3, bb, MO_8) +DO_STN_1(4, bb, MO_8) + +DO_STN_2(1, hh, MO_16, MO_16) +DO_STN_2(1, hs, MO_32, MO_16) +DO_STN_2(1, hd, MO_64, MO_16) +DO_STN_2(2, hh, MO_16, MO_16) +DO_STN_2(3, hh, MO_16, MO_16) +DO_STN_2(4, hh, MO_16, MO_16) + +DO_STN_2(1, ss, MO_32, MO_32) +DO_STN_2(1, sd, MO_64, MO_32) +DO_STN_2(2, ss, MO_32, MO_32) +DO_STN_2(3, ss, MO_32, MO_32) +DO_STN_2(4, ss, MO_32, MO_32) + +DO_STN_2(1, dd, MO_64, MO_64) +DO_STN_2(2, dd, MO_64, MO_64) +DO_STN_2(3, dd, MO_64, MO_64) +DO_STN_2(4, dd, MO_64, MO_64) + +#undef DO_STN_1 +#undef DO_STN_2 + +/* + * Loads with a vector index. + */ + +/* + * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. + */ +typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); + +static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) +{ + return *(uint32_t *)(reg + H1_4(reg_ofs)); +} + +static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) +{ + return *(int32_t *)(reg + H1_4(reg_ofs)); +} + +static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) +{ + return (uint32_t)*(uint64_t *)(reg + reg_ofs); +} + +static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) +{ + return (int32_t)*(uint64_t *)(reg + reg_ofs); +} + +static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) +{ + return *(uint64_t *)(reg + reg_ofs); +} + +static inline QEMU_ALWAYS_INLINE +void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + uint32_t mtedesc, int esize, int msize, + zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const int mmu_idx = cpu_mmu_index(env, false); + const intptr_t reg_max = simd_oprsz(desc); + const int scale = simd_data(desc); + ARMVectorReg scratch; + intptr_t reg_off; + SVEHostPage info, info2; + + memset(&scratch, 0, reg_max); + reg_off = 0; + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if (likely(pg & 1)) { + target_ulong addr = base + (off_fn(vm, reg_off) << scale); + target_ulong in_page = -(addr | TARGET_PAGE_MASK); + + sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, + mmu_idx, retaddr); + + if (likely(in_page >= msize)) { + if (unlikely(info.flags & TLB_WATCHPOINT)) { + cpu_check_watchpoint(env_cpu(env), addr, msize, + info.attrs, BP_MEM_READ, retaddr); + } + if (mtedesc && info.tagged) { + mte_check(env, mtedesc, addr, retaddr); + } + if (unlikely(info.flags & TLB_MMIO)) { + tlb_fn(env, &scratch, reg_off, addr, retaddr); + } else { + host_fn(&scratch, reg_off, info.host); + } + } else { + /* Element crosses the page boundary. */ + sve_probe_page(&info2, false, env, addr + in_page, 0, + MMU_DATA_LOAD, mmu_idx, retaddr); + if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { + cpu_check_watchpoint(env_cpu(env), addr, + msize, info.attrs, + BP_MEM_READ, retaddr); + } + if (mtedesc && info.tagged) { + mte_check(env, mtedesc, addr, retaddr); + } + tlb_fn(env, &scratch, reg_off, addr, retaddr); + } + } + reg_off += esize; + pg >>= esize; + } while (reg_off & 63); + } while (reg_off < reg_max); + + /* Wait until all exceptions have been raised to write back. */ + memcpy(vd, &scratch, reg_max); +} + +static inline QEMU_ALWAYS_INLINE +void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + int esize, int msize, zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esize, msize, off_fn, host_fn, tlb_fn); +} + +#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + +#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + +DO_LD1_ZPZ_S(bsu, zsu, MO_8) +DO_LD1_ZPZ_S(bsu, zss, MO_8) +DO_LD1_ZPZ_D(bdu, zsu, MO_8) +DO_LD1_ZPZ_D(bdu, zss, MO_8) +DO_LD1_ZPZ_D(bdu, zd, MO_8) + +DO_LD1_ZPZ_S(bss, zsu, MO_8) +DO_LD1_ZPZ_S(bss, zss, MO_8) +DO_LD1_ZPZ_D(bds, zsu, MO_8) +DO_LD1_ZPZ_D(bds, zss, MO_8) +DO_LD1_ZPZ_D(bds, zd, MO_8) + +DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) +DO_LD1_ZPZ_S(hsu_le, zss, MO_16) +DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) +DO_LD1_ZPZ_D(hdu_le, zss, MO_16) +DO_LD1_ZPZ_D(hdu_le, zd, MO_16) + +DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) +DO_LD1_ZPZ_S(hsu_be, zss, MO_16) +DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) +DO_LD1_ZPZ_D(hdu_be, zss, MO_16) +DO_LD1_ZPZ_D(hdu_be, zd, MO_16) + +DO_LD1_ZPZ_S(hss_le, zsu, MO_16) +DO_LD1_ZPZ_S(hss_le, zss, MO_16) +DO_LD1_ZPZ_D(hds_le, zsu, MO_16) +DO_LD1_ZPZ_D(hds_le, zss, MO_16) +DO_LD1_ZPZ_D(hds_le, zd, MO_16) + +DO_LD1_ZPZ_S(hss_be, zsu, MO_16) +DO_LD1_ZPZ_S(hss_be, zss, MO_16) +DO_LD1_ZPZ_D(hds_be, zsu, MO_16) +DO_LD1_ZPZ_D(hds_be, zss, MO_16) +DO_LD1_ZPZ_D(hds_be, zd, MO_16) + +DO_LD1_ZPZ_S(ss_le, zsu, MO_32) +DO_LD1_ZPZ_S(ss_le, zss, MO_32) +DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) +DO_LD1_ZPZ_D(sdu_le, zss, MO_32) +DO_LD1_ZPZ_D(sdu_le, zd, MO_32) + +DO_LD1_ZPZ_S(ss_be, zsu, MO_32) +DO_LD1_ZPZ_S(ss_be, zss, MO_32) +DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) +DO_LD1_ZPZ_D(sdu_be, zss, MO_32) +DO_LD1_ZPZ_D(sdu_be, zd, MO_32) + +DO_LD1_ZPZ_D(sds_le, zsu, MO_32) +DO_LD1_ZPZ_D(sds_le, zss, MO_32) +DO_LD1_ZPZ_D(sds_le, zd, MO_32) + +DO_LD1_ZPZ_D(sds_be, zsu, MO_32) +DO_LD1_ZPZ_D(sds_be, zss, MO_32) +DO_LD1_ZPZ_D(sds_be, zd, MO_32) + +DO_LD1_ZPZ_D(dd_le, zsu, MO_64) +DO_LD1_ZPZ_D(dd_le, zss, MO_64) +DO_LD1_ZPZ_D(dd_le, zd, MO_64) + +DO_LD1_ZPZ_D(dd_be, zsu, MO_64) +DO_LD1_ZPZ_D(dd_be, zss, MO_64) +DO_LD1_ZPZ_D(dd_be, zd, MO_64) + +#undef DO_LD1_ZPZ_S +#undef DO_LD1_ZPZ_D + +/* First fault loads with a vector index. */ + +/* + * Common helpers for all gather first-faulting loads. + */ + +static inline QEMU_ALWAYS_INLINE +void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + uint32_t mtedesc, const int esz, const int msz, + zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const int mmu_idx = cpu_mmu_index(env, false); + const intptr_t reg_max = simd_oprsz(desc); + const int scale = simd_data(desc); + const int esize = 1 << esz; + const int msize = 1 << msz; + intptr_t reg_off; + SVEHostPage info; + target_ulong addr, in_page; + + /* Skip to the first true predicate. */ + reg_off = find_next_active(vg, 0, reg_max, esz); + if (unlikely(reg_off >= reg_max)) { + /* The entire predicate was false; no load occurs. */ + memset(vd, 0, reg_max); + return; + } + + /* + * Probe the first element, allowing faults. + */ + addr = base + (off_fn(vm, reg_off) << scale); + if (mtedesc) { + mte_check(env, mtedesc, addr, retaddr); + } + tlb_fn(env, vd, reg_off, addr, retaddr); + + /* After any fault, zero the other elements. */ + swap_memzero(vd, reg_off); + reg_off += esize; + swap_memzero(vd + reg_off, reg_max - reg_off); + + /* + * Probe the remaining elements, not allowing faults. + */ + while (reg_off < reg_max) { + uint64_t pg = vg[reg_off >> 6]; + do { + if (likely((pg >> (reg_off & 63)) & 1)) { + addr = base + (off_fn(vm, reg_off) << scale); + in_page = -(addr | TARGET_PAGE_MASK); + + if (unlikely(in_page < msize)) { + /* Stop if the element crosses a page boundary. */ + goto fault; + } + + sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, + mmu_idx, retaddr); + if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { + goto fault; + } + if (unlikely(info.flags & TLB_WATCHPOINT) && + (cpu_watchpoint_address_matches + (env_cpu(env), addr, msize) & BP_MEM_READ)) { + goto fault; + } + if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { + goto fault; + } + + host_fn(vd, reg_off, info.host); + } + reg_off += esize; + } while (reg_off & 63); + } + return; + + fault: + record_fault(env, reg_off, reg_max); +} + +static inline QEMU_ALWAYS_INLINE +void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + const int esz, const int msz, + zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esz, msz, off_fn, host_fn, tlb_fn); +} + +#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_ldff##MEM##_##OFS) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ldff##MEM##_##OFS##_mte) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + +#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_ldff##MEM##_##OFS) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ldff##MEM##_##OFS##_mte) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + +DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) +DO_LDFF1_ZPZ_S(bsu, zss, MO_8) +DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) +DO_LDFF1_ZPZ_D(bdu, zss, MO_8) +DO_LDFF1_ZPZ_D(bdu, zd, MO_8) + +DO_LDFF1_ZPZ_S(bss, zsu, MO_8) +DO_LDFF1_ZPZ_S(bss, zss, MO_8) +DO_LDFF1_ZPZ_D(bds, zsu, MO_8) +DO_LDFF1_ZPZ_D(bds, zss, MO_8) +DO_LDFF1_ZPZ_D(bds, zd, MO_8) + +DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) +DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) +DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) +DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) +DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) + +DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) +DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) +DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) +DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) +DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) + +DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) +DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) +DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) +DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) +DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) + +DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) +DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) +DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) +DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) +DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) + +DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) +DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) +DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) +DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) +DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) + +DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) +DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) +DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) +DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) +DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) + +DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) +DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) +DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) + +DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) +DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) +DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) + +DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) +DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) +DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) + +DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) +DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) +DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) + +/* Stores with a vector index. */ + +static inline QEMU_ALWAYS_INLINE +void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + uint32_t mtedesc, int esize, int msize, + zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + const int mmu_idx = cpu_mmu_index(env, false); + const intptr_t reg_max = simd_oprsz(desc); + const int scale = simd_data(desc); + void *host[ARM_MAX_VQ * 4]; + intptr_t reg_off, i; + SVEHostPage info, info2; + + /* + * Probe all of the elements for host addresses and flags. + */ + i = reg_off = 0; + do { + uint64_t pg = vg[reg_off >> 6]; + do { + target_ulong addr = base + (off_fn(vm, reg_off) << scale); + target_ulong in_page = -(addr | TARGET_PAGE_MASK); + + host[i] = NULL; + if (likely((pg >> (reg_off & 63)) & 1)) { + if (likely(in_page >= msize)) { + sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, + mmu_idx, retaddr); + if (!(info.flags & TLB_MMIO)) { + host[i] = info.host; + } + } else { + /* + * Element crosses the page boundary. + * Probe both pages, but do not record the host address, + * so that we use the slow path. + */ + sve_probe_page(&info, false, env, addr, 0, + MMU_DATA_STORE, mmu_idx, retaddr); + sve_probe_page(&info2, false, env, addr + in_page, 0, + MMU_DATA_STORE, mmu_idx, retaddr); + info.flags |= info2.flags; + } + + if (unlikely(info.flags & TLB_WATCHPOINT)) { + cpu_check_watchpoint(env_cpu(env), addr, msize, + info.attrs, BP_MEM_WRITE, retaddr); + } + + if (mtedesc && info.tagged) { + mte_check(env, mtedesc, addr, retaddr); + } + } + i += 1; + reg_off += esize; + } while (reg_off & 63); + } while (reg_off < reg_max); + + /* + * Now that we have recognized all exceptions except SyncExternal + * (from TLB_MMIO), which we cannot avoid, perform all of the stores. + * + * Note for the common case of an element in RAM, not crossing a page + * boundary, we have stored the host address in host[]. This doubles + * as a first-level check against the predicate, since only enabled + * elements have non-null host addresses. + */ + i = reg_off = 0; + do { + void *h = host[i]; + if (likely(h != NULL)) { + host_fn(vd, reg_off, h); + } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { + target_ulong addr = base + (off_fn(vm, reg_off) << scale); + tlb_fn(env, vd, reg_off, addr, retaddr); + } + i += 1; + reg_off += esize; + } while (reg_off < reg_max); +} + +static inline QEMU_ALWAYS_INLINE +void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + int esize, int msize, zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esize, msize, off_fn, host_fn, tlb_fn); +} + +#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ + off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} \ +void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} + +#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} \ +void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} + +DO_ST1_ZPZ_S(bs, zsu, MO_8) +DO_ST1_ZPZ_S(hs_le, zsu, MO_16) +DO_ST1_ZPZ_S(hs_be, zsu, MO_16) +DO_ST1_ZPZ_S(ss_le, zsu, MO_32) +DO_ST1_ZPZ_S(ss_be, zsu, MO_32) + +DO_ST1_ZPZ_S(bs, zss, MO_8) +DO_ST1_ZPZ_S(hs_le, zss, MO_16) +DO_ST1_ZPZ_S(hs_be, zss, MO_16) +DO_ST1_ZPZ_S(ss_le, zss, MO_32) +DO_ST1_ZPZ_S(ss_be, zss, MO_32) + +DO_ST1_ZPZ_D(bd, zsu, MO_8) +DO_ST1_ZPZ_D(hd_le, zsu, MO_16) +DO_ST1_ZPZ_D(hd_be, zsu, MO_16) +DO_ST1_ZPZ_D(sd_le, zsu, MO_32) +DO_ST1_ZPZ_D(sd_be, zsu, MO_32) +DO_ST1_ZPZ_D(dd_le, zsu, MO_64) +DO_ST1_ZPZ_D(dd_be, zsu, MO_64) + +DO_ST1_ZPZ_D(bd, zss, MO_8) +DO_ST1_ZPZ_D(hd_le, zss, MO_16) +DO_ST1_ZPZ_D(hd_be, zss, MO_16) +DO_ST1_ZPZ_D(sd_le, zss, MO_32) +DO_ST1_ZPZ_D(sd_be, zss, MO_32) +DO_ST1_ZPZ_D(dd_le, zss, MO_64) +DO_ST1_ZPZ_D(dd_be, zss, MO_64) + +DO_ST1_ZPZ_D(bd, zd, MO_8) +DO_ST1_ZPZ_D(hd_le, zd, MO_16) +DO_ST1_ZPZ_D(hd_be, zd, MO_16) +DO_ST1_ZPZ_D(sd_le, zd, MO_32) +DO_ST1_ZPZ_D(sd_be, zd, MO_32) +DO_ST1_ZPZ_D(dd_le, zd, MO_64) +DO_ST1_ZPZ_D(dd_be, zd, MO_64) + +#undef DO_ST1_ZPZ_S +#undef DO_ST1_ZPZ_D + +void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm, *k = vk; + + for (i = 0; i < opr_sz; ++i) { + d[i] = n[i] ^ m[i] ^ k[i]; + } +} + +void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm, *k = vk; + + for (i = 0; i < opr_sz; ++i) { + d[i] = n[i] ^ (m[i] & ~k[i]); + } +} + +void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm, *k = vk; + + for (i = 0; i < opr_sz; ++i) { + d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); + } +} + +void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm, *k = vk; + + for (i = 0; i < opr_sz; ++i) { + d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); + } +} + +void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn, *m = vm, *k = vk; + + for (i = 0; i < opr_sz; ++i) { + d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); + } +} + +/* + * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. + * See hasless(v,1) from + * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord + */ +static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) +{ + int bits = 8 << esz; + uint64_t ones = dup_const(esz, 1); + uint64_t signs = ones << (bits - 1); + uint64_t cmp0, cmp1; + + cmp1 = dup_const(esz, n); + cmp0 = cmp1 ^ m0; + cmp1 = cmp1 ^ m1; + cmp0 = (cmp0 - ones) & ~cmp0; + cmp1 = (cmp1 - ones) & ~cmp1; + return (cmp0 | cmp1) & signs; +} + +static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, + uint32_t desc, int esz, bool nmatch) +{ + uint16_t esz_mask = pred_esz_masks[esz]; + intptr_t opr_sz = simd_oprsz(desc); + uint32_t flags = PREDTEST_INIT; + intptr_t i, j, k; + + for (i = 0; i < opr_sz; i += 16) { + uint64_t m0 = *(uint64_t *)(vm + i); + uint64_t m1 = *(uint64_t *)(vm + i + 8); + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; + uint16_t out = 0; + + for (j = 0; j < 16; j += 8) { + uint64_t n = *(uint64_t *)(vn + i + j); + + for (k = 0; k < 8; k += 1 << esz) { + if (pg & (1 << (j + k))) { + bool o = do_match2(n >> (k * 8), m0, m1, esz); + out |= (o ^ nmatch) << (j + k); + } + } + } + *(uint16_t *)(vd + H1_2(i >> 3)) = out; + flags = iter_predtest_fwd(out, pg, flags); + } + return flags; +} + +#define DO_PPZZ_MATCH(NAME, ESZ, INV) \ +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ +} + +DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) +DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) + +DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) +DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) + +#undef DO_PPZZ_MATCH + +void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, + uint32_t desc) +{ + ARMVectorReg scratch; + intptr_t i, j; + intptr_t opr_sz = simd_oprsz(desc); + uint32_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + if (d == n) { + n = memcpy(&scratch, n, opr_sz); + if (d == m) { + m = n; + } + } else if (d == m) { + m = memcpy(&scratch, m, opr_sz); + } + + for (i = 0; i < opr_sz; i += 4) { + uint64_t count = 0; + uint8_t pred; + + pred = pg[H1(i >> 3)] >> (i & 7); + if (pred & 1) { + uint32_t nn = n[H4(i >> 2)]; + + for (j = 0; j <= i; j += 4) { + pred = pg[H1(j >> 3)] >> (j & 7); + if ((pred & 1) && nn == m[H4(j >> 2)]) { + ++count; + } + } + } + d[H4(i >> 2)] = count; + } +} + +void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, + uint32_t desc) +{ + ARMVectorReg scratch; + intptr_t i, j; + intptr_t opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + uint8_t *pg = vg; + + if (d == n) { + n = memcpy(&scratch, n, opr_sz); + if (d == m) { + m = n; + } + } else if (d == m) { + m = memcpy(&scratch, m, opr_sz); + } + + for (i = 0; i < opr_sz / 8; ++i) { + uint64_t count = 0; + if (pg[H1(i)] & 1) { + uint64_t nn = n[i]; + for (j = 0; j <= i; ++j) { + if ((pg[H1(j)] & 1) && nn == m[j]) { + ++count; + } + } + } + d[i] = count; + } +} + +/* + * Returns the number of bytes in m0 and m1 that match n. + * Unlike do_match2 we don't just need true/false, we need an exact count. + * This requires two extra logical operations. + */ +static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) +{ + const uint64_t mask = dup_const(MO_8, 0x7f); + uint64_t cmp0, cmp1; + + cmp1 = dup_const(MO_8, n); + cmp0 = cmp1 ^ m0; + cmp1 = cmp1 ^ m1; + + /* + * 1: clear msb of each byte to avoid carry to next byte (& mask) + * 2: carry in to msb if byte != 0 (+ mask) + * 3: set msb if cmp has msb set (| cmp) + * 4: set ~msb to ignore them (| mask) + * We now have 0xff for byte != 0 or 0x7f for byte == 0. + * 5: invert, resulting in 0x80 if and only if byte == 0. + */ + cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); + cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); + + /* + * Combine the two compares in a way that the bits do + * not overlap, and so preserves the count of set bits. + * If the host has an efficient instruction for ctpop, + * then ctpop(x) + ctpop(y) has the same number of + * operations as ctpop(x | (y >> 1)). If the host does + * not have an efficient ctpop, then we only want to + * use it once. + */ + return ctpop64(cmp0 | (cmp1 >> 1)); +} + +void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j; + intptr_t opr_sz = simd_oprsz(desc); + + for (i = 0; i < opr_sz; i += 16) { + uint64_t n0 = *(uint64_t *)(vn + i); + uint64_t m0 = *(uint64_t *)(vm + i); + uint64_t n1 = *(uint64_t *)(vn + i + 8); + uint64_t m1 = *(uint64_t *)(vm + i + 8); + uint64_t out0 = 0; + uint64_t out1 = 0; + + for (j = 0; j < 64; j += 8) { + uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); + uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); + out0 |= cnt0 << j; + out1 |= cnt1 << j; + } + + *(uint64_t *)(vd + i) = out0; + *(uint64_t *)(vd + i + 8) = out1; + } +} + +void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + int shr = simd_data(desc); + int shl = 8 - shr; + uint64_t mask = dup_const(MO_8, 0xff >> shr); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + uint64_t t = n[i] ^ m[i]; + d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); + } +} + +void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + int shr = simd_data(desc); + int shl = 16 - shr; + uint64_t mask = dup_const(MO_16, 0xffff >> shr); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + uint64_t t = n[i] ^ m[i]; + d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); + } +} + +void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 4; + int shr = simd_data(desc); + uint32_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = ror32(n[i] ^ m[i], shr); + } +} + +void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, + void *status, uint32_t desc) +{ + intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); + + for (s = 0; s < opr_sz; ++s) { + float32 *n = vn + s * sizeof(float32) * 4; + float32 *m = vm + s * sizeof(float32) * 4; + float32 *a = va + s * sizeof(float32) * 4; + float32 *d = vd + s * sizeof(float32) * 4; + float32 n00 = n[H4(0)], n01 = n[H4(1)]; + float32 n10 = n[H4(2)], n11 = n[H4(3)]; + float32 m00 = m[H4(0)], m01 = m[H4(1)]; + float32 m10 = m[H4(2)], m11 = m[H4(3)]; + float32 p0, p1; + + /* i = 0, j = 0 */ + p0 = float32_mul(n00, m00, status); + p1 = float32_mul(n01, m01, status); + d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); + + /* i = 0, j = 1 */ + p0 = float32_mul(n00, m10, status); + p1 = float32_mul(n01, m11, status); + d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); + + /* i = 1, j = 0 */ + p0 = float32_mul(n10, m00, status); + p1 = float32_mul(n11, m01, status); + d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); + + /* i = 1, j = 1 */ + p0 = float32_mul(n10, m10, status); + p1 = float32_mul(n11, m11, status); + d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); + } +} + +void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, + void *status, uint32_t desc) +{ + intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); + + for (s = 0; s < opr_sz; ++s) { + float64 *n = vn + s * sizeof(float64) * 4; + float64 *m = vm + s * sizeof(float64) * 4; + float64 *a = va + s * sizeof(float64) * 4; + float64 *d = vd + s * sizeof(float64) * 4; + float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; + float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; + float64 p0, p1; + + /* i = 0, j = 0 */ + p0 = float64_mul(n00, m00, status); + p1 = float64_mul(n01, m01, status); + d[0] = float64_add(a[0], float64_add(p0, p1, status), status); + + /* i = 0, j = 1 */ + p0 = float64_mul(n00, m10, status); + p1 = float64_mul(n01, m11, status); + d[1] = float64_add(a[1], float64_add(p0, p1, status), status); + + /* i = 1, j = 0 */ + p0 = float64_mul(n10, m00, status); + p1 = float64_mul(n11, m01, status); + d[2] = float64_add(a[2], float64_add(p0, p1, status), status); + + /* i = 1, j = 1 */ + p0 = float64_mul(n10, m10, status); + p1 = float64_mul(n11, m11, status); + d[3] = float64_add(a[3], float64_add(p0, p1, status), status); + } +} + +#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPEW); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPEW nn = *(TYPEW *)(vn + HW(i)); \ + *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) +DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) +DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) + +#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPEW); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ + *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) +DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) + +#undef DO_FCVTLT +#undef DO_FCVTNT diff --git a/target/arm/tcg/tlb_helper.c b/target/arm/tcg/tlb_helper.c new file mode 100644 index 0000000000..60abcbebe6 --- /dev/null +++ b/target/arm/tcg/tlb_helper.c @@ -0,0 +1,287 @@ +/* + * ARM TLB (Translation lookaside buffer) helpers. + * + * This code is licensed under the GNU GPL v2 or later. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/helper-proto.h" + + +/* Return true if the translation regime is using LPAE format page tables */ +bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) +{ + int el = regime_el(env, mmu_idx); + if (el == 2 || arm_el_is_aa64(env, el)) { + return true; + } + if (arm_feature(env, ARM_FEATURE_PMSA) && + arm_feature(env, ARM_FEATURE_V8)) { + return true; + } + if (arm_feature(env, ARM_FEATURE_LPAE) + && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) { + return true; + } + return false; +} + +/* + * Returns true if the stage 1 translation regime is using LPAE format page + * tables. Used when raising alignment exceptions, whose FSR changes depending + * on whether the long or short descriptor format is in use. + */ +bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) +{ + mmu_idx = stage_1_mmu_idx(mmu_idx); + return regime_using_lpae_format(env, mmu_idx); +} + +static inline uint32_t merge_syn_data_abort(uint32_t template_syn, + unsigned int target_el, + bool same_el, bool ea, + bool s1ptw, bool is_write, + int fsc) +{ + uint32_t syn; + + /* + * ISV is only set for data aborts routed to EL2 and + * never for stage-1 page table walks faulting on stage 2. + * + * Furthermore, ISV is only set for certain kinds of load/stores. + * If the template syndrome does not have ISV set, we should leave + * it cleared. + * + * See ARMv8 specs, D7-1974: + * ISS encoding for an exception from a Data Abort, the + * ISV field. + */ + if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) { + syn = syn_data_abort_no_iss(same_el, 0, + ea, 0, s1ptw, is_write, fsc); + } else { + /* + * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template + * syndrome created at translation time. + * Now we create the runtime syndrome with the remaining fields. + */ + syn = syn_data_abort_with_iss(same_el, + 0, 0, 0, 0, 0, + ea, 0, s1ptw, is_write, fsc, + true); + /* Merge the runtime syndrome with the template syndrome. */ + syn |= template_syn; + } + return syn; +} + +static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi, + int target_el, int mmu_idx, uint32_t *ret_fsc) +{ + ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx); + uint32_t fsr, fsc; + + if (target_el == 2 || arm_el_is_aa64(env, target_el) || + arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) { + /* + * LPAE format fault status register : bottom 6 bits are + * status code in the same form as needed for syndrome + */ + fsr = arm_fi_to_lfsc(fi); + fsc = extract32(fsr, 0, 6); + } else { + fsr = arm_fi_to_sfsc(fi); + /* + * Short format FSR : this fault will never actually be reported + * to an EL that uses a syndrome register. Use a (currently) + * reserved FSR code in case the constructed syndrome does leak + * into the guest somehow. + */ + fsc = 0x3f; + } + + *ret_fsc = fsc; + return fsr; +} + +static G_NORETURN +void arm_deliver_fault(ARMCPU *cpu, vaddr addr, + MMUAccessType access_type, + int mmu_idx, ARMMMUFaultInfo *fi) +{ + CPUARMState *env = &cpu->env; + int target_el; + bool same_el; + uint32_t syn, exc, fsr, fsc; + + target_el = exception_target_el(env); + if (fi->stage2) { + target_el = 2; + env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4; + if (arm_is_secure_below_el3(env) && fi->s1ns) { + env->cp15.hpfar_el2 |= HPFAR_NS; + } + } + same_el = (arm_current_el(env) == target_el); + + fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc); + + if (access_type == MMU_INST_FETCH) { + syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc); + exc = EXCP_PREFETCH_ABORT; + } else { + syn = merge_syn_data_abort(env->exception.syndrome, target_el, + same_el, fi->ea, fi->s1ptw, + access_type == MMU_DATA_STORE, + fsc); + if (access_type == MMU_DATA_STORE + && arm_feature(env, ARM_FEATURE_V6)) { + fsr |= (1 << 11); + } + exc = EXCP_DATA_ABORT; + } + + env->exception.vaddress = addr; + env->exception.fsr = fsr; + raise_exception(env, exc, syn, target_el); +} + +/* Raise a data fault alignment exception for the specified virtual address */ +void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, + MMUAccessType access_type, + int mmu_idx, uintptr_t retaddr) +{ + ARMCPU *cpu = ARM_CPU(cs); + ARMMMUFaultInfo fi = {}; + + /* now we have a real cpu fault */ + cpu_restore_state(cs, retaddr); + + fi.type = ARMFault_Alignment; + arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi); +} + +void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc) +{ + ARMMMUFaultInfo fi = { .type = ARMFault_Alignment }; + int target_el = exception_target_el(env); + int mmu_idx = cpu_mmu_index(env, true); + uint32_t fsc; + + env->exception.vaddress = pc; + + /* + * Note that the fsc is not applicable to this exception, + * since any syndrome is pcalignment not insn_abort. + */ + env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc); + raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el); +} + +#if !defined(CONFIG_USER_ONLY) + +/* + * arm_cpu_do_transaction_failed: handle a memory system error response + * (eg "no device/memory present at address") by raising an external abort + * exception + */ +void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr, + vaddr addr, unsigned size, + MMUAccessType access_type, + int mmu_idx, MemTxAttrs attrs, + MemTxResult response, uintptr_t retaddr) +{ + ARMCPU *cpu = ARM_CPU(cs); + ARMMMUFaultInfo fi = {}; + + /* now we have a real cpu fault */ + cpu_restore_state(cs, retaddr); + + fi.ea = arm_extabort_type(response); + fi.type = ARMFault_SyncExternal; + arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi); +} + +bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size, + MMUAccessType access_type, int mmu_idx, + bool probe, uintptr_t retaddr) +{ + ARMCPU *cpu = ARM_CPU(cs); + GetPhysAddrResult res = {}; + ARMMMUFaultInfo local_fi, *fi; + int ret; + + /* + * Allow S1_ptw_translate to see any fault generated here. + * Since this may recurse, read and clear. + */ + fi = cpu->env.tlb_fi; + if (fi) { + cpu->env.tlb_fi = NULL; + } else { + fi = memset(&local_fi, 0, sizeof(local_fi)); + } + + /* + * Walk the page table and (if the mapping exists) add the page + * to the TLB. On success, return true. Otherwise, if probing, + * return false. Otherwise populate fsr with ARM DFSR/IFSR fault + * register format, and signal the fault. + */ + ret = get_phys_addr(&cpu->env, address, access_type, + core_to_arm_mmu_idx(&cpu->env, mmu_idx), + &res, fi); + if (likely(!ret)) { + /* + * Map a single [sub]page. Regions smaller than our declared + * target page size are handled specially, so for those we + * pass in the exact addresses. + */ + if (res.f.lg_page_size >= TARGET_PAGE_BITS) { + res.f.phys_addr &= TARGET_PAGE_MASK; + address &= TARGET_PAGE_MASK; + } + + res.f.pte_attrs = res.cacheattrs.attrs; + res.f.shareability = res.cacheattrs.shareability; + + tlb_set_page_full(cs, mmu_idx, address, &res.f); + return true; + } else if (probe) { + return false; + } else { + /* now we have a real cpu fault */ + cpu_restore_state(cs, retaddr); + arm_deliver_fault(cpu, address, access_type, mmu_idx, fi); + } +} +#else +void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr, + MMUAccessType access_type, + bool maperr, uintptr_t ra) +{ + ARMMMUFaultInfo fi = { + .type = maperr ? ARMFault_Translation : ARMFault_Permission, + .level = 3, + }; + ARMCPU *cpu = ARM_CPU(cs); + + /* + * We report both ESR and FAR to signal handlers. + * For now, it's easiest to deliver the fault normally. + */ + cpu_restore_state(cs, ra); + arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi); +} + +void arm_cpu_record_sigbus(CPUState *cs, vaddr addr, + MMUAccessType access_type, uintptr_t ra) +{ + arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra); +} +#endif /* !defined(CONFIG_USER_ONLY) */ diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c new file mode 100644 index 0000000000..f59d3b26ea --- /dev/null +++ b/target/arm/tcg/vec_helper.c @@ -0,0 +1,2716 @@ +/* + * ARM AdvSIMD / SVE Vector Operations + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "exec/helper-proto.h" +#include "tcg/tcg-gvec-desc.h" +#include "fpu/softfloat.h" +#include "qemu/int128.h" +#include "vec_internal.h" + +/* + * Data for expanding active predicate bits to bytes, for byte elements. + * + * for (i = 0; i < 256; ++i) { + * unsigned long m = 0; + * for (j = 0; j < 8; j++) { + * if ((i >> j) & 1) { + * m |= 0xfful << (j << 3); + * } + * } + * printf("0x%016lx,\n", m); + * } + */ +const uint64_t expand_pred_b_data[256] = { + 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, + 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, + 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, + 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, + 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, + 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, + 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, + 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, + 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, + 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, + 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, + 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, + 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, + 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, + 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, + 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, + 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, + 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, + 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, + 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, + 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, + 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, + 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, + 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, + 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, + 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, + 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, + 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, + 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, + 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, + 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, + 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, + 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, + 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, + 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, + 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, + 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, + 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, + 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, + 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, + 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, + 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, + 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, + 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, + 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, + 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, + 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, + 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, + 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, + 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, + 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, + 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, + 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, + 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, + 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, + 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, + 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, + 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, + 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, + 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, + 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, + 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, + 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, + 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, + 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, + 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, + 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, + 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, + 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, + 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, + 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, + 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, + 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, + 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, + 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, + 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, + 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, + 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, + 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, + 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, + 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, + 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, + 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, + 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, + 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, + 0xffffffffffffffff, +}; + +/* + * Similarly for half-word elements. + * for (i = 0; i < 256; ++i) { + * unsigned long m = 0; + * if (i & 0xaa) { + * continue; + * } + * for (j = 0; j < 8; j += 2) { + * if ((i >> j) & 1) { + * m |= 0xfffful << (j << 3); + * } + * } + * printf("[0x%x] = 0x%016lx,\n", i, m); + * } + */ +const uint64_t expand_pred_h_data[0x55 + 1] = { + [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, + [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, + [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, + [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, + [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, + [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, + [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, + [0x55] = 0xffffffffffffffff, +}; + +/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ +int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, + bool neg, bool round) +{ + /* + * Simplify: + * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 + * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 + */ + int32_t ret = (int32_t)src1 * src2; + if (neg) { + ret = -ret; + } + ret += ((int32_t)src3 << 7) + (round << 6); + ret >>= 7; + + if (ret != (int8_t)ret) { + ret = (ret < 0 ? INT8_MIN : INT8_MAX); + } + return ret; +} + +void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm, *a = va; + + for (i = 0; i < opr_sz; ++i) { + d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); + } +} + +void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm, *a = va; + + for (i = 0; i < opr_sz; ++i) { + d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); + } +} + +void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); + } +} + +void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); + } +} + +/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ +int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, + bool neg, bool round, uint32_t *sat) +{ + /* Simplify similarly to do_sqrdmlah_b above. */ + int32_t ret = (int32_t)src1 * src2; + if (neg) { + ret = -ret; + } + ret += ((int32_t)src3 << 15) + (round << 14); + ret >>= 15; + + if (ret != (int16_t)ret) { + *sat = 1; + ret = (ret < 0 ? INT16_MIN : INT16_MAX); + } + return ret; +} + +uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, + uint32_t src2, uint32_t src3) +{ + uint32_t *sat = &env->vfp.qc[0]; + uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); + uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, + false, true, sat); + return deposit32(e1, 16, 16, e2); +} + +void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int16_t *d = vd; + int16_t *n = vn; + int16_t *m = vm; + uintptr_t i; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, + uint32_t src2, uint32_t src3) +{ + uint32_t *sat = &env->vfp.qc[0]; + uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); + uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, + true, true, sat); + return deposit32(e1, 16, 16, e2); +} + +void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int16_t *d = vd; + int16_t *n = vn; + int16_t *m = vm; + uintptr_t i; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm, *a = va; + uint32_t discard; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); + } +} + +void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm, *a = va; + uint32_t discard; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); + } +} + +void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + uint32_t discard; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); + } +} + +void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + uint32_t discard; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); + } +} + +void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); + uint32_t discard; + + for (i = 0; i < opr_sz / 2; i += 16 / 2) { + int16_t mm = m[i]; + for (j = 0; j < 16 / 2; ++j) { + d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); + } + } +} + +void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); + uint32_t discard; + + for (i = 0; i < opr_sz / 2; i += 16 / 2) { + int16_t mm = m[i]; + for (j = 0; j < 16 / 2; ++j) { + d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); + } + } +} + +/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ +int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, + bool neg, bool round, uint32_t *sat) +{ + /* Simplify similarly to do_sqrdmlah_b above. */ + int64_t ret = (int64_t)src1 * src2; + if (neg) { + ret = -ret; + } + ret += ((int64_t)src3 << 31) + (round << 30); + ret >>= 31; + + if (ret != (int32_t)ret) { + *sat = 1; + ret = (ret < 0 ? INT32_MIN : INT32_MAX); + } + return ret; +} + +uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, + int32_t src2, int32_t src3) +{ + uint32_t *sat = &env->vfp.qc[0]; + return do_sqrdmlah_s(src1, src2, src3, false, true, sat); +} + +void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int32_t *d = vd; + int32_t *n = vn; + int32_t *m = vm; + uintptr_t i; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, + int32_t src2, int32_t src3) +{ + uint32_t *sat = &env->vfp.qc[0]; + return do_sqrdmlah_s(src1, src2, src3, true, true, sat); +} + +void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int32_t *d = vd; + int32_t *n = vn; + int32_t *m = vm; + uintptr_t i; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, + void *vq, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm, *a = va; + uint32_t discard; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); + } +} + +void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm, *a = va; + uint32_t discard; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); + } +} + +void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm; + uint32_t discard; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); + } +} + +void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm; + uint32_t discard; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); + } +} + +void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); + uint32_t discard; + + for (i = 0; i < opr_sz / 4; i += 16 / 4) { + int32_t mm = m[i]; + for (j = 0; j < 16 / 4; ++j) { + d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); + } + } +} + +void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); + uint32_t discard; + + for (i = 0; i < opr_sz / 4; i += 16 / 4) { + int32_t mm = m[i]; + for (j = 0; j < 16 / 4; ++j) { + d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); + } + } +} + +/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ +static int64_t do_sat128_d(Int128 r) +{ + int64_t ls = int128_getlo(r); + int64_t hs = int128_gethi(r); + + if (unlikely(hs != (ls >> 63))) { + return hs < 0 ? INT64_MIN : INT64_MAX; + } + return ls; +} + +int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) +{ + uint64_t l, h; + Int128 r, t; + + /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ + muls64(&l, &h, m, n); + r = int128_make128(l, h); + if (neg) { + r = int128_neg(r); + } + if (a) { + t = int128_exts64(a); + t = int128_lshift(t, 63); + r = int128_add(r, t); + } + if (round) { + t = int128_exts64(1ll << 62); + r = int128_add(r, t); + } + r = int128_rshift(r, 63); + + return do_sat128_d(r); +} + +void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm, *a = va; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); + } +} + +void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm, *a = va; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); + } +} + +void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); + } +} + +void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); + } +} + +void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; + + for (i = 0; i < opr_sz / 8; i += 16 / 8) { + int64_t mm = m[i]; + for (j = 0; j < 16 / 8; ++j) { + d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); + } + } +} + +void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + int idx = simd_data(desc); + int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; + + for (i = 0; i < opr_sz / 8; i += 16 / 8) { + int64_t mm = m[i]; + for (j = 0; j < 16 / 8; ++j) { + d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); + } + } +} + +/* Integer 8 and 16-bit dot-product. + * + * Note that for the loops herein, host endianness does not matter + * with respect to the ordering of data within the quad-width lanes. + * All elements are treated equally, no matter where they are. + */ + +#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPED *d = vd, *a = va; \ + TYPEN *n = vn; \ + TYPEM *m = vm; \ + for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ + d[i] = (a[i] + \ + (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ + (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ + (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ + (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) +DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) +DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) +DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) +DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) + +#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i = 0, opr_sz = simd_oprsz(desc); \ + intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ + intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ + intptr_t index = simd_data(desc); \ + TYPED *d = vd, *a = va; \ + TYPEN *n = vn; \ + TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ + do { \ + TYPED m0 = m_indexed[i * 4 + 0]; \ + TYPED m1 = m_indexed[i * 4 + 1]; \ + TYPED m2 = m_indexed[i * 4 + 2]; \ + TYPED m3 = m_indexed[i * 4 + 3]; \ + do { \ + d[i] = (a[i] + \ + n[i * 4 + 0] * m0 + \ + n[i * 4 + 1] * m1 + \ + n[i * 4 + 2] * m2 + \ + n[i * 4 + 3] * m3); \ + } while (++i < segend); \ + segend = i + 4; \ + } while (i < opr_sz_n); \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) +DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) +DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) +DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) +DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) +DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) + +void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd; + float16 *n = vn; + float16 *m = vm; + float_status *fpst = vfpst; + uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + + for (i = 0; i < opr_sz / 2; i += 2) { + float16 e0 = n[H2(i)]; + float16 e1 = m[H2(i + 1)] ^ neg_imag; + float16 e2 = n[H2(i + 1)]; + float16 e3 = m[H2(i)] ^ neg_real; + + d[H2(i)] = float16_add(e0, e1, fpst); + d[H2(i + 1)] = float16_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd; + float32 *n = vn; + float32 *m = vm; + float_status *fpst = vfpst; + uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + + for (i = 0; i < opr_sz / 4; i += 2) { + float32 e0 = n[H4(i)]; + float32 e1 = m[H4(i + 1)] ^ neg_imag; + float32 e2 = n[H4(i + 1)]; + float32 e3 = m[H4(i)] ^ neg_real; + + d[H4(i)] = float32_add(e0, e1, fpst); + d[H4(i + 1)] = float32_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float64 *d = vd; + float64 *n = vn; + float64 *m = vm; + float_status *fpst = vfpst; + uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); + uint64_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 63; + neg_imag <<= 63; + + for (i = 0; i < opr_sz / 8; i += 2) { + float64 e0 = n[i]; + float64 e1 = m[i + 1] ^ neg_imag; + float64 e2 = n[i + 1]; + float64 e3 = m[i] ^ neg_real; + + d[i] = float64_add(e0, e1, fpst); + d[i + 1] = float64_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd, *n = vn, *m = vm, *a = va; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + + for (i = 0; i < opr_sz / 2; i += 2) { + float16 e2 = n[H2(i + flip)]; + float16 e1 = m[H2(i + flip)] ^ neg_real; + float16 e4 = e2; + float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; + + d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); + d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd, *n = vn, *m = vm, *a = va; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); + uint32_t neg_real = flip ^ neg_imag; + intptr_t elements = opr_sz / sizeof(float16); + intptr_t eltspersegment = 16 / sizeof(float16); + intptr_t i, j; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + + for (i = 0; i < elements; i += eltspersegment) { + float16 mr = m[H2(i + 2 * index + 0)]; + float16 mi = m[H2(i + 2 * index + 1)]; + float16 e1 = neg_real ^ (flip ? mi : mr); + float16 e3 = neg_imag ^ (flip ? mr : mi); + + for (j = i; j < i + eltspersegment; j += 2) { + float16 e2 = n[H2(j + flip)]; + float16 e4 = e2; + + d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); + d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); + } + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd, *n = vn, *m = vm, *a = va; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + + for (i = 0; i < opr_sz / 4; i += 2) { + float32 e2 = n[H4(i + flip)]; + float32 e1 = m[H4(i + flip)] ^ neg_real; + float32 e4 = e2; + float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; + + d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); + d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd, *n = vn, *m = vm, *a = va; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); + uint32_t neg_real = flip ^ neg_imag; + intptr_t elements = opr_sz / sizeof(float32); + intptr_t eltspersegment = 16 / sizeof(float32); + intptr_t i, j; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + + for (i = 0; i < elements; i += eltspersegment) { + float32 mr = m[H4(i + 2 * index + 0)]; + float32 mi = m[H4(i + 2 * index + 1)]; + float32 e1 = neg_real ^ (flip ? mi : mr); + float32 e3 = neg_imag ^ (flip ? mr : mi); + + for (j = i; j < i + eltspersegment; j += 2) { + float32 e2 = n[H4(j + flip)]; + float32 e4 = e2; + + d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); + d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); + } + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float64 *d = vd, *n = vn, *m = vm, *a = va; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint64_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 63; + neg_imag <<= 63; + + for (i = 0; i < opr_sz / 8; i += 2) { + float64 e2 = n[i + flip]; + float64 e1 = m[i + flip] ^ neg_real; + float64 e4 = e2; + float64 e3 = m[i + 1 - flip] ^ neg_imag; + + d[i] = float64_muladd(e2, e1, a[i], 0, fpst); + d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* + * Floating point comparisons producing an integer result (all 1s or all 0s). + * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. + * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. + */ +static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) +{ + return -float16_eq_quiet(op1, op2, stat); +} + +static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) +{ + return -float32_eq_quiet(op1, op2, stat); +} + +static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) +{ + return -float16_le(op2, op1, stat); +} + +static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) +{ + return -float32_le(op2, op1, stat); +} + +static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) +{ + return -float16_lt(op2, op1, stat); +} + +static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) +{ + return -float32_lt(op2, op1, stat); +} + +static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) +{ + return -float16_le(float16_abs(op2), float16_abs(op1), stat); +} + +static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) +{ + return -float32_le(float32_abs(op2), float32_abs(op1), stat); +} + +static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) +{ + return -float16_lt(float16_abs(op2), float16_abs(op1), stat); +} + +static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) +{ + return -float32_lt(float32_abs(op2), float32_abs(op1), stat); +} + +static int16_t vfp_tosszh(float16 x, void *fpstp) +{ + float_status *fpst = fpstp; + if (float16_is_any_nan(x)) { + float_raise(float_flag_invalid, fpst); + return 0; + } + return float16_to_int16_round_to_zero(x, fpst); +} + +static uint16_t vfp_touszh(float16 x, void *fpstp) +{ + float_status *fpst = fpstp; + if (float16_is_any_nan(x)) { + float_raise(float_flag_invalid, fpst); + return 0; + } + return float16_to_uint16_round_to_zero(x, fpst); +} + +#define DO_2OP(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], stat); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) +DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) +DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) + +DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) +DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) +DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) + +DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) +DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) + +DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) +DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) +DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) +DO_2OP(gvec_touizs, helper_vfp_touizs, float32) +DO_2OP(gvec_sstoh, int16_to_float16, int16_t) +DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) +DO_2OP(gvec_tosszh, vfp_tosszh, float16) +DO_2OP(gvec_touszh, vfp_touszh, float16) + +#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ + static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ + { \ + return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ + } + +#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ + static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ + { \ + return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ + } + +#define DO_2OP_CMP0(FN, CMPOP, DIRN) \ + WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ + WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ + DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ + DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) + +DO_2OP_CMP0(cgt, cgt, FWD) +DO_2OP_CMP0(cge, cge, FWD) +DO_2OP_CMP0(ceq, ceq, FWD) +DO_2OP_CMP0(clt, cgt, REV) +DO_2OP_CMP0(cle, cge, REV) + +#undef DO_2OP +#undef DO_2OP_CMP0 + +/* Floating-point trigonometric starting value. + * See the ARM ARM pseudocode function FPTrigSMul. + */ +static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) +{ + float16 result = float16_mul(op1, op1, stat); + if (!float16_is_any_nan(result)) { + result = float16_set_sign(result, op2 & 1); + } + return result; +} + +static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) +{ + float32 result = float32_mul(op1, op1, stat); + if (!float32_is_any_nan(result)) { + result = float32_set_sign(result, op2 & 1); + } + return result; +} + +static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) +{ + float64 result = float64_mul(op1, op1, stat); + if (!float64_is_any_nan(result)) { + result = float64_set_sign(result, op2 & 1); + } + return result; +} + +static float16 float16_abd(float16 op1, float16 op2, float_status *stat) +{ + return float16_abs(float16_sub(op1, op2, stat)); +} + +static float32 float32_abd(float32 op1, float32 op2, float_status *stat) +{ + return float32_abs(float32_sub(op1, op2, stat)); +} + +/* + * Reciprocal step. These are the AArch32 version which uses a + * non-fused multiply-and-subtract. + */ +static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) +{ + op1 = float16_squash_input_denormal(op1, stat); + op2 = float16_squash_input_denormal(op2, stat); + + if ((float16_is_infinity(op1) && float16_is_zero(op2)) || + (float16_is_infinity(op2) && float16_is_zero(op1))) { + return float16_two; + } + return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); +} + +static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) +{ + op1 = float32_squash_input_denormal(op1, stat); + op2 = float32_squash_input_denormal(op2, stat); + + if ((float32_is_infinity(op1) && float32_is_zero(op2)) || + (float32_is_infinity(op2) && float32_is_zero(op1))) { + return float32_two; + } + return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); +} + +/* Reciprocal square-root step. AArch32 non-fused semantics. */ +static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) +{ + op1 = float16_squash_input_denormal(op1, stat); + op2 = float16_squash_input_denormal(op2, stat); + + if ((float16_is_infinity(op1) && float16_is_zero(op2)) || + (float16_is_infinity(op2) && float16_is_zero(op1))) { + return float16_one_point_five; + } + op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); + return float16_div(op1, float16_two, stat); +} + +static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) +{ + op1 = float32_squash_input_denormal(op1, stat); + op2 = float32_squash_input_denormal(op2, stat); + + if ((float32_is_infinity(op1) && float32_is_zero(op2)) || + (float32_is_infinity(op2) && float32_is_zero(op1))) { + return float32_one_point_five; + } + op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); + return float32_div(op1, float32_two, stat); +} + +#define DO_3OP(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], m[i], stat); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_3OP(gvec_fadd_h, float16_add, float16) +DO_3OP(gvec_fadd_s, float32_add, float32) +DO_3OP(gvec_fadd_d, float64_add, float64) + +DO_3OP(gvec_fsub_h, float16_sub, float16) +DO_3OP(gvec_fsub_s, float32_sub, float32) +DO_3OP(gvec_fsub_d, float64_sub, float64) + +DO_3OP(gvec_fmul_h, float16_mul, float16) +DO_3OP(gvec_fmul_s, float32_mul, float32) +DO_3OP(gvec_fmul_d, float64_mul, float64) + +DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) +DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) +DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) + +DO_3OP(gvec_fabd_h, float16_abd, float16) +DO_3OP(gvec_fabd_s, float32_abd, float32) + +DO_3OP(gvec_fceq_h, float16_ceq, float16) +DO_3OP(gvec_fceq_s, float32_ceq, float32) + +DO_3OP(gvec_fcge_h, float16_cge, float16) +DO_3OP(gvec_fcge_s, float32_cge, float32) + +DO_3OP(gvec_fcgt_h, float16_cgt, float16) +DO_3OP(gvec_fcgt_s, float32_cgt, float32) + +DO_3OP(gvec_facge_h, float16_acge, float16) +DO_3OP(gvec_facge_s, float32_acge, float32) + +DO_3OP(gvec_facgt_h, float16_acgt, float16) +DO_3OP(gvec_facgt_s, float32_acgt, float32) + +DO_3OP(gvec_fmax_h, float16_max, float16) +DO_3OP(gvec_fmax_s, float32_max, float32) + +DO_3OP(gvec_fmin_h, float16_min, float16) +DO_3OP(gvec_fmin_s, float32_min, float32) + +DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) +DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) + +DO_3OP(gvec_fminnum_h, float16_minnum, float16) +DO_3OP(gvec_fminnum_s, float32_minnum, float32) + +DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) +DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) + +DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) +DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) + +#ifdef TARGET_AARCH64 + +DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) +DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) +DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) + +DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) +DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) +DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) + +#endif +#undef DO_3OP + +/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ +static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, + float_status *stat) +{ + return float16_add(dest, float16_mul(op1, op2, stat), stat); +} + +static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, + float_status *stat) +{ + return float32_add(dest, float32_mul(op1, op2, stat), stat); +} + +static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, + float_status *stat) +{ + return float16_sub(dest, float16_mul(op1, op2, stat), stat); +} + +static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, + float_status *stat) +{ + return float32_sub(dest, float32_mul(op1, op2, stat), stat); +} + +/* Fused versions; these have the semantics Neon VFMA/VFMS want */ +static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, + float_status *stat) +{ + return float16_muladd(op1, op2, dest, 0, stat); +} + +static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, + float_status *stat) +{ + return float32_muladd(op1, op2, dest, 0, stat); +} + +static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, + float_status *stat) +{ + return float16_muladd(float16_chs(op1), op2, dest, 0, stat); +} + +static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, + float_status *stat) +{ + return float32_muladd(float32_chs(op1), op2, dest, 0, stat); +} + +#define DO_MULADD(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(d[i], n[i], m[i], stat); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) +DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) + +DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) +DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) + +DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) +DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) + +DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) +DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) + +/* For the indexed ops, SVE applies the index per 128-bit vector segment. + * For AdvSIMD, there is of course only one such vector segment. + */ + +#define DO_MUL_IDX(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ + intptr_t idx = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = n[i + j] * mm; \ + } \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) +DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) +DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) + +#undef DO_MUL_IDX + +#define DO_MLA_IDX(NAME, TYPE, OP, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ + intptr_t idx = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm, *a = va; \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = a[i + j] OP n[i + j] * mm; \ + } \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) +DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) +DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) + +DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) +DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) +DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) + +#undef DO_MLA_IDX + +#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ + intptr_t idx = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = TYPE##_##ADD(d[i + j], \ + TYPE##_mul(n[i + j], mm, stat), stat); \ + } \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +#define float16_nop(N, M, S) (M) +#define float32_nop(N, M, S) (M) +#define float64_nop(N, M, S) (M) + +DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) +DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) +DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) + +/* + * Non-fused multiply-accumulate operations, for Neon. NB that unlike + * the fused ops below they assume accumulate both from and into Vd. + */ +DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) +DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) +DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) +DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) + +#undef float16_nop +#undef float32_nop +#undef float64_nop +#undef DO_FMUL_IDX + +#define DO_FMLA_IDX(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ + void *stat, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc); \ + intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ + TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ + intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ + TYPE *d = vd, *n = vn, *m = vm, *a = va; \ + op1_neg <<= (8 * sizeof(TYPE) - 1); \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ + mm, a[i + j], 0, stat); \ + } \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) +DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) +DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) + +#undef DO_FMLA_IDX + +#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ +void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ + bool q = false; \ + for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ + WTYPE dd = (WTYPE)n[i] OP m[i]; \ + if (dd < MIN) { \ + dd = MIN; \ + q = true; \ + } else if (dd > MAX) { \ + dd = MAX; \ + q = true; \ + } \ + d[i] = dd; \ + } \ + if (q) { \ + uint32_t *qc = vq; \ + qc[0] = 1; \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) +DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) +DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) + +DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) +DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) +DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) + +DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) +DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) +DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) + +DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) +DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) +DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) + +#undef DO_SAT + +void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, + void *vm, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + bool q = false; + + for (i = 0; i < oprsz / 8; i++) { + uint64_t nn = n[i], mm = m[i], dd = nn + mm; + if (dd < nn) { + dd = UINT64_MAX; + q = true; + } + d[i] = dd; + } + if (q) { + uint32_t *qc = vq; + qc[0] = 1; + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, + void *vm, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + bool q = false; + + for (i = 0; i < oprsz / 8; i++) { + uint64_t nn = n[i], mm = m[i], dd = nn - mm; + if (nn < mm) { + dd = 0; + q = true; + } + d[i] = dd; + } + if (q) { + uint32_t *qc = vq; + qc[0] = 1; + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, + void *vm, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm; + bool q = false; + + for (i = 0; i < oprsz / 8; i++) { + int64_t nn = n[i], mm = m[i], dd = nn + mm; + if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { + dd = (nn >> 63) ^ ~INT64_MIN; + q = true; + } + d[i] = dd; + } + if (q) { + uint32_t *qc = vq; + qc[0] = 1; + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, + void *vm, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int64_t *d = vd, *n = vn, *m = vm; + bool q = false; + + for (i = 0; i < oprsz / 8; i++) { + int64_t nn = n[i], mm = m[i], dd = nn - mm; + if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { + dd = (nn >> 63) ^ ~INT64_MIN; + q = true; + } + d[i] = dd; + } + if (q) { + uint32_t *qc = vq; + qc[0] = 1; + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + + +#define DO_SRA(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] += n[i] >> shift; \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_SRA(gvec_ssra_b, int8_t) +DO_SRA(gvec_ssra_h, int16_t) +DO_SRA(gvec_ssra_s, int32_t) +DO_SRA(gvec_ssra_d, int64_t) + +DO_SRA(gvec_usra_b, uint8_t) +DO_SRA(gvec_usra_h, uint16_t) +DO_SRA(gvec_usra_s, uint32_t) +DO_SRA(gvec_usra_d, uint64_t) + +#undef DO_SRA + +#define DO_RSHR(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + TYPE tmp = n[i] >> (shift - 1); \ + d[i] = (tmp >> 1) + (tmp & 1); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_RSHR(gvec_srshr_b, int8_t) +DO_RSHR(gvec_srshr_h, int16_t) +DO_RSHR(gvec_srshr_s, int32_t) +DO_RSHR(gvec_srshr_d, int64_t) + +DO_RSHR(gvec_urshr_b, uint8_t) +DO_RSHR(gvec_urshr_h, uint16_t) +DO_RSHR(gvec_urshr_s, uint32_t) +DO_RSHR(gvec_urshr_d, uint64_t) + +#undef DO_RSHR + +#define DO_RSRA(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + TYPE tmp = n[i] >> (shift - 1); \ + d[i] += (tmp >> 1) + (tmp & 1); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_RSRA(gvec_srsra_b, int8_t) +DO_RSRA(gvec_srsra_h, int16_t) +DO_RSRA(gvec_srsra_s, int32_t) +DO_RSRA(gvec_srsra_d, int64_t) + +DO_RSRA(gvec_ursra_b, uint8_t) +DO_RSRA(gvec_ursra_h, uint16_t) +DO_RSRA(gvec_ursra_s, uint32_t) +DO_RSRA(gvec_ursra_d, uint64_t) + +#undef DO_RSRA + +#define DO_SRI(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_SRI(gvec_sri_b, uint8_t) +DO_SRI(gvec_sri_h, uint16_t) +DO_SRI(gvec_sri_s, uint32_t) +DO_SRI(gvec_sri_d, uint64_t) + +#undef DO_SRI + +#define DO_SLI(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ +} + +DO_SLI(gvec_sli_b, uint8_t) +DO_SLI(gvec_sli_h, uint16_t) +DO_SLI(gvec_sli_s, uint32_t) +DO_SLI(gvec_sli_d, uint64_t) + +#undef DO_SLI + +/* + * Convert float16 to float32, raising no exceptions and + * preserving exceptional values, including SNaN. + * This is effectively an unpack+repack operation. + */ +static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) +{ + const int f16_bias = 15; + const int f32_bias = 127; + uint32_t sign = extract32(f16, 15, 1); + uint32_t exp = extract32(f16, 10, 5); + uint32_t frac = extract32(f16, 0, 10); + + if (exp == 0x1f) { + /* Inf or NaN */ + exp = 0xff; + } else if (exp == 0) { + /* Zero or denormal. */ + if (frac != 0) { + if (fz16) { + frac = 0; + } else { + /* + * Denormal; these are all normal float32. + * Shift the fraction so that the msb is at bit 11, + * then remove bit 11 as the implicit bit of the + * normalized float32. Note that we still go through + * the shift for normal numbers below, to put the + * float32 fraction at the right place. + */ + int shift = clz32(frac) - 21; + frac = (frac << shift) & 0x3ff; + exp = f32_bias - f16_bias - shift + 1; + } + } + } else { + /* Normal number; adjust the bias. */ + exp += f32_bias - f16_bias; + } + sign <<= 31; + exp <<= 23; + frac <<= 23 - 10; + + return sign | exp | frac; +} + +static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) +{ + /* + * Branchless load of u32[0], u64[0], u32[1], or u64[1]. + * Load the 2nd qword iff is_q & is_2. + * Shift to the 2nd dword iff !is_q & is_2. + * For !is_q & !is_2, the upper bits of the result are garbage. + */ + return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); +} + +/* + * Note that FMLAL requires oprsz == 8 or oprsz == 16, + * as there is not yet SVE versions that might use blocking. + */ + +static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, + uint32_t desc, bool fz16) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + int is_q = oprsz == 16; + uint64_t n_4, m_4; + + /* Pre-load all of the f16 data, avoiding overlap issues. */ + n_4 = load4_f16(vn, is_q, is_2); + m_4 = load4_f16(vm, is_q, is_2); + + /* Negate all inputs for FMLSL at once. */ + if (is_s) { + n_4 ^= 0x8000800080008000ull; + } + + for (i = 0; i < oprsz / 4; i++) { + float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); + float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, + void *venv, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc); + uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); + CPUARMState *env = venv; + float_status *status = &env->vfp.fp_status; + bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); + + for (i = 0; i < oprsz; i += sizeof(float32)) { + float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; + float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); + float32 nn = float16_to_float32_by_bits(nn_16, fz16); + float32 mm = float16_to_float32_by_bits(mm_16, fz16); + float32 aa = *(float32 *)(va + H1_4(i)); + + *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); + } +} + +static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, + uint32_t desc, bool fz16) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); + int is_q = oprsz == 16; + uint64_t n_4; + float32 m_1; + + /* Pre-load all of the f16 data, avoiding overlap issues. */ + n_4 = load4_f16(vn, is_q, is_2); + + /* Negate all inputs for FMLSL at once. */ + if (is_s) { + n_4 ^= 0x8000800080008000ull; + } + + m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); + + for (i = 0; i < oprsz / 4; i++) { + float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, + void *venv, uint32_t desc) +{ + intptr_t i, j, oprsz = simd_oprsz(desc); + uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); + intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); + CPUARMState *env = venv; + float_status *status = &env->vfp.fp_status; + bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); + + for (i = 0; i < oprsz; i += 16) { + float16 mm_16 = *(float16 *)(vm + i + idx); + float32 mm = float16_to_float32_by_bits(mm_16, fz16); + + for (j = 0; j < 16; j += sizeof(float32)) { + float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; + float32 nn = float16_to_float32_by_bits(nn_16, fz16); + float32 aa = *(float32 *)(va + H1_4(i + j)); + + *(float32 *)(vd + H1_4(i + j)) = + float32_muladd(nn, mm, aa, 0, status); + } + } +} + +void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + int8_t mm = m[i]; + int8_t nn = n[i]; + int8_t res = 0; + if (mm >= 0) { + if (mm < 8) { + res = nn << mm; + } + } else { + res = nn >> (mm > -8 ? -mm : 7); + } + d[i] = res; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + int8_t mm = m[i]; /* only 8 bits of shift are significant */ + int16_t nn = n[i]; + int16_t res = 0; + if (mm >= 0) { + if (mm < 16) { + res = nn << mm; + } + } else { + res = nn >> (mm > -16 ? -mm : 15); + } + d[i] = res; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + int8_t mm = m[i]; + uint8_t nn = n[i]; + uint8_t res = 0; + if (mm >= 0) { + if (mm < 8) { + res = nn << mm; + } + } else { + if (mm > -8) { + res = nn >> -mm; + } + } + d[i] = res; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + int8_t mm = m[i]; /* only 8 bits of shift are significant */ + uint16_t nn = n[i]; + uint16_t res = 0; + if (mm >= 0) { + if (mm < 16) { + res = nn << mm; + } + } else { + if (mm > -16) { + res = nn >> -mm; + } + } + d[i] = res; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* + * 8x8->8 polynomial multiply. + * + * Polynomial multiplication is like integer multiplication except the + * partial products are XORed, not added. + * + * TODO: expose this as a generic vector operation, as it is a common + * crypto building block. + */ +void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + uint64_t nn = n[i]; + uint64_t mm = m[i]; + uint64_t rr = 0; + + for (j = 0; j < 8; ++j) { + uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; + rr ^= mm & mask; + mm = (mm << 1) & 0xfefefefefefefefeull; + nn >>= 1; + } + d[i] = rr; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* + * 64x64->128 polynomial multiply. + * Because of the lanes are not accessed in strict columns, + * this probably cannot be turned into a generic helper. + */ +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t hi = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; i += 2) { + uint64_t nn = n[i + hi]; + uint64_t mm = m[i + hi]; + uint64_t rhi = 0; + uint64_t rlo = 0; + + /* Bit 0 can only influence the low 64-bit result. */ + if (nn & 1) { + rlo = mm; + } + + for (j = 1; j < 64; ++j) { + uint64_t mask = -((nn >> j) & 1); + rlo ^= (mm << j) & mask; + rhi ^= (mm >> (64 - j)) & mask; + } + d[i] = rlo; + d[i + 1] = rhi; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* + * 8x8->16 polynomial multiply. + * + * The byte inputs are expanded to (or extracted from) half-words. + * Note that neon and sve2 get the inputs from different positions. + * This allows 4 bytes to be processed in parallel with uint64_t. + */ + +static uint64_t expand_byte_to_half(uint64_t x) +{ + return (x & 0x000000ff) + | ((x & 0x0000ff00) << 8) + | ((x & 0x00ff0000) << 16) + | ((x & 0xff000000) << 24); +} + +uint64_t pmull_w(uint64_t op1, uint64_t op2) +{ + uint64_t result = 0; + int i; + for (i = 0; i < 16; ++i) { + uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; + result ^= op2 & mask; + op1 >>= 1; + op2 <<= 1; + } + return result; +} + +uint64_t pmull_h(uint64_t op1, uint64_t op2) +{ + uint64_t result = 0; + int i; + for (i = 0; i < 8; ++i) { + uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; + result ^= op2 & mask; + op1 >>= 1; + op2 <<= 1; + } + return result; +} + +void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + int hi = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t nn = n[hi], mm = m[hi]; + + d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + nn >>= 32; + mm >>= 32; + d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + + clear_tail(d, 16, simd_maxsz(desc)); +} + +#ifdef TARGET_AARCH64 +void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + int shift = simd_data(desc) * 8; + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; + uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; + + d[i] = pmull_h(nn, mm); + } +} + +static uint64_t pmull_d(uint64_t op1, uint64_t op2) +{ + uint64_t result = 0; + int i; + + for (i = 0; i < 32; ++i) { + uint64_t mask = -((op1 >> i) & 1); + result ^= (op2 << i) & mask; + } + return result; +} + +void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t sel = H4(simd_data(desc)); + intptr_t i, opr_sz = simd_oprsz(desc); + uint32_t *n = vn, *m = vm; + uint64_t *d = vd; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); + } +} +#endif + +#define DO_CMP0(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ + TYPE nn = *(TYPE *)(vn + i); \ + *(TYPE *)(vd + i) = -(nn OP 0); \ + } \ + clear_tail(vd, opr_sz, simd_maxsz(desc)); \ +} + +DO_CMP0(gvec_ceq0_b, int8_t, ==) +DO_CMP0(gvec_clt0_b, int8_t, <) +DO_CMP0(gvec_cle0_b, int8_t, <=) +DO_CMP0(gvec_cgt0_b, int8_t, >) +DO_CMP0(gvec_cge0_b, int8_t, >=) + +DO_CMP0(gvec_ceq0_h, int16_t, ==) +DO_CMP0(gvec_clt0_h, int16_t, <) +DO_CMP0(gvec_cle0_h, int16_t, <=) +DO_CMP0(gvec_cgt0_h, int16_t, >) +DO_CMP0(gvec_cge0_h, int16_t, >=) + +#undef DO_CMP0 + +#define DO_ABD(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + \ + for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ + d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_ABD(gvec_sabd_b, int8_t) +DO_ABD(gvec_sabd_h, int16_t) +DO_ABD(gvec_sabd_s, int32_t) +DO_ABD(gvec_sabd_d, int64_t) + +DO_ABD(gvec_uabd_b, uint8_t) +DO_ABD(gvec_uabd_h, uint16_t) +DO_ABD(gvec_uabd_s, uint32_t) +DO_ABD(gvec_uabd_d, uint64_t) + +#undef DO_ABD + +#define DO_ABA(NAME, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + \ + for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ + d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_ABA(gvec_saba_b, int8_t) +DO_ABA(gvec_saba_h, int16_t) +DO_ABA(gvec_saba_s, int32_t) +DO_ABA(gvec_saba_d, int64_t) + +DO_ABA(gvec_uaba_b, uint8_t) +DO_ABA(gvec_uaba_h, uint16_t) +DO_ABA(gvec_uaba_s, uint32_t) +DO_ABA(gvec_uaba_d, uint64_t) + +#undef DO_ABA + +#define DO_NEON_PAIRWISE(NAME, OP) \ + void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ + void *stat, uint32_t oprsz) \ + { \ + float_status *fpst = stat; \ + float32 *d = vd; \ + float32 *n = vn; \ + float32 *m = vm; \ + float32 r0, r1; \ + \ + /* Read all inputs before writing outputs in case vm == vd */ \ + r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ + r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ + \ + d[H4(0)] = r0; \ + d[H4(1)] = r1; \ + } \ + \ + void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ + void *stat, uint32_t oprsz) \ + { \ + float_status *fpst = stat; \ + float16 *d = vd; \ + float16 *n = vn; \ + float16 *m = vm; \ + float16 r0, r1, r2, r3; \ + \ + /* Read all inputs before writing outputs in case vm == vd */ \ + r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ + r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ + r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ + r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ + \ + d[H2(0)] = r0; \ + d[H2(1)] = r1; \ + d[H2(2)] = r2; \ + d[H2(3)] = r3; \ + } + +DO_NEON_PAIRWISE(neon_padd, add) +DO_NEON_PAIRWISE(neon_pmax, max) +DO_NEON_PAIRWISE(neon_pmin, min) + +#undef DO_NEON_PAIRWISE + +#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ + void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ + { \ + intptr_t i, oprsz = simd_oprsz(desc); \ + int shift = simd_data(desc); \ + TYPE *d = vd, *n = vn; \ + float_status *fpst = stat; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], shift, fpst); \ + } \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ + } + +DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) +DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) +DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) +DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) +DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) +DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) +DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) +DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) + +#undef DO_VCVT_FIXED + +#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ + void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ + { \ + float_status *fpst = stat; \ + intptr_t i, oprsz = simd_oprsz(desc); \ + uint32_t rmode = simd_data(desc); \ + uint32_t prev_rmode = get_float_rounding_mode(fpst); \ + TYPE *d = vd, *n = vn; \ + set_float_rounding_mode(rmode, fpst); \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], 0, fpst); \ + } \ + set_float_rounding_mode(prev_rmode, fpst); \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ + } + +DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) +DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) +DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) +DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) + +#undef DO_VCVT_RMODE + +#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ + void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ + { \ + float_status *fpst = stat; \ + intptr_t i, oprsz = simd_oprsz(desc); \ + uint32_t rmode = simd_data(desc); \ + uint32_t prev_rmode = get_float_rounding_mode(fpst); \ + TYPE *d = vd, *n = vn; \ + set_float_rounding_mode(rmode, fpst); \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], fpst); \ + } \ + set_float_rounding_mode(prev_rmode, fpst); \ + clear_tail(d, oprsz, simd_maxsz(desc)); \ + } + +DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) +DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) + +#undef DO_VRINT_RMODE + +#ifdef TARGET_AARCH64 +void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) +{ + const uint8_t *indices = vm; + CPUARMState *env = venv; + size_t oprsz = simd_oprsz(desc); + uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); + bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); + uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); + union { + uint8_t b[16]; + uint64_t d[2]; + } result; + + /* + * We must construct the final result in a temp, lest the output + * overlaps the input table. For TBL, begin with zero; for TBX, + * begin with the original register contents. Note that we always + * copy 16 bytes here to avoid an extra branch; clearing the high + * bits of the register for oprsz == 8 is handled below. + */ + if (is_tbx) { + memcpy(&result, vd, 16); + } else { + memset(&result, 0, 16); + } + + for (size_t i = 0; i < oprsz; ++i) { + uint32_t index = indices[H1(i)]; + + if (index < table_len) { + /* + * Convert index (a byte offset into the virtual table + * which is a series of 128-bit vectors concatenated) + * into the correct register element, bearing in mind + * that the table can wrap around from V31 to V0. + */ + const uint8_t *table = (const uint8_t *) + aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); + result.b[H1(i)] = table[H1(index % 16)]; + } + } + + memcpy(vd, &result, 16); + clear_tail(vd, oprsz, simd_maxsz(desc)); +} +#endif + +/* + * NxN -> N highpart multiply + * + * TODO: expose this as a generic vector operation. + */ + +void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = ((int32_t)n[i] * m[i]) >> 8; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = ((int32_t)n[i] * m[i]) >> 16; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + int32_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = ((int64_t)n[i] * m[i]) >> 32; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t discard; + + for (i = 0; i < opr_sz / 8; ++i) { + muls64(&discard, &d[i], n[i], m[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint8_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = ((uint32_t)n[i] * m[i]) >> 8; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint16_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = ((uint32_t)n[i] * m[i]) >> 16; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint32_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = ((uint64_t)n[i] * m[i]) >> 32; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t discard; + + for (i = 0; i < opr_sz / 8; ++i) { + mulu64(&discard, &d[i], n[i], m[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + int shr = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz; ++i) { + d[i] = ror64(n[i] ^ m[i], shr); + } + clear_tail(d, opr_sz * 8, simd_maxsz(desc)); +} + +/* + * Integer matrix-multiply accumulate + */ + +static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) +{ + int8_t *n = vn, *m = vm; + + for (intptr_t k = 0; k < 8; ++k) { + sum += n[H1(k)] * m[H1(k)]; + } + return sum; +} + +static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) +{ + uint8_t *n = vn, *m = vm; + + for (intptr_t k = 0; k < 8; ++k) { + sum += n[H1(k)] * m[H1(k)]; + } + return sum; +} + +static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) +{ + uint8_t *n = vn; + int8_t *m = vm; + + for (intptr_t k = 0; k < 8; ++k) { + sum += n[H1(k)] * m[H1(k)]; + } + return sum; +} + +static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, + uint32_t (*inner_loop)(uint32_t, void *, void *)) +{ + intptr_t seg, opr_sz = simd_oprsz(desc); + + for (seg = 0; seg < opr_sz; seg += 16) { + uint32_t *d = vd + seg; + uint32_t *a = va + seg; + uint32_t sum0, sum1, sum2, sum3; + + /* + * Process the entire segment at once, writing back the + * results only after we've consumed all of the inputs. + * + * Key to indices by column: + * i j i j + */ + sum0 = a[H4(0 + 0)]; + sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); + sum1 = a[H4(0 + 1)]; + sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); + sum2 = a[H4(2 + 0)]; + sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); + sum3 = a[H4(2 + 1)]; + sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); + + d[H4(0)] = sum0; + d[H4(1)] = sum1; + d[H4(2)] = sum2; + d[H4(3)] = sum3; + } + clear_tail(vd, opr_sz, simd_maxsz(desc)); +} + +#define DO_MMLA_B(NAME, INNER) \ + void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ + { do_mmla_b(vd, vn, vm, va, desc, INNER); } + +DO_MMLA_B(gvec_smmla_b, do_smmla_b) +DO_MMLA_B(gvec_ummla_b, do_ummla_b) +DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) + +/* + * BFloat16 Dot Product + */ + +float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) +{ + /* FPCR is ignored for BFDOT and BFMMLA. */ + float_status bf_status = { + .tininess_before_rounding = float_tininess_before_rounding, + .float_rounding_mode = float_round_to_odd_inf, + .flush_to_zero = true, + .flush_inputs_to_zero = true, + .default_nan_mode = true, + }; + float32 t1, t2; + + /* + * Extract each BFloat16 from the element pair, and shift + * them such that they become float32. + */ + t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); + t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); + t1 = float32_add(t1, t2, &bf_status); + t1 = float32_add(sum, t1, &bf_status); + + return t1; +} + +void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + float32 *d = vd, *a = va; + uint32_t *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = bfdotadd(a[i], n[i], m[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t index = simd_data(desc); + intptr_t elements = opr_sz / 4; + intptr_t eltspersegment = MIN(16 / 4, elements); + float32 *d = vd, *a = va; + uint32_t *n = vn, *m = vm; + + for (i = 0; i < elements; i += eltspersegment) { + uint32_t m_idx = m[i + H4(index)]; + + for (j = i; j < i + eltspersegment; j++) { + d[j] = bfdotadd(a[j], n[j], m_idx); + } + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) +{ + intptr_t s, opr_sz = simd_oprsz(desc); + float32 *d = vd, *a = va; + uint32_t *n = vn, *m = vm; + + for (s = 0; s < opr_sz / 4; s += 4) { + float32 sum00, sum01, sum10, sum11; + + /* + * Process the entire segment at once, writing back the + * results only after we've consumed all of the inputs. + * + * Key to indicies by column: + * i j i k j k + */ + sum00 = a[s + H4(0 + 0)]; + sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); + sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); + + sum01 = a[s + H4(0 + 1)]; + sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); + sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); + + sum10 = a[s + H4(2 + 0)]; + sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); + sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); + + sum11 = a[s + H4(2 + 1)]; + sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); + sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); + + d[s + H4(0 + 0)] = sum00; + d[s + H4(0 + 1)] = sum01; + d[s + H4(2 + 0)] = sum10; + d[s + H4(2 + 1)] = sum11; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, + void *stat, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + intptr_t sel = simd_data(desc); + float32 *d = vd, *a = va; + bfloat16 *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + float32 nn = n[H2(i * 2 + sel)] << 16; + float32 mm = m[H2(i * 2 + sel)] << 16; + d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, + void *va, void *stat, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); + intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); + intptr_t elements = opr_sz / 4; + intptr_t eltspersegment = MIN(16 / 4, elements); + float32 *d = vd, *a = va; + bfloat16 *n = vn, *m = vm; + + for (i = 0; i < elements; i += eltspersegment) { + float32 m_idx = m[H2(2 * i + index)] << 16; + + for (j = i; j < i + eltspersegment; j++) { + float32 n_j = n[H2(2 * j + sel)] << 16; + d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); + } + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +#define DO_CLAMP(NAME, TYPE) \ +void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ + TYPE aa = *(TYPE *)(a + i); \ + TYPE nn = *(TYPE *)(n + i); \ + TYPE mm = *(TYPE *)(m + i); \ + TYPE dd = MIN(MAX(aa, nn), mm); \ + *(TYPE *)(d + i) = dd; \ + } \ + clear_tail(d, opr_sz, simd_maxsz(desc)); \ +} + +DO_CLAMP(gvec_sclamp_b, int8_t) +DO_CLAMP(gvec_sclamp_h, int16_t) +DO_CLAMP(gvec_sclamp_s, int32_t) +DO_CLAMP(gvec_sclamp_d, int64_t) + +DO_CLAMP(gvec_uclamp_b, uint8_t) +DO_CLAMP(gvec_uclamp_h, uint16_t) +DO_CLAMP(gvec_uclamp_s, uint32_t) +DO_CLAMP(gvec_uclamp_d, uint64_t) diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h new file mode 100644 index 0000000000..1f4ed80ff7 --- /dev/null +++ b/target/arm/tcg/vec_internal.h @@ -0,0 +1,246 @@ +/* + * ARM AdvSIMD / SVE Vector Helpers + * + * Copyright (c) 2020 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#ifndef TARGET_ARM_VEC_INTERNAL_H +#define TARGET_ARM_VEC_INTERNAL_H + +/* + * Note that vector data is stored in host-endian 64-bit chunks, + * so addressing units smaller than that needs a host-endian fixup. + * + * The H macros are used when indexing an array of elements of size N. + * + * The H1_ macros are used when performing byte arithmetic and then + * casting the final pointer to a type of size N. + */ +#if HOST_BIG_ENDIAN +#define H1(x) ((x) ^ 7) +#define H1_2(x) ((x) ^ 6) +#define H1_4(x) ((x) ^ 4) +#define H2(x) ((x) ^ 3) +#define H4(x) ((x) ^ 1) +#else +#define H1(x) (x) +#define H1_2(x) (x) +#define H1_4(x) (x) +#define H2(x) (x) +#define H4(x) (x) +#endif +/* + * Access to 64-bit elements isn't host-endian dependent; we provide H8 + * and H1_8 so that when a function is being generated from a macro we + * can pass these rather than an empty macro argument, for clarity. + */ +#define H8(x) (x) +#define H1_8(x) (x) + +/* + * Expand active predicate bits to bytes, for byte elements. + */ +extern const uint64_t expand_pred_b_data[256]; +static inline uint64_t expand_pred_b(uint8_t byte) +{ + return expand_pred_b_data[byte]; +} + +/* Similarly for half-word elements. */ +extern const uint64_t expand_pred_h_data[0x55 + 1]; +static inline uint64_t expand_pred_h(uint8_t byte) +{ + return expand_pred_h_data[byte & 0x55]; +} + +static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) +{ + uint64_t *d = vd + opr_sz; + uintptr_t i; + + for (i = opr_sz; i < max_sz; i += 8) { + *d++ = 0; + } +} + +static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits, + bool round, uint32_t *sat) +{ + if (shift <= -bits) { + /* Rounding the sign bit always produces 0. */ + if (round) { + return 0; + } + return src >> 31; + } else if (shift < 0) { + if (round) { + src >>= -shift - 1; + return (src >> 1) + (src & 1); + } + return src >> -shift; + } else if (shift < bits) { + int32_t val = src << shift; + if (bits == 32) { + if (!sat || val >> shift == src) { + return val; + } + } else { + int32_t extval = sextract32(val, 0, bits); + if (!sat || val == extval) { + return extval; + } + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return (1u << (bits - 1)) - (src >= 0); +} + +static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits, + bool round, uint32_t *sat) +{ + if (shift <= -(bits + round)) { + return 0; + } else if (shift < 0) { + if (round) { + src >>= -shift - 1; + return (src >> 1) + (src & 1); + } + return src >> -shift; + } else if (shift < bits) { + uint32_t val = src << shift; + if (bits == 32) { + if (!sat || val >> shift == src) { + return val; + } + } else { + uint32_t extval = extract32(val, 0, bits); + if (!sat || val == extval) { + return extval; + } + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return MAKE_64BIT_MASK(0, bits); +} + +static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits, + bool round, uint32_t *sat) +{ + if (sat && src < 0) { + *sat = 1; + return 0; + } + return do_uqrshl_bhs(src, shift, bits, round, sat); +} + +static inline int64_t do_sqrshl_d(int64_t src, int64_t shift, + bool round, uint32_t *sat) +{ + if (shift <= -64) { + /* Rounding the sign bit always produces 0. */ + if (round) { + return 0; + } + return src >> 63; + } else if (shift < 0) { + if (round) { + src >>= -shift - 1; + return (src >> 1) + (src & 1); + } + return src >> -shift; + } else if (shift < 64) { + int64_t val = src << shift; + if (!sat || val >> shift == src) { + return val; + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return src < 0 ? INT64_MIN : INT64_MAX; +} + +static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift, + bool round, uint32_t *sat) +{ + if (shift <= -(64 + round)) { + return 0; + } else if (shift < 0) { + if (round) { + src >>= -shift - 1; + return (src >> 1) + (src & 1); + } + return src >> -shift; + } else if (shift < 64) { + uint64_t val = src << shift; + if (!sat || val >> shift == src) { + return val; + } + } else if (!sat || src == 0) { + return 0; + } + + *sat = 1; + return UINT64_MAX; +} + +static inline int64_t do_suqrshl_d(int64_t src, int64_t shift, + bool round, uint32_t *sat) +{ + if (sat && src < 0) { + *sat = 1; + return 0; + } + return do_uqrshl_d(src, shift, round, sat); +} + +int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool); +int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); +int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); +int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); + +/* + * 8 x 8 -> 16 vector polynomial multiply where the inputs are + * in the low 8 bits of each 16-bit element +*/ +uint64_t pmull_h(uint64_t op1, uint64_t op2); +/* + * 16 x 16 -> 32 vector polynomial multiply where the inputs are + * in the low 16 bits of each 32-bit element + */ +uint64_t pmull_w(uint64_t op1, uint64_t op2); + +/** + * bfdotadd: + * @sum: addend + * @e1, @e2: multiplicand vectors + * + * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. + * The @e1 and @e2 operands correspond to the 32-bit source vector + * slots and contain two Bfloat16 values each. + * + * Corresponds to the ARM pseudocode function BFDotAdd. + */ +float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2); + +#endif /* TARGET_ARM_VEC_INTERNAL_H */ diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c deleted file mode 100644 index 60abcbebe6..0000000000 --- a/target/arm/tlb_helper.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * ARM TLB (Translation lookaside buffer) helpers. - * - * This code is licensed under the GNU GPL v2 or later. - * - * SPDX-License-Identifier: GPL-2.0-or-later - */ -#include "qemu/osdep.h" -#include "cpu.h" -#include "internals.h" -#include "exec/exec-all.h" -#include "exec/helper-proto.h" - - -/* Return true if the translation regime is using LPAE format page tables */ -bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - int el = regime_el(env, mmu_idx); - if (el == 2 || arm_el_is_aa64(env, el)) { - return true; - } - if (arm_feature(env, ARM_FEATURE_PMSA) && - arm_feature(env, ARM_FEATURE_V8)) { - return true; - } - if (arm_feature(env, ARM_FEATURE_LPAE) - && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) { - return true; - } - return false; -} - -/* - * Returns true if the stage 1 translation regime is using LPAE format page - * tables. Used when raising alignment exceptions, whose FSR changes depending - * on whether the long or short descriptor format is in use. - */ -bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - mmu_idx = stage_1_mmu_idx(mmu_idx); - return regime_using_lpae_format(env, mmu_idx); -} - -static inline uint32_t merge_syn_data_abort(uint32_t template_syn, - unsigned int target_el, - bool same_el, bool ea, - bool s1ptw, bool is_write, - int fsc) -{ - uint32_t syn; - - /* - * ISV is only set for data aborts routed to EL2 and - * never for stage-1 page table walks faulting on stage 2. - * - * Furthermore, ISV is only set for certain kinds of load/stores. - * If the template syndrome does not have ISV set, we should leave - * it cleared. - * - * See ARMv8 specs, D7-1974: - * ISS encoding for an exception from a Data Abort, the - * ISV field. - */ - if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) { - syn = syn_data_abort_no_iss(same_el, 0, - ea, 0, s1ptw, is_write, fsc); - } else { - /* - * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template - * syndrome created at translation time. - * Now we create the runtime syndrome with the remaining fields. - */ - syn = syn_data_abort_with_iss(same_el, - 0, 0, 0, 0, 0, - ea, 0, s1ptw, is_write, fsc, - true); - /* Merge the runtime syndrome with the template syndrome. */ - syn |= template_syn; - } - return syn; -} - -static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi, - int target_el, int mmu_idx, uint32_t *ret_fsc) -{ - ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx); - uint32_t fsr, fsc; - - if (target_el == 2 || arm_el_is_aa64(env, target_el) || - arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) { - /* - * LPAE format fault status register : bottom 6 bits are - * status code in the same form as needed for syndrome - */ - fsr = arm_fi_to_lfsc(fi); - fsc = extract32(fsr, 0, 6); - } else { - fsr = arm_fi_to_sfsc(fi); - /* - * Short format FSR : this fault will never actually be reported - * to an EL that uses a syndrome register. Use a (currently) - * reserved FSR code in case the constructed syndrome does leak - * into the guest somehow. - */ - fsc = 0x3f; - } - - *ret_fsc = fsc; - return fsr; -} - -static G_NORETURN -void arm_deliver_fault(ARMCPU *cpu, vaddr addr, - MMUAccessType access_type, - int mmu_idx, ARMMMUFaultInfo *fi) -{ - CPUARMState *env = &cpu->env; - int target_el; - bool same_el; - uint32_t syn, exc, fsr, fsc; - - target_el = exception_target_el(env); - if (fi->stage2) { - target_el = 2; - env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4; - if (arm_is_secure_below_el3(env) && fi->s1ns) { - env->cp15.hpfar_el2 |= HPFAR_NS; - } - } - same_el = (arm_current_el(env) == target_el); - - fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc); - - if (access_type == MMU_INST_FETCH) { - syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc); - exc = EXCP_PREFETCH_ABORT; - } else { - syn = merge_syn_data_abort(env->exception.syndrome, target_el, - same_el, fi->ea, fi->s1ptw, - access_type == MMU_DATA_STORE, - fsc); - if (access_type == MMU_DATA_STORE - && arm_feature(env, ARM_FEATURE_V6)) { - fsr |= (1 << 11); - } - exc = EXCP_DATA_ABORT; - } - - env->exception.vaddress = addr; - env->exception.fsr = fsr; - raise_exception(env, exc, syn, target_el); -} - -/* Raise a data fault alignment exception for the specified virtual address */ -void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, - MMUAccessType access_type, - int mmu_idx, uintptr_t retaddr) -{ - ARMCPU *cpu = ARM_CPU(cs); - ARMMMUFaultInfo fi = {}; - - /* now we have a real cpu fault */ - cpu_restore_state(cs, retaddr); - - fi.type = ARMFault_Alignment; - arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi); -} - -void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc) -{ - ARMMMUFaultInfo fi = { .type = ARMFault_Alignment }; - int target_el = exception_target_el(env); - int mmu_idx = cpu_mmu_index(env, true); - uint32_t fsc; - - env->exception.vaddress = pc; - - /* - * Note that the fsc is not applicable to this exception, - * since any syndrome is pcalignment not insn_abort. - */ - env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc); - raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el); -} - -#if !defined(CONFIG_USER_ONLY) - -/* - * arm_cpu_do_transaction_failed: handle a memory system error response - * (eg "no device/memory present at address") by raising an external abort - * exception - */ -void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr, - vaddr addr, unsigned size, - MMUAccessType access_type, - int mmu_idx, MemTxAttrs attrs, - MemTxResult response, uintptr_t retaddr) -{ - ARMCPU *cpu = ARM_CPU(cs); - ARMMMUFaultInfo fi = {}; - - /* now we have a real cpu fault */ - cpu_restore_state(cs, retaddr); - - fi.ea = arm_extabort_type(response); - fi.type = ARMFault_SyncExternal; - arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi); -} - -bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size, - MMUAccessType access_type, int mmu_idx, - bool probe, uintptr_t retaddr) -{ - ARMCPU *cpu = ARM_CPU(cs); - GetPhysAddrResult res = {}; - ARMMMUFaultInfo local_fi, *fi; - int ret; - - /* - * Allow S1_ptw_translate to see any fault generated here. - * Since this may recurse, read and clear. - */ - fi = cpu->env.tlb_fi; - if (fi) { - cpu->env.tlb_fi = NULL; - } else { - fi = memset(&local_fi, 0, sizeof(local_fi)); - } - - /* - * Walk the page table and (if the mapping exists) add the page - * to the TLB. On success, return true. Otherwise, if probing, - * return false. Otherwise populate fsr with ARM DFSR/IFSR fault - * register format, and signal the fault. - */ - ret = get_phys_addr(&cpu->env, address, access_type, - core_to_arm_mmu_idx(&cpu->env, mmu_idx), - &res, fi); - if (likely(!ret)) { - /* - * Map a single [sub]page. Regions smaller than our declared - * target page size are handled specially, so for those we - * pass in the exact addresses. - */ - if (res.f.lg_page_size >= TARGET_PAGE_BITS) { - res.f.phys_addr &= TARGET_PAGE_MASK; - address &= TARGET_PAGE_MASK; - } - - res.f.pte_attrs = res.cacheattrs.attrs; - res.f.shareability = res.cacheattrs.shareability; - - tlb_set_page_full(cs, mmu_idx, address, &res.f); - return true; - } else if (probe) { - return false; - } else { - /* now we have a real cpu fault */ - cpu_restore_state(cs, retaddr); - arm_deliver_fault(cpu, address, access_type, mmu_idx, fi); - } -} -#else -void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr, - MMUAccessType access_type, - bool maperr, uintptr_t ra) -{ - ARMMMUFaultInfo fi = { - .type = maperr ? ARMFault_Translation : ARMFault_Permission, - .level = 3, - }; - ARMCPU *cpu = ARM_CPU(cs); - - /* - * We report both ESR and FAR to signal handlers. - * For now, it's easiest to deliver the fault normally. - */ - cpu_restore_state(cs, ra); - arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi); -} - -void arm_cpu_record_sigbus(CPUState *cs, vaddr addr, - MMUAccessType access_type, uintptr_t ra) -{ - arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra); -} -#endif /* !defined(CONFIG_USER_ONLY) */ diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c deleted file mode 100644 index f59d3b26ea..0000000000 --- a/target/arm/vec_helper.c +++ /dev/null @@ -1,2716 +0,0 @@ -/* - * ARM AdvSIMD / SVE Vector Operations - * - * Copyright (c) 2018 Linaro - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#include "qemu/osdep.h" -#include "cpu.h" -#include "exec/helper-proto.h" -#include "tcg/tcg-gvec-desc.h" -#include "fpu/softfloat.h" -#include "qemu/int128.h" -#include "vec_internal.h" - -/* - * Data for expanding active predicate bits to bytes, for byte elements. - * - * for (i = 0; i < 256; ++i) { - * unsigned long m = 0; - * for (j = 0; j < 8; j++) { - * if ((i >> j) & 1) { - * m |= 0xfful << (j << 3); - * } - * } - * printf("0x%016lx,\n", m); - * } - */ -const uint64_t expand_pred_b_data[256] = { - 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, - 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, - 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, - 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, - 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, - 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, - 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, - 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, - 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, - 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, - 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, - 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, - 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, - 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, - 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, - 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, - 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, - 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, - 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, - 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, - 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, - 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, - 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, - 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, - 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, - 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, - 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, - 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, - 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, - 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, - 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, - 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, - 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, - 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, - 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, - 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, - 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, - 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, - 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, - 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, - 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, - 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, - 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, - 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, - 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, - 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, - 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, - 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, - 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, - 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, - 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, - 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, - 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, - 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, - 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, - 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, - 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, - 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, - 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, - 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, - 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, - 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, - 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, - 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, - 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, - 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, - 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, - 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, - 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, - 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, - 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, - 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, - 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, - 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, - 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, - 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, - 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, - 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, - 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, - 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, - 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, - 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, - 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, - 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, - 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, - 0xffffffffffffffff, -}; - -/* - * Similarly for half-word elements. - * for (i = 0; i < 256; ++i) { - * unsigned long m = 0; - * if (i & 0xaa) { - * continue; - * } - * for (j = 0; j < 8; j += 2) { - * if ((i >> j) & 1) { - * m |= 0xfffful << (j << 3); - * } - * } - * printf("[0x%x] = 0x%016lx,\n", i, m); - * } - */ -const uint64_t expand_pred_h_data[0x55 + 1] = { - [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, - [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, - [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, - [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, - [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, - [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, - [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, - [0x55] = 0xffffffffffffffff, -}; - -/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ -int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, - bool neg, bool round) -{ - /* - * Simplify: - * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 - * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 - */ - int32_t ret = (int32_t)src1 * src2; - if (neg) { - ret = -ret; - } - ret += ((int32_t)src3 << 7) + (round << 6); - ret >>= 7; - - if (ret != (int8_t)ret) { - ret = (ret < 0 ? INT8_MIN : INT8_MAX); - } - return ret; -} - -void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm, *a = va; - - for (i = 0; i < opr_sz; ++i) { - d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); - } -} - -void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm, *a = va; - - for (i = 0; i < opr_sz; ++i) { - d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); - } -} - -void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); - } -} - -void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); - } -} - -/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ -int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, - bool neg, bool round, uint32_t *sat) -{ - /* Simplify similarly to do_sqrdmlah_b above. */ - int32_t ret = (int32_t)src1 * src2; - if (neg) { - ret = -ret; - } - ret += ((int32_t)src3 << 15) + (round << 14); - ret >>= 15; - - if (ret != (int16_t)ret) { - *sat = 1; - ret = (ret < 0 ? INT16_MIN : INT16_MAX); - } - return ret; -} - -uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, - uint32_t src2, uint32_t src3) -{ - uint32_t *sat = &env->vfp.qc[0]; - uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); - uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, - false, true, sat); - return deposit32(e1, 16, 16, e2); -} - -void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - int16_t *d = vd; - int16_t *n = vn; - int16_t *m = vm; - uintptr_t i; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, - uint32_t src2, uint32_t src3) -{ - uint32_t *sat = &env->vfp.qc[0]; - uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); - uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, - true, true, sat); - return deposit32(e1, 16, 16, e2); -} - -void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - int16_t *d = vd; - int16_t *n = vn; - int16_t *m = vm; - uintptr_t i; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm, *a = va; - uint32_t discard; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); - } -} - -void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm, *a = va; - uint32_t discard; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); - } -} - -void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - uint32_t discard; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); - } -} - -void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - uint32_t discard; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); - } -} - -void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); - uint32_t discard; - - for (i = 0; i < opr_sz / 2; i += 16 / 2) { - int16_t mm = m[i]; - for (j = 0; j < 16 / 2; ++j) { - d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); - } - } -} - -void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); - uint32_t discard; - - for (i = 0; i < opr_sz / 2; i += 16 / 2) { - int16_t mm = m[i]; - for (j = 0; j < 16 / 2; ++j) { - d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); - } - } -} - -/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ -int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, - bool neg, bool round, uint32_t *sat) -{ - /* Simplify similarly to do_sqrdmlah_b above. */ - int64_t ret = (int64_t)src1 * src2; - if (neg) { - ret = -ret; - } - ret += ((int64_t)src3 << 31) + (round << 30); - ret >>= 31; - - if (ret != (int32_t)ret) { - *sat = 1; - ret = (ret < 0 ? INT32_MIN : INT32_MAX); - } - return ret; -} - -uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, - int32_t src2, int32_t src3) -{ - uint32_t *sat = &env->vfp.qc[0]; - return do_sqrdmlah_s(src1, src2, src3, false, true, sat); -} - -void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - int32_t *d = vd; - int32_t *n = vn; - int32_t *m = vm; - uintptr_t i; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, - int32_t src2, int32_t src3) -{ - uint32_t *sat = &env->vfp.qc[0]; - return do_sqrdmlah_s(src1, src2, src3, true, true, sat); -} - -void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - int32_t *d = vd; - int32_t *n = vn; - int32_t *m = vm; - uintptr_t i; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, - void *vq, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm, *a = va; - uint32_t discard; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); - } -} - -void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm, *a = va; - uint32_t discard; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); - } -} - -void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm; - uint32_t discard; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); - } -} - -void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm; - uint32_t discard; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); - } -} - -void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); - uint32_t discard; - - for (i = 0; i < opr_sz / 4; i += 16 / 4) { - int32_t mm = m[i]; - for (j = 0; j < 16 / 4; ++j) { - d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); - } - } -} - -void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); - uint32_t discard; - - for (i = 0; i < opr_sz / 4; i += 16 / 4) { - int32_t mm = m[i]; - for (j = 0; j < 16 / 4; ++j) { - d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); - } - } -} - -/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ -static int64_t do_sat128_d(Int128 r) -{ - int64_t ls = int128_getlo(r); - int64_t hs = int128_gethi(r); - - if (unlikely(hs != (ls >> 63))) { - return hs < 0 ? INT64_MIN : INT64_MAX; - } - return ls; -} - -int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) -{ - uint64_t l, h; - Int128 r, t; - - /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ - muls64(&l, &h, m, n); - r = int128_make128(l, h); - if (neg) { - r = int128_neg(r); - } - if (a) { - t = int128_exts64(a); - t = int128_lshift(t, 63); - r = int128_add(r, t); - } - if (round) { - t = int128_exts64(1ll << 62); - r = int128_add(r, t); - } - r = int128_rshift(r, 63); - - return do_sat128_d(r); -} - -void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm, *a = va; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); - } -} - -void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm, *a = va; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); - } -} - -void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); - } -} - -void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); - } -} - -void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; - - for (i = 0; i < opr_sz / 8; i += 16 / 8) { - int64_t mm = m[i]; - for (j = 0; j < 16 / 8; ++j) { - d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); - } - } -} - -void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - int idx = simd_data(desc); - int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; - - for (i = 0; i < opr_sz / 8; i += 16 / 8) { - int64_t mm = m[i]; - for (j = 0; j < 16 / 8; ++j) { - d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); - } - } -} - -/* Integer 8 and 16-bit dot-product. - * - * Note that for the loops herein, host endianness does not matter - * with respect to the ordering of data within the quad-width lanes. - * All elements are treated equally, no matter where they are. - */ - -#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPED *d = vd, *a = va; \ - TYPEN *n = vn; \ - TYPEM *m = vm; \ - for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ - d[i] = (a[i] + \ - (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ - (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ - (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ - (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ - } \ - clear_tail(d, opr_sz, simd_maxsz(desc)); \ -} - -DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) -DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) -DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) -DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) -DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) - -#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i = 0, opr_sz = simd_oprsz(desc); \ - intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ - intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ - intptr_t index = simd_data(desc); \ - TYPED *d = vd, *a = va; \ - TYPEN *n = vn; \ - TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ - do { \ - TYPED m0 = m_indexed[i * 4 + 0]; \ - TYPED m1 = m_indexed[i * 4 + 1]; \ - TYPED m2 = m_indexed[i * 4 + 2]; \ - TYPED m3 = m_indexed[i * 4 + 3]; \ - do { \ - d[i] = (a[i] + \ - n[i * 4 + 0] * m0 + \ - n[i * 4 + 1] * m1 + \ - n[i * 4 + 2] * m2 + \ - n[i * 4 + 3] * m3); \ - } while (++i < segend); \ - segend = i + 4; \ - } while (i < opr_sz_n); \ - clear_tail(d, opr_sz, simd_maxsz(desc)); \ -} - -DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) -DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) -DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) -DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) -DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) -DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) - -void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float16 *d = vd; - float16 *n = vn; - float16 *m = vm; - float_status *fpst = vfpst; - uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = neg_real ^ 1; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; - - for (i = 0; i < opr_sz / 2; i += 2) { - float16 e0 = n[H2(i)]; - float16 e1 = m[H2(i + 1)] ^ neg_imag; - float16 e2 = n[H2(i + 1)]; - float16 e3 = m[H2(i)] ^ neg_real; - - d[H2(i)] = float16_add(e0, e1, fpst); - d[H2(i + 1)] = float16_add(e2, e3, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float32 *d = vd; - float32 *n = vn; - float32 *m = vm; - float_status *fpst = vfpst; - uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = neg_real ^ 1; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; - - for (i = 0; i < opr_sz / 4; i += 2) { - float32 e0 = n[H4(i)]; - float32 e1 = m[H4(i + 1)] ^ neg_imag; - float32 e2 = n[H4(i + 1)]; - float32 e3 = m[H4(i)] ^ neg_real; - - d[H4(i)] = float32_add(e0, e1, fpst); - d[H4(i + 1)] = float32_add(e2, e3, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float64 *d = vd; - float64 *n = vn; - float64 *m = vm; - float_status *fpst = vfpst; - uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); - uint64_t neg_imag = neg_real ^ 1; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 63; - neg_imag <<= 63; - - for (i = 0; i < opr_sz / 8; i += 2) { - float64 e0 = n[i]; - float64 e1 = m[i + 1] ^ neg_imag; - float64 e2 = n[i + 1]; - float64 e3 = m[i] ^ neg_real; - - d[i] = float64_add(e0, e1, fpst); - d[i + 1] = float64_add(e2, e3, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float16 *d = vd, *n = vn, *m = vm, *a = va; - float_status *fpst = vfpst; - intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint32_t neg_real = flip ^ neg_imag; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; - - for (i = 0; i < opr_sz / 2; i += 2) { - float16 e2 = n[H2(i + flip)]; - float16 e1 = m[H2(i + flip)] ^ neg_real; - float16 e4 = e2; - float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; - - d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); - d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float16 *d = vd, *n = vn, *m = vm, *a = va; - float_status *fpst = vfpst; - intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; - intptr_t elements = opr_sz / sizeof(float16); - intptr_t eltspersegment = 16 / sizeof(float16); - intptr_t i, j; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; - - for (i = 0; i < elements; i += eltspersegment) { - float16 mr = m[H2(i + 2 * index + 0)]; - float16 mi = m[H2(i + 2 * index + 1)]; - float16 e1 = neg_real ^ (flip ? mi : mr); - float16 e3 = neg_imag ^ (flip ? mr : mi); - - for (j = i; j < i + eltspersegment; j += 2) { - float16 e2 = n[H2(j + flip)]; - float16 e4 = e2; - - d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); - d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); - } - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float32 *d = vd, *n = vn, *m = vm, *a = va; - float_status *fpst = vfpst; - intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint32_t neg_real = flip ^ neg_imag; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; - - for (i = 0; i < opr_sz / 4; i += 2) { - float32 e2 = n[H4(i + flip)]; - float32 e1 = m[H4(i + flip)] ^ neg_real; - float32 e4 = e2; - float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; - - d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); - d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float32 *d = vd, *n = vn, *m = vm, *a = va; - float_status *fpst = vfpst; - intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; - intptr_t elements = opr_sz / sizeof(float32); - intptr_t eltspersegment = 16 / sizeof(float32); - intptr_t i, j; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; - - for (i = 0; i < elements; i += eltspersegment) { - float32 mr = m[H4(i + 2 * index + 0)]; - float32 mi = m[H4(i + 2 * index + 1)]; - float32 e1 = neg_real ^ (flip ? mi : mr); - float32 e3 = neg_imag ^ (flip ? mr : mi); - - for (j = i; j < i + eltspersegment; j += 2) { - float32 e2 = n[H4(j + flip)]; - float32 e4 = e2; - - d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); - d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); - } - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, - void *vfpst, uint32_t desc) -{ - uintptr_t opr_sz = simd_oprsz(desc); - float64 *d = vd, *n = vn, *m = vm, *a = va; - float_status *fpst = vfpst; - intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint64_t neg_real = flip ^ neg_imag; - uintptr_t i; - - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 63; - neg_imag <<= 63; - - for (i = 0; i < opr_sz / 8; i += 2) { - float64 e2 = n[i + flip]; - float64 e1 = m[i + flip] ^ neg_real; - float64 e4 = e2; - float64 e3 = m[i + 1 - flip] ^ neg_imag; - - d[i] = float64_muladd(e2, e1, a[i], 0, fpst); - d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -/* - * Floating point comparisons producing an integer result (all 1s or all 0s). - * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. - * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. - */ -static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) -{ - return -float16_eq_quiet(op1, op2, stat); -} - -static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) -{ - return -float32_eq_quiet(op1, op2, stat); -} - -static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) -{ - return -float16_le(op2, op1, stat); -} - -static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) -{ - return -float32_le(op2, op1, stat); -} - -static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) -{ - return -float16_lt(op2, op1, stat); -} - -static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) -{ - return -float32_lt(op2, op1, stat); -} - -static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) -{ - return -float16_le(float16_abs(op2), float16_abs(op1), stat); -} - -static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) -{ - return -float32_le(float32_abs(op2), float32_abs(op1), stat); -} - -static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) -{ - return -float16_lt(float16_abs(op2), float16_abs(op1), stat); -} - -static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) -{ - return -float32_lt(float32_abs(op2), float32_abs(op1), stat); -} - -static int16_t vfp_tosszh(float16 x, void *fpstp) -{ - float_status *fpst = fpstp; - if (float16_is_any_nan(x)) { - float_raise(float_flag_invalid, fpst); - return 0; - } - return float16_to_int16_round_to_zero(x, fpst); -} - -static uint16_t vfp_touszh(float16 x, void *fpstp) -{ - float_status *fpst = fpstp; - if (float16_is_any_nan(x)) { - float_raise(float_flag_invalid, fpst); - return 0; - } - return float16_to_uint16_round_to_zero(x, fpst); -} - -#define DO_2OP(NAME, FUNC, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(n[i], stat); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) -DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) -DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) - -DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) -DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) -DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) - -DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) -DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) - -DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) -DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) -DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) -DO_2OP(gvec_touizs, helper_vfp_touizs, float32) -DO_2OP(gvec_sstoh, int16_to_float16, int16_t) -DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) -DO_2OP(gvec_tosszh, vfp_tosszh, float16) -DO_2OP(gvec_touszh, vfp_touszh, float16) - -#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ - static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ - { \ - return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ - } - -#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ - static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ - { \ - return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ - } - -#define DO_2OP_CMP0(FN, CMPOP, DIRN) \ - WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ - WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ - DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ - DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) - -DO_2OP_CMP0(cgt, cgt, FWD) -DO_2OP_CMP0(cge, cge, FWD) -DO_2OP_CMP0(ceq, ceq, FWD) -DO_2OP_CMP0(clt, cgt, REV) -DO_2OP_CMP0(cle, cge, REV) - -#undef DO_2OP -#undef DO_2OP_CMP0 - -/* Floating-point trigonometric starting value. - * See the ARM ARM pseudocode function FPTrigSMul. - */ -static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) -{ - float16 result = float16_mul(op1, op1, stat); - if (!float16_is_any_nan(result)) { - result = float16_set_sign(result, op2 & 1); - } - return result; -} - -static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) -{ - float32 result = float32_mul(op1, op1, stat); - if (!float32_is_any_nan(result)) { - result = float32_set_sign(result, op2 & 1); - } - return result; -} - -static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) -{ - float64 result = float64_mul(op1, op1, stat); - if (!float64_is_any_nan(result)) { - result = float64_set_sign(result, op2 & 1); - } - return result; -} - -static float16 float16_abd(float16 op1, float16 op2, float_status *stat) -{ - return float16_abs(float16_sub(op1, op2, stat)); -} - -static float32 float32_abd(float32 op1, float32 op2, float_status *stat) -{ - return float32_abs(float32_sub(op1, op2, stat)); -} - -/* - * Reciprocal step. These are the AArch32 version which uses a - * non-fused multiply-and-subtract. - */ -static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) -{ - op1 = float16_squash_input_denormal(op1, stat); - op2 = float16_squash_input_denormal(op2, stat); - - if ((float16_is_infinity(op1) && float16_is_zero(op2)) || - (float16_is_infinity(op2) && float16_is_zero(op1))) { - return float16_two; - } - return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); -} - -static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) -{ - op1 = float32_squash_input_denormal(op1, stat); - op2 = float32_squash_input_denormal(op2, stat); - - if ((float32_is_infinity(op1) && float32_is_zero(op2)) || - (float32_is_infinity(op2) && float32_is_zero(op1))) { - return float32_two; - } - return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); -} - -/* Reciprocal square-root step. AArch32 non-fused semantics. */ -static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) -{ - op1 = float16_squash_input_denormal(op1, stat); - op2 = float16_squash_input_denormal(op2, stat); - - if ((float16_is_infinity(op1) && float16_is_zero(op2)) || - (float16_is_infinity(op2) && float16_is_zero(op1))) { - return float16_one_point_five; - } - op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); - return float16_div(op1, float16_two, stat); -} - -static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) -{ - op1 = float32_squash_input_denormal(op1, stat); - op2 = float32_squash_input_denormal(op2, stat); - - if ((float32_is_infinity(op1) && float32_is_zero(op2)) || - (float32_is_infinity(op2) && float32_is_zero(op1))) { - return float32_one_point_five; - } - op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); - return float32_div(op1, float32_two, stat); -} - -#define DO_3OP(NAME, FUNC, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(n[i], m[i], stat); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_3OP(gvec_fadd_h, float16_add, float16) -DO_3OP(gvec_fadd_s, float32_add, float32) -DO_3OP(gvec_fadd_d, float64_add, float64) - -DO_3OP(gvec_fsub_h, float16_sub, float16) -DO_3OP(gvec_fsub_s, float32_sub, float32) -DO_3OP(gvec_fsub_d, float64_sub, float64) - -DO_3OP(gvec_fmul_h, float16_mul, float16) -DO_3OP(gvec_fmul_s, float32_mul, float32) -DO_3OP(gvec_fmul_d, float64_mul, float64) - -DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) -DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) -DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) - -DO_3OP(gvec_fabd_h, float16_abd, float16) -DO_3OP(gvec_fabd_s, float32_abd, float32) - -DO_3OP(gvec_fceq_h, float16_ceq, float16) -DO_3OP(gvec_fceq_s, float32_ceq, float32) - -DO_3OP(gvec_fcge_h, float16_cge, float16) -DO_3OP(gvec_fcge_s, float32_cge, float32) - -DO_3OP(gvec_fcgt_h, float16_cgt, float16) -DO_3OP(gvec_fcgt_s, float32_cgt, float32) - -DO_3OP(gvec_facge_h, float16_acge, float16) -DO_3OP(gvec_facge_s, float32_acge, float32) - -DO_3OP(gvec_facgt_h, float16_acgt, float16) -DO_3OP(gvec_facgt_s, float32_acgt, float32) - -DO_3OP(gvec_fmax_h, float16_max, float16) -DO_3OP(gvec_fmax_s, float32_max, float32) - -DO_3OP(gvec_fmin_h, float16_min, float16) -DO_3OP(gvec_fmin_s, float32_min, float32) - -DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) -DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) - -DO_3OP(gvec_fminnum_h, float16_minnum, float16) -DO_3OP(gvec_fminnum_s, float32_minnum, float32) - -DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) -DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) - -DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) -DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) - -#ifdef TARGET_AARCH64 - -DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) -DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) -DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) - -DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) -DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) -DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) - -#endif -#undef DO_3OP - -/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ -static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, - float_status *stat) -{ - return float16_add(dest, float16_mul(op1, op2, stat), stat); -} - -static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, - float_status *stat) -{ - return float32_add(dest, float32_mul(op1, op2, stat), stat); -} - -static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, - float_status *stat) -{ - return float16_sub(dest, float16_mul(op1, op2, stat), stat); -} - -static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, - float_status *stat) -{ - return float32_sub(dest, float32_mul(op1, op2, stat), stat); -} - -/* Fused versions; these have the semantics Neon VFMA/VFMS want */ -static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, - float_status *stat) -{ - return float16_muladd(op1, op2, dest, 0, stat); -} - -static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, - float_status *stat) -{ - return float32_muladd(op1, op2, dest, 0, stat); -} - -static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, - float_status *stat) -{ - return float16_muladd(float16_chs(op1), op2, dest, 0, stat); -} - -static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, - float_status *stat) -{ - return float32_muladd(float32_chs(op1), op2, dest, 0, stat); -} - -#define DO_MULADD(NAME, FUNC, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(d[i], n[i], m[i], stat); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) -DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) - -DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) -DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) - -DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) -DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) - -DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) -DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) - -/* For the indexed ops, SVE applies the index per 128-bit vector segment. - * For AdvSIMD, there is of course only one such vector segment. - */ - -#define DO_MUL_IDX(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ - intptr_t idx = simd_data(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ - TYPE mm = m[H(i + idx)]; \ - for (j = 0; j < segment; j++) { \ - d[i + j] = n[i + j] * mm; \ - } \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) -DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) -DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) - -#undef DO_MUL_IDX - -#define DO_MLA_IDX(NAME, TYPE, OP, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ - intptr_t idx = simd_data(desc); \ - TYPE *d = vd, *n = vn, *m = vm, *a = va; \ - for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ - TYPE mm = m[H(i + idx)]; \ - for (j = 0; j < segment; j++) { \ - d[i + j] = a[i + j] OP n[i + j] * mm; \ - } \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) -DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) -DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) - -DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) -DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) -DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) - -#undef DO_MLA_IDX - -#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ - intptr_t idx = simd_data(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ - TYPE mm = m[H(i + idx)]; \ - for (j = 0; j < segment; j++) { \ - d[i + j] = TYPE##_##ADD(d[i + j], \ - TYPE##_mul(n[i + j], mm, stat), stat); \ - } \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -#define float16_nop(N, M, S) (M) -#define float32_nop(N, M, S) (M) -#define float64_nop(N, M, S) (M) - -DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) -DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) -DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) - -/* - * Non-fused multiply-accumulate operations, for Neon. NB that unlike - * the fused ops below they assume accumulate both from and into Vd. - */ -DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) -DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) -DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) -DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) - -#undef float16_nop -#undef float32_nop -#undef float64_nop -#undef DO_FMUL_IDX - -#define DO_FMLA_IDX(NAME, TYPE, H) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ - void *stat, uint32_t desc) \ -{ \ - intptr_t i, j, oprsz = simd_oprsz(desc); \ - intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ - TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ - intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ - TYPE *d = vd, *n = vn, *m = vm, *a = va; \ - op1_neg <<= (8 * sizeof(TYPE) - 1); \ - for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ - TYPE mm = m[H(i + idx)]; \ - for (j = 0; j < segment; j++) { \ - d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ - mm, a[i + j], 0, stat); \ - } \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) -DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) -DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) - -#undef DO_FMLA_IDX - -#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ -void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ - bool q = false; \ - for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ - WTYPE dd = (WTYPE)n[i] OP m[i]; \ - if (dd < MIN) { \ - dd = MIN; \ - q = true; \ - } else if (dd > MAX) { \ - dd = MAX; \ - q = true; \ - } \ - d[i] = dd; \ - } \ - if (q) { \ - uint32_t *qc = vq; \ - qc[0] = 1; \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) -DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) -DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) - -DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) -DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) -DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) - -DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) -DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) -DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) - -DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) -DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) -DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) - -#undef DO_SAT - -void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, - void *vm, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - bool q = false; - - for (i = 0; i < oprsz / 8; i++) { - uint64_t nn = n[i], mm = m[i], dd = nn + mm; - if (dd < nn) { - dd = UINT64_MAX; - q = true; - } - d[i] = dd; - } - if (q) { - uint32_t *qc = vq; - qc[0] = 1; - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - -void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, - void *vm, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - bool q = false; - - for (i = 0; i < oprsz / 8; i++) { - uint64_t nn = n[i], mm = m[i], dd = nn - mm; - if (nn < mm) { - dd = 0; - q = true; - } - d[i] = dd; - } - if (q) { - uint32_t *qc = vq; - qc[0] = 1; - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - -void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, - void *vm, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm; - bool q = false; - - for (i = 0; i < oprsz / 8; i++) { - int64_t nn = n[i], mm = m[i], dd = nn + mm; - if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { - dd = (nn >> 63) ^ ~INT64_MIN; - q = true; - } - d[i] = dd; - } - if (q) { - uint32_t *qc = vq; - qc[0] = 1; - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - -void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, - void *vm, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - int64_t *d = vd, *n = vn, *m = vm; - bool q = false; - - for (i = 0; i < oprsz / 8; i++) { - int64_t nn = n[i], mm = m[i], dd = nn - mm; - if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { - dd = (nn >> 63) ^ ~INT64_MIN; - q = true; - } - d[i] = dd; - } - if (q) { - uint32_t *qc = vq; - qc[0] = 1; - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - - -#define DO_SRA(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] += n[i] >> shift; \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_SRA(gvec_ssra_b, int8_t) -DO_SRA(gvec_ssra_h, int16_t) -DO_SRA(gvec_ssra_s, int32_t) -DO_SRA(gvec_ssra_d, int64_t) - -DO_SRA(gvec_usra_b, uint8_t) -DO_SRA(gvec_usra_h, uint16_t) -DO_SRA(gvec_usra_s, uint32_t) -DO_SRA(gvec_usra_d, uint64_t) - -#undef DO_SRA - -#define DO_RSHR(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - TYPE tmp = n[i] >> (shift - 1); \ - d[i] = (tmp >> 1) + (tmp & 1); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_RSHR(gvec_srshr_b, int8_t) -DO_RSHR(gvec_srshr_h, int16_t) -DO_RSHR(gvec_srshr_s, int32_t) -DO_RSHR(gvec_srshr_d, int64_t) - -DO_RSHR(gvec_urshr_b, uint8_t) -DO_RSHR(gvec_urshr_h, uint16_t) -DO_RSHR(gvec_urshr_s, uint32_t) -DO_RSHR(gvec_urshr_d, uint64_t) - -#undef DO_RSHR - -#define DO_RSRA(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - TYPE tmp = n[i] >> (shift - 1); \ - d[i] += (tmp >> 1) + (tmp & 1); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_RSRA(gvec_srsra_b, int8_t) -DO_RSRA(gvec_srsra_h, int16_t) -DO_RSRA(gvec_srsra_s, int32_t) -DO_RSRA(gvec_srsra_d, int64_t) - -DO_RSRA(gvec_ursra_b, uint8_t) -DO_RSRA(gvec_ursra_h, uint16_t) -DO_RSRA(gvec_ursra_s, uint32_t) -DO_RSRA(gvec_ursra_d, uint64_t) - -#undef DO_RSRA - -#define DO_SRI(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_SRI(gvec_sri_b, uint8_t) -DO_SRI(gvec_sri_h, uint16_t) -DO_SRI(gvec_sri_s, uint32_t) -DO_SRI(gvec_sri_d, uint64_t) - -#undef DO_SRI - -#define DO_SLI(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ -} - -DO_SLI(gvec_sli_b, uint8_t) -DO_SLI(gvec_sli_h, uint16_t) -DO_SLI(gvec_sli_s, uint32_t) -DO_SLI(gvec_sli_d, uint64_t) - -#undef DO_SLI - -/* - * Convert float16 to float32, raising no exceptions and - * preserving exceptional values, including SNaN. - * This is effectively an unpack+repack operation. - */ -static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) -{ - const int f16_bias = 15; - const int f32_bias = 127; - uint32_t sign = extract32(f16, 15, 1); - uint32_t exp = extract32(f16, 10, 5); - uint32_t frac = extract32(f16, 0, 10); - - if (exp == 0x1f) { - /* Inf or NaN */ - exp = 0xff; - } else if (exp == 0) { - /* Zero or denormal. */ - if (frac != 0) { - if (fz16) { - frac = 0; - } else { - /* - * Denormal; these are all normal float32. - * Shift the fraction so that the msb is at bit 11, - * then remove bit 11 as the implicit bit of the - * normalized float32. Note that we still go through - * the shift for normal numbers below, to put the - * float32 fraction at the right place. - */ - int shift = clz32(frac) - 21; - frac = (frac << shift) & 0x3ff; - exp = f32_bias - f16_bias - shift + 1; - } - } - } else { - /* Normal number; adjust the bias. */ - exp += f32_bias - f16_bias; - } - sign <<= 31; - exp <<= 23; - frac <<= 23 - 10; - - return sign | exp | frac; -} - -static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) -{ - /* - * Branchless load of u32[0], u64[0], u32[1], or u64[1]. - * Load the 2nd qword iff is_q & is_2. - * Shift to the 2nd dword iff !is_q & is_2. - * For !is_q & !is_2, the upper bits of the result are garbage. - */ - return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); -} - -/* - * Note that FMLAL requires oprsz == 8 or oprsz == 16, - * as there is not yet SVE versions that might use blocking. - */ - -static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) -{ - intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); - int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - int is_q = oprsz == 16; - uint64_t n_4, m_4; - - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); - m_4 = load4_f16(vm, is_q, is_2); - - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - - for (i = 0; i < oprsz / 4; i++) { - float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); - float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - -void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, - void *venv, uint32_t desc) -{ - CPUARMState *env = venv; - do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); -} - -void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, - void *venv, uint32_t desc) -{ - CPUARMState *env = venv; - do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); -} - -void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, - void *venv, uint32_t desc) -{ - intptr_t i, oprsz = simd_oprsz(desc); - uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; - intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); - CPUARMState *env = venv; - float_status *status = &env->vfp.fp_status; - bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); - - for (i = 0; i < oprsz; i += sizeof(float32)) { - float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; - float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); - float32 nn = float16_to_float32_by_bits(nn_16, fz16); - float32 mm = float16_to_float32_by_bits(mm_16, fz16); - float32 aa = *(float32 *)(va + H1_4(i)); - - *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); - } -} - -static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) -{ - intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); - int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); - int is_q = oprsz == 16; - uint64_t n_4; - float32 m_1; - - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); - - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - - m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); - - for (i = 0; i < oprsz / 4; i++) { - float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); - } - clear_tail(d, oprsz, simd_maxsz(desc)); -} - -void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, - void *venv, uint32_t desc) -{ - CPUARMState *env = venv; - do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); -} - -void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, - void *venv, uint32_t desc) -{ - CPUARMState *env = venv; - do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); -} - -void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, - void *venv, uint32_t desc) -{ - intptr_t i, j, oprsz = simd_oprsz(desc); - uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; - intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); - intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); - CPUARMState *env = venv; - float_status *status = &env->vfp.fp_status; - bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); - - for (i = 0; i < oprsz; i += 16) { - float16 mm_16 = *(float16 *)(vm + i + idx); - float32 mm = float16_to_float32_by_bits(mm_16, fz16); - - for (j = 0; j < 16; j += sizeof(float32)) { - float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; - float32 nn = float16_to_float32_by_bits(nn_16, fz16); - float32 aa = *(float32 *)(va + H1_4(i + j)); - - *(float32 *)(vd + H1_4(i + j)) = - float32_muladd(nn, mm, aa, 0, status); - } - } -} - -void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - int8_t mm = m[i]; - int8_t nn = n[i]; - int8_t res = 0; - if (mm >= 0) { - if (mm < 8) { - res = nn << mm; - } - } else { - res = nn >> (mm > -8 ? -mm : 7); - } - d[i] = res; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - int8_t mm = m[i]; /* only 8 bits of shift are significant */ - int16_t nn = n[i]; - int16_t res = 0; - if (mm >= 0) { - if (mm < 16) { - res = nn << mm; - } - } else { - res = nn >> (mm > -16 ? -mm : 15); - } - d[i] = res; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - int8_t mm = m[i]; - uint8_t nn = n[i]; - uint8_t res = 0; - if (mm >= 0) { - if (mm < 8) { - res = nn << mm; - } - } else { - if (mm > -8) { - res = nn >> -mm; - } - } - d[i] = res; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - int8_t mm = m[i]; /* only 8 bits of shift are significant */ - uint16_t nn = n[i]; - uint16_t res = 0; - if (mm >= 0) { - if (mm < 16) { - res = nn << mm; - } - } else { - if (mm > -16) { - res = nn >> -mm; - } - } - d[i] = res; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -/* - * 8x8->8 polynomial multiply. - * - * Polynomial multiplication is like integer multiplication except the - * partial products are XORed, not added. - * - * TODO: expose this as a generic vector operation, as it is a common - * crypto building block. - */ -void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = n[i]; - uint64_t mm = m[i]; - uint64_t rr = 0; - - for (j = 0; j < 8; ++j) { - uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; - rr ^= mm & mask; - mm = (mm << 1) & 0xfefefefefefefefeull; - nn >>= 1; - } - d[i] = rr; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -/* - * 64x64->128 polynomial multiply. - * Because of the lanes are not accessed in strict columns, - * this probably cannot be turned into a generic helper. - */ -void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - intptr_t hi = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; i += 2) { - uint64_t nn = n[i + hi]; - uint64_t mm = m[i + hi]; - uint64_t rhi = 0; - uint64_t rlo = 0; - - /* Bit 0 can only influence the low 64-bit result. */ - if (nn & 1) { - rlo = mm; - } - - for (j = 1; j < 64; ++j) { - uint64_t mask = -((nn >> j) & 1); - rlo ^= (mm << j) & mask; - rhi ^= (mm >> (64 - j)) & mask; - } - d[i] = rlo; - d[i + 1] = rhi; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -/* - * 8x8->16 polynomial multiply. - * - * The byte inputs are expanded to (or extracted from) half-words. - * Note that neon and sve2 get the inputs from different positions. - * This allows 4 bytes to be processed in parallel with uint64_t. - */ - -static uint64_t expand_byte_to_half(uint64_t x) -{ - return (x & 0x000000ff) - | ((x & 0x0000ff00) << 8) - | ((x & 0x00ff0000) << 16) - | ((x & 0xff000000) << 24); -} - -uint64_t pmull_w(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 16; ++i) { - uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - -uint64_t pmull_h(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 8; ++i) { - uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - -void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - int hi = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t nn = n[hi], mm = m[hi]; - - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); - nn >>= 32; - mm >>= 32; - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); - - clear_tail(d, 16, simd_maxsz(desc)); -} - -#ifdef TARGET_AARCH64 -void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - int shift = simd_data(desc) * 8; - intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; - - d[i] = pmull_h(nn, mm); - } -} - -static uint64_t pmull_d(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - - for (i = 0; i < 32; ++i) { - uint64_t mask = -((op1 >> i) & 1); - result ^= (op2 << i) & mask; - } - return result; -} - -void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t sel = H4(simd_data(desc)); - intptr_t i, opr_sz = simd_oprsz(desc); - uint32_t *n = vn, *m = vm; - uint64_t *d = vd; - - for (i = 0; i < opr_sz / 8; ++i) { - d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); - } -} -#endif - -#define DO_CMP0(NAME, TYPE, OP) \ -void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ - TYPE nn = *(TYPE *)(vn + i); \ - *(TYPE *)(vd + i) = -(nn OP 0); \ - } \ - clear_tail(vd, opr_sz, simd_maxsz(desc)); \ -} - -DO_CMP0(gvec_ceq0_b, int8_t, ==) -DO_CMP0(gvec_clt0_b, int8_t, <) -DO_CMP0(gvec_cle0_b, int8_t, <=) -DO_CMP0(gvec_cgt0_b, int8_t, >) -DO_CMP0(gvec_cge0_b, int8_t, >=) - -DO_CMP0(gvec_ceq0_h, int16_t, ==) -DO_CMP0(gvec_clt0_h, int16_t, <) -DO_CMP0(gvec_cle0_h, int16_t, <=) -DO_CMP0(gvec_cgt0_h, int16_t, >) -DO_CMP0(gvec_cge0_h, int16_t, >=) - -#undef DO_CMP0 - -#define DO_ABD(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - \ - for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ - d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ - } \ - clear_tail(d, opr_sz, simd_maxsz(desc)); \ -} - -DO_ABD(gvec_sabd_b, int8_t) -DO_ABD(gvec_sabd_h, int16_t) -DO_ABD(gvec_sabd_s, int32_t) -DO_ABD(gvec_sabd_d, int64_t) - -DO_ABD(gvec_uabd_b, uint8_t) -DO_ABD(gvec_uabd_h, uint16_t) -DO_ABD(gvec_uabd_s, uint32_t) -DO_ABD(gvec_uabd_d, uint64_t) - -#undef DO_ABD - -#define DO_ABA(NAME, TYPE) \ -void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - TYPE *d = vd, *n = vn, *m = vm; \ - \ - for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ - d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ - } \ - clear_tail(d, opr_sz, simd_maxsz(desc)); \ -} - -DO_ABA(gvec_saba_b, int8_t) -DO_ABA(gvec_saba_h, int16_t) -DO_ABA(gvec_saba_s, int32_t) -DO_ABA(gvec_saba_d, int64_t) - -DO_ABA(gvec_uaba_b, uint8_t) -DO_ABA(gvec_uaba_h, uint16_t) -DO_ABA(gvec_uaba_s, uint32_t) -DO_ABA(gvec_uaba_d, uint64_t) - -#undef DO_ABA - -#define DO_NEON_PAIRWISE(NAME, OP) \ - void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ - void *stat, uint32_t oprsz) \ - { \ - float_status *fpst = stat; \ - float32 *d = vd; \ - float32 *n = vn; \ - float32 *m = vm; \ - float32 r0, r1; \ - \ - /* Read all inputs before writing outputs in case vm == vd */ \ - r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ - r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ - \ - d[H4(0)] = r0; \ - d[H4(1)] = r1; \ - } \ - \ - void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ - void *stat, uint32_t oprsz) \ - { \ - float_status *fpst = stat; \ - float16 *d = vd; \ - float16 *n = vn; \ - float16 *m = vm; \ - float16 r0, r1, r2, r3; \ - \ - /* Read all inputs before writing outputs in case vm == vd */ \ - r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ - r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ - r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ - r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ - \ - d[H2(0)] = r0; \ - d[H2(1)] = r1; \ - d[H2(2)] = r2; \ - d[H2(3)] = r3; \ - } - -DO_NEON_PAIRWISE(neon_padd, add) -DO_NEON_PAIRWISE(neon_pmax, max) -DO_NEON_PAIRWISE(neon_pmin, min) - -#undef DO_NEON_PAIRWISE - -#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ - void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ - { \ - intptr_t i, oprsz = simd_oprsz(desc); \ - int shift = simd_data(desc); \ - TYPE *d = vd, *n = vn; \ - float_status *fpst = stat; \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(n[i], shift, fpst); \ - } \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ - } - -DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) -DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) -DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) -DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) -DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) -DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) -DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) -DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) - -#undef DO_VCVT_FIXED - -#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ - void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ - { \ - float_status *fpst = stat; \ - intptr_t i, oprsz = simd_oprsz(desc); \ - uint32_t rmode = simd_data(desc); \ - uint32_t prev_rmode = get_float_rounding_mode(fpst); \ - TYPE *d = vd, *n = vn; \ - set_float_rounding_mode(rmode, fpst); \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(n[i], 0, fpst); \ - } \ - set_float_rounding_mode(prev_rmode, fpst); \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ - } - -DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) -DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) -DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) -DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) - -#undef DO_VCVT_RMODE - -#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ - void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ - { \ - float_status *fpst = stat; \ - intptr_t i, oprsz = simd_oprsz(desc); \ - uint32_t rmode = simd_data(desc); \ - uint32_t prev_rmode = get_float_rounding_mode(fpst); \ - TYPE *d = vd, *n = vn; \ - set_float_rounding_mode(rmode, fpst); \ - for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ - d[i] = FUNC(n[i], fpst); \ - } \ - set_float_rounding_mode(prev_rmode, fpst); \ - clear_tail(d, oprsz, simd_maxsz(desc)); \ - } - -DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) -DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) - -#undef DO_VRINT_RMODE - -#ifdef TARGET_AARCH64 -void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) -{ - const uint8_t *indices = vm; - CPUARMState *env = venv; - size_t oprsz = simd_oprsz(desc); - uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); - bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); - uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); - union { - uint8_t b[16]; - uint64_t d[2]; - } result; - - /* - * We must construct the final result in a temp, lest the output - * overlaps the input table. For TBL, begin with zero; for TBX, - * begin with the original register contents. Note that we always - * copy 16 bytes here to avoid an extra branch; clearing the high - * bits of the register for oprsz == 8 is handled below. - */ - if (is_tbx) { - memcpy(&result, vd, 16); - } else { - memset(&result, 0, 16); - } - - for (size_t i = 0; i < oprsz; ++i) { - uint32_t index = indices[H1(i)]; - - if (index < table_len) { - /* - * Convert index (a byte offset into the virtual table - * which is a series of 128-bit vectors concatenated) - * into the correct register element, bearing in mind - * that the table can wrap around from V31 to V0. - */ - const uint8_t *table = (const uint8_t *) - aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); - result.b[H1(i)] = table[H1(index % 16)]; - } - } - - memcpy(vd, &result, 16); - clear_tail(vd, oprsz, simd_maxsz(desc)); -} -#endif - -/* - * NxN -> N highpart multiply - * - * TODO: expose this as a generic vector operation. - */ - -void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = ((int32_t)n[i] * m[i]) >> 8; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = ((int32_t)n[i] * m[i]) >> 16; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - int32_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = ((int64_t)n[i] * m[i]) >> 32; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t discard; - - for (i = 0; i < opr_sz / 8; ++i) { - muls64(&discard, &d[i], n[i], m[i]); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint8_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = ((uint32_t)n[i] * m[i]) >> 8; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint16_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 2; ++i) { - d[i] = ((uint32_t)n[i] * m[i]) >> 16; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint32_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = ((uint64_t)n[i] * m[i]) >> 32; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd, *n = vn, *m = vm; - uint64_t discard; - - for (i = 0; i < opr_sz / 8; ++i) { - mulu64(&discard, &d[i], n[i], m[i]); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc) / 8; - int shr = simd_data(desc); - uint64_t *d = vd, *n = vn, *m = vm; - - for (i = 0; i < opr_sz; ++i) { - d[i] = ror64(n[i] ^ m[i], shr); - } - clear_tail(d, opr_sz * 8, simd_maxsz(desc)); -} - -/* - * Integer matrix-multiply accumulate - */ - -static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) -{ - int8_t *n = vn, *m = vm; - - for (intptr_t k = 0; k < 8; ++k) { - sum += n[H1(k)] * m[H1(k)]; - } - return sum; -} - -static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) -{ - uint8_t *n = vn, *m = vm; - - for (intptr_t k = 0; k < 8; ++k) { - sum += n[H1(k)] * m[H1(k)]; - } - return sum; -} - -static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) -{ - uint8_t *n = vn; - int8_t *m = vm; - - for (intptr_t k = 0; k < 8; ++k) { - sum += n[H1(k)] * m[H1(k)]; - } - return sum; -} - -static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, - uint32_t (*inner_loop)(uint32_t, void *, void *)) -{ - intptr_t seg, opr_sz = simd_oprsz(desc); - - for (seg = 0; seg < opr_sz; seg += 16) { - uint32_t *d = vd + seg; - uint32_t *a = va + seg; - uint32_t sum0, sum1, sum2, sum3; - - /* - * Process the entire segment at once, writing back the - * results only after we've consumed all of the inputs. - * - * Key to indices by column: - * i j i j - */ - sum0 = a[H4(0 + 0)]; - sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); - sum1 = a[H4(0 + 1)]; - sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); - sum2 = a[H4(2 + 0)]; - sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); - sum3 = a[H4(2 + 1)]; - sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); - - d[H4(0)] = sum0; - d[H4(1)] = sum1; - d[H4(2)] = sum2; - d[H4(3)] = sum3; - } - clear_tail(vd, opr_sz, simd_maxsz(desc)); -} - -#define DO_MMLA_B(NAME, INNER) \ - void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ - { do_mmla_b(vd, vn, vm, va, desc, INNER); } - -DO_MMLA_B(gvec_smmla_b, do_smmla_b) -DO_MMLA_B(gvec_ummla_b, do_ummla_b) -DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) - -/* - * BFloat16 Dot Product - */ - -float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) -{ - /* FPCR is ignored for BFDOT and BFMMLA. */ - float_status bf_status = { - .tininess_before_rounding = float_tininess_before_rounding, - .float_rounding_mode = float_round_to_odd_inf, - .flush_to_zero = true, - .flush_inputs_to_zero = true, - .default_nan_mode = true, - }; - float32 t1, t2; - - /* - * Extract each BFloat16 from the element pair, and shift - * them such that they become float32. - */ - t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); - t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); - t1 = float32_add(t1, t2, &bf_status); - t1 = float32_add(sum, t1, &bf_status); - - return t1; -} - -void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - float32 *d = vd, *a = va; - uint32_t *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - d[i] = bfdotadd(a[i], n[i], m[i]); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, - void *va, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - intptr_t index = simd_data(desc); - intptr_t elements = opr_sz / 4; - intptr_t eltspersegment = MIN(16 / 4, elements); - float32 *d = vd, *a = va; - uint32_t *n = vn, *m = vm; - - for (i = 0; i < elements; i += eltspersegment) { - uint32_t m_idx = m[i + H4(index)]; - - for (j = i; j < i + eltspersegment; j++) { - d[j] = bfdotadd(a[j], n[j], m_idx); - } - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) -{ - intptr_t s, opr_sz = simd_oprsz(desc); - float32 *d = vd, *a = va; - uint32_t *n = vn, *m = vm; - - for (s = 0; s < opr_sz / 4; s += 4) { - float32 sum00, sum01, sum10, sum11; - - /* - * Process the entire segment at once, writing back the - * results only after we've consumed all of the inputs. - * - * Key to indicies by column: - * i j i k j k - */ - sum00 = a[s + H4(0 + 0)]; - sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); - sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); - - sum01 = a[s + H4(0 + 1)]; - sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); - sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); - - sum10 = a[s + H4(2 + 0)]; - sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); - sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); - - sum11 = a[s + H4(2 + 1)]; - sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); - sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); - - d[s + H4(0 + 0)] = sum00; - d[s + H4(0 + 1)] = sum01; - d[s + H4(2 + 0)] = sum10; - d[s + H4(2 + 1)] = sum11; - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, - void *stat, uint32_t desc) -{ - intptr_t i, opr_sz = simd_oprsz(desc); - intptr_t sel = simd_data(desc); - float32 *d = vd, *a = va; - bfloat16 *n = vn, *m = vm; - - for (i = 0; i < opr_sz / 4; ++i) { - float32 nn = n[H2(i * 2 + sel)] << 16; - float32 mm = m[H2(i * 2 + sel)] << 16; - d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, - void *va, void *stat, uint32_t desc) -{ - intptr_t i, j, opr_sz = simd_oprsz(desc); - intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); - intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); - intptr_t elements = opr_sz / 4; - intptr_t eltspersegment = MIN(16 / 4, elements); - float32 *d = vd, *a = va; - bfloat16 *n = vn, *m = vm; - - for (i = 0; i < elements; i += eltspersegment) { - float32 m_idx = m[H2(2 * i + index)] << 16; - - for (j = i; j < i + eltspersegment; j++) { - float32 n_j = n[H2(2 * j + sel)] << 16; - d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); - } - } - clear_tail(d, opr_sz, simd_maxsz(desc)); -} - -#define DO_CLAMP(NAME, TYPE) \ -void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ -{ \ - intptr_t i, opr_sz = simd_oprsz(desc); \ - for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ - TYPE aa = *(TYPE *)(a + i); \ - TYPE nn = *(TYPE *)(n + i); \ - TYPE mm = *(TYPE *)(m + i); \ - TYPE dd = MIN(MAX(aa, nn), mm); \ - *(TYPE *)(d + i) = dd; \ - } \ - clear_tail(d, opr_sz, simd_maxsz(desc)); \ -} - -DO_CLAMP(gvec_sclamp_b, int8_t) -DO_CLAMP(gvec_sclamp_h, int16_t) -DO_CLAMP(gvec_sclamp_s, int32_t) -DO_CLAMP(gvec_sclamp_d, int64_t) - -DO_CLAMP(gvec_uclamp_b, uint8_t) -DO_CLAMP(gvec_uclamp_h, uint16_t) -DO_CLAMP(gvec_uclamp_s, uint32_t) -DO_CLAMP(gvec_uclamp_d, uint64_t) diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h deleted file mode 100644 index 1f4ed80ff7..0000000000 --- a/target/arm/vec_internal.h +++ /dev/null @@ -1,246 +0,0 @@ -/* - * ARM AdvSIMD / SVE Vector Helpers - * - * Copyright (c) 2020 Linaro - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - */ - -#ifndef TARGET_ARM_VEC_INTERNAL_H -#define TARGET_ARM_VEC_INTERNAL_H - -/* - * Note that vector data is stored in host-endian 64-bit chunks, - * so addressing units smaller than that needs a host-endian fixup. - * - * The H macros are used when indexing an array of elements of size N. - * - * The H1_ macros are used when performing byte arithmetic and then - * casting the final pointer to a type of size N. - */ -#if HOST_BIG_ENDIAN -#define H1(x) ((x) ^ 7) -#define H1_2(x) ((x) ^ 6) -#define H1_4(x) ((x) ^ 4) -#define H2(x) ((x) ^ 3) -#define H4(x) ((x) ^ 1) -#else -#define H1(x) (x) -#define H1_2(x) (x) -#define H1_4(x) (x) -#define H2(x) (x) -#define H4(x) (x) -#endif -/* - * Access to 64-bit elements isn't host-endian dependent; we provide H8 - * and H1_8 so that when a function is being generated from a macro we - * can pass these rather than an empty macro argument, for clarity. - */ -#define H8(x) (x) -#define H1_8(x) (x) - -/* - * Expand active predicate bits to bytes, for byte elements. - */ -extern const uint64_t expand_pred_b_data[256]; -static inline uint64_t expand_pred_b(uint8_t byte) -{ - return expand_pred_b_data[byte]; -} - -/* Similarly for half-word elements. */ -extern const uint64_t expand_pred_h_data[0x55 + 1]; -static inline uint64_t expand_pred_h(uint8_t byte) -{ - return expand_pred_h_data[byte & 0x55]; -} - -static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) -{ - uint64_t *d = vd + opr_sz; - uintptr_t i; - - for (i = opr_sz; i < max_sz; i += 8) { - *d++ = 0; - } -} - -static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits, - bool round, uint32_t *sat) -{ - if (shift <= -bits) { - /* Rounding the sign bit always produces 0. */ - if (round) { - return 0; - } - return src >> 31; - } else if (shift < 0) { - if (round) { - src >>= -shift - 1; - return (src >> 1) + (src & 1); - } - return src >> -shift; - } else if (shift < bits) { - int32_t val = src << shift; - if (bits == 32) { - if (!sat || val >> shift == src) { - return val; - } - } else { - int32_t extval = sextract32(val, 0, bits); - if (!sat || val == extval) { - return extval; - } - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return (1u << (bits - 1)) - (src >= 0); -} - -static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits, - bool round, uint32_t *sat) -{ - if (shift <= -(bits + round)) { - return 0; - } else if (shift < 0) { - if (round) { - src >>= -shift - 1; - return (src >> 1) + (src & 1); - } - return src >> -shift; - } else if (shift < bits) { - uint32_t val = src << shift; - if (bits == 32) { - if (!sat || val >> shift == src) { - return val; - } - } else { - uint32_t extval = extract32(val, 0, bits); - if (!sat || val == extval) { - return extval; - } - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return MAKE_64BIT_MASK(0, bits); -} - -static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits, - bool round, uint32_t *sat) -{ - if (sat && src < 0) { - *sat = 1; - return 0; - } - return do_uqrshl_bhs(src, shift, bits, round, sat); -} - -static inline int64_t do_sqrshl_d(int64_t src, int64_t shift, - bool round, uint32_t *sat) -{ - if (shift <= -64) { - /* Rounding the sign bit always produces 0. */ - if (round) { - return 0; - } - return src >> 63; - } else if (shift < 0) { - if (round) { - src >>= -shift - 1; - return (src >> 1) + (src & 1); - } - return src >> -shift; - } else if (shift < 64) { - int64_t val = src << shift; - if (!sat || val >> shift == src) { - return val; - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return src < 0 ? INT64_MIN : INT64_MAX; -} - -static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift, - bool round, uint32_t *sat) -{ - if (shift <= -(64 + round)) { - return 0; - } else if (shift < 0) { - if (round) { - src >>= -shift - 1; - return (src >> 1) + (src & 1); - } - return src >> -shift; - } else if (shift < 64) { - uint64_t val = src << shift; - if (!sat || val >> shift == src) { - return val; - } - } else if (!sat || src == 0) { - return 0; - } - - *sat = 1; - return UINT64_MAX; -} - -static inline int64_t do_suqrshl_d(int64_t src, int64_t shift, - bool round, uint32_t *sat) -{ - if (sat && src < 0) { - *sat = 1; - return 0; - } - return do_uqrshl_d(src, shift, round, sat); -} - -int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool); -int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); -int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); -int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); - -/* - * 8 x 8 -> 16 vector polynomial multiply where the inputs are - * in the low 8 bits of each 16-bit element -*/ -uint64_t pmull_h(uint64_t op1, uint64_t op2); -/* - * 16 x 16 -> 32 vector polynomial multiply where the inputs are - * in the low 16 bits of each 32-bit element - */ -uint64_t pmull_w(uint64_t op1, uint64_t op2); - -/** - * bfdotadd: - * @sum: addend - * @e1, @e2: multiplicand vectors - * - * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. - * The @e1 and @e2 operands correspond to the 32-bit source vector - * slots and contain two Bfloat16 values each. - * - * Corresponds to the ARM pseudocode function BFDotAdd. - */ -float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2); - -#endif /* TARGET_ARM_VEC_INTERNAL_H */