* See the COPYING file in the top-level directory.
*/
-#include "host/load-extract-al16-al8.h"
-#include "host/store-insert-al16.h"
+#include "host/load-extract-al16-al8.h.inc"
+#include "host/store-insert-al16.h.inc"
#ifdef CONFIG_ATOMIC64
# define HAVE_al8 true
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic extract 64 from 128-bit, AArch64 version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H
-#define AARCH64_LOAD_EXTRACT_AL16_AL8_H
-
-#include "host/cpuinfo.h"
-#include "tcg/debug-assert.h"
-
-/**
- * load_atom_extract_al16_or_al8:
- * @pv: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
-{
- uintptr_t pi = (uintptr_t)pv;
- __int128_t *ptr_align = (__int128_t *)(pi & ~7);
- int shr = (pi & 7) * 8;
- uint64_t l, h;
-
- /*
- * With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned
- * and single-copy atomic on the parts if 8-byte aligned.
- * All we need do is align the pointer mod 8.
- */
- tcg_debug_assert(HAVE_ATOMIC128_RO);
- asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align));
- return (l >> shr) | (h << (-shr & 63));
-}
-
-#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_LOAD_EXTRACT_AL16_AL8_H
+#define AARCH64_LOAD_EXTRACT_AL16_AL8_H
+
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+ int shr = (pi & 7) * 8;
+ uint64_t l, h;
+
+ /*
+ * With FEAT_LSE2, LDP is single-copy atomic if 16-byte aligned
+ * and single-copy atomic on the parts if 8-byte aligned.
+ * All we need do is align the pointer mod 8.
+ */
+ tcg_debug_assert(HAVE_ATOMIC128_RO);
+ asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*ptr_align));
+ return (l >> shr) | (h << (-shr & 63));
+}
+
+#endif /* AARCH64_LOAD_EXTRACT_AL16_AL8_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic store insert into 128-bit, AArch64 version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef AARCH64_STORE_INSERT_AL16_H
-#define AARCH64_STORE_INSERT_AL16_H
-
-/**
- * store_atom_insert_al16:
- * @p: host address
- * @val: shifted value to store
- * @msk: mask for value to store
- *
- * Atomically store @val to @p masked by @msk.
- */
-static inline void ATTRIBUTE_ATOMIC128_OPT
-store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
-{
- /*
- * GCC only implements __sync* primitives for int128 on aarch64.
- * We can do better without the barriers, and integrating the
- * arithmetic into the load-exclusive/store-conditional pair.
- */
- uint64_t tl, th, vl, vh, ml, mh;
- uint32_t fail;
-
- qemu_build_assert(!HOST_BIG_ENDIAN);
- vl = int128_getlo(val);
- vh = int128_gethi(val);
- ml = int128_getlo(msk);
- mh = int128_gethi(msk);
-
- asm("0: ldxp %[l], %[h], %[mem]\n\t"
- "bic %[l], %[l], %[ml]\n\t"
- "bic %[h], %[h], %[mh]\n\t"
- "orr %[l], %[l], %[vl]\n\t"
- "orr %[h], %[h], %[vh]\n\t"
- "stxp %w[f], %[l], %[h], %[mem]\n\t"
- "cbnz %w[f], 0b\n"
- : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
- : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
-}
-
-#endif /* AARCH64_STORE_INSERT_AL16_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, AArch64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef AARCH64_STORE_INSERT_AL16_H
+#define AARCH64_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+ /*
+ * GCC only implements __sync* primitives for int128 on aarch64.
+ * We can do better without the barriers, and integrating the
+ * arithmetic into the load-exclusive/store-conditional pair.
+ */
+ uint64_t tl, th, vl, vh, ml, mh;
+ uint32_t fail;
+
+ qemu_build_assert(!HOST_BIG_ENDIAN);
+ vl = int128_getlo(val);
+ vh = int128_gethi(val);
+ ml = int128_getlo(msk);
+ mh = int128_gethi(msk);
+
+ asm("0: ldxp %[l], %[h], %[mem]\n\t"
+ "bic %[l], %[l], %[ml]\n\t"
+ "bic %[h], %[h], %[mh]\n\t"
+ "orr %[l], %[l], %[vl]\n\t"
+ "orr %[h], %[h], %[vh]\n\t"
+ "stxp %w[f], %[l], %[h], %[mem]\n\t"
+ "cbnz %w[f], 0b\n"
+ : [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+ : [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+}
+
+#endif /* AARCH64_STORE_INSERT_AL16_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic extract 64 from 128-bit, generic version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H
-#define HOST_LOAD_EXTRACT_AL16_AL8_H
-
-/**
- * load_atom_extract_al16_or_al8:
- * @pv: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
-load_atom_extract_al16_or_al8(void *pv, int s)
-{
- uintptr_t pi = (uintptr_t)pv;
- int o = pi & 7;
- int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
- Int128 r;
-
- pv = (void *)(pi & ~7);
- if (pi & 8) {
- uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
- uint64_t a = qatomic_read__nocheck(p8);
- uint64_t b = qatomic_read__nocheck(p8 + 1);
-
- if (HOST_BIG_ENDIAN) {
- r = int128_make128(b, a);
- } else {
- r = int128_make128(a, b);
- }
- } else {
- r = atomic16_read_ro(pv);
- }
- return int128_getlo(int128_urshift(r, shr));
-}
-
-#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_LOAD_EXTRACT_AL16_AL8_H
+#define HOST_LOAD_EXTRACT_AL16_AL8_H
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ int o = pi & 7;
+ int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
+ Int128 r;
+
+ pv = (void *)(pi & ~7);
+ if (pi & 8) {
+ uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
+ uint64_t a = qatomic_read__nocheck(p8);
+ uint64_t b = qatomic_read__nocheck(p8 + 1);
+
+ if (HOST_BIG_ENDIAN) {
+ r = int128_make128(b, a);
+ } else {
+ r = int128_make128(a, b);
+ }
+ } else {
+ r = atomic16_read_ro(pv);
+ }
+ return int128_getlo(int128_urshift(r, shr));
+}
+
+#endif /* HOST_LOAD_EXTRACT_AL16_AL8_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic store insert into 128-bit, generic version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef HOST_STORE_INSERT_AL16_H
-#define HOST_STORE_INSERT_AL16_H
-
-/**
- * store_atom_insert_al16:
- * @p: host address
- * @val: shifted value to store
- * @msk: mask for value to store
- *
- * Atomically store @val to @p masked by @msk.
- */
-static inline void ATTRIBUTE_ATOMIC128_OPT
-store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
-{
-#if defined(CONFIG_ATOMIC128)
- __uint128_t *pu;
- Int128Alias old, new;
-
- /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
- pu = __builtin_assume_aligned(ps, 16);
- old.u = *pu;
- msk = int128_not(msk);
- do {
- new.s = int128_and(old.s, msk);
- new.s = int128_or(new.s, val);
- } while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true,
- __ATOMIC_RELAXED, __ATOMIC_RELAXED));
-#else
- Int128 old, new, cmp;
-
- ps = __builtin_assume_aligned(ps, 16);
- old = *ps;
- msk = int128_not(msk);
- do {
- cmp = old;
- new = int128_and(old, msk);
- new = int128_or(new, val);
- old = atomic16_cmpxchg(ps, cmp, new);
- } while (int128_ne(cmp, old));
-#endif
-}
-
-#endif /* HOST_STORE_INSERT_AL16_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, generic version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef HOST_STORE_INSERT_AL16_H
+#define HOST_STORE_INSERT_AL16_H
+
+/**
+ * store_atom_insert_al16:
+ * @p: host address
+ * @val: shifted value to store
+ * @msk: mask for value to store
+ *
+ * Atomically store @val to @p masked by @msk.
+ */
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+{
+#if defined(CONFIG_ATOMIC128)
+ __uint128_t *pu;
+ Int128Alias old, new;
+
+ /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
+ pu = __builtin_assume_aligned(ps, 16);
+ old.u = *pu;
+ msk = int128_not(msk);
+ do {
+ new.s = int128_and(old.s, msk);
+ new.s = int128_or(new.s, val);
+ } while (!__atomic_compare_exchange_n(pu, &old.u, new.u, true,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+#else
+ Int128 old, new, cmp;
+
+ ps = __builtin_assume_aligned(ps, 16);
+ old = *ps;
+ msk = int128_not(msk);
+ do {
+ cmp = old;
+ new = int128_and(old, msk);
+ new = int128_or(new, val);
+ old = atomic16_cmpxchg(ps, cmp, new);
+ } while (int128_ne(cmp, old));
+#endif
+}
+
+#endif /* HOST_STORE_INSERT_AL16_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic extract 64 from 128-bit, LoongArch version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef LOONGARCH_LOAD_EXTRACT_AL16_AL8_H
-#define LOONGARCH_LOAD_EXTRACT_AL16_AL8_H
-
-#include "host/cpuinfo.h"
-#include "tcg/debug-assert.h"
-
-/**
- * load_atom_extract_al16_or_al8:
- * @pv: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
-{
- uintptr_t pi = (uintptr_t)pv;
- Int128 *ptr_align = (Int128 *)(pi & ~7);
- int shr = (pi & 7) * 8;
- uint64_t l, h;
-
- tcg_debug_assert(HAVE_ATOMIC128_RO);
- asm("vld $vr0, %2, 0\n\t"
- "vpickve2gr.d %0, $vr0, 0\n\t"
- "vpickve2gr.d %1, $vr0, 1"
- : "=r"(l), "=r"(h) : "r"(ptr_align), "m"(*ptr_align) : "f0");
-
- return (l >> shr) | (h << (-shr & 63));
-}
-
-#endif /* LOONGARCH_LOAD_EXTRACT_AL16_AL8_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, LoongArch version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef LOONGARCH_LOAD_EXTRACT_AL16_AL8_H
+#define LOONGARCH_LOAD_EXTRACT_AL16_AL8_H
+
+#include "host/cpuinfo.h"
+#include "tcg/debug-assert.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ Int128 *ptr_align = (Int128 *)(pi & ~7);
+ int shr = (pi & 7) * 8;
+ uint64_t l, h;
+
+ tcg_debug_assert(HAVE_ATOMIC128_RO);
+ asm("vld $vr0, %2, 0\n\t"
+ "vpickve2gr.d %0, $vr0, 0\n\t"
+ "vpickve2gr.d %1, $vr0, 1"
+ : "=r"(l), "=r"(h) : "r"(ptr_align), "m"(*ptr_align) : "f0");
+
+ return (l >> shr) | (h << (-shr & 63));
+}
+
+#endif /* LOONGARCH_LOAD_EXTRACT_AL16_AL8_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic store insert into 128-bit, LoongArch version.
- */
-
-#ifndef LOONGARCH_STORE_INSERT_AL16_H
-#define LOONGARCH_STORE_INSERT_AL16_H
-
-void store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
- QEMU_ERROR("unsupported atomic");
-
-#endif /* LOONGARCH_STORE_INSERT_AL16_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic store insert into 128-bit, LoongArch version.
+ */
+
+#ifndef LOONGARCH_STORE_INSERT_AL16_H
+#define LOONGARCH_STORE_INSERT_AL16_H
+
+void store_atom_insert_al16(Int128 *ps, Int128 val, Int128 msk)
+ QEMU_ERROR("unsupported atomic");
+
+#endif /* LOONGARCH_STORE_INSERT_AL16_H */
+++ /dev/null
-/*
- * SPDX-License-Identifier: GPL-2.0-or-later
- * Atomic extract 64 from 128-bit, x86_64 version.
- *
- * Copyright (C) 2023 Linaro, Ltd.
- */
-
-#ifndef X86_64_LOAD_EXTRACT_AL16_AL8_H
-#define X86_64_LOAD_EXTRACT_AL16_AL8_H
-
-#ifdef CONFIG_INT128_TYPE
-#include "host/atomic128-ldst.h"
-
-/**
- * load_atom_extract_al16_or_al8:
- * @pv: host address
- * @s: object size in bytes, @s <= 8.
- *
- * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
- * cross an 16-byte boundary then the access must be 16-byte atomic,
- * otherwise the access must be 8-byte atomic.
- */
-static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
-load_atom_extract_al16_or_al8(void *pv, int s)
-{
- uintptr_t pi = (uintptr_t)pv;
- __int128_t *ptr_align = (__int128_t *)(pi & ~7);
- int shr = (pi & 7) * 8;
- X86Int128Union r;
-
- /*
- * ptr_align % 16 is now only 0 or 8.
- * If the host supports atomic loads with VMOVDQU, then always use that,
- * making the branch highly predictable. Otherwise we must use VMOVDQA
- * when ptr_align % 16 == 0 for 16-byte atomicity.
- */
- if ((cpuinfo & CPUINFO_ATOMIC_VMOVDQU) || (pi & 8)) {
- asm("vmovdqu %1, %0" : "=x" (r.v) : "m" (*ptr_align));
- } else {
- asm("vmovdqa %1, %0" : "=x" (r.v) : "m" (*ptr_align));
- }
- return int128_getlo(int128_urshift(r.s, shr));
-}
-#else
-/* Fallback definition that must be optimized away, or error. */
-uint64_t QEMU_ERROR("unsupported atomic")
- load_atom_extract_al16_or_al8(void *pv, int s);
-#endif
-
-#endif /* X86_64_LOAD_EXTRACT_AL16_AL8_H */
--- /dev/null
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * Atomic extract 64 from 128-bit, x86_64 version.
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef X86_64_LOAD_EXTRACT_AL16_AL8_H
+#define X86_64_LOAD_EXTRACT_AL16_AL8_H
+
+#ifdef CONFIG_INT128_TYPE
+#include "host/atomic128-ldst.h"
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @pv, when pv % s != 0. If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
+{
+ uintptr_t pi = (uintptr_t)pv;
+ __int128_t *ptr_align = (__int128_t *)(pi & ~7);
+ int shr = (pi & 7) * 8;
+ X86Int128Union r;
+
+ /*
+ * ptr_align % 16 is now only 0 or 8.
+ * If the host supports atomic loads with VMOVDQU, then always use that,
+ * making the branch highly predictable. Otherwise we must use VMOVDQA
+ * when ptr_align % 16 == 0 for 16-byte atomicity.
+ */
+ if ((cpuinfo & CPUINFO_ATOMIC_VMOVDQU) || (pi & 8)) {
+ asm("vmovdqu %1, %0" : "=x" (r.v) : "m" (*ptr_align));
+ } else {
+ asm("vmovdqa %1, %0" : "=x" (r.v) : "m" (*ptr_align));
+ }
+ return int128_getlo(int128_urshift(r.s, shr));
+}
+#else
+/* Fallback definition that must be optimized away, or error. */
+uint64_t QEMU_ERROR("unsupported atomic")
+ load_atom_extract_al16_or_al8(void *pv, int s);
+#endif
+
+#endif /* X86_64_LOAD_EXTRACT_AL16_AL8_H */