+++ /dev/null
-/*
- * This model describes the interaction between ctx->notify_me
- * and aio_notify().
- *
- * Author: Paolo Bonzini <pbonzini@redhat.com>
- *
- * This file is in the public domain. If you really want a license,
- * the WTFPL will do.
- *
- * To simulate it:
- * spin -p docs/aio_notify.promela
- *
- * To verify it:
- * spin -a docs/aio_notify.promela
- * gcc -O2 pan.c
- * ./a.out -a
- *
- * To verify it (with a bug planted in the model):
- * spin -a -DBUG docs/aio_notify.promela
- * gcc -O2 pan.c
- * ./a.out -a
- */
-
-#define MAX 4
-#define LAST (1 << (MAX - 1))
-#define FINAL ((LAST << 1) - 1)
-
-bool notify_me;
-bool event;
-
-int req;
-int done;
-
-active proctype waiter()
-{
- int fetch;
-
- do
- :: true -> {
- notify_me++;
-
- if
-#ifndef BUG
- :: (req > 0) -> skip;
-#endif
- :: else ->
- // Wait for a nudge from the other side
- do
- :: event == 1 -> { event = 0; break; }
- od;
- fi;
-
- notify_me--;
-
- atomic { fetch = req; req = 0; }
- done = done | fetch;
- }
- od
-}
-
-active proctype notifier()
-{
- int next = 1;
-
- do
- :: next <= LAST -> {
- // generate a request
- req = req | next;
- next = next << 1;
-
- // aio_notify
- if
- :: notify_me == 1 -> event = 1;
- :: else -> printf("Skipped event_notifier_set\n"); skip;
- fi;
-
- // Test both synchronous and asynchronous delivery
- if
- :: 1 -> do
- :: req == 0 -> break;
- od;
- :: 1 -> skip;
- fi;
- }
- od;
-}
-
-never { /* [] done < FINAL */
-accept_init:
- do
- :: done < FINAL -> skip;
- od;
-}
+++ /dev/null
-/*
- * This model describes the interaction between ctx->notified
- * and ctx->notifier.
- *
- * Author: Paolo Bonzini <pbonzini@redhat.com>
- *
- * This file is in the public domain. If you really want a license,
- * the WTFPL will do.
- *
- * To verify the buggy version:
- * spin -a -DBUG1 docs/aio_notify_bug.promela
- * gcc -O2 pan.c
- * ./a.out -a -f
- * (or -DBUG2)
- *
- * To verify the fixed version:
- * spin -a docs/aio_notify_bug.promela
- * gcc -O2 pan.c
- * ./a.out -a -f
- *
- * Add -DCHECK_REQ to test an alternative invariant and the
- * "notify_me" optimization.
- */
-
-int notify_me;
-bool notified;
-bool event;
-bool req;
-bool notifier_done;
-
-#ifdef CHECK_REQ
-#define USE_NOTIFY_ME 1
-#else
-#define USE_NOTIFY_ME 0
-#endif
-
-#ifdef BUG
-#error Please define BUG1 or BUG2 instead.
-#endif
-
-active proctype notifier()
-{
- do
- :: true -> {
- req = 1;
- if
- :: !USE_NOTIFY_ME || notify_me ->
-#if defined BUG1
- /* CHECK_REQ does not detect this bug! */
- notified = 1;
- event = 1;
-#elif defined BUG2
- if
- :: !notified -> event = 1;
- :: else -> skip;
- fi;
- notified = 1;
-#else
- event = 1;
- notified = 1;
-#endif
- :: else -> skip;
- fi
- }
- :: true -> break;
- od;
- notifier_done = 1;
-}
-
-#define AIO_POLL \
- notify_me++; \
- if \
- :: !req -> { \
- if \
- :: event -> skip; \
- fi; \
- } \
- :: else -> skip; \
- fi; \
- notify_me--; \
- \
- atomic { old = notified; notified = 0; } \
- if \
- :: old -> event = 0; \
- :: else -> skip; \
- fi; \
- \
- req = 0;
-
-active proctype waiter()
-{
- bool old;
-
- do
- :: true -> AIO_POLL;
- od;
-}
-
-/* Same as waiter(), but disappears after a while. */
-active proctype temporary_waiter()
-{
- bool old;
-
- do
- :: true -> AIO_POLL;
- :: true -> break;
- od;
-}
-
-#ifdef CHECK_REQ
-never {
- do
- :: req -> goto accept_if_req_not_eventually_false;
- :: true -> skip;
- od;
-
-accept_if_req_not_eventually_false:
- if
- :: req -> goto accept_if_req_not_eventually_false;
- fi;
- assert(0);
-}
-
-#else
-/* There must be infinitely many transitions of event as long
- * as the notifier does not exit.
- *
- * If event stayed always true, the waiters would be busy looping.
- * If event stayed always false, the waiters would be sleeping
- * forever.
- */
-never {
- do
- :: !event -> goto accept_if_event_not_eventually_true;
- :: event -> goto accept_if_event_not_eventually_false;
- :: true -> skip;
- od;
-
-accept_if_event_not_eventually_true:
- if
- :: !event && notifier_done -> do :: true -> skip; od;
- :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
- fi;
- assert(0);
-
-accept_if_event_not_eventually_false:
- if
- :: event -> goto accept_if_event_not_eventually_false;
- fi;
- assert(0);
-}
-#endif
+++ /dev/null
-/*
- * This model describes a bug in aio_notify. If ctx->notifier is
- * cleared too late, a wakeup could be lost.
- *
- * Author: Paolo Bonzini <pbonzini@redhat.com>
- *
- * This file is in the public domain. If you really want a license,
- * the WTFPL will do.
- *
- * To verify the buggy version:
- * spin -a -DBUG docs/aio_notify_bug.promela
- * gcc -O2 pan.c
- * ./a.out -a -f
- *
- * To verify the fixed version:
- * spin -a docs/aio_notify_bug.promela
- * gcc -O2 pan.c
- * ./a.out -a -f
- *
- * Add -DCHECK_REQ to test an alternative invariant and the
- * "notify_me" optimization.
- */
-
-int notify_me;
-bool event;
-bool req;
-bool notifier_done;
-
-#ifdef CHECK_REQ
-#define USE_NOTIFY_ME 1
-#else
-#define USE_NOTIFY_ME 0
-#endif
-
-active proctype notifier()
-{
- do
- :: true -> {
- req = 1;
- if
- :: !USE_NOTIFY_ME || notify_me -> event = 1;
- :: else -> skip;
- fi
- }
- :: true -> break;
- od;
- notifier_done = 1;
-}
-
-#ifdef BUG
-#define AIO_POLL \
- notify_me++; \
- if \
- :: !req -> { \
- if \
- :: event -> skip; \
- fi; \
- } \
- :: else -> skip; \
- fi; \
- notify_me--; \
- \
- req = 0; \
- event = 0;
-#else
-#define AIO_POLL \
- notify_me++; \
- if \
- :: !req -> { \
- if \
- :: event -> skip; \
- fi; \
- } \
- :: else -> skip; \
- fi; \
- notify_me--; \
- \
- event = 0; \
- req = 0;
-#endif
-
-active proctype waiter()
-{
- do
- :: true -> AIO_POLL;
- od;
-}
-
-/* Same as waiter(), but disappears after a while. */
-active proctype temporary_waiter()
-{
- do
- :: true -> AIO_POLL;
- :: true -> break;
- od;
-}
-
-#ifdef CHECK_REQ
-never {
- do
- :: req -> goto accept_if_req_not_eventually_false;
- :: true -> skip;
- od;
-
-accept_if_req_not_eventually_false:
- if
- :: req -> goto accept_if_req_not_eventually_false;
- fi;
- assert(0);
-}
-
-#else
-/* There must be infinitely many transitions of event as long
- * as the notifier does not exit.
- *
- * If event stayed always true, the waiters would be busy looping.
- * If event stayed always false, the waiters would be sleeping
- * forever.
- */
-never {
- do
- :: !event -> goto accept_if_event_not_eventually_true;
- :: event -> goto accept_if_event_not_eventually_false;
- :: true -> skip;
- od;
-
-accept_if_event_not_eventually_true:
- if
- :: !event && notifier_done -> do :: true -> skip; od;
- :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
- fi;
- assert(0);
-
-accept_if_event_not_eventually_false:
- if
- :: event -> goto accept_if_event_not_eventually_false;
- fi;
- assert(0);
-}
-#endif
+++ /dev/null
-CPUs perform independent memory operations effectively in random order.
-but this can be a problem for CPU-CPU interaction (including interactions
-between QEMU and the guest). Multi-threaded programs use various tools
-to instruct the compiler and the CPU to restrict the order to something
-that is consistent with the expectations of the programmer.
-
-The most basic tool is locking. Mutexes, condition variables and
-semaphores are used in QEMU, and should be the default approach to
-synchronization. Anything else is considerably harder, but it's
-also justified more often than one would like. The two tools that
-are provided by qemu/atomic.h are memory barriers and atomic operations.
-
-Macros defined by qemu/atomic.h fall in three camps:
-
-- compiler barriers: barrier();
-
-- weak atomic access and manual memory barriers: atomic_read(),
- atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(),
- smp_mb_release(), smp_read_barrier_depends();
-
-- sequentially consistent atomic access: everything else.
-
-
-COMPILER MEMORY BARRIER
-=======================
-
-barrier() prevents the compiler from moving the memory accesses either
-side of it to the other side. The compiler barrier has no direct effect
-on the CPU, which may then reorder things however it wishes.
-
-barrier() is mostly used within qemu/atomic.h itself. On some
-architectures, CPU guarantees are strong enough that blocking compiler
-optimizations already ensures the correct order of execution. In this
-case, qemu/atomic.h will reduce stronger memory barriers to simple
-compiler barriers.
-
-Still, barrier() can be useful when writing code that can be interrupted
-by signal handlers.
-
-
-SEQUENTIALLY CONSISTENT ATOMIC ACCESS
-=====================================
-
-Most of the operations in the qemu/atomic.h header ensure *sequential
-consistency*, where "the result of any execution is the same as if the
-operations of all the processors were executed in some sequential order,
-and the operations of each individual processor appear in this sequence
-in the order specified by its program".
-
-qemu/atomic.h provides the following set of atomic read-modify-write
-operations:
-
- void atomic_inc(ptr)
- void atomic_dec(ptr)
- void atomic_add(ptr, val)
- void atomic_sub(ptr, val)
- void atomic_and(ptr, val)
- void atomic_or(ptr, val)
-
- typeof(*ptr) atomic_fetch_inc(ptr)
- typeof(*ptr) atomic_fetch_dec(ptr)
- typeof(*ptr) atomic_fetch_add(ptr, val)
- typeof(*ptr) atomic_fetch_sub(ptr, val)
- typeof(*ptr) atomic_fetch_and(ptr, val)
- typeof(*ptr) atomic_fetch_or(ptr, val)
- typeof(*ptr) atomic_xchg(ptr, val)
- typeof(*ptr) atomic_cmpxchg(ptr, old, new)
-
-all of which return the old value of *ptr. These operations are
-polymorphic; they operate on any type that is as wide as an int.
-
-Sequentially consistent loads and stores can be done using:
-
- atomic_fetch_add(ptr, 0) for loads
- atomic_xchg(ptr, val) for stores
-
-However, they are quite expensive on some platforms, notably POWER and
-ARM. Therefore, qemu/atomic.h provides two primitives with slightly
-weaker constraints:
-
- typeof(*ptr) atomic_mb_read(ptr)
- void atomic_mb_set(ptr, val)
-
-The semantics of these primitives map to Java volatile variables,
-and are strongly related to memory barriers as used in the Linux
-kernel (see below).
-
-As long as you use atomic_mb_read and atomic_mb_set, accesses cannot
-be reordered with each other, and it is also not possible to reorder
-"normal" accesses around them.
-
-However, and this is the important difference between
-atomic_mb_read/atomic_mb_set and sequential consistency, it is important
-for both threads to access the same volatile variable. It is not the
-case that everything visible to thread A when it writes volatile field f
-becomes visible to thread B after it reads volatile field g. The store
-and load have to "match" (i.e., be performed on the same volatile
-field) to achieve the right semantics.
-
-
-These operations operate on any type that is as wide as an int or smaller.
-
-
-WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS
-=============================================
-
-Compared to sequentially consistent atomic access, programming with
-weaker consistency models can be considerably more complicated.
-In general, if the algorithm you are writing includes both writes
-and reads on the same side, it is generally simpler to use sequentially
-consistent primitives.
-
-When using this model, variables are accessed with atomic_read() and
-atomic_set(), and restrictions to the ordering of accesses is enforced
-using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(),
-smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends().
-
-atomic_read() and atomic_set() prevents the compiler from using
-optimizations that might otherwise optimize accesses out of existence
-on the one hand, or that might create unsolicited accesses on the other.
-In general this should not have any effect, because the same compiler
-barriers are already implied by memory barriers. However, it is useful
-to do so, because it tells readers which variables are shared with
-other threads, and which are local to the current thread or protected
-by other, more mundane means.
-
-Memory barriers control the order of references to shared memory.
-They come in six kinds:
-
-- smp_rmb() guarantees that all the LOAD operations specified before
- the barrier will appear to happen before all the LOAD operations
- specified after the barrier with respect to the other components of
- the system.
-
- In other words, smp_rmb() puts a partial ordering on loads, but is not
- required to have any effect on stores.
-
-- smp_wmb() guarantees that all the STORE operations specified before
- the barrier will appear to happen before all the STORE operations
- specified after the barrier with respect to the other components of
- the system.
-
- In other words, smp_wmb() puts a partial ordering on stores, but is not
- required to have any effect on loads.
-
-- smp_mb_acquire() guarantees that all the LOAD operations specified before
- the barrier will appear to happen before all the LOAD or STORE operations
- specified after the barrier with respect to the other components of
- the system.
-
-- smp_mb_release() guarantees that all the STORE operations specified *after*
- the barrier will appear to happen after all the LOAD or STORE operations
- specified *before* the barrier with respect to the other components of
- the system.
-
-- smp_mb() guarantees that all the LOAD and STORE operations specified
- before the barrier will appear to happen before all the LOAD and
- STORE operations specified after the barrier with respect to the other
- components of the system.
-
- smp_mb() puts a partial ordering on both loads and stores. It is
- stronger than both a read and a write memory barrier; it implies both
- smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs
- coming before the barrier from overtaking LOADs coming after the
- barrier and vice versa.
-
-- smp_read_barrier_depends() is a weaker kind of read barrier. On
- most processors, whenever two loads are performed such that the
- second depends on the result of the first (e.g., the first load
- retrieves the address to which the second load will be directed),
- the processor will guarantee that the first LOAD will appear to happen
- before the second with respect to the other components of the system.
- However, this is not always true---for example, it was not true on
- Alpha processors. Whenever this kind of access happens to shared
- memory (that is not protected by a lock), a read barrier is needed,
- and smp_read_barrier_depends() can be used instead of smp_rmb().
-
- Note that the first load really has to have a _data_ dependency and not
- a control dependency. If the address for the second load is dependent
- on the first load, but the dependency is through a conditional rather
- than actually loading the address itself, then it's a _control_
- dependency and a full read barrier or better is required.
-
-
-This is the set of barriers that is required *between* two atomic_read()
-and atomic_set() operations to achieve sequential consistency:
-
- | 2nd operation |
- |-----------------------------------------------|
- 1st operation | (after last) | atomic_read | atomic_set |
- ---------------+----------------+-------------+----------------|
- (before first) | | none | smp_mb_release |
- ---------------+----------------+-------------+----------------|
- atomic_read | smp_mb_acquire | smp_rmb | ** |
- ---------------+----------------+-------------+----------------|
- atomic_set | none | smp_mb()*** | smp_wmb() |
- ---------------+----------------+-------------+----------------|
-
- * Or smp_read_barrier_depends().
-
- ** This requires a load-store barrier. This is achieved by
- either smp_mb_acquire() or smp_mb_release().
-
- *** This requires a store-load barrier. On most machines, the only
- way to achieve this is a full barrier.
-
-
-You can see that the two possible definitions of atomic_mb_read()
-and atomic_mb_set() are the following:
-
- 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire()
- atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb()
-
- 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire()
- atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v);
-
-Usually the former is used, because smp_mb() is expensive and a program
-normally has more reads than writes. Therefore it makes more sense to
-make atomic_mb_set() the more expensive operation.
-
-There are two common cases in which atomic_mb_read and atomic_mb_set
-generate too many memory barriers, and thus it can be useful to manually
-place barriers instead:
-
-- when a data structure has one thread that is always a writer
- and one thread that is always a reader, manual placement of
- memory barriers makes the write side faster. Furthermore,
- correctness is easy to check for in this case using the "pairing"
- trick that is explained below:
-
- thread 1 thread 1
- ------------------------- ------------------------
- (other writes)
- smp_mb_release()
- atomic_mb_set(&a, x) atomic_set(&a, x)
- smp_wmb()
- atomic_mb_set(&b, y) atomic_set(&b, y)
-
- =>
- thread 2 thread 2
- ------------------------- ------------------------
- y = atomic_mb_read(&b) y = atomic_read(&b)
- smp_rmb()
- x = atomic_mb_read(&a) x = atomic_read(&a)
- smp_mb_acquire()
-
- Note that the barrier between the stores in thread 1, and between
- the loads in thread 2, has been optimized here to a write or a
- read memory barrier respectively. On some architectures, notably
- ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as
- smp_mb, but smp_rmb and/or smp_wmb are more efficient.
-
-- sometimes, a thread is accessing many variables that are otherwise
- unrelated to each other (for example because, apart from the current
- thread, exactly one other thread will read or write each of these
- variables). In this case, it is possible to "hoist" the implicit
- barriers provided by atomic_mb_read() and atomic_mb_set() outside
- a loop. For example, the above definition atomic_mb_read() gives
- the following transformation:
-
- n = 0; n = 0;
- for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
- n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]);
- smp_mb_acquire();
-
- Similarly, atomic_mb_set() can be transformed as follows:
- smp_mb():
-
- smp_mb_release();
- for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
- atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
- smp_mb();
-
-
-The two tricks can be combined. In this case, splitting a loop in
-two lets you hoist the barriers out of the loops _and_ eliminate the
-expensive smp_mb():
-
- smp_mb_release();
- for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++)
- atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
- atomic_mb_set(&b[i], false); smb_wmb();
- } for (i = 0; i < 10; i++)
- atomic_set(&a[i], false);
- smp_mb();
-
- The other thread can still use atomic_mb_read()/atomic_mb_set()
-
-
-Memory barrier pairing
-----------------------
-
-A useful rule of thumb is that memory barriers should always, or almost
-always, be paired with another barrier. In the case of QEMU, however,
-note that the other barrier may actually be in a driver that runs in
-the guest!
-
-For the purposes of pairing, smp_read_barrier_depends() and smp_rmb()
-both count as read barriers. A read barrier shall pair with a write
-barrier or a full barrier; a write barrier shall pair with a read
-barrier or a full barrier. A full barrier can pair with anything.
-For example:
-
- thread 1 thread 2
- =============== ===============
- a = 1;
- smp_wmb();
- b = 2; x = b;
- smp_rmb();
- y = a;
-
-Note that the "writing" thread is accessing the variables in the
-opposite order as the "reading" thread. This is expected: stores
-before the write barrier will normally match the loads after the
-read barrier, and vice versa. The same is true for more than 2
-access and for data dependency barriers:
-
- thread 1 thread 2
- =============== ===============
- b[2] = 1;
- smp_wmb();
- x->i = 2;
- smp_wmb();
- a = x; x = a;
- smp_read_barrier_depends();
- y = x->i;
- smp_read_barrier_depends();
- z = b[y];
-
-smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire().
-and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release().
-
-
-COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
-============================================
-
-Here is a list of differences between Linux kernel atomic operations
-and memory barriers, and the equivalents in QEMU:
-
-- atomic operations in Linux are always on a 32-bit int type and
- use a boxed atomic_t type; atomic operations in QEMU are polymorphic
- and use normal C types.
-
-- Originally, atomic_read and atomic_set in Linux gave no guarantee
- at all. Linux 4.1 updated them to implement volatile
- semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).
-
- QEMU's atomic_read/set implement, if the compiler supports it, C11
- atomic relaxed semantics, and volatile semantics otherwise.
- Both semantics prevent the compiler from doing certain transformations;
- the difference is that atomic accesses are guaranteed to be atomic,
- while volatile accesses aren't. Thus, in the volatile case we just cross
- our fingers hoping that the compiler will generate atomic accesses,
- since we assume the variables passed are machine-word sized and
- properly aligned.
- No barriers are implied by atomic_read/set in either Linux or QEMU.
-
-- atomic read-modify-write operations in Linux are of three kinds:
-
- atomic_OP returns void
- atomic_OP_return returns new value of the variable
- atomic_fetch_OP returns the old value of the variable
- atomic_cmpxchg returns the old value of the variable
-
- In QEMU, the second kind does not exist. Currently Linux has
- atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub.
-
-- different atomic read-modify-write operations in Linux imply
- a different set of memory barriers; in QEMU, all of them enforce
- sequential consistency, which means they imply full memory barriers
- before and after the operation.
-
-- Linux does not have an equivalent of atomic_mb_set(). In particular,
- note that smp_store_mb() is a little weaker than atomic_mb_set().
- atomic_mb_read() compiles to the same instructions as Linux's
- smp_load_acquire(), but this should be treated as an implementation
- detail. QEMU does have atomic_load_acquire() and atomic_store_release()
- macros, but for now they are only used within atomic.h. This may
- change in the future.
-
-
-SOURCES
-=======
-
-* Documentation/memory-barriers.txt from the Linux kernel
-
-* "The JSR-133 Cookbook for Compiler Writers", available at
- http://g.oswego.edu/dl/jmm/cookbook.html
+++ /dev/null
-<!--
-Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
-All rights reserved.
-
-This file is licensed via The FreeBSD Documentation License, the full text of
-which is included at the end of this document.
--->
-
-# Dirty Bitmaps and Incremental Backup
-
-* Dirty Bitmaps are objects that track which data needs to be backed up for the
- next incremental backup.
-
-* Dirty bitmaps can be created at any time and attached to any node
- (not just complete drives.)
-
-## Dirty Bitmap Names
-
-* A dirty bitmap's name is unique to the node, but bitmaps attached to different
- nodes can share the same name.
-
-* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
- name, but any user-created bitmaps may not be. There can be any number of
- anonymous bitmaps per node.
-
-* The name of a user-created bitmap must not be empty ("").
-
-## Bitmap Modes
-
-* A Bitmap can be "frozen," which means that it is currently in-use by a backup
- operation and cannot be deleted, renamed, written to, reset,
- etc.
-
-* The normal operating mode for a bitmap is "active."
-
-## Basic QMP Usage
-
-### Supported Commands ###
-
-* block-dirty-bitmap-add
-* block-dirty-bitmap-remove
-* block-dirty-bitmap-clear
-
-### Creation
-
-* To create a new bitmap, enabled, on the drive with id=drive0:
-
-```json
-{ "execute": "block-dirty-bitmap-add",
- "arguments": {
- "node": "drive0",
- "name": "bitmap0"
- }
-}
-```
-
-* This bitmap will have a default granularity that matches the cluster size of
- its associated drive, if available, clamped to between [4KiB, 64KiB].
- The current default for qcow2 is 64KiB.
-
-* To create a new bitmap that tracks changes in 32KiB segments:
-
-```json
-{ "execute": "block-dirty-bitmap-add",
- "arguments": {
- "node": "drive0",
- "name": "bitmap0",
- "granularity": 32768
- }
-}
-```
-
-### Deletion
-
-* Bitmaps that are frozen cannot be deleted.
-
-* Deleting the bitmap does not impact any other bitmaps attached to the same
- node, nor does it affect any backups already created from this node.
-
-* Because bitmaps are only unique to the node to which they are attached,
- you must specify the node/drive name here, too.
-
-```json
-{ "execute": "block-dirty-bitmap-remove",
- "arguments": {
- "node": "drive0",
- "name": "bitmap0"
- }
-}
-```
-
-### Resetting
-
-* Resetting a bitmap will clear all information it holds.
-
-* An incremental backup created from an empty bitmap will copy no data,
- as if nothing has changed.
-
-```json
-{ "execute": "block-dirty-bitmap-clear",
- "arguments": {
- "node": "drive0",
- "name": "bitmap0"
- }
-}
-```
-
-## Transactions
-
-### Justification
-
-Bitmaps can be safely modified when the VM is paused or halted by using
-the basic QMP commands. For instance, you might perform the following actions:
-
-1. Boot the VM in a paused state.
-2. Create a full drive backup of drive0.
-3. Create a new bitmap attached to drive0.
-4. Resume execution of the VM.
-5. Incremental backups are ready to be created.
-
-At this point, the bitmap and drive backup would be correctly in sync,
-and incremental backups made from this point forward would be correctly aligned
-to the full drive backup.
-
-This is not particularly useful if we decide we want to start incremental
-backups after the VM has been running for a while, for which we will need to
-perform actions such as the following:
-
-1. Boot the VM and begin execution.
-2. Using a single transaction, perform the following operations:
- * Create bitmap0.
- * Create a full drive backup of drive0.
-3. Incremental backups are now ready to be created.
-
-### Supported Bitmap Transactions
-
-* block-dirty-bitmap-add
-* block-dirty-bitmap-clear
-
-The usages are identical to their respective QMP commands, but see below
-for examples.
-
-### Example: New Incremental Backup
-
-As outlined in the justification, perhaps we want to create a new incremental
-backup chain attached to a drive.
-
-```json
-{ "execute": "transaction",
- "arguments": {
- "actions": [
- {"type": "block-dirty-bitmap-add",
- "data": {"node": "drive0", "name": "bitmap0"} },
- {"type": "drive-backup",
- "data": {"device": "drive0", "target": "/path/to/full_backup.img",
- "sync": "full", "format": "qcow2"} }
- ]
- }
-}
-```
-
-### Example: New Incremental Backup Anchor Point
-
-Maybe we just want to create a new full backup with an existing bitmap and
-want to reset the bitmap to track the new chain.
-
-```json
-{ "execute": "transaction",
- "arguments": {
- "actions": [
- {"type": "block-dirty-bitmap-clear",
- "data": {"node": "drive0", "name": "bitmap0"} },
- {"type": "drive-backup",
- "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
- "sync": "full", "format": "qcow2"} }
- ]
- }
-}
-```
-
-## Incremental Backups
-
-The star of the show.
-
-**Nota Bene!** Only incremental backups of entire drives are supported for now.
-So despite the fact that you can attach a bitmap to any arbitrary node, they are
-only currently useful when attached to the root node. This is because
-drive-backup only supports drives/devices instead of arbitrary nodes.
-
-### Example: First Incremental Backup
-
-1. Create a full backup and sync it to the dirty bitmap, as in the transactional
-examples above; or with the VM offline, manually create a full copy and then
-create a new bitmap before the VM begins execution.
-
- * Let's assume the full backup is named 'full_backup.img'.
- * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
-
-2. Create a destination image for the incremental backup that utilizes the
-full backup as a backing image.
-
- * Let's assume it is named 'incremental.0.img'.
-
- ```sh
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
- ```
-
-3. Issue the incremental backup command:
-
- ```json
- { "execute": "drive-backup",
- "arguments": {
- "device": "drive0",
- "bitmap": "bitmap0",
- "target": "incremental.0.img",
- "format": "qcow2",
- "sync": "incremental",
- "mode": "existing"
- }
- }
- ```
-
-### Example: Second Incremental Backup
-
-1. Create a new destination image for the incremental backup that points to the
- previous one, e.g.: 'incremental.1.img'
-
- ```sh
- # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
- ```
-
-2. Issue a new incremental backup command. The only difference here is that we
- have changed the target image below.
-
- ```json
- { "execute": "drive-backup",
- "arguments": {
- "device": "drive0",
- "bitmap": "bitmap0",
- "target": "incremental.1.img",
- "format": "qcow2",
- "sync": "incremental",
- "mode": "existing"
- }
- }
- ```
-
-## Errors
-
-* In the event of an error that occurs after a backup job is successfully
- launched, either by a direct QMP command or a QMP transaction, the user
- will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
- by a BLOCK_JOB_ERROR event.
-
-* In the case of an event being cancelled, the user will receive a
- BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
-
-* In either case, the incremental backup data contained within the bitmap is
- safely rolled back, and the data within the bitmap is not lost. The image
- file created for the failed attempt can be safely deleted.
-
-* Once the underlying problem is fixed (e.g. more storage space is freed up),
- you can simply retry the incremental backup command with the same bitmap.
-
-### Example
-
-1. Create a target image:
-
- ```sh
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
- ```
-
-2. Attempt to create an incremental backup via QMP:
-
- ```json
- { "execute": "drive-backup",
- "arguments": {
- "device": "drive0",
- "bitmap": "bitmap0",
- "target": "incremental.0.img",
- "format": "qcow2",
- "sync": "incremental",
- "mode": "existing"
- }
- }
- ```
-
-3. Receive an event notifying us of failure:
-
- ```json
- { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
- "data": { "speed": 0, "offset": 0, "len": 67108864,
- "error": "No space left on device",
- "device": "drive1", "type": "backup" },
- "event": "BLOCK_JOB_COMPLETED" }
- ```
-
-4. Delete the failed incremental, and re-create the image.
-
- ```sh
- # rm incremental.0.img
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
- ```
-
-5. Retry the command after fixing the underlying problem,
- such as freeing up space on the backup volume:
-
- ```json
- { "execute": "drive-backup",
- "arguments": {
- "device": "drive0",
- "bitmap": "bitmap0",
- "target": "incremental.0.img",
- "format": "qcow2",
- "sync": "incremental",
- "mode": "existing"
- }
- }
- ```
-
-6. Receive confirmation that the job completed successfully:
-
- ```json
- { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
- "data": { "device": "drive1", "type": "backup",
- "speed": 0, "len": 67108864, "offset": 67108864},
- "event": "BLOCK_JOB_COMPLETED" }
- ```
-
-### Partial Transactional Failures
-
-* Sometimes, a transaction will succeed in launching and return success,
- but then later the backup jobs themselves may fail. It is possible that
- a management application may have to deal with a partial backup failure
- after a successful transaction.
-
-* If multiple backup jobs are specified in a single transaction, when one of
- them fails, it will not interact with the other backup jobs in any way.
-
-* The job(s) that succeeded will clear the dirty bitmap associated with the
- operation, but the job(s) that failed will not. It is not "safe" to delete
- any incremental backups that were created successfully in this scenario,
- even though others failed.
-
-#### Example
-
-* QMP example highlighting two backup jobs:
-
- ```json
- { "execute": "transaction",
- "arguments": {
- "actions": [
- { "type": "drive-backup",
- "data": { "device": "drive0", "bitmap": "bitmap0",
- "format": "qcow2", "mode": "existing",
- "sync": "incremental", "target": "d0-incr-1.qcow2" } },
- { "type": "drive-backup",
- "data": { "device": "drive1", "bitmap": "bitmap1",
- "format": "qcow2", "mode": "existing",
- "sync": "incremental", "target": "d1-incr-1.qcow2" } },
- ]
- }
- }
- ```
-
-* QMP example response, highlighting one success and one failure:
- * Acknowledgement that the Transaction was accepted and jobs were launched:
- ```json
- { "return": {} }
- ```
-
- * Later, QEMU sends notice that the first job was completed:
- ```json
- { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
- "data": { "device": "drive0", "type": "backup",
- "speed": 0, "len": 67108864, "offset": 67108864 },
- "event": "BLOCK_JOB_COMPLETED"
- }
- ```
-
- * Later yet, QEMU sends notice that the second job has failed:
- ```json
- { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
- "data": { "device": "drive1", "action": "report",
- "operation": "read" },
- "event": "BLOCK_JOB_ERROR" }
- ```
-
- ```json
- { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
- "data": { "speed": 0, "offset": 0, "len": 67108864,
- "error": "Input/output error",
- "device": "drive1", "type": "backup" },
- "event": "BLOCK_JOB_COMPLETED" }
-
-* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
- but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
- incremental backup of all drives at a point-in-time is to be made,
- new backups for both drives will need to be made, taking into account
- that a new incremental backup for drive0 needs to be based on top of
- "d0-incr-1.qcow2."
-
-### Grouped Completion Mode
-
-* While jobs launched by transactions normally complete or fail on their own,
- it is possible to instruct them to complete or fail together as a group.
-
-* QMP transactions take an optional properties structure that can affect
- the semantics of the transaction.
-
-* The "completion-mode" transaction property can be either "individual"
- which is the default, legacy behavior described above, or "grouped,"
- a new behavior detailed below.
-
-* Delayed Completion: In grouped completion mode, no jobs will report
- success until all jobs are ready to report success.
-
-* Grouped failure: If any job fails in grouped completion mode, all remaining
- jobs will be cancelled. Any incremental backups will restore their dirty
- bitmap objects as if no backup command was ever issued.
-
- * Regardless of if QEMU reports a particular incremental backup job as
- CANCELLED or as an ERROR, the in-memory bitmap will be restored.
-
-#### Example
-
-* Here's the same example scenario from above with the new property:
-
- ```json
- { "execute": "transaction",
- "arguments": {
- "actions": [
- { "type": "drive-backup",
- "data": { "device": "drive0", "bitmap": "bitmap0",
- "format": "qcow2", "mode": "existing",
- "sync": "incremental", "target": "d0-incr-1.qcow2" } },
- { "type": "drive-backup",
- "data": { "device": "drive1", "bitmap": "bitmap1",
- "format": "qcow2", "mode": "existing",
- "sync": "incremental", "target": "d1-incr-1.qcow2" } },
- ],
- "properties": {
- "completion-mode": "grouped"
- }
- }
- }
- ```
-
-* QMP example response, highlighting a failure for drive2:
- * Acknowledgement that the Transaction was accepted and jobs were launched:
- ```json
- { "return": {} }
- ```
-
- * Later, QEMU sends notice that the second job has errored out,
- but that the first job was also cancelled:
- ```json
- { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
- "data": { "device": "drive1", "action": "report",
- "operation": "read" },
- "event": "BLOCK_JOB_ERROR" }
- ```
-
- ```json
- { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
- "data": { "speed": 0, "offset": 0, "len": 67108864,
- "error": "Input/output error",
- "device": "drive1", "type": "backup" },
- "event": "BLOCK_JOB_COMPLETED" }
- ```
-
- ```json
- { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
- "data": { "device": "drive0", "type": "backup", "speed": 0,
- "len": 67108864, "offset": 16777216 },
- "event": "BLOCK_JOB_CANCELLED" }
- ```
-
-<!--
-The FreeBSD Documentation License
-
-Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
-PDF, PostScript, RTF and so forth) with or without modification, are permitted
-provided that the following conditions are met:
-
-Redistributions of source code (Markdown) must retain the above copyright
-notice, this list of conditions and the following disclaimer of this file
-unmodified.
-
-Redistributions in compiled form (transformed to other DTDs, converted to PDF,
-PostScript, RTF and other formats) must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-
-THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--->
+++ /dev/null
-Block I/O error injection using blkdebug
-----------------------------------------
-Copyright (C) 2014-2015 Red Hat Inc
-
-This work is licensed under the terms of the GNU GPL, version 2 or later. See
-the COPYING file in the top-level directory.
-
-The blkdebug block driver is a rule-based error injection engine. It can be
-used to exercise error code paths in block drivers including ENOSPC (out of
-space) and EIO.
-
-This document gives an overview of the features available in blkdebug.
-
-Background
-----------
-Block drivers have many error code paths that handle I/O errors. Image formats
-are especially complex since metadata I/O errors during cluster allocation or
-while updating tables happen halfway through request processing and require
-discipline to keep image files consistent.
-
-Error injection allows test cases to trigger I/O errors at specific points.
-This way, all error paths can be tested to make sure they are correct.
-
-Rules
------
-The blkdebug block driver takes a list of "rules" that tell the error injection
-engine when to fail an I/O request.
-
-Each I/O request is evaluated against the rules. If a rule matches the request
-then its "action" is executed.
-
-Rules can be placed in a configuration file; the configuration file
-follows the same .ini-like format used by QEMU's -readconfig option, and
-each section of the file represents a rule.
-
-The following configuration file defines a single rule:
-
- $ cat blkdebug.conf
- [inject-error]
- event = "read_aio"
- errno = "28"
-
-This rule fails all aio read requests with ENOSPC (28). Note that the errno
-value depends on the host. On Linux, see
-/usr/include/asm-generic/errno-base.h for errno values.
-
-Invoke QEMU as follows:
-
- $ qemu-system-x86_64
- -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \
- -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0
-
-Rules support the following attributes:
-
- event - which type of operation to match (e.g. read_aio, write_aio,
- flush_to_os, flush_to_disk). See the "Events" section for
- information on events.
-
- state - (optional) the engine must be in this state number in order for this
- rule to match. See the "State transitions" section for information
- on states.
-
- errno - the numeric errno value to return when a request matches this rule.
- The errno values depend on the host since the numeric values are not
- standarized in the POSIX specification.
-
- sector - (optional) a sector number that the request must overlap in order to
- match this rule
-
- once - (optional, default "off") only execute this action on the first
- matching request
-
- immediately - (optional, default "off") return a NULL BlockAIOCB
- pointer and fail without an errno instead. This
- exercises the code path where BlockAIOCB fails and the
- caller's BlockCompletionFunc is not invoked.
-
-Events
-------
-Block drivers provide information about the type of I/O request they are about
-to make so rules can match specific types of requests. For example, the qcow2
-block driver tells blkdebug when it accesses the L1 table so rules can match
-only L1 table accesses and not other metadata or guest data requests.
-
-The core events are:
-
- read_aio - guest data read
-
- write_aio - guest data write
-
- flush_to_os - write out unwritten block driver state (e.g. cached metadata)
-
- flush_to_disk - flush the host block device's disk cache
-
-See qapi/block-core.json:BlkdebugEvent for the full list of events.
-You may need to grep block driver source code to understand the
-meaning of specific events.
-
-State transitions
------------------
-There are cases where more power is needed to match a particular I/O request in
-a longer sequence of requests. For example:
-
- write_aio
- flush_to_disk
- write_aio
-
-How do we match the 2nd write_aio but not the first? This is where state
-transitions come in.
-
-The error injection engine has an integer called the "state" that always starts
-initialized to 1. The state integer is internal to blkdebug and cannot be
-observed from outside but rules can interact with it for powerful matching
-behavior.
-
-Rules can be conditional on the current state and they can transition to a new
-state.
-
-When a rule's "state" attribute is non-zero then the current state must equal
-the attribute in order for the rule to match.
-
-For example, to match the 2nd write_aio:
-
- [set-state]
- event = "write_aio"
- state = "1"
- new_state = "2"
-
- [inject-error]
- event = "write_aio"
- state = "2"
- errno = "5"
-
-The first write_aio request matches the set-state rule and transitions from
-state 1 to state 2. Once state 2 has been entered, the set-state rule no
-longer matches since it requires state 1. But the inject-error rule now
-matches the next write_aio request and injects EIO (5).
-
-State transition rules support the following attributes:
-
- event - which type of operation to match (e.g. read_aio, write_aio,
- flush_to_os, flush_to_disk). See the "Events" section for
- information on events.
-
- state - (optional) the engine must be in this state number in order for this
- rule to match
-
- new_state - transition to this state number
-
-Suspend and resume
-------------------
-Exercising code paths in block drivers may require specific ordering amongst
-concurrent requests. The "breakpoint" feature allows requests to be halted on
-a blkdebug event and resumed later. This makes it possible to achieve
-deterministic ordering when multiple requests are in flight.
-
-Breakpoints on blkdebug events are associated with a user-defined "tag" string.
-This tag serves as an identifier by which the request can be resumed at a later
-point.
-
-See the qemu-io(1) break, resume, remove_break, and wait_break commands for
-details.
+++ /dev/null
-= Block driver correctness testing with blkverify =
-
-== Introduction ==
-
-This document describes how to use the blkverify protocol to test that a block
-driver is operating correctly.
-
-It is difficult to test and debug block drivers against real guests. Often
-processes inside the guest will crash because corrupt sectors were read as part
-of the executable. Other times obscure errors are raised by a program inside
-the guest. These issues are extremely hard to trace back to bugs in the block
-driver.
-
-Blkverify solves this problem by catching data corruption inside QEMU the first
-time bad data is read and reporting the disk sector that is corrupted.
-
-== How it works ==
-
-The blkverify protocol has two child block devices, the "test" device and the
-"raw" device. Read/write operations are mirrored to both devices so their
-state should always be in sync.
-
-The "raw" device is a raw image, a flat file, that has identical starting
-contents to the "test" image. The idea is that the "raw" device will handle
-read/write operations correctly and not corrupt data. It can be used as a
-reference for comparison against the "test" device.
-
-After a mirrored read operation completes, blkverify will compare the data and
-raise an error if it is not identical. This makes it possible to catch the
-first instance where corrupt data is read.
-
-== Example ==
-
-Imagine raw.img has 0xcd repeated throughout its first sector:
-
- $ ./qemu-io -c 'read -v 0 512' raw.img
- 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
- 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
- [...]
- 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
- 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
- read 512/512 bytes at offset 0
- 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec)
-
-And test.img is corrupt, its first sector is zeroed when it shouldn't be:
-
- $ ./qemu-io -c 'read -v 0 512' test.img
- 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
- 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
- [...]
- 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
- 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
- read 512/512 bytes at offset 0
- 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec)
-
-This error is caught by blkverify:
-
- $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img
- blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
-
-A more realistic scenario is verifying the installation of a guest OS:
-
- $ ./qemu-img create raw.img 16G
- $ ./qemu-img create -f qcow2 test.qcow2 16G
- $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \
- -drive file=blkverify:raw.img:test.qcow2
-
-If the installation is aborted when blkverify detects corruption, use qemu-io
-to explore the contents of the disk image at the sector in question.
+++ /dev/null
- The QEMU build system architecture
- ==================================
-
-This document aims to help developers understand the architecture of the
-QEMU build system. As with projects using GNU autotools, the QEMU build
-system has two stages, first the developer runs the "configure" script
-to determine the local build environment characteristics, then they run
-"make" to build the project. There is about where the similarities with
-GNU autotools end, so try to forget what you know about them.
-
-
-Stage 1: configure
-==================
-
-The QEMU configure script is written directly in shell, and should be
-compatible with any POSIX shell, hence it uses #!/bin/sh. An important
-implication of this is that it is important to avoid using bash-isms on
-development platforms where bash is the primary host.
-
-In contrast to autoconf scripts, QEMU's configure is expected to be
-silent while it is checking for features. It will only display output
-when an error occurs, or to show the final feature enablement summary
-on completion.
-
-Adding new checks to the configure script usually comprises the
-following tasks:
-
- - Initialize one or more variables with the default feature state.
-
- Ideally features should auto-detect whether they are present,
- so try to avoid hardcoding the initial state to either enabled
- or disabled, as that forces the user to pass a --enable-XXX
- / --disable-XXX flag on every invocation of configure.
-
- - Add support to the command line arg parser to handle any new
- --enable-XXX / --disable-XXX flags required by the feature XXX.
-
- - Add information to the help output message to report on the new
- feature flag.
-
- - Add code to perform the actual feature check. As noted above, try to
- be fully dynamic in checking enablement/disablement.
-
- - Add code to print out the feature status in the configure summary
- upon completion.
-
- - Add any new makefile variables to $config_host_mak on completion.
-
-
-Taking (a simplified version of) the probe for gnutls from configure,
-we have the following pieces:
-
- # Initial variable state
- gnutls=""
-
- ..snip..
-
- # Configure flag processing
- --disable-gnutls) gnutls="no"
- ;;
- --enable-gnutls) gnutls="yes"
- ;;
-
- ..snip..
-
- # Help output feature message
- gnutls GNUTLS cryptography support
-
- ..snip..
-
- # Test for gnutls
- if test "$gnutls" != "no"; then
- if ! $pkg_config --exists "gnutls"; then
- gnutls_cflags=`$pkg_config --cflags gnutls`
- gnutls_libs=`$pkg_config --libs gnutls`
- libs_softmmu="$gnutls_libs $libs_softmmu"
- libs_tools="$gnutls_libs $libs_tools"
- QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags"
- gnutls="yes"
- elif test "$gnutls" = "yes"; then
- feature_not_found "gnutls" "Install gnutls devel"
- else
- gnutls="no"
- fi
- fi
-
- ..snip..
-
- # Completion feature summary
- echo "GNUTLS support $gnutls"
-
- ..snip..
-
- # Define make variables
- if test "$gnutls" = "yes" ; then
- echo "CONFIG_GNUTLS=y" >> $config_host_mak
- fi
-
-
-Helper functions
-----------------
-
-The configure script provides a variety of helper functions to assist
-developers in checking for system features:
-
- - do_cc $ARGS...
-
- Attempt to run the system C compiler passing it $ARGS...
-
- - do_cxx $ARGS...
-
- Attempt to run the system C++ compiler passing it $ARGS...
-
- - compile_object $CFLAGS
-
- Attempt to compile a test program with the system C compiler using
- $CFLAGS. The test program must have been previously written to a file
- called $TMPC.
-
- - compile_prog $CFLAGS $LDFLAGS
-
- Attempt to compile a test program with the system C compiler using
- $CFLAGS and link it with the system linker using $LDFLAGS. The test
- program must have been previously written to a file called $TMPC.
-
- - has $COMMAND
-
- Determine if $COMMAND exists in the current environment, either as a
- shell builtin, or executable binary, returning 0 on success.
-
- - path_of $COMMAND
-
- Return the fully qualified path of $COMMAND, printing it to stdout,
- and returning 0 on success.
-
- - check_define $NAME
-
- Determine if the macro $NAME is defined by the system C compiler
-
- - check_include $NAME
-
- Determine if the include $NAME file is available to the system C
- compiler
-
- - write_c_skeleton
-
- Write a minimal C program main() function to the temporary file
- indicated by $TMPC
-
- - feature_not_found $NAME $REMEDY
-
- Print a message to stderr that the feature $NAME was not available
- on the system, suggesting the user try $REMEDY to address the
- problem.
-
- - error_exit $MESSAGE $MORE...
-
- Print $MESSAGE to stderr, followed by $MORE... and then exit from the
- configure script with non-zero status
-
- - query_pkg_config $ARGS...
-
- Run pkg-config passing it $ARGS. If QEMU is doing a static build,
- then --static will be automatically added to $ARGS
-
-
-Stage 2: makefiles
-==================
-
-The use of GNU make is required with the QEMU build system.
-
-Although the source code is spread across multiple subdirectories, the
-build system should be considered largely non-recursive in nature, in
-contrast to common practices seen with automake. There is some recursive
-invocation of make, but this is related to the things being built,
-rather than the source directory structure.
-
-QEMU currently supports both VPATH and non-VPATH builds, so there are
-three general ways to invoke configure & perform a build.
-
- - VPATH, build artifacts outside of QEMU source tree entirely
-
- cd ../
- mkdir build
- cd build
- ../qemu/configure
- make
-
- - VPATH, build artifacts in a subdir of QEMU source tree
-
- mkdir build
- cd build
- ../configure
- make
-
- - non-VPATH, build artifacts everywhere
-
- ./configure
- make
-
-The QEMU maintainers generally recommend that a VPATH build is used by
-developers. Patches to QEMU are expected to ensure VPATH build still
-works.
-
-
-Module structure
-----------------
-
-There are a number of key outputs of the QEMU build system:
-
- - Tools - qemu-img, qemu-nbd, qga (guest agent), etc
- - System emulators - qemu-system-$ARCH
- - Userspace emulators - qemu-$ARCH
- - Unit tests
-
-The source code is highly modularized, split across many files to
-facilitate building of all of these components with as little duplicated
-compilation as possible. There can be considered to be two distinct
-groups of files, those which are independent of the QEMU emulation
-target and those which are dependent on the QEMU emulation target.
-
-In the target-independent set lives various general purpose helper code,
-such as error handling infrastructure, standard data structures,
-platform portability wrapper functions, etc. This code can be compiled
-once only and the .o files linked into all output binaries.
-
-In the target-dependent set lives CPU emulation, device emulation and
-much glue code. This sometimes also has to be compiled multiple times,
-once for each target being built.
-
-The utility code that is used by all binaries is built into a
-static archive called libqemuutil.a, which is then linked to all the
-binaries. In order to provide hooks that are only needed by some of the
-binaries, code in libqemuutil.a may depend on other functions that are
-not fully implemented by all QEMU binaries. To deal with this there is a
-second library called libqemustub.a which provides dummy stubs for all
-these functions. These will get lazy linked into the binary if the real
-implementation is not present. In this way, the libqemustub.a static
-library can be thought of as a portable implementation of the weak
-symbols concept. All binaries should link to both libqemuutil.a and
-libqemustub.a. e.g.
-
- qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a
-
-
-Windows platform portability
-----------------------------
-
-On Windows, all binaries have the suffix '.exe', so all Makefile rules
-which create binaries must include the $(EXESUF) variable on the binary
-name. e.g.
-
- qemu-img$(EXESUF): qemu-img.o ..snip..
-
-This expands to '.exe' on Windows, or '' on other platforms.
-
-A further complication for the system emulator binaries is that
-two separate binaries need to be generated.
-
-The main binary (e.g. qemu-system-x86_64.exe) is linked against the
-Windows console runtime subsystem. These are expected to be run from a
-command prompt window, and so will print stderr to the console that
-launched them.
-
-The second binary generated has a 'w' on the end of its name (e.g.
-qemu-system-x86_64w.exe) and is linked against the Windows graphical
-runtime subsystem. These are expected to be run directly from the
-desktop and will open up a dedicated console window for stderr output.
-
-The Makefile.target will generate the binary for the graphical subsystem
-first, and then use objcopy to relink it against the console subsystem
-to generate the second binary.
-
-
-Object variable naming
-----------------------
-
-The QEMU convention is to define variables to list different groups of
-object files. These are named with the convention $PREFIX-obj-y. For
-example the libqemuutil.a file will be linked with all objects listed
-in a variable 'util-obj-y'. So, for example, util/Makefile.obj will
-contain a set of definitions looking like
-
- util-obj-y += bitmap.o bitops.o hbitmap.o
- util-obj-y += fifo8.o
- util-obj-y += acl.o
- util-obj-y += error.o qemu-error.o
-
-When there is an object file which needs to be conditionally built based
-on some characteristic of the host system, the configure script will
-define a variable for the conditional. For example, on Windows it will
-define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a
-value of 'y'. It is now possible to use the config variables when
-listing object files. For example,
-
- util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o
- util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
-
-On Windows this expands to
-
- util-obj-y += oslib-win32.o qemu-thread-win32.o
- util-obj-n += oslib-posix.o qemu-thread-posix.o
-
-Since libqemutil.a links in $(util-obj-y), the POSIX specific files
-listed against $(util-obj-n) are ignored on the Windows platform builds.
-
-
-CFLAGS / LDFLAGS / LIBS handling
---------------------------------
-
-There are many different binaries being built with differing purposes,
-and some of them might even be 3rd party libraries pulled in via git
-submodules. As such the use of the global CFLAGS variable is generally
-avoided in QEMU, since it would apply to too many build targets.
-
-Flags that are needed by any QEMU code (i.e. everything *except* GIT
-submodule projects) are put in $(QEMU_CFLAGS) variable. For linker
-flags the $(LIBS) variable is sometimes used, but a couple of more
-targeted variables are preferred. $(libs_softmmu) is used for
-libraries that must be linked to system emulator targets, $(LIBS_TOOLS)
-is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used
-for the QEMU guest agent. There is currently no specific variable for
-the userspace emulator targets as the global $(LIBS), or more targeted
-variables shown below, are sufficient.
-
-In addition to these variables, it is possible to provide cflags and
-libs against individual source code files, by defining variables of the
-form $FILENAME-cflags and $FILENAME-libs. For example, the curl block
-driver needs to link to the libcurl library, so block/Makefile defines
-some variables:
-
- curl.o-cflags := $(CURL_CFLAGS)
- curl.o-libs := $(CURL_LIBS)
-
-The scope is a little different between the two variables. The libs get
-used when linking any target binary that includes the curl.o object
-file, while the cflags get used when compiling the curl.c file only.
-
-
-Statically defined files
-------------------------
-
-The following key files are statically defined in the source tree, with
-the rules needed to build QEMU. Their behaviour is influenced by a
-number of dynamically created files listed later.
-
-- Makefile
-
-The main entry point used when invoking make to build all the components
-of QEMU. The default 'all' target will naturally result in the build of
-every component. The various tools and helper binaries are built
-directly via a non-recursive set of rules.
-
-Each system/userspace emulation target needs to have a slightly
-different set of make rules / variables. Thus, make will be recursively
-invoked for each of the emulation targets.
-
-The recursive invocation will end up processing the toplevel
-Makefile.target file (more on that later).
-
-
-- */Makefile.objs
-
-Since the source code is spread across multiple directories, the rules
-for each file are similarly modularized. Thus each subdirectory
-containing .c files will usually also contain a Makefile.objs file.
-These files are not directly invoked by a recursive make, but instead
-they are imported by the top level Makefile and/or Makefile.target
-
-Each Makefile.objs usually just declares a set of variables listing the
-.o files that need building from the source files in the directory. They
-will also define any custom linker or compiler flags. For example in
-block/Makefile.objs
-
- block-obj-$(CONFIG_LIBISCSI) += iscsi.o
- block-obj-$(CONFIG_CURL) += curl.o
-
- ..snip...
-
- iscsi.o-cflags := $(LIBISCSI_CFLAGS)
- iscsi.o-libs := $(LIBISCSI_LIBS)
- curl.o-cflags := $(CURL_CFLAGS)
- curl.o-libs := $(CURL_LIBS)
-
-If there are any rules defined in the Makefile.objs file, they should
-all use $(obj) as a prefix to the target, e.g.
-
- $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp
-
-
-- Makefile.target
-
-This file provides the entry point used to build each individual system
-or userspace emulator target. Each enabled target has its own
-subdirectory. For example if configure is run with the argument
-'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu'
-will be created, containing a 'Makefile' which symlinks back to
-Makefile.target
-
-So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up
-using Makefile.target for the build rules.
-
-
-- rules.mak
-
-This file provides the generic helper rules for invoking build tools, in
-particular the compiler and linker. This also contains the magic (hairy)
-'unnest-vars' function which is used to merge the variable definitions
-from all Makefile.objs in the source tree down into the main Makefile
-context.
-
-
-- default-configs/*.mak
-
-The files under default-configs/ control what emulated hardware is built
-into each QEMU system and userspace emulator targets. They merely
-contain a long list of config variable definitions. For example,
-default-configs/x86_64-softmmu.mak has:
-
- include pci.mak
- include sound.mak
- include usb.mak
- CONFIG_QXL=$(CONFIG_SPICE)
- CONFIG_VGA_ISA=y
- CONFIG_VGA_CIRRUS=y
- CONFIG_VMWARE_VGA=y
- CONFIG_VIRTIO_VGA=y
- ...snip...
-
-These files rarely need changing unless new devices / hardware need to
-be enabled for a particular system/userspace emulation target
-
-
-- tests/Makefile
-
-Rules for building the unit tests. This file is included directly by the
-top level Makefile, so anything defined in this file will influence the
-entire build system. Care needs to be taken when writing rules for tests
-to ensure they only apply to the unit test execution / build.
-
-- tests/docker/Makefile.include
-
-Rules for Docker tests. Like tests/Makefile, this file is included
-directly by the top level Makefile, anything defined in this file will
-influence the entire build system.
-
-- po/Makefile
-
-Rules for building and installing the binary message catalogs from the
-text .po file sources. This almost never needs changing for any reason.
-
-
-Dynamically created files
--------------------------
-
-The following files are generated dynamically by configure in order to
-control the behaviour of the statically defined makefiles. This avoids
-the need for QEMU makefiles to go through any pre-processing as seen
-with autotools, where Makefile.am generates Makefile.in which generates
-Makefile.
-
-
-- config-host.mak
-
-When configure has determined the characteristics of the build host it
-will write a long list of variables to config-host.mak file. This
-provides the various install directories, compiler / linker flags and a
-variety of CONFIG_* variables related to optionally enabled features.
-This is imported by the top level Makefile in order to tailor the build
-output.
-
-The variables defined here are those which are applicable to all QEMU
-build outputs. Variables which are potentially different for each
-emulator target are defined by the next file...
-
-It is also used as a dependency checking mechanism. If make sees that
-the modification timestamp on configure is newer than that on
-config-host.mak, then configure will be re-run.
-
-
-- config-host.h
-
-The config-host.h file is used by source code to determine what features
-are enabled. It is generated from the contents of config-host.mak using
-the scripts/create_config program. This extracts all the CONFIG_* variables,
-most of the HOST_* variables and a few other misc variables from
-config-host.mak, formatting them as C preprocessor macros.
-
-
-- $TARGET-NAME/config-target.mak
-
-TARGET-NAME is the name of a system or userspace emulator, for example,
-x86_64-softmmu denotes the system emulator for the x86_64 architecture.
-This file contains the variables which need to vary on a per-target
-basis. For example, it will indicate whether KVM or Xen are enabled for
-the target and any other potential custom libraries needed for linking
-the target.
-
-
-- $TARGET-NAME/config-devices.mak
-
-TARGET-NAME is again the name of a system or userspace emulator. The
-config-devices.mak file is automatically generated by make using the
-scripts/make_device_config.sh program, feeding it the
-default-configs/$TARGET-NAME file as input.
-
-
-- $TARGET-NAME/Makefile
-
-This is the entrypoint used when make recurses to build a single system
-or userspace emulator target. It is merely a symlink back to the
-Makefile.target in the top level.
--- /dev/null
+###########################################################################
+#
+# You can pass this file directly to qemu using the -readconfig
+# command line switch.
+#
+# This config file creates a EHCI adapter with companion UHCI
+# controllers as multifunction device in PCI slot "1d".
+#
+# Specify "bus=ehci.0" when creating usb devices to hook them up
+# there.
+#
+
+[device "ehci"]
+ driver = "ich9-usb-ehci1"
+ addr = "1d.7"
+ multifunction = "on"
+
+[device "uhci-1"]
+ driver = "ich9-usb-uhci1"
+ addr = "1d.0"
+ multifunction = "on"
+ masterbus = "ehci.0"
+ firstport = "0"
+
+[device "uhci-2"]
+ driver = "ich9-usb-uhci2"
+ addr = "1d.1"
+ multifunction = "on"
+ masterbus = "ehci.0"
+ firstport = "2"
+
+[device "uhci-3"]
+ driver = "ich9-usb-uhci3"
+ addr = "1d.2"
+ multifunction = "on"
+ masterbus = "ehci.0"
+ firstport = "4"
--- /dev/null
+# mach-virt - VirtIO guest (graphical console)
+# =========================================================
+#
+# Usage:
+#
+# $ qemu-system-aarch64 \
+# -nodefaults \
+# -readconfig mach-virt-graphical.cfg \
+# -cpu host
+#
+# You will probably need to tweak the lines marked as
+# CHANGE ME before being able to use this configuration!
+#
+# The guest will have a selection of VirtIO devices
+# tailored towards optimal performance with modern guests,
+# and will be accessed through a graphical console.
+#
+# ---------------------------------------------------------
+#
+# Using -nodefaults is required to have full control over
+# the virtual hardware: when it's specified, QEMU will
+# populate the board with only the builtin peripherals,
+# such as the PL011 UART, plus a PCI Express Root Bus; the
+# user will then have to explicitly add further devices.
+#
+# The PCI Express Root Bus shows up in the guest as:
+#
+# 00:00.0 Host bridge
+#
+# This configuration file adds a number of other useful
+# devices, more specifically:
+#
+# 00:01.0 Display controller
+# 00.1c.* PCI bridge (PCI Express Root Ports)
+# 01:00.0 SCSI storage controller
+# 02:00.0 Ethernet controller
+# 03:00.0 USB controller
+#
+# More information about these devices is available below.
+
+
+# Machine options
+# =========================================================
+#
+# We use the virt machine type and enable KVM acceleration
+# for better performance.
+#
+# Using less than 1 GiB of memory is probably not going to
+# yield good performance in the guest, and might even lead
+# to obscure boot issues in some cases.
+#
+# Unfortunately, there is no way to configure the CPU model
+# in this file, so it will have to be provided on the
+# command line, but we can configure the guest to use the
+# same GIC version as the host.
+
+[machine]
+ type = "virt"
+ accel = "kvm"
+ gic-version = "host"
+
+[memory]
+ size = "1024"
+
+
+# Firmware configuration
+# =========================================================
+#
+# There are two parts to the firmware: a read-only image
+# containing the executable code, which is shared between
+# guests, and a read/write variable store that is owned
+# by one specific guest, exclusively, and is used to
+# record information such as the UEFI boot order.
+#
+# For any new guest, its permanent, private variable store
+# should initially be copied from the template file
+# provided along with the firmware binary.
+#
+# Depending on the OS distribution you're using on the
+# host, the name of the package containing the firmware
+# binary and variable store template, as well as the paths
+# to the files themselves, will be different. For example:
+#
+# Fedora
+# edk2-aarch64 (pkg)
+# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin)
+# /usr/share/edk2/aarch64/vars-template-pflash.raw (var)
+#
+# RHEL
+# AAVMF (pkg)
+# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
+# /usr/share/AAVMF/AAVMF_VARS.fd (var)
+#
+# Debian/Ubuntu
+# qemu-efi (pkg)
+# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
+# /usr/share/AAVMF/AAVMF_VARS.fd (var)
+
+[drive "uefi-binary"]
+ file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME
+ format = "raw"
+ if = "pflash"
+ unit = "0"
+ readonly = "on"
+
+[drive "uefi-varstore"]
+ file = "guest_VARS.fd" # CHANGE ME
+ format = "raw"
+ if = "pflash"
+ unit = "1"
+
+
+# PCI bridge (PCI Express Root Ports)
+# =========================================================
+#
+# We create eight PCI Express Root Ports, and we plug them
+# all into separate functions of the same slot. Some of
+# them will be used by devices, the rest will remain
+# available for hotplug.
+
+[device "pcie.1"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.0"
+ port = "1"
+ chassis = "1"
+ multifunction = "on"
+
+[device "pcie.2"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.1"
+ port = "2"
+ chassis = "2"
+
+[device "pcie.3"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.2"
+ port = "3"
+ chassis = "3"
+
+[device "pcie.4"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.3"
+ port = "4"
+ chassis = "4"
+
+[device "pcie.5"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.4"
+ port = "5"
+ chassis = "5"
+
+[device "pcie.6"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.5"
+ port = "6"
+ chassis = "6"
+
+[device "pcie.7"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.6"
+ port = "7"
+ chassis = "7"
+
+[device "pcie.8"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.7"
+ port = "8"
+ chassis = "8"
+
+
+# SCSI storage controller (and storage)
+# =========================================================
+#
+# We use virtio-scsi here so that we can (hot)plug a large
+# number of disks without running into issues; a SCSI disk,
+# backed by a qcow2 disk image on the host's filesystem, is
+# attached to it.
+#
+# We also create an optical disk, mostly for installation
+# purposes: once the guest OS has been succesfully
+# installed, the guest will no longer boot from optical
+# media. If you don't want, or no longer want, to have an
+# optical disk in the guest you can safely comment out
+# all relevant sections below.
+
+[device "scsi"]
+ driver = "virtio-scsi-pci"
+ bus = "pcie.1"
+ addr = "00.0"
+
+[device "scsi-disk"]
+ driver = "scsi-hd"
+ bus = "scsi.0"
+ drive = "disk"
+ bootindex = "1"
+
+[drive "disk"]
+ file = "guest.qcow2" # CHANGE ME
+ format = "qcow2"
+ if = "none"
+
+[device "scsi-optical-disk"]
+ driver = "scsi-cd"
+ bus = "scsi.0"
+ drive = "optical-disk"
+ bootindex = "2"
+
+[drive "optical-disk"]
+ file = "install.iso" # CHANGE ME
+ format = "raw"
+ if = "none"
+
+
+# Ethernet controller
+# =========================================================
+#
+# We use virtio-net for improved performance over emulated
+# hardware; on the host side, we take advantage of user
+# networking so that the QEMU process doesn't require any
+# additional privileges.
+
+[netdev "hostnet"]
+ type = "user"
+
+[device "net"]
+ driver = "virtio-net-pci"
+ netdev = "hostnet"
+ bus = "pcie.2"
+ addr = "00.0"
+
+
+# USB controller (and input devices)
+# =========================================================
+#
+# We add a virtualization-friendly USB 3.0 controller and
+# a USB keyboard / USB tablet combo so that graphical
+# guests can be controlled appropriately.
+
+[device "usb"]
+ driver = "nec-usb-xhci"
+ bus = "pcie.3"
+ addr = "00.0"
+
+[device "keyboard"]
+ driver = "usb-kbd"
+ bus = "usb.0"
+
+[device "tablet"]
+ driver = "usb-tablet"
+ bus = "usb.0"
+
+
+# Display controller
+# =========================================================
+#
+# We use virtio-gpu because the legacy VGA framebuffer is
+# very troublesome on aarch64, and virtio-gpu is the only
+# video device that doesn't implement it.
+#
+# If you're running the guest on a remote, potentially
+# headless host, you will probably want to append something
+# like
+#
+# -display vnc=127.0.0.1:0
+#
+# to the command line in order to prevent QEMU from
+# creating a graphical display window on the host and
+# enable remote access instead.
+
+[device "video"]
+ driver = "virtio-gpu"
+ bus = "pcie.0"
+ addr = "01.0"
--- /dev/null
+# mach-virt - VirtIO guest (serial console)
+# =========================================================
+#
+# Usage:
+#
+# $ qemu-system-aarch64 \
+# -nodefaults \
+# -readconfig mach-virt-serial.cfg \
+# -display none -serial mon:stdio \
+# -cpu host
+#
+# You will probably need to tweak the lines marked as
+# CHANGE ME before being able to use this configuration!
+#
+# The guest will have a selection of VirtIO devices
+# tailored towards optimal performance with modern guests,
+# and will be accessed through the serial console.
+#
+# ---------------------------------------------------------
+#
+# Using -nodefaults is required to have full control over
+# the virtual hardware: when it's specified, QEMU will
+# populate the board with only the builtin peripherals,
+# such as the PL011 UART, plus a PCI Express Root Bus; the
+# user will then have to explicitly add further devices.
+#
+# The PCI Express Root Bus shows up in the guest as:
+#
+# 00:00.0 Host bridge
+#
+# This configuration file adds a number of other useful
+# devices, more specifically:
+#
+# 00.1c.* PCI bridge (PCI Express Root Ports)
+# 01:00.0 SCSI storage controller
+# 02:00.0 Ethernet controller
+#
+# More information about these devices is available below.
+#
+# We use '-display none' to prevent QEMU from creating a
+# graphical display window, which would serve no use in
+# this specific configuration, and '-serial mon:stdio' to
+# multiplex the guest's serial console and the QEMU monitor
+# to the host's stdio; use 'Ctrl+A h' to learn how to
+# switch between the two and more.
+
+
+# Machine options
+# =========================================================
+#
+# We use the virt machine type and enable KVM acceleration
+# for better performance.
+#
+# Using less than 1 GiB of memory is probably not going to
+# yield good performance in the guest, and might even lead
+# to obscure boot issues in some cases.
+#
+# Unfortunately, there is no way to configure the CPU model
+# in this file, so it will have to be provided on the
+# command line, but we can configure the guest to use the
+# same GIC version as the host.
+
+[machine]
+ type = "virt"
+ accel = "kvm"
+ gic-version = "host"
+
+[memory]
+ size = "1024"
+
+
+# Firmware configuration
+# =========================================================
+#
+# There are two parts to the firmware: a read-only image
+# containing the executable code, which is shared between
+# guests, and a read/write variable store that is owned
+# by one specific guest, exclusively, and is used to
+# record information such as the UEFI boot order.
+#
+# For any new guest, its permanent, private variable store
+# should initially be copied from the template file
+# provided along with the firmware binary.
+#
+# Depending on the OS distribution you're using on the
+# host, the name of the package containing the firmware
+# binary and variable store template, as well as the paths
+# to the files themselves, will be different. For example:
+#
+# Fedora
+# edk2-aarch64 (pkg)
+# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin)
+# /usr/share/edk2/aarch64/vars-template-pflash.raw (var)
+#
+# RHEL
+# AAVMF (pkg)
+# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
+# /usr/share/AAVMF/AAVMF_VARS.fd (var)
+#
+# Debian/Ubuntu
+# qemu-efi (pkg)
+# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
+# /usr/share/AAVMF/AAVMF_VARS.fd (var)
+
+[drive "uefi-binary"]
+ file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME
+ format = "raw"
+ if = "pflash"
+ unit = "0"
+ readonly = "on"
+
+[drive "uefi-varstore"]
+ file = "guest_VARS.fd" # CHANGE ME
+ format = "raw"
+ if = "pflash"
+ unit = "1"
+
+
+# PCI bridge (PCI Express Root Ports)
+# =========================================================
+#
+# We create eight PCI Express Root Ports, and we plug them
+# all into separate functions of the same slot. Some of
+# them will be used by devices, the rest will remain
+# available for hotplug.
+
+[device "pcie.1"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.0"
+ port = "1"
+ chassis = "1"
+ multifunction = "on"
+
+[device "pcie.2"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.1"
+ port = "2"
+ chassis = "2"
+
+[device "pcie.3"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.2"
+ port = "3"
+ chassis = "3"
+
+[device "pcie.4"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.3"
+ port = "4"
+ chassis = "4"
+
+[device "pcie.5"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.4"
+ port = "5"
+ chassis = "5"
+
+[device "pcie.6"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.5"
+ port = "6"
+ chassis = "6"
+
+[device "pcie.7"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.6"
+ port = "7"
+ chassis = "7"
+
+[device "pcie.8"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.7"
+ port = "8"
+ chassis = "8"
+
+
+# SCSI storage controller (and storage)
+# =========================================================
+#
+# We use virtio-scsi here so that we can (hot)plug a large
+# number of disks without running into issues; a SCSI disk,
+# backed by a qcow2 disk image on the host's filesystem, is
+# attached to it.
+#
+# We also create an optical disk, mostly for installation
+# purposes: once the guest OS has been succesfully
+# installed, the guest will no longer boot from optical
+# media. If you don't want, or no longer want, to have an
+# optical disk in the guest you can safely comment out
+# all relevant sections below.
+
+[device "scsi"]
+ driver = "virtio-scsi-pci"
+ bus = "pcie.1"
+ addr = "00.0"
+
+[device "scsi-disk"]
+ driver = "scsi-hd"
+ bus = "scsi.0"
+ drive = "disk"
+ bootindex = "1"
+
+[drive "disk"]
+ file = "guest.qcow2" # CHANGE ME
+ format = "qcow2"
+ if = "none"
+
+[device "scsi-optical-disk"]
+ driver = "scsi-cd"
+ bus = "scsi.0"
+ drive = "optical-disk"
+ bootindex = "2"
+
+[drive "optical-disk"]
+ file = "install.iso" # CHANGE ME
+ format = "raw"
+ if = "none"
+
+
+# Ethernet controller
+# =========================================================
+#
+# We use virtio-net for improved performance over emulated
+# hardware; on the host side, we take advantage of user
+# networking so that the QEMU process doesn't require any
+# additional privileges.
+
+[netdev "hostnet"]
+ type = "user"
+
+[device "net"]
+ driver = "virtio-net-pci"
+ netdev = "hostnet"
+ bus = "pcie.2"
+ addr = "00.0"
--- /dev/null
+# q35 - Emulated guest (graphical console)
+# =========================================================
+#
+# Usage:
+#
+# $ qemu-system-x86_64 \
+# -nodefaults \
+# -readconfig q35-emulated.cfg
+#
+# You will probably need to tweak the lines marked as
+# CHANGE ME before being able to use this configuration!
+#
+# The guest will have a selection of emulated devices that
+# closely resembles that of a physical machine, and will be
+# accessed through a graphical console.
+#
+# ---------------------------------------------------------
+#
+# Using -nodefaults is required to have full control over
+# the virtual hardware: when it's specified, QEMU will
+# populate the board with only the builtin peripherals
+# plus a small selection of core PCI devices and
+# controllers; the user will then have to explicitly add
+# further devices.
+#
+# The core PCI devices show up in the guest as:
+#
+# 00:00.0 Host bridge
+# 00:1f.0 ISA bridge / LPC
+# 00:1f.2 SATA (AHCI) controller
+# 00:1f.3 SMBus controller
+#
+# This configuration file adds a number of devices that
+# are pretty much guaranteed to be present in every single
+# physical machine based on q35, more specifically:
+#
+# 00:01.0 VGA compatible controller
+# 00:19.0 Ethernet controller
+# 00:1a.* USB controller (#2)
+# 00:1b.0 Audio device
+# 00:1c.* PCI bridge (PCI Express Root Ports)
+# 00:1d.* USB Controller (#1)
+# 00:1e.0 PCI bridge (legacy PCI bridge)
+#
+# More information about these devices is available below.
+
+
+# Machine options
+# =========================================================
+#
+# We use the q35 machine type and enable KVM acceleration
+# for better performance.
+#
+# Using less than 1 GiB of memory is probably not going to
+# yield good performance in the guest, and might even lead
+# to obscure boot issues in some cases.
+#
+# Unfortunately, there is no way to configure the CPU model
+# in this file, so it will have to be provided on the
+# command line.
+
+[machine]
+ type = "q35"
+ accel = "kvm"
+
+[memory]
+ size = "1024"
+
+
+# PCI bridge (PCI Express Root Ports)
+# =========================================================
+#
+# We add four PCI Express Root Ports, all sharing the same
+# slot on the PCI Express Root Bus. These ports support
+# hotplug.
+
+[device "ich9-pcie-port-1"]
+ driver = "ioh3420"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1c.0"
+ port = "1"
+ chassis = "1"
+
+[device "ich9-pcie-port-2"]
+ driver = "ioh3420"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1c.1"
+ port = "2"
+ chassis = "2"
+
+[device "ich9-pcie-port-3"]
+ driver = "ioh3420"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1c.2"
+ port = "3"
+ chassis = "3"
+
+[device "ich9-pcie-port-4"]
+ driver = "ioh3420"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1c.3"
+ port = "4"
+ chassis = "4"
+
+
+# PCI bridge (legacy PCI bridge)
+# =========================================================
+#
+# This bridge can be used to build an independent topology
+# for legacy PCI devices. PCI Express devices should be
+# plugged into PCI Express slots instead, so ideally there
+# will be no devices connected to this bridge.
+
+[device "ich9-pci-bridge"]
+ driver = "i82801b11-bridge"
+ bus = "pcie.0"
+ addr = "1e.0"
+
+
+# SATA storage
+# =========================================================
+#
+# An implicit SATA controller is created automatically for
+# every single q35 guest; here we create a disk, backed by
+# a qcow2 disk image on the host's filesystem, and attach
+# it to that controller so that the guest can use it.
+#
+# We also create an optical disk, mostly for installation
+# purposes: once the guest OS has been succesfully
+# installed, the guest will no longer boot from optical
+# media. If you don't want, or no longer want, to have an
+# optical disk in the guest you can safely comment out
+# all relevant sections below.
+
+[device "sata-disk"]
+ driver = "ide-hd"
+ bus = "ide.0"
+ drive = "disk"
+ bootindex = "1"
+
+[drive "disk"]
+ file = "guest.qcow2" # CHANGE ME
+ format = "qcow2"
+ if = "none"
+
+[device "sata-optical-disk"]
+ driver = "ide-cd"
+ bus = "ide.1"
+ drive = "optical-disk"
+ bootindex = "2"
+
+[drive "optical-disk"]
+ file = "install.iso" # CHANGE ME
+ format = "raw"
+ if = "none"
+
+
+# USB controller (#1)
+# =========================================================
+#
+# EHCI controller + UHCI companion controllers.
+
+[device "ich9-ehci-1"]
+ driver = "ich9-usb-ehci1"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1d.7"
+
+[device "ich9-uhci-1"]
+ driver = "ich9-usb-uhci1"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1d.0"
+ masterbus = "ich9-ehci-1.0"
+ firstport = "0"
+
+[device "ich9-uhci-2"]
+ driver = "ich9-usb-uhci2"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1d.1"
+ masterbus = "ich9-ehci-1.0"
+ firstport = "2"
+
+[device "ich9-uhci-3"]
+ driver = "ich9-usb-uhci3"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1d.2"
+ masterbus = "ich9-ehci-1.0"
+ firstport = "4"
+
+
+# USB controller (#2)
+# =========================================================
+#
+# EHCI controller + UHCI companion controllers.
+
+[device "ich9-ehci-2"]
+ driver = "ich9-usb-ehci2"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1a.7"
+
+[device "ich9-uhci-4"]
+ driver = "ich9-usb-uhci4"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1a.0"
+ masterbus = "ich9-ehci-2.0"
+ firstport = "0"
+
+[device "ich9-uhci-5"]
+ driver = "ich9-usb-uhci5"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1a.1"
+ masterbus = "ich9-ehci-2.0"
+ firstport = "2"
+
+[device "ich9-uhci-6"]
+ driver = "ich9-usb-uhci6"
+ multifunction = "on"
+ bus = "pcie.0"
+ addr = "1a.2"
+ masterbus = "ich9-ehci-2.0"
+ firstport = "4"
+
+
+# Ethernet controller
+# =========================================================
+#
+# We add a Gigabit Ethernet interface to the guest; on the
+# host side, we take advantage of user networking so that
+# the QEMU process doesn't require any additional
+# privileges.
+
+[netdev "hostnet"]
+ type = "user"
+
+[device "net"]
+ driver = "e1000"
+ netdev = "hostnet"
+ bus = "pcie.0"
+ addr = "19.0"
+
+
+# VGA compatible controller
+# =========================================================
+#
+# We use stdvga instead of Cirrus as it supports more video
+# modes and is closer to what actual hardware looks like.
+#
+# If you're running the guest on a remote, potentially
+# headless host, you will probably want to append something
+# like
+#
+# -display vnc=127.0.0.1:0
+#
+# to the command line in order to prevent QEMU from
+# creating a graphical display window on the host and
+# enable remote access instead.
+
+[device "video"]
+ driver = "VGA"
+ bus = "pcie.0"
+ addr = "01.0"
+
+
+# Audio device
+# =========================================================
+#
+# The sound card is a legacy PCI device that is plugged
+# directly into the PCI Express Root Bus.
+
+[device "ich9-hda-audio"]
+ driver = "ich9-intel-hda"
+ bus = "pcie.0"
+ addr = "1b.0"
+
+[device "ich9-hda-duplex"]
+ driver = "hda-duplex"
+ bus = "ich9-hda-audio.0"
+ cad = "0"
--- /dev/null
+# q35 - VirtIO guest (graphical console)
+# =========================================================
+#
+# Usage:
+#
+# $ qemu-system-x86_64 \
+# -nodefaults \
+# -readconfig q35-virtio-graphical.cfg
+#
+# You will probably need to tweak the lines marked as
+# CHANGE ME before being able to use this configuration!
+#
+# The guest will have a selection of VirtIO devices
+# tailored towards optimal performance with modern guests,
+# and will be accessed through a graphical console.
+#
+# ---------------------------------------------------------
+#
+# Using -nodefaults is required to have full control over
+# the virtual hardware: when it's specified, QEMU will
+# populate the board with only the builtin peripherals
+# plus a small selection of core PCI devices and
+# controllers; the user will then have to explicitly add
+# further devices.
+#
+# The core PCI devices show up in the guest as:
+#
+# 00:00.0 Host bridge
+# 00:1f.0 ISA bridge / LPC
+# 00:1f.2 SATA (AHCI) controller
+# 00:1f.3 SMBus controller
+#
+# This configuration file adds a number of other useful
+# devices, more specifically:
+#
+# 00:01.0 VGA compatible controller
+# 00:1b.0 Audio device
+# 00.1c.* PCI bridge (PCI Express Root Ports)
+# 01:00.0 SCSI storage controller
+# 02:00.0 Ethernet controller
+# 03:00.0 USB controller
+#
+# More information about these devices is available below.
+
+
+# Machine options
+# =========================================================
+#
+# We use the q35 machine type and enable KVM acceleration
+# for better performance.
+#
+# Using less than 1 GiB of memory is probably not going to
+# yield good performance in the guest, and might even lead
+# to obscure boot issues in some cases.
+
+[machine]
+ type = "q35"
+ accel = "kvm"
+
+[memory]
+ size = "1024"
+
+
+# PCI bridge (PCI Express Root Ports)
+# =========================================================
+#
+# We create eight PCI Express Root Ports, and we plug them
+# all into separate functions of the same slot. Some of
+# them will be used by devices, the rest will remain
+# available for hotplug.
+
+[device "pcie.1"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.0"
+ port = "1"
+ chassis = "1"
+ multifunction = "on"
+
+[device "pcie.2"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.1"
+ port = "2"
+ chassis = "2"
+
+[device "pcie.3"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.2"
+ port = "3"
+ chassis = "3"
+
+[device "pcie.4"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.3"
+ port = "4"
+ chassis = "4"
+
+[device "pcie.5"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.4"
+ port = "5"
+ chassis = "5"
+
+[device "pcie.6"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.5"
+ port = "6"
+ chassis = "6"
+
+[device "pcie.7"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.6"
+ port = "7"
+ chassis = "7"
+
+[device "pcie.8"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.7"
+ port = "8"
+ chassis = "8"
+
+
+# SCSI storage controller (and storage)
+# =========================================================
+#
+# We use virtio-scsi here so that we can (hot)plug a large
+# number of disks without running into issues; a SCSI disk,
+# backed by a qcow2 disk image on the host's filesystem, is
+# attached to it.
+#
+# We also create an optical disk, mostly for installation
+# purposes: once the guest OS has been succesfully
+# installed, the guest will no longer boot from optical
+# media. If you don't want, or no longer want, to have an
+# optical disk in the guest you can safely comment out
+# all relevant sections below.
+
+[device "scsi"]
+ driver = "virtio-scsi-pci"
+ bus = "pcie.1"
+ addr = "00.0"
+
+[device "scsi-disk"]
+ driver = "scsi-hd"
+ bus = "scsi.0"
+ drive = "disk"
+ bootindex = "1"
+
+[drive "disk"]
+ file = "guest.qcow2" # CHANGE ME
+ format = "qcow2"
+ if = "none"
+
+[device "scsi-optical-disk"]
+ driver = "scsi-cd"
+ bus = "scsi.0"
+ drive = "optical-disk"
+ bootindex = "2"
+
+[drive "optical-disk"]
+ file = "install.iso" # CHANGE ME
+ format = "raw"
+ if = "none"
+
+
+# Ethernet controller
+# =========================================================
+#
+# We use virtio-net for improved performance over emulated
+# hardware; on the host side, we take advantage of user
+# networking so that the QEMU process doesn't require any
+# additional privileges.
+
+[netdev "hostnet"]
+ type = "user"
+
+[device "net"]
+ driver = "virtio-net-pci"
+ netdev = "hostnet"
+ bus = "pcie.2"
+ addr = "00.0"
+
+
+# USB controller (and input devices)
+# =========================================================
+#
+# We add a virtualization-friendly USB 3.0 controller and
+# a USB tablet so that graphical guests can be controlled
+# appropriately. A USB keyboard is not needed, as q35
+# guests get a PS/2 one added automatically.
+
+[device "usb"]
+ driver = "nec-usb-xhci"
+ bus = "pcie.3"
+ addr = "00.0"
+
+[device "tablet"]
+ driver = "usb-tablet"
+ bus = "usb.0"
+
+
+# VGA compatible controller
+# =========================================================
+#
+# We plug the QXL video card directly into the PCI Express
+# Root Bus as it is a legacy PCI device; this way, we can
+# reduce the number of PCI Express controllers in the
+# guest.
+#
+# If you're running the guest on a remote, potentially
+# headless host, you will probably want to append something
+# like
+#
+# -display vnc=127.0.0.1:0
+#
+# to the command line in order to prevent QEMU from
+# creating a graphical display window on the host and
+# enable remote access instead.
+
+[device "video"]
+ driver = "qxl-vga"
+ bus = "pcie.0"
+ addr = "01.0"
+
+
+# Audio device
+# =========================================================
+#
+# Like the video card, the sound card is a legacy PCI
+# device and as such can be plugged directly into the PCI
+# Express Root Bus.
+
+[device "sound"]
+ driver = "ich9-intel-hda"
+ bus = "pcie.0"
+ addr = "1b.0"
+
+[device "duplex"]
+ driver = "hda-duplex"
+ bus = "sound.0"
+ cad = "0"
--- /dev/null
+# q35 - VirtIO guest (serial console)
+# =========================================================
+#
+# Usage:
+#
+# $ qemu-system-x86_64 \
+# -nodefaults \
+# -readconfig q35-virtio-serial.cfg \
+# -display none -serial mon:stdio
+#
+# You will probably need to tweak the lines marked as
+# CHANGE ME before being able to use this configuration!
+#
+# The guest will have a selection of VirtIO devices
+# tailored towards optimal performance with modern guests,
+# and will be accessed through the serial console.
+#
+# ---------------------------------------------------------
+#
+# Using -nodefaults is required to have full control over
+# the virtual hardware: when it's specified, QEMU will
+# populate the board with only the builtin peripherals
+# plus a small selection of core PCI devices and
+# controllers; the user will then have to explicitly add
+# further devices.
+#
+# The core PCI devices show up in the guest as:
+#
+# 00:00.0 Host bridge
+# 00:1f.0 ISA bridge / LPC
+# 00:1f.2 SATA (AHCI) controller
+# 00:1f.3 SMBus controller
+#
+# This configuration file adds a number of other useful
+# devices, more specifically:
+#
+# 00.1c.* PCI bridge (PCI Express Root Ports)
+# 01:00.0 SCSI storage controller
+# 02:00.0 Ethernet controller
+#
+# More information about these devices is available below.
+#
+# We use '-display none' to prevent QEMU from creating a
+# graphical display window, which would serve no use in
+# this specific configuration, and '-serial mon:stdio' to
+# multiplex the guest's serial console and the QEMU monitor
+# to the host's stdio; use 'Ctrl+A h' to learn how to
+# switch between the two and more.
+
+
+# Machine options
+# =========================================================
+#
+# We use the q35 machine type and enable KVM acceleration
+# for better performance.
+#
+# Using less than 1 GiB of memory is probably not going to
+# yield good performance in the guest, and might even lead
+# to obscure boot issues in some cases.
+
+[machine]
+ type = "q35"
+ accel = "kvm"
+
+[memory]
+ size = "1024"
+
+
+# PCI bridge (PCI Express Root Ports)
+# =========================================================
+#
+# We create eight PCI Express Root Ports, and we plug them
+# all into separate functions of the same slot. Some of
+# them will be used by devices, the rest will remain
+# available for hotplug.
+
+[device "pcie.1"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.0"
+ port = "1"
+ chassis = "1"
+ multifunction = "on"
+
+[device "pcie.2"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.1"
+ port = "2"
+ chassis = "2"
+
+[device "pcie.3"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.2"
+ port = "3"
+ chassis = "3"
+
+[device "pcie.4"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.3"
+ port = "4"
+ chassis = "4"
+
+[device "pcie.5"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.4"
+ port = "5"
+ chassis = "5"
+
+[device "pcie.6"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.5"
+ port = "6"
+ chassis = "6"
+
+[device "pcie.7"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.6"
+ port = "7"
+ chassis = "7"
+
+[device "pcie.8"]
+ driver = "pcie-root-port"
+ bus = "pcie.0"
+ addr = "1c.7"
+ port = "8"
+ chassis = "8"
+
+
+# SCSI storage controller (and storage)
+# =========================================================
+#
+# We use virtio-scsi here so that we can (hot)plug a large
+# number of disks without running into issues; a SCSI disk,
+# backed by a qcow2 disk image on the host's filesystem, is
+# attached to it.
+#
+# We also create an optical disk, mostly for installation
+# purposes: once the guest OS has been succesfully
+# installed, the guest will no longer boot from optical
+# media. If you don't want, or no longer want, to have an
+# optical disk in the guest you can safely comment out
+# all relevant sections below.
+
+[device "scsi"]
+ driver = "virtio-scsi-pci"
+ bus = "pcie.1"
+ addr = "00.0"
+
+[device "scsi-disk"]
+ driver = "scsi-hd"
+ bus = "scsi.0"
+ drive = "disk"
+ bootindex = "1"
+
+[drive "disk"]
+ file = "guest.qcow2" # CHANGE ME
+ format = "qcow2"
+ if = "none"
+
+[device "scsi-optical-disk"]
+ driver = "scsi-cd"
+ bus = "scsi.0"
+ drive = "optical-disk"
+ bootindex = "2"
+
+[drive "optical-disk"]
+ file = "install.iso" # CHANGE ME
+ format = "raw"
+ if = "none"
+
+
+# Ethernet controller
+# =========================================================
+#
+# We use virtio-net for improved performance over emulated
+# hardware; on the host side, we take advantage of user
+# networking so that the QEMU process doesn't require any
+# additional privileges.
+
+[netdev "hostnet"]
+ type = "user"
+
+[device "net"]
+ driver = "virtio-net-pci"
+ netdev = "hostnet"
+ bus = "pcie.2"
+ addr = "00.0"
--- /dev/null
+CPUs perform independent memory operations effectively in random order.
+but this can be a problem for CPU-CPU interaction (including interactions
+between QEMU and the guest). Multi-threaded programs use various tools
+to instruct the compiler and the CPU to restrict the order to something
+that is consistent with the expectations of the programmer.
+
+The most basic tool is locking. Mutexes, condition variables and
+semaphores are used in QEMU, and should be the default approach to
+synchronization. Anything else is considerably harder, but it's
+also justified more often than one would like. The two tools that
+are provided by qemu/atomic.h are memory barriers and atomic operations.
+
+Macros defined by qemu/atomic.h fall in three camps:
+
+- compiler barriers: barrier();
+
+- weak atomic access and manual memory barriers: atomic_read(),
+ atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(),
+ smp_mb_release(), smp_read_barrier_depends();
+
+- sequentially consistent atomic access: everything else.
+
+
+COMPILER MEMORY BARRIER
+=======================
+
+barrier() prevents the compiler from moving the memory accesses either
+side of it to the other side. The compiler barrier has no direct effect
+on the CPU, which may then reorder things however it wishes.
+
+barrier() is mostly used within qemu/atomic.h itself. On some
+architectures, CPU guarantees are strong enough that blocking compiler
+optimizations already ensures the correct order of execution. In this
+case, qemu/atomic.h will reduce stronger memory barriers to simple
+compiler barriers.
+
+Still, barrier() can be useful when writing code that can be interrupted
+by signal handlers.
+
+
+SEQUENTIALLY CONSISTENT ATOMIC ACCESS
+=====================================
+
+Most of the operations in the qemu/atomic.h header ensure *sequential
+consistency*, where "the result of any execution is the same as if the
+operations of all the processors were executed in some sequential order,
+and the operations of each individual processor appear in this sequence
+in the order specified by its program".
+
+qemu/atomic.h provides the following set of atomic read-modify-write
+operations:
+
+ void atomic_inc(ptr)
+ void atomic_dec(ptr)
+ void atomic_add(ptr, val)
+ void atomic_sub(ptr, val)
+ void atomic_and(ptr, val)
+ void atomic_or(ptr, val)
+
+ typeof(*ptr) atomic_fetch_inc(ptr)
+ typeof(*ptr) atomic_fetch_dec(ptr)
+ typeof(*ptr) atomic_fetch_add(ptr, val)
+ typeof(*ptr) atomic_fetch_sub(ptr, val)
+ typeof(*ptr) atomic_fetch_and(ptr, val)
+ typeof(*ptr) atomic_fetch_or(ptr, val)
+ typeof(*ptr) atomic_xchg(ptr, val)
+ typeof(*ptr) atomic_cmpxchg(ptr, old, new)
+
+all of which return the old value of *ptr. These operations are
+polymorphic; they operate on any type that is as wide as an int.
+
+Sequentially consistent loads and stores can be done using:
+
+ atomic_fetch_add(ptr, 0) for loads
+ atomic_xchg(ptr, val) for stores
+
+However, they are quite expensive on some platforms, notably POWER and
+ARM. Therefore, qemu/atomic.h provides two primitives with slightly
+weaker constraints:
+
+ typeof(*ptr) atomic_mb_read(ptr)
+ void atomic_mb_set(ptr, val)
+
+The semantics of these primitives map to Java volatile variables,
+and are strongly related to memory barriers as used in the Linux
+kernel (see below).
+
+As long as you use atomic_mb_read and atomic_mb_set, accesses cannot
+be reordered with each other, and it is also not possible to reorder
+"normal" accesses around them.
+
+However, and this is the important difference between
+atomic_mb_read/atomic_mb_set and sequential consistency, it is important
+for both threads to access the same volatile variable. It is not the
+case that everything visible to thread A when it writes volatile field f
+becomes visible to thread B after it reads volatile field g. The store
+and load have to "match" (i.e., be performed on the same volatile
+field) to achieve the right semantics.
+
+
+These operations operate on any type that is as wide as an int or smaller.
+
+
+WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS
+=============================================
+
+Compared to sequentially consistent atomic access, programming with
+weaker consistency models can be considerably more complicated.
+In general, if the algorithm you are writing includes both writes
+and reads on the same side, it is generally simpler to use sequentially
+consistent primitives.
+
+When using this model, variables are accessed with atomic_read() and
+atomic_set(), and restrictions to the ordering of accesses is enforced
+using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(),
+smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends().
+
+atomic_read() and atomic_set() prevents the compiler from using
+optimizations that might otherwise optimize accesses out of existence
+on the one hand, or that might create unsolicited accesses on the other.
+In general this should not have any effect, because the same compiler
+barriers are already implied by memory barriers. However, it is useful
+to do so, because it tells readers which variables are shared with
+other threads, and which are local to the current thread or protected
+by other, more mundane means.
+
+Memory barriers control the order of references to shared memory.
+They come in six kinds:
+
+- smp_rmb() guarantees that all the LOAD operations specified before
+ the barrier will appear to happen before all the LOAD operations
+ specified after the barrier with respect to the other components of
+ the system.
+
+ In other words, smp_rmb() puts a partial ordering on loads, but is not
+ required to have any effect on stores.
+
+- smp_wmb() guarantees that all the STORE operations specified before
+ the barrier will appear to happen before all the STORE operations
+ specified after the barrier with respect to the other components of
+ the system.
+
+ In other words, smp_wmb() puts a partial ordering on stores, but is not
+ required to have any effect on loads.
+
+- smp_mb_acquire() guarantees that all the LOAD operations specified before
+ the barrier will appear to happen before all the LOAD or STORE operations
+ specified after the barrier with respect to the other components of
+ the system.
+
+- smp_mb_release() guarantees that all the STORE operations specified *after*
+ the barrier will appear to happen after all the LOAD or STORE operations
+ specified *before* the barrier with respect to the other components of
+ the system.
+
+- smp_mb() guarantees that all the LOAD and STORE operations specified
+ before the barrier will appear to happen before all the LOAD and
+ STORE operations specified after the barrier with respect to the other
+ components of the system.
+
+ smp_mb() puts a partial ordering on both loads and stores. It is
+ stronger than both a read and a write memory barrier; it implies both
+ smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs
+ coming before the barrier from overtaking LOADs coming after the
+ barrier and vice versa.
+
+- smp_read_barrier_depends() is a weaker kind of read barrier. On
+ most processors, whenever two loads are performed such that the
+ second depends on the result of the first (e.g., the first load
+ retrieves the address to which the second load will be directed),
+ the processor will guarantee that the first LOAD will appear to happen
+ before the second with respect to the other components of the system.
+ However, this is not always true---for example, it was not true on
+ Alpha processors. Whenever this kind of access happens to shared
+ memory (that is not protected by a lock), a read barrier is needed,
+ and smp_read_barrier_depends() can be used instead of smp_rmb().
+
+ Note that the first load really has to have a _data_ dependency and not
+ a control dependency. If the address for the second load is dependent
+ on the first load, but the dependency is through a conditional rather
+ than actually loading the address itself, then it's a _control_
+ dependency and a full read barrier or better is required.
+
+
+This is the set of barriers that is required *between* two atomic_read()
+and atomic_set() operations to achieve sequential consistency:
+
+ | 2nd operation |
+ |-----------------------------------------------|
+ 1st operation | (after last) | atomic_read | atomic_set |
+ ---------------+----------------+-------------+----------------|
+ (before first) | | none | smp_mb_release |
+ ---------------+----------------+-------------+----------------|
+ atomic_read | smp_mb_acquire | smp_rmb | ** |
+ ---------------+----------------+-------------+----------------|
+ atomic_set | none | smp_mb()*** | smp_wmb() |
+ ---------------+----------------+-------------+----------------|
+
+ * Or smp_read_barrier_depends().
+
+ ** This requires a load-store barrier. This is achieved by
+ either smp_mb_acquire() or smp_mb_release().
+
+ *** This requires a store-load barrier. On most machines, the only
+ way to achieve this is a full barrier.
+
+
+You can see that the two possible definitions of atomic_mb_read()
+and atomic_mb_set() are the following:
+
+ 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire()
+ atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb()
+
+ 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire()
+ atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v);
+
+Usually the former is used, because smp_mb() is expensive and a program
+normally has more reads than writes. Therefore it makes more sense to
+make atomic_mb_set() the more expensive operation.
+
+There are two common cases in which atomic_mb_read and atomic_mb_set
+generate too many memory barriers, and thus it can be useful to manually
+place barriers instead:
+
+- when a data structure has one thread that is always a writer
+ and one thread that is always a reader, manual placement of
+ memory barriers makes the write side faster. Furthermore,
+ correctness is easy to check for in this case using the "pairing"
+ trick that is explained below:
+
+ thread 1 thread 1
+ ------------------------- ------------------------
+ (other writes)
+ smp_mb_release()
+ atomic_mb_set(&a, x) atomic_set(&a, x)
+ smp_wmb()
+ atomic_mb_set(&b, y) atomic_set(&b, y)
+
+ =>
+ thread 2 thread 2
+ ------------------------- ------------------------
+ y = atomic_mb_read(&b) y = atomic_read(&b)
+ smp_rmb()
+ x = atomic_mb_read(&a) x = atomic_read(&a)
+ smp_mb_acquire()
+
+ Note that the barrier between the stores in thread 1, and between
+ the loads in thread 2, has been optimized here to a write or a
+ read memory barrier respectively. On some architectures, notably
+ ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as
+ smp_mb, but smp_rmb and/or smp_wmb are more efficient.
+
+- sometimes, a thread is accessing many variables that are otherwise
+ unrelated to each other (for example because, apart from the current
+ thread, exactly one other thread will read or write each of these
+ variables). In this case, it is possible to "hoist" the implicit
+ barriers provided by atomic_mb_read() and atomic_mb_set() outside
+ a loop. For example, the above definition atomic_mb_read() gives
+ the following transformation:
+
+ n = 0; n = 0;
+ for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
+ n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]);
+ smp_mb_acquire();
+
+ Similarly, atomic_mb_set() can be transformed as follows:
+ smp_mb():
+
+ smp_mb_release();
+ for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
+ atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
+ smp_mb();
+
+
+The two tricks can be combined. In this case, splitting a loop in
+two lets you hoist the barriers out of the loops _and_ eliminate the
+expensive smp_mb():
+
+ smp_mb_release();
+ for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++)
+ atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
+ atomic_mb_set(&b[i], false); smb_wmb();
+ } for (i = 0; i < 10; i++)
+ atomic_set(&a[i], false);
+ smp_mb();
+
+ The other thread can still use atomic_mb_read()/atomic_mb_set()
+
+
+Memory barrier pairing
+----------------------
+
+A useful rule of thumb is that memory barriers should always, or almost
+always, be paired with another barrier. In the case of QEMU, however,
+note that the other barrier may actually be in a driver that runs in
+the guest!
+
+For the purposes of pairing, smp_read_barrier_depends() and smp_rmb()
+both count as read barriers. A read barrier shall pair with a write
+barrier or a full barrier; a write barrier shall pair with a read
+barrier or a full barrier. A full barrier can pair with anything.
+For example:
+
+ thread 1 thread 2
+ =============== ===============
+ a = 1;
+ smp_wmb();
+ b = 2; x = b;
+ smp_rmb();
+ y = a;
+
+Note that the "writing" thread is accessing the variables in the
+opposite order as the "reading" thread. This is expected: stores
+before the write barrier will normally match the loads after the
+read barrier, and vice versa. The same is true for more than 2
+access and for data dependency barriers:
+
+ thread 1 thread 2
+ =============== ===============
+ b[2] = 1;
+ smp_wmb();
+ x->i = 2;
+ smp_wmb();
+ a = x; x = a;
+ smp_read_barrier_depends();
+ y = x->i;
+ smp_read_barrier_depends();
+ z = b[y];
+
+smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire().
+and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release().
+
+
+COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
+============================================
+
+Here is a list of differences between Linux kernel atomic operations
+and memory barriers, and the equivalents in QEMU:
+
+- atomic operations in Linux are always on a 32-bit int type and
+ use a boxed atomic_t type; atomic operations in QEMU are polymorphic
+ and use normal C types.
+
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+ at all. Linux 4.1 updated them to implement volatile
+ semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).
+
+ QEMU's atomic_read/set implement, if the compiler supports it, C11
+ atomic relaxed semantics, and volatile semantics otherwise.
+ Both semantics prevent the compiler from doing certain transformations;
+ the difference is that atomic accesses are guaranteed to be atomic,
+ while volatile accesses aren't. Thus, in the volatile case we just cross
+ our fingers hoping that the compiler will generate atomic accesses,
+ since we assume the variables passed are machine-word sized and
+ properly aligned.
+ No barriers are implied by atomic_read/set in either Linux or QEMU.
+
+- atomic read-modify-write operations in Linux are of three kinds:
+
+ atomic_OP returns void
+ atomic_OP_return returns new value of the variable
+ atomic_fetch_OP returns the old value of the variable
+ atomic_cmpxchg returns the old value of the variable
+
+ In QEMU, the second kind does not exist. Currently Linux has
+ atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub.
+
+- different atomic read-modify-write operations in Linux imply
+ a different set of memory barriers; in QEMU, all of them enforce
+ sequential consistency, which means they imply full memory barriers
+ before and after the operation.
+
+- Linux does not have an equivalent of atomic_mb_set(). In particular,
+ note that smp_store_mb() is a little weaker than atomic_mb_set().
+ atomic_mb_read() compiles to the same instructions as Linux's
+ smp_load_acquire(), but this should be treated as an implementation
+ detail. QEMU does have atomic_load_acquire() and atomic_store_release()
+ macros, but for now they are only used within atomic.h. This may
+ change in the future.
+
+
+SOURCES
+=======
+
+* Documentation/memory-barriers.txt from the Linux kernel
+
+* "The JSR-133 Cookbook for Compiler Writers", available at
+ http://g.oswego.edu/dl/jmm/cookbook.html
--- /dev/null
+<!--
+Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+All rights reserved.
+
+This file is licensed via The FreeBSD Documentation License, the full text of
+which is included at the end of this document.
+-->
+
+# Dirty Bitmaps and Incremental Backup
+
+* Dirty Bitmaps are objects that track which data needs to be backed up for the
+ next incremental backup.
+
+* Dirty bitmaps can be created at any time and attached to any node
+ (not just complete drives.)
+
+## Dirty Bitmap Names
+
+* A dirty bitmap's name is unique to the node, but bitmaps attached to different
+ nodes can share the same name.
+
+* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
+ name, but any user-created bitmaps may not be. There can be any number of
+ anonymous bitmaps per node.
+
+* The name of a user-created bitmap must not be empty ("").
+
+## Bitmap Modes
+
+* A Bitmap can be "frozen," which means that it is currently in-use by a backup
+ operation and cannot be deleted, renamed, written to, reset,
+ etc.
+
+* The normal operating mode for a bitmap is "active."
+
+## Basic QMP Usage
+
+### Supported Commands ###
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-remove
+* block-dirty-bitmap-clear
+
+### Creation
+
+* To create a new bitmap, enabled, on the drive with id=drive0:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+* This bitmap will have a default granularity that matches the cluster size of
+ its associated drive, if available, clamped to between [4KiB, 64KiB].
+ The current default for qcow2 is 64KiB.
+
+* To create a new bitmap that tracks changes in 32KiB segments:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0",
+ "granularity": 32768
+ }
+}
+```
+
+### Deletion
+
+* Bitmaps that are frozen cannot be deleted.
+
+* Deleting the bitmap does not impact any other bitmaps attached to the same
+ node, nor does it affect any backups already created from this node.
+
+* Because bitmaps are only unique to the node to which they are attached,
+ you must specify the node/drive name here, too.
+
+```json
+{ "execute": "block-dirty-bitmap-remove",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+### Resetting
+
+* Resetting a bitmap will clear all information it holds.
+
+* An incremental backup created from an empty bitmap will copy no data,
+ as if nothing has changed.
+
+```json
+{ "execute": "block-dirty-bitmap-clear",
+ "arguments": {
+ "node": "drive0",
+ "name": "bitmap0"
+ }
+}
+```
+
+## Transactions
+
+### Justification
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly aligned
+to the full drive backup.
+
+This is not particularly useful if we decide we want to start incremental
+backups after the VM has been running for a while, for which we will need to
+perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+ * Create bitmap0.
+ * Create a full drive backup of drive0.
+3. Incremental backups are now ready to be created.
+
+### Supported Bitmap Transactions
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-clear
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+### Example: New Incremental Backup
+
+As outlined in the justification, perhaps we want to create a new incremental
+backup chain attached to a drive.
+
+```json
+{ "execute": "transaction",
+ "arguments": {
+ "actions": [
+ {"type": "block-dirty-bitmap-add",
+ "data": {"node": "drive0", "name": "bitmap0"} },
+ {"type": "drive-backup",
+ "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+ "sync": "full", "format": "qcow2"} }
+ ]
+ }
+}
+```
+
+### Example: New Incremental Backup Anchor Point
+
+Maybe we just want to create a new full backup with an existing bitmap and
+want to reset the bitmap to track the new chain.
+
+```json
+{ "execute": "transaction",
+ "arguments": {
+ "actions": [
+ {"type": "block-dirty-bitmap-clear",
+ "data": {"node": "drive0", "name": "bitmap0"} },
+ {"type": "drive-backup",
+ "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+ "sync": "full", "format": "qcow2"} }
+ ]
+ }
+}
+```
+
+## Incremental Backups
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported for now.
+So despite the fact that you can attach a bitmap to any arbitrary node, they are
+only currently useful when attached to the root node. This is because
+drive-backup only supports drives/devices instead of arbitrary nodes.
+
+### Example: First Incremental Backup
+
+1. Create a full backup and sync it to the dirty bitmap, as in the transactional
+examples above; or with the VM offline, manually create a full copy and then
+create a new bitmap before the VM begins execution.
+
+ * Let's assume the full backup is named 'full_backup.img'.
+ * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
+
+2. Create a destination image for the incremental backup that utilizes the
+full backup as a backing image.
+
+ * Let's assume it is named 'incremental.0.img'.
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+3. Issue the incremental backup command:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "incremental",
+ "mode": "existing"
+ }
+ }
+ ```
+
+### Example: Second Incremental Backup
+
+1. Create a new destination image for the incremental backup that points to the
+ previous one, e.g.: 'incremental.1.img'
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+ ```
+
+2. Issue a new incremental backup command. The only difference here is that we
+ have changed the target image below.
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.1.img",
+ "format": "qcow2",
+ "sync": "incremental",
+ "mode": "existing"
+ }
+ }
+ ```
+
+## Errors
+
+* In the event of an error that occurs after a backup job is successfully
+ launched, either by a direct QMP command or a QMP transaction, the user
+ will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
+ by a BLOCK_JOB_ERROR event.
+
+* In the case of an event being cancelled, the user will receive a
+ BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
+
+* In either case, the incremental backup data contained within the bitmap is
+ safely rolled back, and the data within the bitmap is not lost. The image
+ file created for the failed attempt can be safely deleted.
+
+* Once the underlying problem is fixed (e.g. more storage space is freed up),
+ you can simply retry the incremental backup command with the same bitmap.
+
+### Example
+
+1. Create a target image:
+
+ ```sh
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+2. Attempt to create an incremental backup via QMP:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "incremental",
+ "mode": "existing"
+ }
+ }
+ ```
+
+3. Receive an event notifying us of failure:
+
+ ```json
+ { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
+ "error": "No space left on device",
+ "device": "drive1", "type": "backup" },
+ "event": "BLOCK_JOB_COMPLETED" }
+ ```
+
+4. Delete the failed incremental, and re-create the image.
+
+ ```sh
+ # rm incremental.0.img
+ # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+ ```
+
+5. Retry the command after fixing the underlying problem,
+ such as freeing up space on the backup volume:
+
+ ```json
+ { "execute": "drive-backup",
+ "arguments": {
+ "device": "drive0",
+ "bitmap": "bitmap0",
+ "target": "incremental.0.img",
+ "format": "qcow2",
+ "sync": "incremental",
+ "mode": "existing"
+ }
+ }
+ ```
+
+6. Receive confirmation that the job completed successfully:
+
+ ```json
+ { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+ "data": { "device": "drive1", "type": "backup",
+ "speed": 0, "len": 67108864, "offset": 67108864},
+ "event": "BLOCK_JOB_COMPLETED" }
+ ```
+
+### Partial Transactional Failures
+
+* Sometimes, a transaction will succeed in launching and return success,
+ but then later the backup jobs themselves may fail. It is possible that
+ a management application may have to deal with a partial backup failure
+ after a successful transaction.
+
+* If multiple backup jobs are specified in a single transaction, when one of
+ them fails, it will not interact with the other backup jobs in any way.
+
+* The job(s) that succeeded will clear the dirty bitmap associated with the
+ operation, but the job(s) that failed will not. It is not "safe" to delete
+ any incremental backups that were created successfully in this scenario,
+ even though others failed.
+
+#### Example
+
+* QMP example highlighting two backup jobs:
+
+ ```json
+ { "execute": "transaction",
+ "arguments": {
+ "actions": [
+ { "type": "drive-backup",
+ "data": { "device": "drive0", "bitmap": "bitmap0",
+ "format": "qcow2", "mode": "existing",
+ "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+ { "type": "drive-backup",
+ "data": { "device": "drive1", "bitmap": "bitmap1",
+ "format": "qcow2", "mode": "existing",
+ "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+ ]
+ }
+ }
+ ```
+
+* QMP example response, highlighting one success and one failure:
+ * Acknowledgement that the Transaction was accepted and jobs were launched:
+ ```json
+ { "return": {} }
+ ```
+
+ * Later, QEMU sends notice that the first job was completed:
+ ```json
+ { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
+ "data": { "device": "drive0", "type": "backup",
+ "speed": 0, "len": 67108864, "offset": 67108864 },
+ "event": "BLOCK_JOB_COMPLETED"
+ }
+ ```
+
+ * Later yet, QEMU sends notice that the second job has failed:
+ ```json
+ { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
+ "data": { "device": "drive1", "action": "report",
+ "operation": "read" },
+ "event": "BLOCK_JOB_ERROR" }
+ ```
+
+ ```json
+ { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
+ "error": "Input/output error",
+ "device": "drive1", "type": "backup" },
+ "event": "BLOCK_JOB_COMPLETED" }
+
+* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
+ but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
+ incremental backup of all drives at a point-in-time is to be made,
+ new backups for both drives will need to be made, taking into account
+ that a new incremental backup for drive0 needs to be based on top of
+ "d0-incr-1.qcow2."
+
+### Grouped Completion Mode
+
+* While jobs launched by transactions normally complete or fail on their own,
+ it is possible to instruct them to complete or fail together as a group.
+
+* QMP transactions take an optional properties structure that can affect
+ the semantics of the transaction.
+
+* The "completion-mode" transaction property can be either "individual"
+ which is the default, legacy behavior described above, or "grouped,"
+ a new behavior detailed below.
+
+* Delayed Completion: In grouped completion mode, no jobs will report
+ success until all jobs are ready to report success.
+
+* Grouped failure: If any job fails in grouped completion mode, all remaining
+ jobs will be cancelled. Any incremental backups will restore their dirty
+ bitmap objects as if no backup command was ever issued.
+
+ * Regardless of if QEMU reports a particular incremental backup job as
+ CANCELLED or as an ERROR, the in-memory bitmap will be restored.
+
+#### Example
+
+* Here's the same example scenario from above with the new property:
+
+ ```json
+ { "execute": "transaction",
+ "arguments": {
+ "actions": [
+ { "type": "drive-backup",
+ "data": { "device": "drive0", "bitmap": "bitmap0",
+ "format": "qcow2", "mode": "existing",
+ "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+ { "type": "drive-backup",
+ "data": { "device": "drive1", "bitmap": "bitmap1",
+ "format": "qcow2", "mode": "existing",
+ "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+ ],
+ "properties": {
+ "completion-mode": "grouped"
+ }
+ }
+ }
+ ```
+
+* QMP example response, highlighting a failure for drive2:
+ * Acknowledgement that the Transaction was accepted and jobs were launched:
+ ```json
+ { "return": {} }
+ ```
+
+ * Later, QEMU sends notice that the second job has errored out,
+ but that the first job was also cancelled:
+ ```json
+ { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
+ "data": { "device": "drive1", "action": "report",
+ "operation": "read" },
+ "event": "BLOCK_JOB_ERROR" }
+ ```
+
+ ```json
+ { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
+ "error": "Input/output error",
+ "device": "drive1", "type": "backup" },
+ "event": "BLOCK_JOB_COMPLETED" }
+ ```
+
+ ```json
+ { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
+ "data": { "device": "drive0", "type": "backup", "speed": 0,
+ "len": 67108864, "offset": 16777216 },
+ "event": "BLOCK_JOB_CANCELLED" }
+ ```
+
+<!--
+The FreeBSD Documentation License
+
+Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
+PDF, PostScript, RTF and so forth) with or without modification, are permitted
+provided that the following conditions are met:
+
+Redistributions of source code (Markdown) must retain the above copyright
+notice, this list of conditions and the following disclaimer of this file
+unmodified.
+
+Redistributions in compiled form (transformed to other DTDs, converted to PDF,
+PostScript, RTF and other formats) must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
--- /dev/null
+Block I/O error injection using blkdebug
+----------------------------------------
+Copyright (C) 2014-2015 Red Hat Inc
+
+This work is licensed under the terms of the GNU GPL, version 2 or later. See
+the COPYING file in the top-level directory.
+
+The blkdebug block driver is a rule-based error injection engine. It can be
+used to exercise error code paths in block drivers including ENOSPC (out of
+space) and EIO.
+
+This document gives an overview of the features available in blkdebug.
+
+Background
+----------
+Block drivers have many error code paths that handle I/O errors. Image formats
+are especially complex since metadata I/O errors during cluster allocation or
+while updating tables happen halfway through request processing and require
+discipline to keep image files consistent.
+
+Error injection allows test cases to trigger I/O errors at specific points.
+This way, all error paths can be tested to make sure they are correct.
+
+Rules
+-----
+The blkdebug block driver takes a list of "rules" that tell the error injection
+engine when to fail an I/O request.
+
+Each I/O request is evaluated against the rules. If a rule matches the request
+then its "action" is executed.
+
+Rules can be placed in a configuration file; the configuration file
+follows the same .ini-like format used by QEMU's -readconfig option, and
+each section of the file represents a rule.
+
+The following configuration file defines a single rule:
+
+ $ cat blkdebug.conf
+ [inject-error]
+ event = "read_aio"
+ errno = "28"
+
+This rule fails all aio read requests with ENOSPC (28). Note that the errno
+value depends on the host. On Linux, see
+/usr/include/asm-generic/errno-base.h for errno values.
+
+Invoke QEMU as follows:
+
+ $ qemu-system-x86_64
+ -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \
+ -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0
+
+Rules support the following attributes:
+
+ event - which type of operation to match (e.g. read_aio, write_aio,
+ flush_to_os, flush_to_disk). See the "Events" section for
+ information on events.
+
+ state - (optional) the engine must be in this state number in order for this
+ rule to match. See the "State transitions" section for information
+ on states.
+
+ errno - the numeric errno value to return when a request matches this rule.
+ The errno values depend on the host since the numeric values are not
+ standarized in the POSIX specification.
+
+ sector - (optional) a sector number that the request must overlap in order to
+ match this rule
+
+ once - (optional, default "off") only execute this action on the first
+ matching request
+
+ immediately - (optional, default "off") return a NULL BlockAIOCB
+ pointer and fail without an errno instead. This
+ exercises the code path where BlockAIOCB fails and the
+ caller's BlockCompletionFunc is not invoked.
+
+Events
+------
+Block drivers provide information about the type of I/O request they are about
+to make so rules can match specific types of requests. For example, the qcow2
+block driver tells blkdebug when it accesses the L1 table so rules can match
+only L1 table accesses and not other metadata or guest data requests.
+
+The core events are:
+
+ read_aio - guest data read
+
+ write_aio - guest data write
+
+ flush_to_os - write out unwritten block driver state (e.g. cached metadata)
+
+ flush_to_disk - flush the host block device's disk cache
+
+See qapi/block-core.json:BlkdebugEvent for the full list of events.
+You may need to grep block driver source code to understand the
+meaning of specific events.
+
+State transitions
+-----------------
+There are cases where more power is needed to match a particular I/O request in
+a longer sequence of requests. For example:
+
+ write_aio
+ flush_to_disk
+ write_aio
+
+How do we match the 2nd write_aio but not the first? This is where state
+transitions come in.
+
+The error injection engine has an integer called the "state" that always starts
+initialized to 1. The state integer is internal to blkdebug and cannot be
+observed from outside but rules can interact with it for powerful matching
+behavior.
+
+Rules can be conditional on the current state and they can transition to a new
+state.
+
+When a rule's "state" attribute is non-zero then the current state must equal
+the attribute in order for the rule to match.
+
+For example, to match the 2nd write_aio:
+
+ [set-state]
+ event = "write_aio"
+ state = "1"
+ new_state = "2"
+
+ [inject-error]
+ event = "write_aio"
+ state = "2"
+ errno = "5"
+
+The first write_aio request matches the set-state rule and transitions from
+state 1 to state 2. Once state 2 has been entered, the set-state rule no
+longer matches since it requires state 1. But the inject-error rule now
+matches the next write_aio request and injects EIO (5).
+
+State transition rules support the following attributes:
+
+ event - which type of operation to match (e.g. read_aio, write_aio,
+ flush_to_os, flush_to_disk). See the "Events" section for
+ information on events.
+
+ state - (optional) the engine must be in this state number in order for this
+ rule to match
+
+ new_state - transition to this state number
+
+Suspend and resume
+------------------
+Exercising code paths in block drivers may require specific ordering amongst
+concurrent requests. The "breakpoint" feature allows requests to be halted on
+a blkdebug event and resumed later. This makes it possible to achieve
+deterministic ordering when multiple requests are in flight.
+
+Breakpoints on blkdebug events are associated with a user-defined "tag" string.
+This tag serves as an identifier by which the request can be resumed at a later
+point.
+
+See the qemu-io(1) break, resume, remove_break, and wait_break commands for
+details.
--- /dev/null
+= Block driver correctness testing with blkverify =
+
+== Introduction ==
+
+This document describes how to use the blkverify protocol to test that a block
+driver is operating correctly.
+
+It is difficult to test and debug block drivers against real guests. Often
+processes inside the guest will crash because corrupt sectors were read as part
+of the executable. Other times obscure errors are raised by a program inside
+the guest. These issues are extremely hard to trace back to bugs in the block
+driver.
+
+Blkverify solves this problem by catching data corruption inside QEMU the first
+time bad data is read and reporting the disk sector that is corrupted.
+
+== How it works ==
+
+The blkverify protocol has two child block devices, the "test" device and the
+"raw" device. Read/write operations are mirrored to both devices so their
+state should always be in sync.
+
+The "raw" device is a raw image, a flat file, that has identical starting
+contents to the "test" image. The idea is that the "raw" device will handle
+read/write operations correctly and not corrupt data. It can be used as a
+reference for comparison against the "test" device.
+
+After a mirrored read operation completes, blkverify will compare the data and
+raise an error if it is not identical. This makes it possible to catch the
+first instance where corrupt data is read.
+
+== Example ==
+
+Imagine raw.img has 0xcd repeated throughout its first sector:
+
+ $ ./qemu-io -c 'read -v 0 512' raw.img
+ 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
+ 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
+ [...]
+ 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
+ 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................
+ read 512/512 bytes at offset 0
+ 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec)
+
+And test.img is corrupt, its first sector is zeroed when it shouldn't be:
+
+ $ ./qemu-io -c 'read -v 0 512' test.img
+ 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+ 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+ [...]
+ 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+ 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+ read 512/512 bytes at offset 0
+ 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec)
+
+This error is caught by blkverify:
+
+ $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img
+ blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
+
+A more realistic scenario is verifying the installation of a guest OS:
+
+ $ ./qemu-img create raw.img 16G
+ $ ./qemu-img create -f qcow2 test.qcow2 16G
+ $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \
+ -drive file=blkverify:raw.img:test.qcow2
+
+If the installation is aborted when blkverify detects corruption, use qemu-io
+to explore the contents of the disk image at the sector in question.
--- /dev/null
+ The QEMU build system architecture
+ ==================================
+
+This document aims to help developers understand the architecture of the
+QEMU build system. As with projects using GNU autotools, the QEMU build
+system has two stages, first the developer runs the "configure" script
+to determine the local build environment characteristics, then they run
+"make" to build the project. There is about where the similarities with
+GNU autotools end, so try to forget what you know about them.
+
+
+Stage 1: configure
+==================
+
+The QEMU configure script is written directly in shell, and should be
+compatible with any POSIX shell, hence it uses #!/bin/sh. An important
+implication of this is that it is important to avoid using bash-isms on
+development platforms where bash is the primary host.
+
+In contrast to autoconf scripts, QEMU's configure is expected to be
+silent while it is checking for features. It will only display output
+when an error occurs, or to show the final feature enablement summary
+on completion.
+
+Adding new checks to the configure script usually comprises the
+following tasks:
+
+ - Initialize one or more variables with the default feature state.
+
+ Ideally features should auto-detect whether they are present,
+ so try to avoid hardcoding the initial state to either enabled
+ or disabled, as that forces the user to pass a --enable-XXX
+ / --disable-XXX flag on every invocation of configure.
+
+ - Add support to the command line arg parser to handle any new
+ --enable-XXX / --disable-XXX flags required by the feature XXX.
+
+ - Add information to the help output message to report on the new
+ feature flag.
+
+ - Add code to perform the actual feature check. As noted above, try to
+ be fully dynamic in checking enablement/disablement.
+
+ - Add code to print out the feature status in the configure summary
+ upon completion.
+
+ - Add any new makefile variables to $config_host_mak on completion.
+
+
+Taking (a simplified version of) the probe for gnutls from configure,
+we have the following pieces:
+
+ # Initial variable state
+ gnutls=""
+
+ ..snip..
+
+ # Configure flag processing
+ --disable-gnutls) gnutls="no"
+ ;;
+ --enable-gnutls) gnutls="yes"
+ ;;
+
+ ..snip..
+
+ # Help output feature message
+ gnutls GNUTLS cryptography support
+
+ ..snip..
+
+ # Test for gnutls
+ if test "$gnutls" != "no"; then
+ if ! $pkg_config --exists "gnutls"; then
+ gnutls_cflags=`$pkg_config --cflags gnutls`
+ gnutls_libs=`$pkg_config --libs gnutls`
+ libs_softmmu="$gnutls_libs $libs_softmmu"
+ libs_tools="$gnutls_libs $libs_tools"
+ QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags"
+ gnutls="yes"
+ elif test "$gnutls" = "yes"; then
+ feature_not_found "gnutls" "Install gnutls devel"
+ else
+ gnutls="no"
+ fi
+ fi
+
+ ..snip..
+
+ # Completion feature summary
+ echo "GNUTLS support $gnutls"
+
+ ..snip..
+
+ # Define make variables
+ if test "$gnutls" = "yes" ; then
+ echo "CONFIG_GNUTLS=y" >> $config_host_mak
+ fi
+
+
+Helper functions
+----------------
+
+The configure script provides a variety of helper functions to assist
+developers in checking for system features:
+
+ - do_cc $ARGS...
+
+ Attempt to run the system C compiler passing it $ARGS...
+
+ - do_cxx $ARGS...
+
+ Attempt to run the system C++ compiler passing it $ARGS...
+
+ - compile_object $CFLAGS
+
+ Attempt to compile a test program with the system C compiler using
+ $CFLAGS. The test program must have been previously written to a file
+ called $TMPC.
+
+ - compile_prog $CFLAGS $LDFLAGS
+
+ Attempt to compile a test program with the system C compiler using
+ $CFLAGS and link it with the system linker using $LDFLAGS. The test
+ program must have been previously written to a file called $TMPC.
+
+ - has $COMMAND
+
+ Determine if $COMMAND exists in the current environment, either as a
+ shell builtin, or executable binary, returning 0 on success.
+
+ - path_of $COMMAND
+
+ Return the fully qualified path of $COMMAND, printing it to stdout,
+ and returning 0 on success.
+
+ - check_define $NAME
+
+ Determine if the macro $NAME is defined by the system C compiler
+
+ - check_include $NAME
+
+ Determine if the include $NAME file is available to the system C
+ compiler
+
+ - write_c_skeleton
+
+ Write a minimal C program main() function to the temporary file
+ indicated by $TMPC
+
+ - feature_not_found $NAME $REMEDY
+
+ Print a message to stderr that the feature $NAME was not available
+ on the system, suggesting the user try $REMEDY to address the
+ problem.
+
+ - error_exit $MESSAGE $MORE...
+
+ Print $MESSAGE to stderr, followed by $MORE... and then exit from the
+ configure script with non-zero status
+
+ - query_pkg_config $ARGS...
+
+ Run pkg-config passing it $ARGS. If QEMU is doing a static build,
+ then --static will be automatically added to $ARGS
+
+
+Stage 2: makefiles
+==================
+
+The use of GNU make is required with the QEMU build system.
+
+Although the source code is spread across multiple subdirectories, the
+build system should be considered largely non-recursive in nature, in
+contrast to common practices seen with automake. There is some recursive
+invocation of make, but this is related to the things being built,
+rather than the source directory structure.
+
+QEMU currently supports both VPATH and non-VPATH builds, so there are
+three general ways to invoke configure & perform a build.
+
+ - VPATH, build artifacts outside of QEMU source tree entirely
+
+ cd ../
+ mkdir build
+ cd build
+ ../qemu/configure
+ make
+
+ - VPATH, build artifacts in a subdir of QEMU source tree
+
+ mkdir build
+ cd build
+ ../configure
+ make
+
+ - non-VPATH, build artifacts everywhere
+
+ ./configure
+ make
+
+The QEMU maintainers generally recommend that a VPATH build is used by
+developers. Patches to QEMU are expected to ensure VPATH build still
+works.
+
+
+Module structure
+----------------
+
+There are a number of key outputs of the QEMU build system:
+
+ - Tools - qemu-img, qemu-nbd, qga (guest agent), etc
+ - System emulators - qemu-system-$ARCH
+ - Userspace emulators - qemu-$ARCH
+ - Unit tests
+
+The source code is highly modularized, split across many files to
+facilitate building of all of these components with as little duplicated
+compilation as possible. There can be considered to be two distinct
+groups of files, those which are independent of the QEMU emulation
+target and those which are dependent on the QEMU emulation target.
+
+In the target-independent set lives various general purpose helper code,
+such as error handling infrastructure, standard data structures,
+platform portability wrapper functions, etc. This code can be compiled
+once only and the .o files linked into all output binaries.
+
+In the target-dependent set lives CPU emulation, device emulation and
+much glue code. This sometimes also has to be compiled multiple times,
+once for each target being built.
+
+The utility code that is used by all binaries is built into a
+static archive called libqemuutil.a, which is then linked to all the
+binaries. In order to provide hooks that are only needed by some of the
+binaries, code in libqemuutil.a may depend on other functions that are
+not fully implemented by all QEMU binaries. To deal with this there is a
+second library called libqemustub.a which provides dummy stubs for all
+these functions. These will get lazy linked into the binary if the real
+implementation is not present. In this way, the libqemustub.a static
+library can be thought of as a portable implementation of the weak
+symbols concept. All binaries should link to both libqemuutil.a and
+libqemustub.a. e.g.
+
+ qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a
+
+
+Windows platform portability
+----------------------------
+
+On Windows, all binaries have the suffix '.exe', so all Makefile rules
+which create binaries must include the $(EXESUF) variable on the binary
+name. e.g.
+
+ qemu-img$(EXESUF): qemu-img.o ..snip..
+
+This expands to '.exe' on Windows, or '' on other platforms.
+
+A further complication for the system emulator binaries is that
+two separate binaries need to be generated.
+
+The main binary (e.g. qemu-system-x86_64.exe) is linked against the
+Windows console runtime subsystem. These are expected to be run from a
+command prompt window, and so will print stderr to the console that
+launched them.
+
+The second binary generated has a 'w' on the end of its name (e.g.
+qemu-system-x86_64w.exe) and is linked against the Windows graphical
+runtime subsystem. These are expected to be run directly from the
+desktop and will open up a dedicated console window for stderr output.
+
+The Makefile.target will generate the binary for the graphical subsystem
+first, and then use objcopy to relink it against the console subsystem
+to generate the second binary.
+
+
+Object variable naming
+----------------------
+
+The QEMU convention is to define variables to list different groups of
+object files. These are named with the convention $PREFIX-obj-y. For
+example the libqemuutil.a file will be linked with all objects listed
+in a variable 'util-obj-y'. So, for example, util/Makefile.obj will
+contain a set of definitions looking like
+
+ util-obj-y += bitmap.o bitops.o hbitmap.o
+ util-obj-y += fifo8.o
+ util-obj-y += acl.o
+ util-obj-y += error.o qemu-error.o
+
+When there is an object file which needs to be conditionally built based
+on some characteristic of the host system, the configure script will
+define a variable for the conditional. For example, on Windows it will
+define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a
+value of 'y'. It is now possible to use the config variables when
+listing object files. For example,
+
+ util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o
+ util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
+
+On Windows this expands to
+
+ util-obj-y += oslib-win32.o qemu-thread-win32.o
+ util-obj-n += oslib-posix.o qemu-thread-posix.o
+
+Since libqemutil.a links in $(util-obj-y), the POSIX specific files
+listed against $(util-obj-n) are ignored on the Windows platform builds.
+
+
+CFLAGS / LDFLAGS / LIBS handling
+--------------------------------
+
+There are many different binaries being built with differing purposes,
+and some of them might even be 3rd party libraries pulled in via git
+submodules. As such the use of the global CFLAGS variable is generally
+avoided in QEMU, since it would apply to too many build targets.
+
+Flags that are needed by any QEMU code (i.e. everything *except* GIT
+submodule projects) are put in $(QEMU_CFLAGS) variable. For linker
+flags the $(LIBS) variable is sometimes used, but a couple of more
+targeted variables are preferred. $(libs_softmmu) is used for
+libraries that must be linked to system emulator targets, $(LIBS_TOOLS)
+is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used
+for the QEMU guest agent. There is currently no specific variable for
+the userspace emulator targets as the global $(LIBS), or more targeted
+variables shown below, are sufficient.
+
+In addition to these variables, it is possible to provide cflags and
+libs against individual source code files, by defining variables of the
+form $FILENAME-cflags and $FILENAME-libs. For example, the curl block
+driver needs to link to the libcurl library, so block/Makefile defines
+some variables:
+
+ curl.o-cflags := $(CURL_CFLAGS)
+ curl.o-libs := $(CURL_LIBS)
+
+The scope is a little different between the two variables. The libs get
+used when linking any target binary that includes the curl.o object
+file, while the cflags get used when compiling the curl.c file only.
+
+
+Statically defined files
+------------------------
+
+The following key files are statically defined in the source tree, with
+the rules needed to build QEMU. Their behaviour is influenced by a
+number of dynamically created files listed later.
+
+- Makefile
+
+The main entry point used when invoking make to build all the components
+of QEMU. The default 'all' target will naturally result in the build of
+every component. The various tools and helper binaries are built
+directly via a non-recursive set of rules.
+
+Each system/userspace emulation target needs to have a slightly
+different set of make rules / variables. Thus, make will be recursively
+invoked for each of the emulation targets.
+
+The recursive invocation will end up processing the toplevel
+Makefile.target file (more on that later).
+
+
+- */Makefile.objs
+
+Since the source code is spread across multiple directories, the rules
+for each file are similarly modularized. Thus each subdirectory
+containing .c files will usually also contain a Makefile.objs file.
+These files are not directly invoked by a recursive make, but instead
+they are imported by the top level Makefile and/or Makefile.target
+
+Each Makefile.objs usually just declares a set of variables listing the
+.o files that need building from the source files in the directory. They
+will also define any custom linker or compiler flags. For example in
+block/Makefile.objs
+
+ block-obj-$(CONFIG_LIBISCSI) += iscsi.o
+ block-obj-$(CONFIG_CURL) += curl.o
+
+ ..snip...
+
+ iscsi.o-cflags := $(LIBISCSI_CFLAGS)
+ iscsi.o-libs := $(LIBISCSI_LIBS)
+ curl.o-cflags := $(CURL_CFLAGS)
+ curl.o-libs := $(CURL_LIBS)
+
+If there are any rules defined in the Makefile.objs file, they should
+all use $(obj) as a prefix to the target, e.g.
+
+ $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp
+
+
+- Makefile.target
+
+This file provides the entry point used to build each individual system
+or userspace emulator target. Each enabled target has its own
+subdirectory. For example if configure is run with the argument
+'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu'
+will be created, containing a 'Makefile' which symlinks back to
+Makefile.target
+
+So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up
+using Makefile.target for the build rules.
+
+
+- rules.mak
+
+This file provides the generic helper rules for invoking build tools, in
+particular the compiler and linker. This also contains the magic (hairy)
+'unnest-vars' function which is used to merge the variable definitions
+from all Makefile.objs in the source tree down into the main Makefile
+context.
+
+
+- default-configs/*.mak
+
+The files under default-configs/ control what emulated hardware is built
+into each QEMU system and userspace emulator targets. They merely
+contain a long list of config variable definitions. For example,
+default-configs/x86_64-softmmu.mak has:
+
+ include pci.mak
+ include sound.mak
+ include usb.mak
+ CONFIG_QXL=$(CONFIG_SPICE)
+ CONFIG_VGA_ISA=y
+ CONFIG_VGA_CIRRUS=y
+ CONFIG_VMWARE_VGA=y
+ CONFIG_VIRTIO_VGA=y
+ ...snip...
+
+These files rarely need changing unless new devices / hardware need to
+be enabled for a particular system/userspace emulation target
+
+
+- tests/Makefile
+
+Rules for building the unit tests. This file is included directly by the
+top level Makefile, so anything defined in this file will influence the
+entire build system. Care needs to be taken when writing rules for tests
+to ensure they only apply to the unit test execution / build.
+
+- tests/docker/Makefile.include
+
+Rules for Docker tests. Like tests/Makefile, this file is included
+directly by the top level Makefile, anything defined in this file will
+influence the entire build system.
+
+- po/Makefile
+
+Rules for building and installing the binary message catalogs from the
+text .po file sources. This almost never needs changing for any reason.
+
+
+Dynamically created files
+-------------------------
+
+The following files are generated dynamically by configure in order to
+control the behaviour of the statically defined makefiles. This avoids
+the need for QEMU makefiles to go through any pre-processing as seen
+with autotools, where Makefile.am generates Makefile.in which generates
+Makefile.
+
+
+- config-host.mak
+
+When configure has determined the characteristics of the build host it
+will write a long list of variables to config-host.mak file. This
+provides the various install directories, compiler / linker flags and a
+variety of CONFIG_* variables related to optionally enabled features.
+This is imported by the top level Makefile in order to tailor the build
+output.
+
+The variables defined here are those which are applicable to all QEMU
+build outputs. Variables which are potentially different for each
+emulator target are defined by the next file...
+
+It is also used as a dependency checking mechanism. If make sees that
+the modification timestamp on configure is newer than that on
+config-host.mak, then configure will be re-run.
+
+
+- config-host.h
+
+The config-host.h file is used by source code to determine what features
+are enabled. It is generated from the contents of config-host.mak using
+the scripts/create_config program. This extracts all the CONFIG_* variables,
+most of the HOST_* variables and a few other misc variables from
+config-host.mak, formatting them as C preprocessor macros.
+
+
+- $TARGET-NAME/config-target.mak
+
+TARGET-NAME is the name of a system or userspace emulator, for example,
+x86_64-softmmu denotes the system emulator for the x86_64 architecture.
+This file contains the variables which need to vary on a per-target
+basis. For example, it will indicate whether KVM or Xen are enabled for
+the target and any other potential custom libraries needed for linking
+the target.
+
+
+- $TARGET-NAME/config-devices.mak
+
+TARGET-NAME is again the name of a system or userspace emulator. The
+config-devices.mak file is automatically generated by make using the
+scripts/make_device_config.sh program, feeding it the
+default-configs/$TARGET-NAME file as input.
+
+
+- $TARGET-NAME/Makefile
+
+This is the entrypoint used when make recurses to build a single system
+or userspace emulator target. It is merely a symlink back to the
+Makefile.target in the top level.
--- /dev/null
+DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt)
+===================================================
+
+QEMU often uses reference counts to track data structures that are being
+accessed and should not be freed. For example, a loop that invoke
+callbacks like this is not safe:
+
+ QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
+ if (ioh->revents & G_IO_OUT) {
+ ioh->fd_write(ioh->opaque);
+ }
+ }
+
+QLIST_FOREACH_SAFE protects against deletion of the current node (ioh)
+by stashing away its "next" pointer. However, ioh->fd_write could
+actually delete the next node from the list. The simplest way to
+avoid this is to mark the node as deleted, and remove it from the
+list in the above loop:
+
+ QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
+ if (ioh->deleted) {
+ QLIST_REMOVE(ioh, next);
+ g_free(ioh);
+ } else {
+ if (ioh->revents & G_IO_OUT) {
+ ioh->fd_write(ioh->opaque);
+ }
+ }
+ }
+
+If however this loop must also be reentrant, i.e. it is possible that
+ioh->fd_write invokes the loop again, some kind of counting is needed:
+
+ walking_handlers++;
+ QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
+ if (ioh->deleted) {
+ if (walking_handlers == 1) {
+ QLIST_REMOVE(ioh, next);
+ g_free(ioh);
+ }
+ } else {
+ if (ioh->revents & G_IO_OUT) {
+ ioh->fd_write(ioh->opaque);
+ }
+ }
+ }
+ walking_handlers--;
+
+One may think of using the RCU primitives, rcu_read_lock() and
+rcu_read_unlock(); effectively, the RCU nesting count would take
+the place of the walking_handlers global variable. Indeed,
+reference counting and RCU have similar purposes, but their usage in
+general is complementary:
+
+- reference counting is fine-grained and limited to a single data
+ structure; RCU delays reclamation of *all* RCU-protected data
+ structures;
+
+- reference counting works even in the presence of code that keeps
+ a reference for a long time; RCU critical sections in principle
+ should be kept short;
+
+- reference counting is often applied to code that is not thread-safe
+ but is reentrant; in fact, usage of reference counting in QEMU predates
+ the introduction of threads by many years. RCU is generally used to
+ protect readers from other threads freeing memory after concurrent
+ modifications to a data structure.
+
+- reclaiming data can be done by a separate thread in the case of RCU;
+ this can improve performance, but also delay reclamation undesirably.
+ With reference counting, reclamation is deterministic.
+
+This file documents QemuLockCnt, an abstraction for using reference
+counting in code that has to be both thread-safe and reentrant.
+
+
+QemuLockCnt concepts
+--------------------
+
+A QemuLockCnt comprises both a counter and a mutex; it has primitives
+to increment and decrement the counter, and to take and release the
+mutex. The counter notes how many visits to the data structures are
+taking place (the visits could be from different threads, or there could
+be multiple reentrant visits from the same thread). The basic rules
+governing the counter/mutex pair then are the following:
+
+- Data protected by the QemuLockCnt must not be freed unless the
+ counter is zero and the mutex is taken.
+
+- A new visit cannot be started while the counter is zero and the
+ mutex is taken.
+
+Most of the time, the mutex protects all writes to the data structure,
+not just frees, though there could be cases where this is not necessary.
+
+Reads, instead, can be done without taking the mutex, as long as the
+readers and writers use the same macros that are used for RCU, for
+example atomic_rcu_read, atomic_rcu_set, QLIST_FOREACH_RCU, etc. This is
+because the reads are done outside a lock and a set or QLIST_INSERT_HEAD
+can happen concurrently with the read. The RCU API ensures that the
+processor and the compiler see all required memory barriers.
+
+This could be implemented simply by protecting the counter with the
+mutex, for example:
+
+ // (1)
+ qemu_mutex_lock(&walking_handlers_mutex);
+ walking_handlers++;
+ qemu_mutex_unlock(&walking_handlers_mutex);
+
+ ...
+
+ // (2)
+ qemu_mutex_lock(&walking_handlers_mutex);
+ if (--walking_handlers == 0) {
+ QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
+ if (ioh->deleted) {
+ QLIST_REMOVE(ioh, next);
+ g_free(ioh);
+ }
+ }
+ }
+ qemu_mutex_unlock(&walking_handlers_mutex);
+
+Here, no frees can happen in the code represented by the ellipsis.
+If another thread is executing critical section (2), that part of
+the code cannot be entered, because the thread will not be able
+to increment the walking_handlers variable. And of course
+during the visit any other thread will see a nonzero value for
+walking_handlers, as in the single-threaded code.
+
+Note that it is possible for multiple concurrent accesses to delay
+the cleanup arbitrarily; in other words, for the walking_handlers
+counter to never become zero. For this reason, this technique is
+more easily applicable if concurrent access to the structure is rare.
+
+However, critical sections are easy to forget since you have to do
+them for each modification of the counter. QemuLockCnt ensures that
+all modifications of the counter take the lock appropriately, and it
+can also be more efficient in two ways:
+
+- it avoids taking the lock for many operations (for example
+ incrementing the counter while it is non-zero);
+
+- on some platforms, one can implement QemuLockCnt to hold the lock
+ and the mutex in a single word, making the fast path no more expensive
+ than simply managing a counter using atomic operations (see
+ docs/atomics.txt). This can be very helpful if concurrent access to
+ the data structure is expected to be rare.
+
+
+Using the same mutex for frees and writes can still incur some small
+inefficiencies; for example, a visit can never start if the counter is
+zero and the mutex is taken---even if the mutex is taken by a write,
+which in principle need not block a visit of the data structure.
+However, these are usually not a problem if any of the following
+assumptions are valid:
+
+- concurrent access is possible but rare
+
+- writes are rare
+
+- writes are frequent, but this kind of write (e.g. appending to a
+ list) has a very small critical section.
+
+For example, QEMU uses QemuLockCnt to manage an AioContext's list of
+bottom halves and file descriptor handlers. Modifications to the list
+of file descriptor handlers are rare. Creation of a new bottom half is
+frequent and can happen on a fast path; however: 1) it is almost never
+concurrent with a visit to the list of bottom halves; 2) it only has
+three instructions in the critical path, two assignments and a smp_wmb().
+
+
+QemuLockCnt API
+---------------
+
+The QemuLockCnt API is described in include/qemu/thread.h.
+
+
+QemuLockCnt usage
+-----------------
+
+This section explains the typical usage patterns for QemuLockCnt functions.
+
+Setting a variable to a non-NULL value can be done between
+qemu_lockcnt_lock and qemu_lockcnt_unlock:
+
+ qemu_lockcnt_lock(&xyz_lockcnt);
+ if (!xyz) {
+ new_xyz = g_new(XYZ, 1);
+ ...
+ atomic_rcu_set(&xyz, new_xyz);
+ }
+ qemu_lockcnt_unlock(&xyz_lockcnt);
+
+Accessing the value can be done between qemu_lockcnt_inc and
+qemu_lockcnt_dec:
+
+ qemu_lockcnt_inc(&xyz_lockcnt);
+ if (xyz) {
+ XYZ *p = atomic_rcu_read(&xyz);
+ ...
+ /* Accesses can now be done through "p". */
+ }
+ qemu_lockcnt_dec(&xyz_lockcnt);
+
+Freeing the object can similarly use qemu_lockcnt_lock and
+qemu_lockcnt_unlock, but you also need to ensure that the count
+is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc
+takes the QemuLockCnt's lock, the count cannot become non-zero while
+the object is being freed. Freeing an object looks like this:
+
+ qemu_lockcnt_lock(&xyz_lockcnt);
+ if (!qemu_lockcnt_count(&xyz_lockcnt)) {
+ g_free(xyz);
+ xyz = NULL;
+ }
+ qemu_lockcnt_unlock(&xyz_lockcnt);
+
+If an object has to be freed right after a visit, you can combine
+the decrement, the locking and the check on count as follows:
+
+ qemu_lockcnt_inc(&xyz_lockcnt);
+ if (xyz) {
+ XYZ *p = atomic_rcu_read(&xyz);
+ ...
+ /* Accesses can now be done through "p". */
+ }
+ if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) {
+ g_free(xyz);
+ xyz = NULL;
+ qemu_lockcnt_unlock(&xyz_lockcnt);
+ }
+
+QemuLockCnt can also be used to access a list as follows:
+
+ qemu_lockcnt_inc(&io_handlers_lockcnt);
+ QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) {
+ if (ioh->revents & G_IO_OUT) {
+ ioh->fd_write(ioh->opaque);
+ }
+ }
+
+ if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) {
+ QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
+ if (ioh->deleted) {
+ QLIST_REMOVE(ioh, next);
+ g_free(ioh);
+ }
+ }
+ qemu_lockcnt_unlock(&io_handlers_lockcnt);
+ }
+
+Again, the RCU primitives are used because new items can be added to the
+list during the walk. QLIST_FOREACH_RCU ensures that the processor and
+the compiler see the appropriate memory barriers.
+
+An alternative pattern uses qemu_lockcnt_dec_if_lock:
+
+ qemu_lockcnt_inc(&io_handlers_lockcnt);
+ QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) {
+ if (ioh->deleted) {
+ if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) {
+ QLIST_REMOVE(ioh, next);
+ g_free(ioh);
+ qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt);
+ }
+ } else {
+ if (ioh->revents & G_IO_OUT) {
+ ioh->fd_write(ioh->opaque);
+ }
+ }
+ }
+ qemu_lockcnt_dec(&io_handlers_lockcnt);
+
+Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock,
+because there is no special task to do if the count goes from 1 to 0.
--- /dev/null
+The memory API
+==============
+
+The memory API models the memory and I/O buses and controllers of a QEMU
+machine. It attempts to allow modelling of:
+
+ - ordinary RAM
+ - memory-mapped I/O (MMIO)
+ - memory controllers that can dynamically reroute physical memory regions
+ to different destinations
+
+The memory model provides support for
+
+ - tracking RAM changes by the guest
+ - setting up coalesced memory for kvm
+ - setting up ioeventfd regions for kvm
+
+Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks
+(leaves) are RAM and MMIO regions, while other nodes represent
+buses, memory controllers, and memory regions that have been rerouted.
+
+In addition to MemoryRegion objects, the memory API provides AddressSpace
+objects for every root and possibly for intermediate MemoryRegions too.
+These represent memory as seen from the CPU or a device's viewpoint.
+
+Types of regions
+----------------
+
+There are multiple types of memory regions (all represented by a single C type
+MemoryRegion):
+
+- RAM: a RAM region is simply a range of host memory that can be made available
+ to the guest.
+ You typically initialize these with memory_region_init_ram(). Some special
+ purposes require the variants memory_region_init_resizeable_ram(),
+ memory_region_init_ram_from_file(), or memory_region_init_ram_ptr().
+
+- MMIO: a range of guest memory that is implemented by host callbacks;
+ each read or write causes a callback to be called on the host.
+ You initialize these with memory_region_init_io(), passing it a
+ MemoryRegionOps structure describing the callbacks.
+
+- ROM: a ROM memory region works like RAM for reads (directly accessing
+ a region of host memory), and forbids writes. You initialize these with
+ memory_region_init_rom().
+
+- ROM device: a ROM device memory region works like RAM for reads
+ (directly accessing a region of host memory), but like MMIO for
+ writes (invoking a callback). You initialize these with
+ memory_region_init_rom_device().
+
+- IOMMU region: an IOMMU region translates addresses of accesses made to it
+ and forwards them to some other target memory region. As the name suggests,
+ these are only needed for modelling an IOMMU, not for simple devices.
+ You initialize these with memory_region_init_iommu().
+
+- container: a container simply includes other memory regions, each at
+ a different offset. Containers are useful for grouping several regions
+ into one unit. For example, a PCI BAR may be composed of a RAM region
+ and an MMIO region.
+
+ A container's subregions are usually non-overlapping. In some cases it is
+ useful to have overlapping regions; for example a memory controller that
+ can overlay a subregion of RAM with MMIO or ROM, or a PCI controller
+ that does not prevent card from claiming overlapping BARs.
+
+ You initialize a pure container with memory_region_init().
+
+- alias: a subsection of another region. Aliases allow a region to be
+ split apart into discontiguous regions. Examples of uses are memory banks
+ used when the guest address space is smaller than the amount of RAM
+ addressed, or a memory controller that splits main memory to expose a "PCI
+ hole". Aliases may point to any type of region, including other aliases,
+ but an alias may not point back to itself, directly or indirectly.
+ You initialize these with memory_region_init_alias().
+
+- reservation region: a reservation region is primarily for debugging.
+ It claims I/O space that is not supposed to be handled by QEMU itself.
+ The typical use is to track parts of the address space which will be
+ handled by the host kernel when KVM is enabled.
+ You initialize these with memory_region_init_reservation(), or by
+ passing a NULL callback parameter to memory_region_init_io().
+
+It is valid to add subregions to a region which is not a pure container
+(that is, to an MMIO, RAM or ROM region). This means that the region
+will act like a container, except that any addresses within the container's
+region which are not claimed by any subregion are handled by the
+container itself (ie by its MMIO callbacks or RAM backing). However
+it is generally possible to achieve the same effect with a pure container
+one of whose subregions is a low priority "background" region covering
+the whole address range; this is often clearer and is preferred.
+Subregions cannot be added to an alias region.
+
+Region names
+------------
+
+Regions are assigned names by the constructor. For most regions these are
+only used for debugging purposes, but RAM regions also use the name to identify
+live migration sections. This means that RAM region names need to have ABI
+stability.
+
+Region lifecycle
+----------------
+
+A region is created by one of the memory_region_init*() functions and
+attached to an object, which acts as its owner or parent. QEMU ensures
+that the owner object remains alive as long as the region is visible to
+the guest, or as long as the region is in use by a virtual CPU or another
+device. For example, the owner object will not die between an
+address_space_map operation and the corresponding address_space_unmap.
+
+After creation, a region can be added to an address space or a
+container with memory_region_add_subregion(), and removed using
+memory_region_del_subregion().
+
+Various region attributes (read-only, dirty logging, coalesced mmio,
+ioeventfd) can be changed during the region lifecycle. They take effect
+as soon as the region is made visible. This can be immediately, later,
+or never.
+
+Destruction of a memory region happens automatically when the owner
+object dies.
+
+If however the memory region is part of a dynamically allocated data
+structure, you should call object_unparent() to destroy the memory region
+before the data structure is freed. For an example see VFIOMSIXInfo
+and VFIOQuirk in hw/vfio/pci.c.
+
+You must not destroy a memory region as long as it may be in use by a
+device or CPU. In order to do this, as a general rule do not create or
+destroy memory regions dynamically during a device's lifetime, and only
+call object_unparent() in the memory region owner's instance_finalize
+callback. The dynamically allocated data structure that contains the
+memory region then should obviously be freed in the instance_finalize
+callback as well.
+
+If you break this rule, the following situation can happen:
+
+- the memory region's owner had a reference taken via memory_region_ref
+ (for example by address_space_map)
+
+- the region is unparented, and has no owner anymore
+
+- when address_space_unmap is called, the reference to the memory region's
+ owner is leaked.
+
+
+There is an exception to the above rule: it is okay to call
+object_unparent at any time for an alias or a container region. It is
+therefore also okay to create or destroy alias and container regions
+dynamically during a device's lifetime.
+
+This exceptional usage is valid because aliases and containers only help
+QEMU building the guest's memory map; they are never accessed directly.
+memory_region_ref and memory_region_unref are never called on aliases
+or containers, and the above situation then cannot happen. Exploiting
+this exception is rarely necessary, and therefore it is discouraged,
+but nevertheless it is used in a few places.
+
+For regions that "have no owner" (NULL is passed at creation time), the
+machine object is actually used as the owner. Since instance_finalize is
+never called for the machine object, you must never call object_unparent
+on regions that have no owner, unless they are aliases or containers.
+
+
+Overlapping regions and priority
+--------------------------------
+Usually, regions may not overlap each other; a memory address decodes into
+exactly one target. In some cases it is useful to allow regions to overlap,
+and sometimes to control which of an overlapping regions is visible to the
+guest. This is done with memory_region_add_subregion_overlap(), which
+allows the region to overlap any other region in the same container, and
+specifies a priority that allows the core to decide which of two regions at
+the same address are visible (highest wins).
+Priority values are signed, and the default value is zero. This means that
+you can use memory_region_add_subregion_overlap() both to specify a region
+that must sit 'above' any others (with a positive priority) and also a
+background region that sits 'below' others (with a negative priority).
+
+If the higher priority region in an overlap is a container or alias, then
+the lower priority region will appear in any "holes" that the higher priority
+region has left by not mapping subregions to that area of its address range.
+(This applies recursively -- if the subregions are themselves containers or
+aliases that leave holes then the lower priority region will appear in these
+holes too.)
+
+For example, suppose we have a container A of size 0x8000 with two subregions
+B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is
+an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two
+of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at
+offset 0x2000. As a diagram:
+
+ 0 1000 2000 3000 4000 5000 6000 7000 8000
+ |------|------|------|------|------|------|------|------|
+ A: [ ]
+ C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC]
+ B: [ ]
+ D: [DDDDD]
+ E: [EEEEE]
+
+The regions that will be seen within this address range then are:
+ [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC]
+
+Since B has higher priority than C, its subregions appear in the flat map
+even where they overlap with C. In ranges where B has not mapped anything
+C's region appears.
+
+If B had provided its own MMIO operations (ie it was not a pure container)
+then these would be used for any addresses in its range not handled by
+D or E, and the result would be:
+ [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB]
+
+Priority values are local to a container, because the priorities of two
+regions are only compared when they are both children of the same container.
+This means that the device in charge of the container (typically modelling
+a bus or a memory controller) can use them to manage the interaction of
+its child regions without any side effects on other parts of the system.
+In the example above, the priorities of D and E are unimportant because
+they do not overlap each other. It is the relative priority of B and C
+that causes D and E to appear on top of C: D and E's priorities are never
+compared against the priority of C.
+
+Visibility
+----------
+The memory core uses the following rules to select a memory region when the
+guest accesses an address:
+
+- all direct subregions of the root region are matched against the address, in
+ descending priority order
+ - if the address lies outside the region offset/size, the subregion is
+ discarded
+ - if the subregion is a leaf (RAM or MMIO), the search terminates, returning
+ this leaf region
+ - if the subregion is a container, the same algorithm is used within the
+ subregion (after the address is adjusted by the subregion offset)
+ - if the subregion is an alias, the search is continued at the alias target
+ (after the address is adjusted by the subregion offset and alias offset)
+ - if a recursive search within a container or alias subregion does not
+ find a match (because of a "hole" in the container's coverage of its
+ address range), then if this is a container with its own MMIO or RAM
+ backing the search terminates, returning the container itself. Otherwise
+ we continue with the next subregion in priority order
+- if none of the subregions match the address then the search terminates
+ with no match found
+
+Example memory map
+------------------
+
+system_memory: container@0-2^48-1
+ |
+ +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff)
+ |
+ +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff)
+ |
+ +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff)
+ | (prio 1)
+ |
+ +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff)
+
+pci (0-2^32-1)
+ |
+ +--- vga-area: container@0xa0000-0xbffff
+ | |
+ | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff)
+ | |
+ | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff)
+ |
+ +---- vram: ram@0xe1000000-0xe1ffffff
+ |
+ +---- vga-mmio: mmio@0xe2000000-0xe200ffff
+
+ram: ram@0x00000000-0xffffffff
+
+This is a (simplified) PC memory map. The 4GB RAM block is mapped into the
+system address space via two aliases: "lomem" is a 1:1 mapping of the first
+3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the
+so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with
+4GB of memory.
+
+The memory controller diverts addresses in the range 640K-768K to the PCI
+address space. This is modelled using the "vga-window" alias, mapped at a
+higher priority so it obscures the RAM at the same addresses. The vga window
+can be removed by programming the memory controller; this is modelled by
+removing the alias and exposing the RAM underneath.
+
+The pci address space is not a direct child of the system address space, since
+we only want parts of it to be visible (we accomplish this using aliases).
+It has two subregions: vga-area models the legacy vga window and is occupied
+by two 32K memory banks pointing at two sections of the framebuffer.
+In addition the vram is mapped as a BAR at address e1000000, and an additional
+BAR containing MMIO registers is mapped after it.
+
+Note that if the guest maps a BAR outside the PCI hole, it would not be
+visible as the pci-hole alias clips it to a 0.5GB range.
+
+MMIO Operations
+---------------
+
+MMIO regions are provided with ->read() and ->write() callbacks; in addition
+various constraints can be supplied to control how these callbacks are called:
+
+ - .valid.min_access_size, .valid.max_access_size define the access sizes
+ (in bytes) which the device accepts; accesses outside this range will
+ have device and bus specific behaviour (ignored, or machine check)
+ - .valid.unaligned specifies that the *device being modelled* supports
+ unaligned accesses; if false, unaligned accesses will invoke the
+ appropriate bus or CPU specific behaviour.
+ - .impl.min_access_size, .impl.max_access_size define the access sizes
+ (in bytes) supported by the *implementation*; other access sizes will be
+ emulated using the ones available. For example a 4-byte write will be
+ emulated using four 1-byte writes, if .impl.max_access_size = 1.
+ - .impl.unaligned specifies that the *implementation* supports unaligned
+ accesses; if false, unaligned accesses will be emulated by two aligned
+ accesses.
+ - .old_mmio eases the porting of code that was formerly using
+ cpu_register_io_memory(). It should not be used in new code.
--- /dev/null
+= Migration =
+
+QEMU has code to load/save the state of the guest that it is running.
+These are two complementary operations. Saving the state just does
+that, saves the state for each device that the guest is running.
+Restoring a guest is just the opposite operation: we need to load the
+state of each device.
+
+For this to work, QEMU has to be launched with the same arguments the
+two times. I.e. it can only restore the state in one guest that has
+the same devices that the one it was saved (this last requirement can
+be relaxed a bit, but for now we can consider that configuration has
+to be exactly the same).
+
+Once that we are able to save/restore a guest, a new functionality is
+requested: migration. This means that QEMU is able to start in one
+machine and being "migrated" to another machine. I.e. being moved to
+another machine.
+
+Next was the "live migration" functionality. This is important
+because some guests run with a lot of state (specially RAM), and it
+can take a while to move all state from one machine to another. Live
+migration allows the guest to continue running while the state is
+transferred. Only while the last part of the state is transferred has
+the guest to be stopped. Typically the time that the guest is
+unresponsive during live migration is the low hundred of milliseconds
+(notice that this depends on a lot of things).
+
+=== Types of migration ===
+
+Now that we have talked about live migration, there are several ways
+to do migration:
+
+- tcp migration: do the migration using tcp sockets
+- unix migration: do the migration using unix sockets
+- exec migration: do the migration using the stdin/stdout through a process.
+- fd migration: do the migration using an file descriptor that is
+ passed to QEMU. QEMU doesn't care how this file descriptor is opened.
+
+All these four migration protocols use the same infrastructure to
+save/restore state devices. This infrastructure is shared with the
+savevm/loadvm functionality.
+
+=== State Live Migration ===
+
+This is used for RAM and block devices. It is not yet ported to vmstate.
+<Fill more information here>
+
+=== What is the common infrastructure ===
+
+QEMU uses a QEMUFile abstraction to be able to do migration. Any type
+of migration that wants to use QEMU infrastructure has to create a
+QEMUFile with:
+
+QEMUFile *qemu_fopen_ops(void *opaque,
+ QEMUFilePutBufferFunc *put_buffer,
+ QEMUFileGetBufferFunc *get_buffer,
+ QEMUFileCloseFunc *close);
+
+The functions have the following functionality:
+
+This function writes a chunk of data to a file at the given position.
+The pos argument can be ignored if the file is only used for
+streaming. The handler should try to write all of the data it can.
+
+typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
+ int64_t pos, int size);
+
+Read a chunk of data from a file at the given position. The pos argument
+can be ignored if the file is only be used for streaming. The number of
+bytes actually read should be returned.
+
+typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
+ int64_t pos, int size);
+
+Close a file and return an error code.
+
+typedef int (QEMUFileCloseFunc)(void *opaque);
+
+You can use any internal state that you need using the opaque void *
+pointer that is passed to all functions.
+
+The important functions for us are put_buffer()/get_buffer() that
+allow to write/read a buffer into the QEMUFile.
+
+=== How to save the state of one device ===
+
+The state of a device is saved using intermediate buffers. There are
+some helper functions to assist this saving.
+
+There is a new concept that we have to explain here: device state
+version. When we migrate a device, we save/load the state as a series
+of fields. Some times, due to bugs or new functionality, we need to
+change the state to store more/different information. We use the
+version to identify each time that we do a change. Each version is
+associated with a series of fields saved. The save_state always saves
+the state as the newer version. But load_state sometimes is able to
+load state from an older version.
+
+=== Legacy way ===
+
+This way is going to disappear as soon as all current users are ported to VMSTATE.
+
+Each device has to register two functions, one to save the state and
+another to load the state back.
+
+int register_savevm(DeviceState *dev,
+ const char *idstr,
+ int instance_id,
+ int version_id,
+ SaveStateHandler *save_state,
+ LoadStateHandler *load_state,
+ void *opaque);
+
+typedef void SaveStateHandler(QEMUFile *f, void *opaque);
+typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
+
+The important functions for the device state format are the save_state
+and load_state. Notice that load_state receives a version_id
+parameter to know what state format is receiving. save_state doesn't
+have a version_id parameter because it always uses the latest version.
+
+=== VMState ===
+
+The legacy way of saving/loading state of the device had the problem
+that we have to maintain two functions in sync. If we did one change
+in one of them and not in the other, we would get a failed migration.
+
+VMState changed the way that state is saved/loaded. Instead of using
+a function to save the state and another to load it, it was changed to
+a declarative way of what the state consisted of. Now VMState is able
+to interpret that definition to be able to load/save the state. As
+the state is declared only once, it can't go out of sync in the
+save/load functions.
+
+An example (from hw/input/pckbd.c)
+
+static const VMStateDescription vmstate_kbd = {
+ .name = "pckbd",
+ .version_id = 3,
+ .minimum_version_id = 3,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8(write_cmd, KBDState),
+ VMSTATE_UINT8(status, KBDState),
+ VMSTATE_UINT8(mode, KBDState),
+ VMSTATE_UINT8(pending, KBDState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+We are declaring the state with name "pckbd".
+The version_id is 3, and the fields are 4 uint8_t in a KBDState structure.
+We registered this with:
+
+ vmstate_register(NULL, 0, &vmstate_kbd, s);
+
+Note: talk about how vmstate <-> qdev interact, and what the instance ids mean.
+
+You can search for VMSTATE_* macros for lots of types used in QEMU in
+include/hw/hw.h.
+
+=== More about versions ===
+
+Version numbers are intended for major incompatible changes to the
+migration of a device, and using them breaks backwards-migration
+compatibility; in general most changes can be made by adding Subsections
+(see below) or _TEST macros (see below) which won't break compatibility.
+
+You can see that there are several version fields:
+
+- version_id: the maximum version_id supported by VMState for that device.
+- minimum_version_id: the minimum version_id that VMState is able to understand
+ for that device.
+- minimum_version_id_old: For devices that were not able to port to vmstate, we can
+ assign a function that knows how to read this old state. This field is
+ ignored if there is no load_state_old handler.
+
+So, VMState is able to read versions from minimum_version_id to
+version_id. And the function load_state_old() (if present) is able to
+load state from minimum_version_id_old to minimum_version_id. This
+function is deprecated and will be removed when no more users are left.
+
+Saving state will always create a section with the 'version_id' value
+and thus can't be loaded by any older QEMU.
+
+=== Massaging functions ===
+
+Sometimes, it is not enough to be able to save the state directly
+from one structure, we need to fill the correct values there. One
+example is when we are using kvm. Before saving the cpu state, we
+need to ask kvm to copy to QEMU the state that it is using. And the
+opposite when we are loading the state, we need a way to tell kvm to
+load the state for the cpu that we have just loaded from the QEMUFile.
+
+The functions to do that are inside a vmstate definition, and are called:
+
+- int (*pre_load)(void *opaque);
+
+ This function is called before we load the state of one device.
+
+- int (*post_load)(void *opaque, int version_id);
+
+ This function is called after we load the state of one device.
+
+- void (*pre_save)(void *opaque);
+
+ This function is called before we save the state of one device.
+
+Example: You can look at hpet.c, that uses the three function to
+ massage the state that is transferred.
+
+If you use memory API functions that update memory layout outside
+initialization (i.e., in response to a guest action), this is a strong
+indication that you need to call these functions in a post_load callback.
+Examples of such memory API functions are:
+
+ - memory_region_add_subregion()
+ - memory_region_del_subregion()
+ - memory_region_set_readonly()
+ - memory_region_set_enabled()
+ - memory_region_set_address()
+ - memory_region_set_alias_offset()
+
+=== Subsections ===
+
+The use of version_id allows to be able to migrate from older versions
+to newer versions of a device. But not the other way around. This
+makes very complicated to fix bugs in stable branches. If we need to
+add anything to the state to fix a bug, we have to disable migration
+to older versions that don't have that bug-fix (i.e. a new field).
+
+But sometimes, that bug-fix is only needed sometimes, not always. For
+instance, if the device is in the middle of a DMA operation, it is
+using a specific functionality, ....
+
+It is impossible to create a way to make migration from any version to
+any other version to work. But we can do better than only allowing
+migration from older versions to newer ones. For that fields that are
+only needed sometimes, we add the idea of subsections. A subsection
+is "like" a device vmstate, but with a particularity, it has a Boolean
+function that tells if that values are needed to be sent or not. If
+this functions returns false, the subsection is not sent.
+
+On the receiving side, if we found a subsection for a device that we
+don't understand, we just fail the migration. If we understand all
+the subsections, then we load the state with success.
+
+One important note is that the post_load() function is called "after"
+loading all subsections, because a newer subsection could change same
+value that it uses.
+
+Example:
+
+static bool ide_drive_pio_state_needed(void *opaque)
+{
+ IDEState *s = opaque;
+
+ return ((s->status & DRQ_STAT) != 0)
+ || (s->bus->error_status & BM_STATUS_PIO_RETRY);
+}
+
+const VMStateDescription vmstate_ide_drive_pio_state = {
+ .name = "ide_drive/pio_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .pre_save = ide_drive_pio_pre_save,
+ .post_load = ide_drive_pio_post_load,
+ .needed = ide_drive_pio_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_INT32(req_nb_sectors, IDEState),
+ VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1,
+ vmstate_info_uint8, uint8_t),
+ VMSTATE_INT32(cur_io_buffer_offset, IDEState),
+ VMSTATE_INT32(cur_io_buffer_len, IDEState),
+ VMSTATE_UINT8(end_transfer_fn_idx, IDEState),
+ VMSTATE_INT32(elementary_transfer_size, IDEState),
+ VMSTATE_INT32(packet_transfer_size, IDEState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+const VMStateDescription vmstate_ide_drive = {
+ .name = "ide_drive",
+ .version_id = 3,
+ .minimum_version_id = 0,
+ .post_load = ide_drive_post_load,
+ .fields = (VMStateField[]) {
+ .... several fields ....
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription*[]) {
+ &vmstate_ide_drive_pio_state,
+ NULL
+ }
+};
+
+Here we have a subsection for the pio state. We only need to
+save/send this state when we are in the middle of a pio operation
+(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is
+not enabled, the values on that fields are garbage and don't need to
+be sent.
+
+Using a condition function that checks a 'property' to determine whether
+to send a subsection allows backwards migration compatibility when
+new subsections are added.
+
+For example;
+ a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and
+ default it to true.
+ b) Add an entry to the HW_COMPAT_ for the previous version
+ that sets the property to false.
+ c) Add a static bool support_foo function that tests the property.
+ d) Add a subsection with a .needed set to the support_foo function
+ e) (potentially) Add a pre_load that sets up a default value for 'foo'
+ to be used if the subsection isn't loaded.
+
+Now that subsection will not be generated when using an older
+machine type and the migration stream will be accepted by older
+QEMU versions. pre-load functions can be used to initialise state
+on the newer version so that they default to suitable values
+when loading streams created by older QEMU versions that do not
+generate the subsection.
+
+In some cases subsections are added for data that had been accidentally
+omitted by earlier versions; if the missing data causes the migration
+process to succeed but the guest to behave badly then it may be better
+to send the subsection and cause the migration to explicitly fail
+with the unknown subsection error. If the bad behaviour only happens
+with certain data values, making the subsection conditional on
+the data value (rather than the machine type) allows migrations to succeed
+in most cases. In general the preference is to tie the subsection to
+the machine type, and allow reliable migrations, unless the behaviour
+from omission of the subsection is really bad.
+
+= Not sending existing elements =
+
+Sometimes members of the VMState are no longer needed;
+ removing them will break migration compatibility
+ making them version dependent and bumping the version will break backwards
+ migration compatibility.
+
+The best way is to:
+ a) Add a new property/compatibility/function in the same way for subsections
+ above.
+ b) replace the VMSTATE macro with the _TEST version of the macro, e.g.:
+ VMSTATE_UINT32(foo, barstruct)
+ becomes
+ VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)
+
+ Sometime in the future when we no longer care about the ancient
+versions these can be killed off.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+ Source side
+ Forward path - written by migration thread
+ Return path - opened by main thread, read by return-path thread
+
+ Destination side
+ Forward path - read by main thread
+ Return path - opened by main thread, written by main thread AND postcopy
+ thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode. Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on. Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up. This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+ Command: 'postcopy listen'
+ The device state
+ A series of sections, identical to the precopy streams device state stream
+ containing everything except postcopiable devices (i.e. RAM)
+ Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location. This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+ 1 2 3 4 5 6 7
+main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN )
+thread | |
+ | (page request)
+ | \___
+ v \
+listen thread: --- page -- page -- page -- page -- page --
+
+ a b c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+ All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory, and the main thread recurses into
+qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package. It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+ Advise: Set at the start of migration if postcopy is enabled, even
+ if it hasn't had the start command; here the destination
+ checks that its OS has the support needed for postcopy, and performs
+ setup to ensure the RAM mappings are suitable for later postcopy.
+ The destination will fail early in migration at this point if the
+ required OS support is not present.
+ (Triggered by reception of POSTCOPY_ADVISE command)
+
+ Discard: Entered on receipt of the first 'discard' command; prior to
+ the first Discard being performed, hugepages are switched off
+ (using madvise) to ensure that no new huge pages are created
+ during the postcopy phase, and to cause any huge pages that
+ have discards on them to be broken.
+
+ Listen: The first command in the package, POSTCOPY_LISTEN, switches
+ the destination state to Listen, and starts a new thread
+ (the 'listen thread') which takes over the job of receiving
+ pages off the migration stream, while the main thread carries
+ on processing the blob. With this thread able to process page
+ reception, the destination now 'sensitises' the RAM to detect
+ any access to missing pages (on Linux using the 'userfault'
+ system).
+
+ Running: POSTCOPY_RUN causes the destination to synchronise all
+ state and start the CPUs and IO devices running. The main
+ thread now finishes processing the migration package and
+ now carries on as it would for normal precopy migration
+ (although it can't do the cleanup it would do as it
+ finishes a normal migration).
+
+ End: The listen thread can now quit, and perform the cleanup of migration
+ state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'. The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending. During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+ a) Have been sent but then redirtied (which must be discarded)
+ b) Have not yet been sent - which also must be discarded to cause any
+ transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy. The dirtymap
+is still valid and is used to ensure that no page is sent more than once. Any
+request for a page that has already been sent is ignored. Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
+=== Postcopy with hugepages ===
+
+Postcopy now works with hugetlbfs backed memory:
+ a) The linux kernel on the destination must support userfault on hugepages.
+ b) The huge-page configuration on the source and destination VMs must be
+ identical; i.e. RAMBlocks on both sides must use the same page size.
+ c) Note that -mem-path /dev/hugepages will fall back to allocating normal
+ RAM if it doesn't have enough hugepages, triggering (b) to fail.
+ Using -mem-prealloc enforces the allocation using hugepages.
+ d) Care should be taken with the size of hugepage used; postcopy with 2MB
+ hugepages works well, however 1GB hugepages are likely to be problematic
+ since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
+ and until the full page is transferred the destination thread is blocked.
--- /dev/null
+Copyright (c) 2015-2016 Linaro Ltd.
+
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
+
+Introduction
+============
+
+This document outlines the design for multi-threaded TCG system-mode
+emulation. The current user-mode emulation mirrors the thread
+structure of the translated executable. Some of the work will be
+applicable to both system and linux-user emulation.
+
+The original system-mode TCG implementation was single threaded and
+dealt with multiple CPUs with simple round-robin scheduling. This
+simplified a lot of things but became increasingly limited as systems
+being emulated gained additional cores and per-core performance gains
+for host systems started to level off.
+
+vCPU Scheduling
+===============
+
+We introduce a new running mode where each vCPU will run on its own
+user-space thread. This will be enabled by default for all FE/BE
+combinations that have had the required work done to support this
+safely.
+
+In the general case of running translated code there should be no
+inter-vCPU dependencies and all vCPUs should be able to run at full
+speed. Synchronisation will only be required while accessing internal
+shared data structures or when the emulated architecture requires a
+coherent representation of the emulated machine state.
+
+Shared Data Structures
+======================
+
+Main Run Loop
+-------------
+
+Even when there is no code being generated there are a number of
+structures associated with the hot-path through the main run-loop.
+These are associated with looking up the next translation block to
+execute. These include:
+
+ tb_jmp_cache (per-vCPU, cache of recent jumps)
+ tb_ctx.htable (global hash table, phys address->tb lookup)
+
+As TB linking only occurs when blocks are in the same page this code
+is critical to performance as looking up the next TB to execute is the
+most common reason to exit the generated code.
+
+DESIGN REQUIREMENT: Make access to lookup structures safe with
+multiple reader/writer threads. Minimise any lock contention to do it.
+
+The hot-path avoids using locks where possible. The tb_jmp_cache is
+updated with atomic accesses to ensure consistent results. The fall
+back QHT based hash table is also designed for lockless lookups. Locks
+are only taken when code generation is required or TranslationBlocks
+have their block-to-block jumps patched.
+
+Global TCG State
+----------------
+
+We need to protect the entire code generation cycle including any post
+generation patching of the translated code. This also implies a shared
+translation buffer which contains code running on all cores. Any
+execution path that comes to the main run loop will need to hold a
+mutex for code generation. This also includes times when we need flush
+code or entries from any shared lookups/caches. Structures held on a
+per-vCPU basis won't need locking unless other vCPUs will need to
+modify them.
+
+DESIGN REQUIREMENT: Add locking around all code generation and TB
+patching.
+
+(Current solution)
+
+Mainly as part of the linux-user work all code generation is
+serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the
+place of mmap_lock() in linux-user.
+
+Translation Blocks
+------------------
+
+Currently the whole system shares a single code generation buffer
+which when full will force a flush of all translations and start from
+scratch again. Some operations also force a full flush of translations
+including:
+
+ - debugging operations (breakpoint insertion/removal)
+ - some CPU helper functions
+
+This is done with the async_safe_run_on_cpu() mechanism to ensure all
+vCPUs are quiescent when changes are being made to shared global
+structures.
+
+More granular translation invalidation events are typically due
+to a change of the state of a physical page:
+
+ - code modification (self modify code, patching code)
+ - page changes (new page mapping in linux-user mode)
+
+While setting the invalid flag in a TranslationBlock will stop it
+being used when looked up in the hot-path there are a number of other
+book-keeping structures that need to be safely cleared.
+
+Any TranslationBlocks which have been patched to jump directly to the
+now invalid blocks need the jump patches reversing so they will return
+to the C code.
+
+There are a number of look-up caches that need to be properly updated
+including the:
+
+ - jump lookup cache
+ - the physical-to-tb lookup hash table
+ - the global page table
+
+The global page table (l1_map) which provides a multi-level look-up
+for PageDesc structures which contain pointers to the start of a
+linked list of all Translation Blocks in that page (see page_next).
+
+Both the jump patching and the page cache involve linked lists that
+the invalidated TranslationBlock needs to be removed from.
+
+DESIGN REQUIREMENT: Safely handle invalidation of TBs
+ - safely patch/revert direct jumps
+ - remove central PageDesc lookup entries
+ - ensure lookup caches/hashes are safely updated
+
+(Current solution)
+
+The direct jump themselves are updated atomically by the TCG
+tb_set_jmp_target() code. Modification to the linked lists that allow
+searching for linked pages are done under the protect of the
+tb_lock().
+
+The global page table is protected by the tb_lock() in system-mode and
+mmap_lock() in linux-user mode.
+
+The lookup caches are updated atomically and the lookup hash uses QHT
+which is designed for concurrent safe lookup.
+
+
+Memory maps and TLBs
+--------------------
+
+The memory handling code is fairly critical to the speed of memory
+access in the emulated system. The SoftMMU code is designed so the
+hot-path can be handled entirely within translated code. This is
+handled with a per-vCPU TLB structure which once populated will allow
+a series of accesses to the page to occur without exiting the
+translated code. It is possible to set flags in the TLB address which
+will ensure the slow-path is taken for each access. This can be done
+to support:
+
+ - Memory regions (dividing up access to PIO, MMIO and RAM)
+ - Dirty page tracking (for code gen, SMC detection, migration and display)
+ - Virtual TLB (for translating guest address->real address)
+
+When the TLB tables are updated by a vCPU thread other than their own
+we need to ensure it is done in a safe way so no inconsistent state is
+seen by the vCPU thread.
+
+Some operations require updating a number of vCPUs TLBs at the same
+time in a synchronised manner.
+
+DESIGN REQUIREMENTS:
+
+ - TLB Flush All/Page
+ - can be across-vCPUs
+ - cross vCPU TLB flush may need other vCPU brought to halt
+ - change may need to be visible to the calling vCPU immediately
+ - TLB Flag Update
+ - usually cross-vCPU
+ - want change to be visible as soon as possible
+ - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs)
+ - This is a per-vCPU table - by definition can't race
+ - updated by its own thread when the slow-path is forced
+
+(Current solution)
+
+We have updated cputlb.c to defer operations when a cross-vCPU
+operation with async_run_on_cpu() which ensures each vCPU sees a
+coherent state when it next runs its work (in a few instructions
+time).
+
+A new set up operations (tlb_flush_*_all_cpus) take an additional flag
+which when set will force synchronisation by setting the source vCPUs
+work as "safe work" and exiting the cpu run loop. This ensure by the
+time execution restarts all flush operations have completed.
+
+TLB flag updates are all done atomically and are also protected by the
+tb_lock() which is used by the functions that update the TLB in bulk.
+
+(Known limitation)
+
+Not really a limitation but the wait mechanism is overly strict for
+some architectures which only need flushes completed by a barrier
+instruction. This could be a future optimisation.
+
+Emulated hardware state
+-----------------------
+
+Currently thanks to KVM work any access to IO memory is automatically
+protected by the global iothread mutex, also known as the BQL (Big
+Qemu Lock). Any IO region that doesn't use global mutex is expected to
+do its own locking.
+
+However IO memory isn't the only way emulated hardware state can be
+modified. Some architectures have model specific registers that
+trigger hardware emulation features. Generally any translation helper
+that needs to update more than a single vCPUs of state should take the
+BQL.
+
+As the BQL, or global iothread mutex is shared across the system we
+push the use of the lock as far down into the TCG code as possible to
+minimise contention.
+
+(Current solution)
+
+MMIO access automatically serialises hardware emulation by way of the
+BQL. Currently ARM targets serialise all ARM_CP_IO register accesses
+and also defer the reset/startup of vCPUs to the vCPU context by way
+of async_run_on_cpu().
+
+Updates to interrupt state are also protected by the BQL as they can
+often be cross vCPU.
+
+Memory Consistency
+==================
+
+Between emulated guests and host systems there are a range of memory
+consistency models. Even emulating weakly ordered systems on strongly
+ordered hosts needs to ensure things like store-after-load re-ordering
+can be prevented when the guest wants to.
+
+Memory Barriers
+---------------
+
+Barriers (sometimes known as fences) provide a mechanism for software
+to enforce a particular ordering of memory operations from the point
+of view of external observers (e.g. another processor core). They can
+apply to any memory operations as well as just loads or stores.
+
+The Linux kernel has an excellent write-up on the various forms of
+memory barrier and the guarantees they can provide [1].
+
+Barriers are often wrapped around synchronisation primitives to
+provide explicit memory ordering semantics. However they can be used
+by themselves to provide safe lockless access by ensuring for example
+a change to a signal flag will only be visible once the changes to
+payload are.
+
+DESIGN REQUIREMENT: Add a new tcg_memory_barrier op
+
+This would enforce a strong load/store ordering so all loads/stores
+complete at the memory barrier. On single-core non-SMP strongly
+ordered backends this could become a NOP.
+
+Aside from explicit standalone memory barrier instructions there are
+also implicit memory ordering semantics which comes with each guest
+memory access instruction. For example all x86 load/stores come with
+fairly strong guarantees of sequential consistency where as ARM has
+special variants of load/store instructions that imply acquire/release
+semantics.
+
+In the case of a strongly ordered guest architecture being emulated on
+a weakly ordered host the scope for a heavy performance impact is
+quite high.
+
+DESIGN REQUIREMENTS: Be efficient with use of memory barriers
+ - host systems with stronger implied guarantees can skip some barriers
+ - merge consecutive barriers to the strongest one
+
+(Current solution)
+
+The system currently has a tcg_gen_mb() which will add memory barrier
+operations if code generation is being done in a parallel context. The
+tcg_optimize() function attempts to merge barriers up to their
+strongest form before any load/store operations. The solution was
+originally developed and tested for linux-user based systems. All
+backends have been converted to emit fences when required. So far the
+following front-ends have been updated to emit fences when required:
+
+ - target-i386
+ - target-arm
+ - target-aarch64
+ - target-alpha
+ - target-mips
+
+Memory Control and Maintenance
+------------------------------
+
+This includes a class of instructions for controlling system cache
+behaviour. While QEMU doesn't model cache behaviour these instructions
+are often seen when code modification has taken place to ensure the
+changes take effect.
+
+Synchronisation Primitives
+--------------------------
+
+There are two broad types of synchronisation primitives found in
+modern ISAs: atomic instructions and exclusive regions.
+
+The first type offer a simple atomic instruction which will guarantee
+some sort of test and conditional store will be truly atomic w.r.t.
+other cores sharing access to the memory. The classic example is the
+x86 cmpxchg instruction.
+
+The second type offer a pair of load/store instructions which offer a
+guarantee that an region of memory has not been touched between the
+load and store instructions. An example of this is ARM's ldrex/strex
+pair where the strex instruction will return a flag indicating a
+successful store only if no other CPU has accessed the memory region
+since the ldrex.
+
+Traditionally TCG has generated a series of operations that work
+because they are within the context of a single translation block so
+will have completed before another CPU is scheduled. However with
+the ability to have multiple threads running to emulate multiple CPUs
+we will need to explicitly expose these semantics.
+
+DESIGN REQUIREMENTS:
+ - Support classic atomic instructions
+ - Support load/store exclusive (or load link/store conditional) pairs
+ - Generic enough infrastructure to support all guest architectures
+CURRENT OPEN QUESTIONS:
+ - How problematic is the ABA problem in general?
+
+(Current solution)
+
+The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which
+can be used directly or combined to emulate other instructions like
+ARM's ldrex/strex instructions. While they are susceptible to the ABA
+problem so far common guests have not implemented patterns where
+this may be a problem - typically presenting a locking ABI which
+assumes cmpxchg like semantics.
+
+The code also includes a fall-back for cases where multi-threaded TCG
+ops can't work (e.g. guest atomic width > host atomic width). In this
+case an EXCP_ATOMIC exit occurs and the instruction is emulated with
+an exclusive lock which ensures all emulation is serialised.
+
+While the atomic helpers look good enough for now there may be a need
+to look at solutions that can more closely model the guest
+architectures semantics.
+
+==========
+
+[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt
--- /dev/null
+Copyright (c) 2014 Red Hat Inc.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later. See
+the COPYING file in the top-level directory.
+
+
+This document explains the IOThread feature and how to write code that runs
+outside the QEMU global mutex.
+
+The main loop and IOThreads
+---------------------------
+QEMU is an event-driven program that can do several things at once using an
+event loop. The VNC server and the QMP monitor are both processed from the
+same event loop, which monitors their file descriptors until they become
+readable and then invokes a callback.
+
+The default event loop is called the main loop (see main-loop.c). It is
+possible to create additional event loop threads using -object
+iothread,id=my-iothread.
+
+Side note: The main loop and IOThread are both event loops but their code is
+not shared completely. Sometimes it is useful to remember that although they
+are conceptually similar they are currently not interchangeable.
+
+Why IOThreads are useful
+------------------------
+IOThreads allow the user to control the placement of work. The main loop is a
+scalability bottleneck on hosts with many CPUs. Work can be spread across
+several IOThreads instead of just one main loop. When set up correctly this
+can improve I/O latency and reduce jitter seen by the guest.
+
+The main loop is also deeply associated with the QEMU global mutex, which is a
+scalability bottleneck in itself. vCPU threads and the main loop use the QEMU
+global mutex to serialize execution of QEMU code. This mutex is necessary
+because a lot of QEMU's code historically was not thread-safe.
+
+The fact that all I/O processing is done in a single main loop and that the
+QEMU global mutex is contended by all vCPU threads and the main loop explain
+why it is desirable to place work into IOThreads.
+
+The experimental virtio-blk data-plane implementation has been benchmarked and
+shows these effects:
+ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf
+
+How to program for IOThreads
+----------------------------
+The main difference between legacy code and new code that can run in an
+IOThread is dealing explicitly with the event loop object, AioContext
+(see include/block/aio.h). Code that only works in the main loop
+implicitly uses the main loop's AioContext. Code that supports running
+in IOThreads must be aware of its AioContext.
+
+AioContext supports the following services:
+ * File descriptor monitoring (read/write/error on POSIX hosts)
+ * Event notifiers (inter-thread signalling)
+ * Timers
+ * Bottom Halves (BH) deferred callbacks
+
+There are several old APIs that use the main loop AioContext:
+ * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor
+ * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
+ * LEGACY timer_new_ms() - create a timer
+ * LEGACY qemu_bh_new() - create a BH
+ * LEGACY qemu_aio_wait() - run an event loop iteration
+
+Since they implicitly work on the main loop they cannot be used in code that
+runs in an IOThread. They might cause a crash or deadlock if called from an
+IOThread since the QEMU global mutex is not held.
+
+Instead, use the AioContext functions directly (see include/block/aio.h):
+ * aio_set_fd_handler() - monitor a file descriptor
+ * aio_set_event_notifier() - monitor an event notifier
+ * aio_timer_new() - create a timer
+ * aio_bh_new() - create a BH
+ * aio_poll() - run an event loop iteration
+
+The AioContext can be obtained from the IOThread using
+iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
+Code that takes an AioContext argument works both in IOThreads or the main
+loop, depending on which AioContext instance the caller passes in.
+
+How to synchronize with an IOThread
+-----------------------------------
+AioContext is not thread-safe so some rules must be followed when using file
+descriptors, event notifiers, timers, or BHs across threads:
+
+1. AioContext functions can always be called safely. They handle their
+own locking internally.
+
+2. Other threads wishing to access the AioContext must use
+aio_context_acquire()/aio_context_release() for mutual exclusion. Once the
+context is acquired no other thread can access it or run event loop iterations
+in this AioContext.
+
+aio_context_acquire()/aio_context_release() calls may be nested. This
+means you can call them if you're not sure whether #2 applies.
+
+There is currently no lock ordering rule if a thread needs to acquire multiple
+AioContexts simultaneously. Therefore, it is only safe for code holding the
+QEMU global mutex to acquire other AioContexts.
+
+Side note: the best way to schedule a function call across threads is to call
+aio_bh_schedule_oneshot(). No acquire/release or locking is needed.
+
+AioContext and the block layer
+------------------------------
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.
+
+The block layer has support for AioContext integrated. Each BlockDriverState
+is associated with an AioContext using bdrv_set_aio_context() and
+bdrv_get_aio_context(). This allows block layer code to process I/O inside the
+right AioContext. Other subsystems may wish to follow a similar approach.
+
+Block layer code must therefore expect to run in an IOThread and avoid using
+old APIs that implicitly use the main loop. See the "How to program for
+IOThreads" above for information on how to do that.
+
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.
+
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed. When a block device is running
+in an IOThread, the IOThread can also process requests from the guest
+(via ioeventfd). To achieve both objects, wrap the code between
+bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
+section". The functions must be called between aio_context_acquire()
+and aio_context_release(). You can freely release and re-acquire the
+AioContext within a drained section.
+
+Long-running jobs (usually in the form of coroutines) are best scheduled in
+the BlockDriverState's AioContext to avoid the need to acquire/release around
+each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier,
+or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
+can be used to get a notification whenever bdrv_set_aio_context() moves a
+BlockDriverState to a different AioContext.
--- /dev/null
+= How to use the QAPI code generator =
+
+Copyright IBM Corp. 2011
+Copyright (C) 2012-2016 Red Hat, Inc.
+
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
+
+== Introduction ==
+
+QAPI is a native C API within QEMU which provides management-level
+functionality to internal and external users. For external
+users/processes, this interface is made available by a JSON-based wire
+format for the QEMU Monitor Protocol (QMP) for controlling qemu, as
+well as the QEMU Guest Agent (QGA) for communicating with the guest.
+The remainder of this document uses "Client JSON Protocol" when
+referring to the wire contents of a QMP or QGA connection.
+
+To map Client JSON Protocol interfaces to the native C QAPI
+implementations, a JSON-based schema is used to define types and
+function signatures, and a set of scripts is used to generate types,
+signatures, and marshaling/dispatch code. This document will describe
+how the schemas, scripts, and resulting code are used.
+
+
+== QMP/Guest agent schema ==
+
+A QAPI schema file is designed to be loosely based on JSON
+(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style
+and the use of comments; a QAPI schema file is then parsed by a python
+code generation program. A valid QAPI schema consists of a series of
+top-level expressions, with no commas between them. Where
+dictionaries (JSON objects) are used, they are parsed as python
+OrderedDicts so that ordering is preserved (for predictable layout of
+generated C structs and parameter lists). Ordering doesn't matter
+between top-level expressions or the keys within an expression, but
+does matter within dictionary values for 'data' and 'returns' members
+of a single expression. QAPI schema input is written using 'single
+quotes' instead of JSON's "double quotes" (in contrast, Client JSON
+Protocol uses no comments, and while input accepts 'single quotes' as
+an extension, output is strict JSON using only "double quotes"). As
+in JSON, trailing commas are not permitted in arrays or dictionaries.
+Input must be ASCII (although QMP supports full Unicode strings, the
+QAPI parser does not). At present, there is no place where a QAPI
+schema requires the use of JSON numbers or null.
+
+
+=== Comments ===
+
+Comments are allowed; anything between an unquoted # and the following
+newline is ignored.
+
+A multi-line comment that starts and ends with a '##' line is a
+documentation comment. These are parsed by the documentation
+generator, which recognizes certain markup detailed below.
+
+
+==== Documentation markup ====
+
+Comment text starting with '=' is a section title:
+
+ # = Section title
+
+Double the '=' for a subsection title:
+
+ # == Subection title
+
+'|' denotes examples:
+
+ # | Text of the example, may span
+ # | multiple lines
+
+'*' starts an itemized list:
+
+ # * First item, may span
+ # multiple lines
+ # * Second item
+
+You can also use '-' instead of '*'.
+
+A decimal number followed by '.' starts a numbered list:
+
+ # 1. First item, may span
+ # multiple lines
+ # 2. Second item
+
+The actual number doesn't matter. You could even use '*' instead of
+'2.' for the second item.
+
+Lists can't be nested. Blank lines are currently not supported within
+lists.
+
+Additional whitespace between the initial '#' and the comment text is
+permitted.
+
+*foo* and _foo_ are for strong and emphasis styles respectively (they
+do not work over multiple lines). @foo is used to reference a name in
+the schema.
+
+Example:
+
+##
+# = Section
+# == Subsection
+#
+# Some text foo with *strong* and _emphasis_
+# 1. with a list
+# 2. like that
+#
+# And some code:
+# | $ echo foo
+# | -> do this
+# | <- get that
+#
+##
+
+
+==== Expression documentation ====
+
+Each expression that isn't an include directive may be preceded by a
+documentation block. Such blocks are called expression documentation
+blocks.
+
+When documentation is required (see pragma 'doc-required'), expression
+documentation blocks are mandatory.
+
+The documentation block consists of a first line naming the
+expression, an optional overview, a description of each argument (for
+commands and events) or member (for structs, unions and alternates),
+and optional tagged sections.
+
+FIXME: the parser accepts these things in almost any order.
+
+Extensions added after the expression was first released carry a
+'(since x.y.z)' comment.
+
+A tagged section starts with one of the following words:
+"Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:".
+The section ends with the start of a new section.
+
+A 'Since: x.y.z' tagged section lists the release that introduced the
+expression.
+
+For example:
+
+##
+# @BlockStats:
+#
+# Statistics of a virtual block device or a block backing device.
+#
+# @device: If the stats are for a virtual block device, the name
+# corresponding to the virtual block device.
+#
+# @node-name: The node name of the device. (since 2.3)
+#
+# ... more members ...
+#
+# Since: 0.14.0
+##
+{ 'struct': 'BlockStats',
+ 'data': {'*device': 'str', '*node-name': 'str',
+ ... more members ... } }
+
+##
+# @query-blockstats:
+#
+# Query the @BlockStats for all virtual block devices.
+#
+# @query-nodes: If true, the command will query all the
+# block nodes ... explain, explain ... (since 2.3)
+#
+# Returns: A list of @BlockStats for each virtual block devices.
+#
+# Since: 0.14.0
+#
+# Example:
+#
+# -> { "execute": "query-blockstats" }
+# <- {
+# ... lots of output ...
+# }
+#
+##
+{ 'command': 'query-blockstats',
+ 'data': { '*query-nodes': 'bool' },
+ 'returns': ['BlockStats'] }
+
+==== Free-form documentation ====
+
+A documentation block that isn't an expression documentation block is
+a free-form documentation block. These may be used to provide
+additional text and structuring content.
+
+
+=== Schema overview ===
+
+The schema sets up a series of types, as well as commands and events
+that will use those types. Forward references are allowed: the parser
+scans in two passes, where the first pass learns all type names, and
+the second validates the schema and generates the code. This allows
+the definition of complex structs that can have mutually recursive
+types, and allows for indefinite nesting of Client JSON Protocol that
+satisfies the schema. A type name should not be defined more than
+once. It is permissible for the schema to contain additional types
+not used by any commands or events in the Client JSON Protocol, for
+the side effect of generated C code used internally.
+
+There are eight top-level expressions recognized by the parser:
+'include', 'pragma', 'command', 'struct', 'enum', 'union',
+'alternate', and 'event'. There are several groups of types: simple
+types (a number of built-in types, such as 'int' and 'str'; as well as
+enumerations), complex types (structs and two flavors of unions), and
+alternate types (a choice between other types). The 'command' and
+'event' expressions can refer to existing types by name, or list an
+anonymous type as a dictionary. Listing a type name inside an array
+refers to a single-dimension array of that type; multi-dimension
+arrays are not directly supported (although an array of a complex
+struct that contains an array member is possible).
+
+All names must begin with a letter, and contain only ASCII letters,
+digits, hyphen, and underscore. There are two exceptions: enum values
+may start with a digit, and names that are downstream extensions (see
+section Downstream extensions) start with underscore.
+
+Names beginning with 'q_' are reserved for the generator, which uses
+them for munging QMP names that resemble C keywords or other
+problematic strings. For example, a member named "default" in qapi
+becomes "q_default" in the generated C code.
+
+Types, commands, and events share a common namespace. Therefore,
+generally speaking, type definitions should always use CamelCase for
+user-defined type names, while built-in types are lowercase.
+
+Type names ending with 'Kind' or 'List' are reserved for the
+generator, which uses them for implicit union enums and array types,
+respectively.
+
+Command names, and member names within a type, should be all lower
+case with words separated by a hyphen. However, some existing older
+commands and complex types use underscore; when extending such
+expressions, consistency is preferred over blindly avoiding
+underscore.
+
+Event names should be ALL_CAPS with words separated by underscore.
+
+Member names starting with 'has-' or 'has_' are reserved for the
+generator, which uses them for tracking optional members.
+
+Any name (command, event, type, member, or enum value) beginning with
+"x-" is marked experimental, and may be withdrawn or changed
+incompatibly in a future release.
+
+Pragma 'name-case-whitelist' lets you violate the rules on use of
+upper and lower case. Use for new code is strongly discouraged.
+
+In the rest of this document, usage lines are given for each
+expression type, with literal strings written in lower case and
+placeholders written in capitals. If a literal string includes a
+prefix of '*', that key/value pair can be omitted from the expression.
+For example, a usage statement that includes '*base':STRUCT-NAME
+means that an expression has an optional key 'base', which if present
+must have a value that forms a struct name.
+
+
+=== Built-in Types ===
+
+The following types are predefined, and map to C as follows:
+
+ Schema C JSON
+ str char * any JSON string, UTF-8
+ number double any JSON number
+ int int64_t a JSON number without fractional part
+ that fits into the C integer type
+ int8 int8_t likewise
+ int16 int16_t likewise
+ int32 int32_t likewise
+ int64 int64_t likewise
+ uint8 uint8_t likewise
+ uint16 uint16_t likewise
+ uint32 uint32_t likewise
+ uint64 uint64_t likewise
+ size uint64_t like uint64_t, except StringInputVisitor
+ accepts size suffixes
+ bool bool JSON true or false
+ any QObject * any JSON value
+ QType QType JSON string matching enum QType values
+
+
+=== Include directives ===
+
+Usage: { 'include': STRING }
+
+The QAPI schema definitions can be modularized using the 'include' directive:
+
+ { 'include': 'path/to/file.json' }
+
+The directive is evaluated recursively, and include paths are relative to the
+file using the directive. Multiple includes of the same file are
+idempotent. No other keys should appear in the expression, and the include
+value should be a string.
+
+As a matter of style, it is a good idea to have all files be
+self-contained, but at the moment, nothing prevents an included file
+from making a forward reference to a type that is only introduced by
+an outer file. The parser may be made stricter in the future to
+prevent incomplete include files.
+
+
+=== Pragma directives ===
+
+Usage: { 'pragma': DICT }
+
+The pragma directive lets you control optional generator behavior.
+The dictionary's entries are pragma names and values.
+
+Pragma's scope is currently the complete schema. Setting the same
+pragma to different values in parts of the schema doesn't work.
+
+Pragma 'doc-required' takes a boolean value. If true, documentation
+is required. Default is false.
+
+Pragma 'returns-whitelist' takes a list of command names that may
+violate the rules on permitted return types. Default is none.
+
+Pragma 'name-case-whitelist' takes a list of names that may violate
+rules on use of upper- vs. lower-case letters. Default is none.
+
+
+=== Struct types ===
+
+Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME }
+
+A struct is a dictionary containing a single 'data' key whose value is
+a dictionary; the dictionary may be empty. This corresponds to a
+struct in C or an Object in JSON. Each value of the 'data' dictionary
+must be the name of a type, or a one-element array containing a type
+name. An example of a struct is:
+
+ { 'struct': 'MyType',
+ 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } }
+
+The use of '*' as a prefix to the name means the member is optional in
+the corresponding JSON protocol usage.
+
+The default initialization value of an optional argument should not be changed
+between versions of QEMU unless the new default maintains backward
+compatibility to the user-visible behavior of the old default.
+
+With proper documentation, this policy still allows some flexibility; for
+example, documenting that a default of 0 picks an optimal buffer size allows
+one release to declare the optimal size at 512 while another release declares
+the optimal size at 4096 - the user-visible behavior is not the bytes used by
+the buffer, but the fact that the buffer was optimal size.
+
+On input structures (only mentioned in the 'data' side of a command), changing
+from mandatory to optional is safe (older clients will supply the option, and
+newer clients can benefit from the default); changing from optional to
+mandatory is backwards incompatible (older clients may be omitting the option,
+and must continue to work).
+
+On output structures (only mentioned in the 'returns' side of a command),
+changing from mandatory to optional is in general unsafe (older clients may be
+expecting the member, and could crash if it is missing), although it
+can be done if the only way that the optional argument will be omitted
+is when it is triggered by the presence of a new input flag to the
+command that older clients don't know to send. Changing from optional
+to mandatory is safe.
+
+A structure that is used in both input and output of various commands
+must consider the backwards compatibility constraints of both directions
+of use.
+
+A struct definition can specify another struct as its base.
+In this case, the members of the base type are included as top-level members
+of the new struct's dictionary in the Client JSON Protocol wire
+format. An example definition is:
+
+ { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } }
+ { 'struct': 'BlockdevOptionsGenericCOWFormat',
+ 'base': 'BlockdevOptionsGenericFormat',
+ 'data': { '*backing': 'str' } }
+
+An example BlockdevOptionsGenericCOWFormat object on the wire could use
+both members like this:
+
+ { "file": "/some/place/my-image",
+ "backing": "/some/place/my-backing-file" }
+
+
+=== Enumeration types ===
+
+Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING }
+ { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING }
+
+An enumeration type is a dictionary containing a single 'data' key
+whose value is a list of strings. An example enumeration is:
+
+ { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] }
+
+Nothing prevents an empty enumeration, although it is probably not
+useful. The list of strings should be lower case; if an enum name
+represents multiple words, use '-' between words. The string 'max' is
+not allowed as an enum value, and values should not be repeated.
+
+The enum constants will be named by using a heuristic to turn the
+type name into a set of underscore separated words. For the example
+above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name
+of 'MY_ENUM_VALUE1' for the first value. If the default heuristic
+does not result in a desirable name, the optional 'prefix' member
+can be used when defining the enum.
+
+The enumeration values are passed as strings over the Client JSON
+Protocol, but are encoded as C enum integral values in generated code.
+While the C code starts numbering at 0, it is better to use explicit
+comparisons to enum values than implicit comparisons to 0; the C code
+will also include a generated enum member ending in _MAX for tracking
+the size of the enum, useful when using common functions for
+converting between strings and enum values. Since the wire format
+always passes by name, it is acceptable to reorder or add new
+enumeration members in any location without breaking clients of Client
+JSON Protocol; however, removing enum values would break
+compatibility. For any struct that has a member that will only contain
+a finite set of string values, using an enum type for that member is
+better than open-coding the member to be type 'str'.
+
+
+=== Union types ===
+
+Usage: { 'union': STRING, 'data': DICT }
+or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT,
+ 'discriminator': ENUM-MEMBER-OF-BASE }
+
+Union types are used to let the user choose between several different
+variants for an object. There are two flavors: simple (no
+discriminator or base), and flat (both discriminator and base). A union
+type is defined using a data dictionary as explained in the following
+paragraphs. The data dictionary for either type of union must not
+be empty.
+
+A simple union type defines a mapping from automatic discriminator
+values to data types like in this example:
+
+ { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } }
+ { 'struct': 'BlockdevOptionsQcow2',
+ 'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } }
+
+ { 'union': 'BlockdevOptionsSimple',
+ 'data': { 'file': 'BlockdevOptionsFile',
+ 'qcow2': 'BlockdevOptionsQcow2' } }
+
+In the Client JSON Protocol, a simple union is represented by a
+dictionary that contains the 'type' member as a discriminator, and a
+'data' member that is of the specified data type corresponding to the
+discriminator value, as in these examples:
+
+ { "type": "file", "data": { "filename": "/some/place/my-image" } }
+ { "type": "qcow2", "data": { "backing": "/some/place/my-image",
+ "lazy-refcounts": true } }
+
+The generated C code uses a struct containing a union. Additionally,
+an implicit C enum 'NameKind' is created, corresponding to the union
+'Name', for accessing the various branches of the union. No branch of
+the union can be named 'max', as this would collide with the implicit
+enum. The value for each branch can be of any type.
+
+A flat union definition avoids nesting on the wire, and specifies a
+set of common members that occur in all variants of the union. The
+'base' key must specify either a type name (the type must be a
+struct, not a union), or a dictionary representing an anonymous type.
+All branches of the union must be complex types, and the top-level
+members of the union dictionary on the wire will be combination of
+members from both the base type and the appropriate branch type (when
+merging two dictionaries, there must be no keys in common). The
+'discriminator' member must be the name of a non-optional enum-typed
+member of the base struct.
+
+The following example enhances the above simple union example by
+adding an optional common member 'read-only', renaming the
+discriminator to something more applicable than the simple union's
+default of 'type', and reducing the number of {} required on the wire:
+
+ { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] }
+ { 'union': 'BlockdevOptions',
+ 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' },
+ 'discriminator': 'driver',
+ 'data': { 'file': 'BlockdevOptionsFile',
+ 'qcow2': 'BlockdevOptionsQcow2' } }
+
+Resulting in these JSON objects:
+
+ { "driver": "file", "read-only": true,
+ "filename": "/some/place/my-image" }
+ { "driver": "qcow2", "read-only": false,
+ "backing": "/some/place/my-image", "lazy-refcounts": true }
+
+Notice that in a flat union, the discriminator name is controlled by
+the user, but because it must map to a base member with enum type, the
+code generator can ensure that branches exist for all values of the
+enum (although the order of the keys need not match the declaration of
+the enum). In the resulting generated C data types, a flat union is
+represented as a struct with the base members included directly, and
+then a union of structures for each branch of the struct.
+
+A simple union can always be re-written as a flat union where the base
+class has a single member named 'type', and where each branch of the
+union has a struct with a single member named 'data'. That is,
+
+ { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } }
+
+is identical on the wire to:
+
+ { 'enum': 'Enum', 'data': ['one', 'two'] }
+ { 'struct': 'Branch1', 'data': { 'data': 'str' } }
+ { 'struct': 'Branch2', 'data': { 'data': 'int' } }
+ { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type',
+ 'data': { 'one': 'Branch1', 'two': 'Branch2' } }
+
+
+=== Alternate types ===
+
+Usage: { 'alternate': STRING, 'data': DICT }
+
+An alternate type is one that allows a choice between two or more JSON
+data types (string, integer, number, or object, but currently not
+array) on the wire. The definition is similar to a simple union type,
+where each branch of the union names a QAPI type. For example:
+
+ { 'alternate': 'BlockdevRef',
+ 'data': { 'definition': 'BlockdevOptions',
+ 'reference': 'str' } }
+
+Unlike a union, the discriminator string is never passed on the wire
+for the Client JSON Protocol. Instead, the value's JSON type serves
+as an implicit discriminator, which in turn means that an alternate
+can only express a choice between types represented differently in
+JSON. If a branch is typed as the 'bool' built-in, the alternate
+accepts true and false; if it is typed as any of the various numeric
+built-ins, it accepts a JSON number; if it is typed as a 'str'
+built-in or named enum type, it accepts a JSON string; and if it is
+typed as a complex type (struct or union), it accepts a JSON object.
+Two different complex types, for instance, aren't permitted, because
+both are represented as a JSON object.
+
+The example alternate declaration above allows using both of the
+following example objects:
+
+ { "file": "my_existing_block_device_id" }
+ { "file": { "driver": "file",
+ "read-only": false,
+ "filename": "/tmp/mydisk.qcow2" } }
+
+
+=== Commands ===
+
+Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT,
+ '*returns': TYPE-NAME, '*boxed': true,
+ '*gen': false, '*success-response': false }
+
+Commands are defined by using a dictionary containing several members,
+where three members are most common. The 'command' member is a
+mandatory string, and determines the "execute" value passed in a
+Client JSON Protocol command exchange.
+
+The 'data' argument maps to the "arguments" dictionary passed in as
+part of a Client JSON Protocol command. The 'data' member is optional
+and defaults to {} (an empty dictionary). If present, it must be the
+string name of a complex type, or a dictionary that declares an
+anonymous type with the same semantics as a 'struct' expression.
+
+The 'returns' member describes what will appear in the "return" member
+of a Client JSON Protocol reply on successful completion of a command.
+The member is optional from the command declaration; if absent, the
+"return" member will be an empty dictionary. If 'returns' is present,
+it must be the string name of a complex or built-in type, a
+one-element array containing the name of a complex or built-in type.
+To return anything else, you have to list the command in pragma
+'returns-whitelist'. If you do this, the command cannot be extended
+to return additional information in the future. Use of
+'returns-whitelist' for new commands is strongly discouraged.
+
+All commands in Client JSON Protocol use a dictionary to report
+failure, with no way to specify that in QAPI. Where the error return
+is different than the usual GenericError class in order to help the
+client react differently to certain error conditions, it is worth
+documenting this in the comments before the command declaration.
+
+Some example commands:
+
+ { 'command': 'my-first-command',
+ 'data': { 'arg1': 'str', '*arg2': 'str' } }
+ { 'struct': 'MyType', 'data': { '*value': 'str' } }
+ { 'command': 'my-second-command',
+ 'returns': [ 'MyType' ] }
+
+which would validate this Client JSON Protocol transaction:
+
+ => { "execute": "my-first-command",
+ "arguments": { "arg1": "hello" } }
+ <= { "return": { } }
+ => { "execute": "my-second-command" }
+ <= { "return": [ { "value": "one" }, { } ] }
+
+The generator emits a prototype for the user's function implementing
+the command. Normally, 'data' is a dictionary for an anonymous type,
+or names a struct type (possibly empty, but not a union), and its
+members are passed as separate arguments to this function. If the
+command definition includes a key 'boxed' with the boolean value true,
+then 'data' is instead the name of any non-empty complex type
+(struct, union, or alternate), and a pointer to that QAPI type is
+passed as a single argument.
+
+The generator also emits a marshalling function that extracts
+arguments for the user's function out of an input QDict, calls the
+user's function, and if it succeeded, builds an output QObject from
+its return value.
+
+In rare cases, QAPI cannot express a type-safe representation of a
+corresponding Client JSON Protocol command. You then have to suppress
+generation of a marshalling function by including a key 'gen' with
+boolean value false, and instead write your own function. Please try
+to avoid adding new commands that rely on this, and instead use
+type-safe unions. For an example of this usage:
+
+ { 'command': 'netdev_add',
+ 'data': {'type': 'str', 'id': 'str'},
+ 'gen': false }
+
+Normally, the QAPI schema is used to describe synchronous exchanges,
+where a response is expected. But in some cases, the action of a
+command is expected to change state in a way that a successful
+response is not possible (although the command will still return a
+normal dictionary error on failure). When a successful reply is not
+possible, the command expression should include the optional key
+'success-response' with boolean value false. So far, only QGA makes
+use of this member.
+
+
+=== Events ===
+
+Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT,
+ '*boxed': true }
+
+Events are defined with the keyword 'event'. It is not allowed to
+name an event 'MAX', since the generator also produces a C enumeration
+of all event names with a generated _MAX value at the end. When
+'data' is also specified, additional info will be included in the
+event, with similar semantics to a 'struct' expression. Finally there
+will be C API generated in qapi-event.h; when called by QEMU code, a
+message with timestamp will be emitted on the wire.
+
+An example event is:
+
+{ 'event': 'EVENT_C',
+ 'data': { '*a': 'int', 'b': 'str' } }
+
+Resulting in this JSON object:
+
+{ "event": "EVENT_C",
+ "data": { "b": "test string" },
+ "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }
+
+The generator emits a function to send the event. Normally, 'data' is
+a dictionary for an anonymous type, or names a struct type (possibly
+empty, but not a union), and its members are passed as separate
+arguments to this function. If the event definition includes a key
+'boxed' with the boolean value true, then 'data' is instead the name of
+any non-empty complex type (struct, union, or alternate), and a
+pointer to that QAPI type is passed as a single argument.
+
+
+=== Downstream extensions ===
+
+QAPI schema names that are externally visible, say in the Client JSON
+Protocol, need to be managed with care. Names starting with a
+downstream prefix of the form __RFQDN_ are reserved for the downstream
+who controls the valid, reverse fully qualified domain name RFQDN.
+RFQDN may only contain ASCII letters, digits, hyphen and period.
+
+Example: Red Hat, Inc. controls redhat.com, and may therefore add a
+downstream command __com.redhat_drive-mirror.
+
+
+== Client JSON Protocol introspection ==
+
+Clients of a Client JSON Protocol commonly need to figure out what
+exactly the server (QEMU) supports.
+
+For this purpose, QMP provides introspection via command
+query-qmp-schema. QGA currently doesn't support introspection.
+
+While Client JSON Protocol wire compatibility should be maintained
+between qemu versions, we cannot make the same guarantees for
+introspection stability. For example, one version of qemu may provide
+a non-variant optional member of a struct, and a later version rework
+the member to instead be non-optional and associated with a variant.
+Likewise, one version of qemu may list a member with open-ended type
+'str', and a later version could convert it to a finite set of strings
+via an enum type; or a member may be converted from a specific type to
+an alternate that represents a choice between the original type and
+something else.
+
+query-qmp-schema returns a JSON array of SchemaInfo objects. These
+objects together describe the wire ABI, as defined in the QAPI schema.
+There is no specified order to the SchemaInfo objects returned; a
+client must search for a particular name throughout the entire array
+to learn more about that name, but is at least guaranteed that there
+will be no collisions between type, command, and event names.
+
+However, the SchemaInfo can't reflect all the rules and restrictions
+that apply to QMP. It's interface introspection (figuring out what's
+there), not interface specification. The specification is in the QAPI
+schema. To understand how QMP is to be used, you need to study the
+QAPI schema.
+
+Like any other command, query-qmp-schema is itself defined in the QAPI
+schema, along with the SchemaInfo type. This text attempts to give an
+overview how things work. For details you need to consult the QAPI
+schema.
+
+SchemaInfo objects have common members "name" and "meta-type", and
+additional variant members depending on the value of meta-type.
+
+Each SchemaInfo object describes a wire ABI entity of a certain
+meta-type: a command, event or one of several kinds of type.
+
+SchemaInfo for commands and events have the same name as in the QAPI
+schema.
+
+Command and event names are part of the wire ABI, but type names are
+not. Therefore, the SchemaInfo for types have auto-generated
+meaningless names. For readability, the examples in this section use
+meaningful type names instead.
+
+To examine a type, start with a command or event using it, then follow
+references by name.
+
+QAPI schema definitions not reachable that way are omitted.
+
+The SchemaInfo for a command has meta-type "command", and variant
+members "arg-type" and "ret-type". On the wire, the "arguments"
+member of a client's "execute" command must conform to the object type
+named by "arg-type". The "return" member that the server passes in a
+success response conforms to the type named by "ret-type".
+
+If the command takes no arguments, "arg-type" names an object type
+without members. Likewise, if the command returns nothing, "ret-type"
+names an object type without members.
+
+Example: the SchemaInfo for command query-qmp-schema
+
+ { "name": "query-qmp-schema", "meta-type": "command",
+ "arg-type": "q_empty", "ret-type": "SchemaInfoList" }
+
+ Type "q_empty" is an automatic object type without members, and type
+ "SchemaInfoList" is the array of SchemaInfo type.
+
+The SchemaInfo for an event has meta-type "event", and variant member
+"arg-type". On the wire, a "data" member that the server passes in an
+event conforms to the object type named by "arg-type".
+
+If the event carries no additional information, "arg-type" names an
+object type without members. The event may not have a data member on
+the wire then.
+
+Each command or event defined with dictionary-valued 'data' in the
+QAPI schema implicitly defines an object type.
+
+Example: the SchemaInfo for EVENT_C from section Events
+
+ { "name": "EVENT_C", "meta-type": "event",
+ "arg-type": "q_obj-EVENT_C-arg" }
+
+ Type "q_obj-EVENT_C-arg" is an implicitly defined object type with
+ the two members from the event's definition.
+
+The SchemaInfo for struct and union types has meta-type "object".
+
+The SchemaInfo for a struct type has variant member "members".
+
+The SchemaInfo for a union type additionally has variant members "tag"
+and "variants".
+
+"members" is a JSON array describing the object's common members, if
+any. Each element is a JSON object with members "name" (the member's
+name), "type" (the name of its type), and optionally "default". The
+member is optional if "default" is present. Currently, "default" can
+only have value null. Other values are reserved for future
+extensions. The "members" array is in no particular order; clients
+must search the entire object when learning whether a particular
+member is supported.
+
+Example: the SchemaInfo for MyType from section Struct types
+
+ { "name": "MyType", "meta-type": "object",
+ "members": [
+ { "name": "member1", "type": "str" },
+ { "name": "member2", "type": "int" },
+ { "name": "member3", "type": "str", "default": null } ] }
+
+"tag" is the name of the common member serving as type tag.
+"variants" is a JSON array describing the object's variant members.
+Each element is a JSON object with members "case" (the value of type
+tag this element applies to) and "type" (the name of an object type
+that provides the variant members for this type tag value). The
+"variants" array is in no particular order, and is not guaranteed to
+list cases in the same order as the corresponding "tag" enum type.
+
+Example: the SchemaInfo for flat union BlockdevOptions from section
+Union types
+
+ { "name": "BlockdevOptions", "meta-type": "object",
+ "members": [
+ { "name": "driver", "type": "BlockdevDriver" },
+ { "name": "read-only", "type": "bool", "default": null } ],
+ "tag": "driver",
+ "variants": [
+ { "case": "file", "type": "BlockdevOptionsFile" },
+ { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] }
+
+Note that base types are "flattened": its members are included in the
+"members" array.
+
+A simple union implicitly defines an enumeration type for its implicit
+discriminator (called "type" on the wire, see section Union types).
+
+A simple union implicitly defines an object type for each of its
+variants.
+
+Example: the SchemaInfo for simple union BlockdevOptionsSimple from section
+Union types
+
+ { "name": "BlockdevOptionsSimple", "meta-type": "object",
+ "members": [
+ { "name": "type", "type": "BlockdevOptionsSimpleKind" } ],
+ "tag": "type",
+ "variants": [
+ { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" },
+ { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] }
+
+ Enumeration type "BlockdevOptionsSimpleKind" and the object types
+ "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper"
+ are implicitly defined.
+
+The SchemaInfo for an alternate type has meta-type "alternate", and
+variant member "members". "members" is a JSON array. Each element is
+a JSON object with member "type", which names a type. Values of the
+alternate type conform to exactly one of its member types. There is
+no guarantee on the order in which "members" will be listed.
+
+Example: the SchemaInfo for BlockdevRef from section Alternate types
+
+ { "name": "BlockdevRef", "meta-type": "alternate",
+ "members": [
+ { "type": "BlockdevOptions" },
+ { "type": "str" } ] }
+
+The SchemaInfo for an array type has meta-type "array", and variant
+member "element-type", which names the array's element type. Array
+types are implicitly defined. For convenience, the array's name may
+resemble the element type; however, clients should examine member
+"element-type" instead of making assumptions based on parsing member
+"name".
+
+Example: the SchemaInfo for ['str']
+
+ { "name": "[str]", "meta-type": "array",
+ "element-type": "str" }
+
+The SchemaInfo for an enumeration type has meta-type "enum" and
+variant member "values". The values are listed in no particular
+order; clients must search the entire enum when learning whether a
+particular value is supported.
+
+Example: the SchemaInfo for MyEnum from section Enumeration types
+
+ { "name": "MyEnum", "meta-type": "enum",
+ "values": [ "value1", "value2", "value3" ] }
+
+The SchemaInfo for a built-in type has the same name as the type in
+the QAPI schema (see section Built-in Types), with one exception
+detailed below. It has variant member "json-type" that shows how
+values of this type are encoded on the wire.
+
+Example: the SchemaInfo for str
+
+ { "name": "str", "meta-type": "builtin", "json-type": "string" }
+
+The QAPI schema supports a number of integer types that only differ in
+how they map to C. They are identical as far as SchemaInfo is
+concerned. Therefore, they get all mapped to a single type "int" in
+SchemaInfo.
+
+As explained above, type names are not part of the wire ABI. Not even
+the names of built-in types. Clients should examine member
+"json-type" instead of hard-coding names of built-in types.
+
+
+== Code generation ==
+
+Schemas are fed into five scripts to generate all the code/files that,
+paired with the core QAPI libraries, comprise everything required to
+take JSON commands read in by a Client JSON Protocol server, unmarshal
+the arguments into the underlying C types, call into the corresponding
+C function, map the response back to a Client JSON Protocol response
+to be returned to the user, and introspect the commands.
+
+As an example, we'll use the following schema, which describes a
+single complex user-defined type, along with command which takes a
+list of that type as a parameter, and returns a single element of that
+type. The user is responsible for writing the implementation of
+qmp_my_command(); everything else is produced by the generator.
+
+ $ cat example-schema.json
+ { 'struct': 'UserDefOne',
+ 'data': { 'integer': 'int', '*string': 'str' } }
+
+ { 'command': 'my-command',
+ 'data': { 'arg1': ['UserDefOne'] },
+ 'returns': 'UserDefOne' }
+
+ { 'event': 'MY_EVENT' }
+
+For a more thorough look at generated code, the testsuite includes
+tests/qapi-schema/qapi-schema-tests.json that covers more examples of
+what the generator will accept, and compiles the resulting C code as
+part of 'make check-unit'.
+
+=== scripts/qapi-types.py ===
+
+Used to generate the C types defined by a schema, along with
+supporting code. The following files are created:
+
+$(prefix)qapi-types.h - C types corresponding to types defined in
+ the schema you pass in
+$(prefix)qapi-types.c - Cleanup functions for the above C types
+
+The $(prefix) is an optional parameter used as a namespace to keep the
+generated code from one schema/code-generation separated from others so code
+can be generated/used from multiple schemas without clobbering previously
+created code.
+
+Example:
+
+ $ python scripts/qapi-types.py --output-dir="qapi-generated" \
+ --prefix="example-" example-schema.json
+ $ cat qapi-generated/example-qapi-types.h
+[Uninteresting stuff omitted...]
+
+ #ifndef EXAMPLE_QAPI_TYPES_H
+ #define EXAMPLE_QAPI_TYPES_H
+
+[Built-in types omitted...]
+
+ typedef struct UserDefOne UserDefOne;
+
+ typedef struct UserDefOneList UserDefOneList;
+
+ struct UserDefOne {
+ int64_t integer;
+ bool has_string;
+ char *string;
+ };
+
+ void qapi_free_UserDefOne(UserDefOne *obj);
+
+ struct UserDefOneList {
+ UserDefOneList *next;
+ UserDefOne *value;
+ };
+
+ void qapi_free_UserDefOneList(UserDefOneList *obj);
+
+ #endif
+ $ cat qapi-generated/example-qapi-types.c
+[Uninteresting stuff omitted...]
+
+ void qapi_free_UserDefOne(UserDefOne *obj)
+ {
+ Visitor *v;
+
+ if (!obj) {
+ return;
+ }
+
+ v = qapi_dealloc_visitor_new();
+ visit_type_UserDefOne(v, NULL, &obj, NULL);
+ visit_free(v);
+ }
+
+ void qapi_free_UserDefOneList(UserDefOneList *obj)
+ {
+ Visitor *v;
+
+ if (!obj) {
+ return;
+ }
+
+ v = qapi_dealloc_visitor_new();
+ visit_type_UserDefOneList(v, NULL, &obj, NULL);
+ visit_free(v);
+ }
+
+=== scripts/qapi-visit.py ===
+
+Used to generate the visitor functions used to walk through and
+convert between a native QAPI C data structure and some other format
+(such as QObject); the generated functions are named visit_type_FOO()
+and visit_type_FOO_members().
+
+The following files are generated:
+
+$(prefix)qapi-visit.c: visitor function for a particular C type, used
+ to automagically convert QObjects into the
+ corresponding C type and vice-versa, as well
+ as for deallocating memory for an existing C
+ type
+
+$(prefix)qapi-visit.h: declarations for previously mentioned visitor
+ functions
+
+Example:
+
+ $ python scripts/qapi-visit.py --output-dir="qapi-generated"
+ --prefix="example-" example-schema.json
+ $ cat qapi-generated/example-qapi-visit.h
+[Uninteresting stuff omitted...]
+
+ #ifndef EXAMPLE_QAPI_VISIT_H
+ #define EXAMPLE_QAPI_VISIT_H
+
+[Visitors for built-in types omitted...]
+
+ void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp);
+ void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp);
+ void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp);
+
+ #endif
+ $ cat qapi-generated/example-qapi-visit.c
+[Uninteresting stuff omitted...]
+
+ void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp)
+ {
+ Error *err = NULL;
+
+ visit_type_int(v, "integer", &obj->integer, &err);
+ if (err) {
+ goto out;
+ }
+ if (visit_optional(v, "string", &obj->has_string)) {
+ visit_type_str(v, "string", &obj->string, &err);
+ if (err) {
+ goto out;
+ }
+ }
+
+ out:
+ error_propagate(errp, err);
+ }
+
+ void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp)
+ {
+ Error *err = NULL;
+
+ visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err);
+ if (err) {
+ goto out;
+ }
+ if (!*obj) {
+ goto out_obj;
+ }
+ visit_type_UserDefOne_members(v, *obj, &err);
+ if (err) {
+ goto out_obj;
+ }
+ visit_check_struct(v, &err);
+ out_obj:
+ visit_end_struct(v, (void **)obj);
+ if (err && visit_is_input(v)) {
+ qapi_free_UserDefOne(*obj);
+ *obj = NULL;
+ }
+ out:
+ error_propagate(errp, err);
+ }
+
+ void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp)
+ {
+ Error *err = NULL;
+ UserDefOneList *tail;
+ size_t size = sizeof(**obj);
+
+ visit_start_list(v, name, (GenericList **)obj, size, &err);
+ if (err) {
+ goto out;
+ }
+
+ for (tail = *obj; tail;
+ tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) {
+ visit_type_UserDefOne(v, NULL, &tail->value, &err);
+ if (err) {
+ break;
+ }
+ }
+
+ visit_end_list(v, (void **)obj);
+ if (err && visit_is_input(v)) {
+ qapi_free_UserDefOneList(*obj);
+ *obj = NULL;
+ }
+ out:
+ error_propagate(errp, err);
+ }
+
+=== scripts/qapi-commands.py ===
+
+Used to generate the marshaling/dispatch functions for the commands
+defined in the schema. The generated code implements
+qmp_marshal_COMMAND() (registered automatically), and declares
+qmp_COMMAND() that the user must implement. The following files are
+generated:
+
+$(prefix)qmp-marshal.c: command marshal/dispatch functions for each
+ QMP command defined in the schema. Functions
+ generated by qapi-visit.py are used to
+ convert QObjects received from the wire into
+ function parameters, and uses the same
+ visitor functions to convert native C return
+ values to QObjects from transmission back
+ over the wire.
+
+$(prefix)qmp-commands.h: Function prototypes for the QMP commands
+ specified in the schema.
+
+Example:
+
+ $ python scripts/qapi-commands.py --output-dir="qapi-generated"
+ --prefix="example-" example-schema.json
+ $ cat qapi-generated/example-qmp-commands.h
+[Uninteresting stuff omitted...]
+
+ #ifndef EXAMPLE_QMP_COMMANDS_H
+ #define EXAMPLE_QMP_COMMANDS_H
+
+ #include "example-qapi-types.h"
+ #include "qapi/qmp/qdict.h"
+ #include "qapi/error.h"
+
+ UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp);
+
+ #endif
+ $ cat qapi-generated/example-qmp-marshal.c
+[Uninteresting stuff omitted...]
+
+ static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp)
+ {
+ Error *err = NULL;
+ Visitor *v;
+
+ v = qobject_output_visitor_new(ret_out);
+ visit_type_UserDefOne(v, "unused", &ret_in, &err);
+ if (!err) {
+ visit_complete(v, ret_out);
+ }
+ error_propagate(errp, err);
+ visit_free(v);
+ v = qapi_dealloc_visitor_new();
+ visit_type_UserDefOne(v, "unused", &ret_in, NULL);
+ visit_free(v);
+ }
+
+ static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp)
+ {
+ Error *err = NULL;
+ UserDefOne *retval;
+ Visitor *v;
+ UserDefOneList *arg1 = NULL;
+
+ v = qobject_input_visitor_new(QOBJECT(args));
+ visit_start_struct(v, NULL, NULL, 0, &err);
+ if (err) {
+ goto out;
+ }
+ visit_type_UserDefOneList(v, "arg1", &arg1, &err);
+ if (!err) {
+ visit_check_struct(v, &err);
+ }
+ visit_end_struct(v, NULL);
+ if (err) {
+ goto out;
+ }
+
+ retval = qmp_my_command(arg1, &err);
+ if (err) {
+ goto out;
+ }
+
+ qmp_marshal_output_UserDefOne(retval, ret, &err);
+
+ out:
+ error_propagate(errp, err);
+ visit_free(v);
+ v = qapi_dealloc_visitor_new();
+ visit_start_struct(v, NULL, NULL, 0, NULL);
+ visit_type_UserDefOneList(v, "arg1", &arg1, NULL);
+ visit_end_struct(v, NULL);
+ visit_free(v);
+ }
+
+ static void qmp_init_marshal(void)
+ {
+ qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS);
+ }
+
+ qapi_init(qmp_init_marshal);
+
+=== scripts/qapi-event.py ===
+
+Used to generate the event-related C code defined by a schema, with
+implementations for qapi_event_send_FOO(). The following files are
+created:
+
+$(prefix)qapi-event.h - Function prototypes for each event type, plus an
+ enumeration of all event names
+$(prefix)qapi-event.c - Implementation of functions to send an event
+
+Example:
+
+ $ python scripts/qapi-event.py --output-dir="qapi-generated"
+ --prefix="example-" example-schema.json
+ $ cat qapi-generated/example-qapi-event.h
+[Uninteresting stuff omitted...]
+
+ #ifndef EXAMPLE_QAPI_EVENT_H
+ #define EXAMPLE_QAPI_EVENT_H
+
+ #include "qapi/error.h"
+ #include "qapi/qmp/qdict.h"
+ #include "example-qapi-types.h"
+
+
+ void qapi_event_send_my_event(Error **errp);
+
+ typedef enum example_QAPIEvent {
+ EXAMPLE_QAPI_EVENT_MY_EVENT = 0,
+ EXAMPLE_QAPI_EVENT__MAX = 1,
+ } example_QAPIEvent;
+
+ extern const char *const example_QAPIEvent_lookup[];
+
+ #endif
+ $ cat qapi-generated/example-qapi-event.c
+[Uninteresting stuff omitted...]
+
+ void qapi_event_send_my_event(Error **errp)
+ {
+ QDict *qmp;
+ Error *err = NULL;
+ QMPEventFuncEmit emit;
+ emit = qmp_event_get_func_emit();
+ if (!emit) {
+ return;
+ }
+
+ qmp = qmp_event_build_dict("MY_EVENT");
+
+ emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err);
+
+ error_propagate(errp, err);
+ QDECREF(qmp);
+ }
+
+ const char *const example_QAPIEvent_lookup[] = {
+ [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT",
+ [EXAMPLE_QAPI_EVENT__MAX] = NULL,
+ };
+
+=== scripts/qapi-introspect.py ===
+
+Used to generate the introspection C code for a schema. The following
+files are created:
+
+$(prefix)qmp-introspect.c - Defines a string holding a JSON
+ description of the schema.
+$(prefix)qmp-introspect.h - Declares the above string.
+
+Example:
+
+ $ python scripts/qapi-introspect.py --output-dir="qapi-generated"
+ --prefix="example-" example-schema.json
+ $ cat qapi-generated/example-qmp-introspect.h
+[Uninteresting stuff omitted...]
+
+ #ifndef EXAMPLE_QMP_INTROSPECT_H
+ #define EXAMPLE_QMP_INTROSPECT_H
+
+ extern const char example_qmp_schema_json[];
+
+ #endif
+ $ cat qapi-generated/example-qmp-introspect.c
+[Uninteresting stuff omitted...]
+
+ const char example_qmp_schema_json[] = "["
+ "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, "
+ "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, "
+ "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, "
+ "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, "
+ "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, "
+ "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, "
+ "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, "
+ "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]";
--- /dev/null
+Using RCU (Read-Copy-Update) for synchronization
+================================================
+
+Read-copy update (RCU) is a synchronization mechanism that is used to
+protect read-mostly data structures. RCU is very efficient and scalable
+on the read side (it is wait-free), and thus can make the read paths
+extremely fast.
+
+RCU supports concurrency between a single writer and multiple readers,
+thus it is not used alone. Typically, the write-side will use a lock to
+serialize multiple updates, but other approaches are possible (e.g.,
+restricting updates to a single task). In QEMU, when a lock is used,
+this will often be the "iothread mutex", also known as the "big QEMU
+lock" (BQL). Also, restricting updates to a single task is done in
+QEMU using the "bottom half" API.
+
+RCU is fundamentally a "wait-to-finish" mechanism. The read side marks
+sections of code with "critical sections", and the update side will wait
+for the execution of all *currently running* critical sections before
+proceeding, or before asynchronously executing a callback.
+
+The key point here is that only the currently running critical sections
+are waited for; critical sections that are started _after_ the beginning
+of the wait do not extend the wait, despite running concurrently with
+the updater. This is the reason why RCU is more scalable than,
+for example, reader-writer locks. It is so much more scalable that
+the system will have a single instance of the RCU mechanism; a single
+mechanism can be used for an arbitrary number of "things", without
+having to worry about things such as contention or deadlocks.
+
+How is this possible? The basic idea is to split updates in two phases,
+"removal" and "reclamation". During removal, we ensure that subsequent
+readers will not be able to get a reference to the old data. After
+removal has completed, a critical section will not be able to access
+the old data. Therefore, critical sections that begin after removal
+do not matter; as soon as all previous critical sections have finished,
+there cannot be any readers who hold references to the data structure,
+and these can now be safely reclaimed (e.g., freed or unref'ed).
+
+Here is a picture:
+
+ thread 1 thread 2 thread 3
+ ------------------- ------------------------ -------------------
+ enter RCU crit.sec.
+ | finish removal phase
+ | begin wait
+ | | enter RCU crit.sec.
+ exit RCU crit.sec | |
+ complete wait |
+ begin reclamation phase |
+ exit RCU crit.sec.
+
+
+Note how thread 3 is still executing its critical section when thread 2
+starts reclaiming data. This is possible, because the old version of the
+data structure was not accessible at the time thread 3 began executing
+that critical section.
+
+
+RCU API
+=======
+
+The core RCU API is small:
+
+ void rcu_read_lock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ entering an RCU read-side critical section.
+
+ void rcu_read_unlock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ exiting an RCU read-side critical section. Note that RCU
+ read-side critical sections may be nested and/or overlapping.
+
+ void synchronize_rcu(void);
+
+ Blocks until all pre-existing RCU read-side critical sections
+ on all threads have completed. This marks the end of the removal
+ phase and the beginning of reclamation phase.
+
+ Note that it would be valid for another update to come while
+ synchronize_rcu is running. Because of this, it is better that
+ the updater releases any locks it may hold before calling
+ synchronize_rcu. If this is not possible (for example, because
+ the updater is protected by the BQL), you can use call_rcu.
+
+ void call_rcu1(struct rcu_head * head,
+ void (*func)(struct rcu_head *head));
+
+ This function invokes func(head) after all pre-existing RCU
+ read-side critical sections on all threads have completed. This
+ marks the end of the removal phase, with func taking care
+ asynchronously of the reclamation phase.
+
+ The foo struct needs to have an rcu_head structure added,
+ perhaps as follows:
+
+ struct foo {
+ struct rcu_head rcu;
+ int a;
+ char b;
+ long c;
+ };
+
+ so that the reclaimer function can fetch the struct foo address
+ and free it:
+
+ call_rcu1(&foo.rcu, foo_reclaim);
+
+ void foo_reclaim(struct rcu_head *rp)
+ {
+ struct foo *fp = container_of(rp, struct foo, rcu);
+ g_free(fp);
+ }
+
+ For the common case where the rcu_head member is the first of the
+ struct, you can use the following macro.
+
+ void call_rcu(T *p,
+ void (*func)(T *p),
+ field-name);
+ void g_free_rcu(T *p,
+ field-name);
+
+ call_rcu1 is typically used through these macro, in the common case
+ where the "struct rcu_head" is the first field in the struct. If
+ the callback function is g_free, in particular, g_free_rcu can be
+ used. In the above case, one could have written simply:
+
+ g_free_rcu(&foo, rcu);
+
+ typeof(*p) atomic_rcu_read(p);
+
+ atomic_rcu_read() is similar to atomic_mb_read(), but it makes
+ some assumptions on the code that calls it. This allows a more
+ optimized implementation.
+
+ atomic_rcu_read assumes that whenever a single RCU critical
+ section reads multiple shared data, these reads are either
+ data-dependent or need no ordering. This is almost always the
+ case when using RCU, because read-side critical sections typically
+ navigate one or more pointers (the pointers that are changed on
+ every update) until reaching a data structure of interest,
+ and then read from there.
+
+ RCU read-side critical sections must use atomic_rcu_read() to
+ read data, unless concurrent writes are prevented by another
+ synchronization mechanism.
+
+ Furthermore, RCU read-side critical sections should traverse the
+ data structure in a single direction, opposite to the direction
+ in which the updater initializes it.
+
+ void atomic_rcu_set(p, typeof(*p) v);
+
+ atomic_rcu_set() is also similar to atomic_mb_set(), and it also
+ makes assumptions on the code that calls it in order to allow a more
+ optimized implementation.
+
+ In particular, atomic_rcu_set() suffices for synchronization
+ with readers, if the updater never mutates a field within a
+ data item that is already accessible to readers. This is the
+ case when initializing a new copy of the RCU-protected data
+ structure; just ensure that initialization of *p is carried out
+ before atomic_rcu_set() makes the data item visible to readers.
+ If this rule is observed, writes will happen in the opposite
+ order as reads in the RCU read-side critical sections (or if
+ there is just one update), and there will be no need for other
+ synchronization mechanism to coordinate the accesses.
+
+The following APIs must be used before RCU is used in a thread:
+
+ void rcu_register_thread(void);
+
+ Mark a thread as taking part in the RCU mechanism. Such a thread
+ will have to report quiescent points regularly, either manually
+ or through the QemuCond/QemuSemaphore/QemuEvent APIs.
+
+ void rcu_unregister_thread(void);
+
+ Mark a thread as not taking part anymore in the RCU mechanism.
+ It is not a problem if such a thread reports quiescent points,
+ either manually or by using the QemuCond/QemuSemaphore/QemuEvent
+ APIs.
+
+Note that these APIs are relatively heavyweight, and should _not_ be
+nested.
+
+
+DIFFERENCES WITH LINUX
+======================
+
+- Waiting on a mutex is possible, though discouraged, within an RCU critical
+ section. This is because spinlocks are rarely (if ever) used in userspace
+ programming; not allowing this would prevent upgrading an RCU read-side
+ critical section to become an updater.
+
+- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and
+ rcu_assign_pointer. They take a _pointer_ to the variable being accessed.
+
+- call_rcu is a macro that has an extra argument (the name of the first
+ field in the struct, which must be a struct rcu_head), and expects the
+ type of the callback's argument to be the type of the first argument.
+ call_rcu1 is the same as Linux's call_rcu.
+
+
+RCU PATTERNS
+============
+
+Many patterns using read-writer locks translate directly to RCU, with
+the advantages of higher scalability and deadlock immunity.
+
+In general, RCU can be used whenever it is possible to create a new
+"version" of a data structure every time the updater runs. This may
+sound like a very strict restriction, however:
+
+- the updater does not mean "everything that writes to a data structure",
+ but rather "everything that involves a reclamation step". See the
+ array example below
+
+- in some cases, creating a new version of a data structure may actually
+ be very cheap. For example, modifying the "next" pointer of a singly
+ linked list is effectively creating a new version of the list.
+
+Here are some frequently-used RCU idioms that are worth noting.
+
+
+RCU list processing
+-------------------
+
+TBD (not yet used in QEMU)
+
+
+RCU reference counting
+----------------------
+
+Because grace periods are not allowed to complete while there is an RCU
+read-side critical section in progress, the RCU read-side primitives
+may be used as a restricted reference-counting mechanism. For example,
+consider the following code fragment:
+
+ rcu_read_lock();
+ p = atomic_rcu_read(&foo);
+ /* do something with p. */
+ rcu_read_unlock();
+
+The RCU read-side critical section ensures that the value of "p" remains
+valid until after the rcu_read_unlock(). In some sense, it is acquiring
+a reference to p that is later released when the critical section ends.
+The write side looks simply like this (with appropriate locking):
+
+ qemu_mutex_lock(&foo_mutex);
+ old = foo;
+ atomic_rcu_set(&foo, new);
+ qemu_mutex_unlock(&foo_mutex);
+ synchronize_rcu();
+ free(old);
+
+If the processing cannot be done purely within the critical section, it
+is possible to combine this idiom with a "real" reference count:
+
+ rcu_read_lock();
+ p = atomic_rcu_read(&foo);
+ foo_ref(p);
+ rcu_read_unlock();
+ /* do something with p. */
+ foo_unref(p);
+
+The write side can be like this:
+
+ qemu_mutex_lock(&foo_mutex);
+ old = foo;
+ atomic_rcu_set(&foo, new);
+ qemu_mutex_unlock(&foo_mutex);
+ synchronize_rcu();
+ foo_unref(old);
+
+or with call_rcu:
+
+ qemu_mutex_lock(&foo_mutex);
+ old = foo;
+ atomic_rcu_set(&foo, new);
+ qemu_mutex_unlock(&foo_mutex);
+ call_rcu(foo_unref, old, rcu);
+
+In both cases, the write side only performs removal. Reclamation
+happens when the last reference to a "foo" object is dropped.
+Using synchronize_rcu() is undesirably expensive, because the
+last reference may be dropped on the read side. Hence you can
+use call_rcu() instead:
+
+ foo_unref(struct foo *p) {
+ if (atomic_fetch_dec(&p->refcount) == 1) {
+ call_rcu(foo_destroy, p, rcu);
+ }
+ }
+
+
+Note that the same idioms would be possible with reader/writer
+locks:
+
+ read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock);
+ p = foo; p = foo;
+ /* do something with p. */ foo = new;
+ read_unlock(&foo_rwlock); free(p);
+ write_mutex_unlock(&foo_rwlock);
+ free(p);
+
+ ------------------------------------------------------------------
+
+ read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock);
+ p = foo; old = foo;
+ foo_ref(p); foo = new;
+ read_unlock(&foo_rwlock); foo_unref(old);
+ /* do something with p. */ write_mutex_unlock(&foo_rwlock);
+ read_lock(&foo_rwlock);
+ foo_unref(p);
+ read_unlock(&foo_rwlock);
+
+foo_unref could use a mechanism such as bottom halves to move deallocation
+out of the write-side critical section.
+
+
+RCU resizable arrays
+--------------------
+
+Resizable arrays can be used with RCU. The expensive RCU synchronization
+(or call_rcu) only needs to take place when the array is resized.
+The two items to take care of are:
+
+- ensuring that the old version of the array is available between removal
+ and reclamation;
+
+- avoiding mismatches in the read side between the array data and the
+ array size.
+
+The first problem is avoided simply by not using realloc. Instead,
+each resize will allocate a new array and copy the old data into it.
+The second problem would arise if the size and the data pointers were
+two members of a larger struct:
+
+ struct mystuff {
+ ...
+ int data_size;
+ int data_alloc;
+ T *data;
+ ...
+ };
+
+Instead, we store the size of the array with the array itself:
+
+ struct arr {
+ int size;
+ int alloc;
+ T data[];
+ };
+ struct arr *global_array;
+
+ read side:
+ rcu_read_lock();
+ struct arr *array = atomic_rcu_read(&global_array);
+ x = i < array->size ? array->data[i] : -1;
+ rcu_read_unlock();
+ return x;
+
+ write side (running under a lock):
+ if (global_array->size == global_array->alloc) {
+ /* Creating a new version. */
+ new_array = g_malloc(sizeof(struct arr) +
+ global_array->alloc * 2 * sizeof(T));
+ new_array->size = global_array->size;
+ new_array->alloc = global_array->alloc * 2;
+ memcpy(new_array->data, global_array->data,
+ global_array->alloc * sizeof(T));
+
+ /* Removal phase. */
+ old_array = global_array;
+ atomic_rcu_set(&new_array->data, new_array);
+ synchronize_rcu();
+
+ /* Reclamation phase. */
+ free(old_array);
+ }
+
+
+SOURCES
+=======
+
+* Documentation/RCU/ from the Linux kernel
--- /dev/null
+= Tracing =
+
+== Introduction ==
+
+This document describes the tracing infrastructure in QEMU and how to use it
+for debugging, profiling, and observing execution.
+
+== Quickstart ==
+
+1. Build with the 'simple' trace backend:
+
+ ./configure --enable-trace-backends=simple
+ make
+
+2. Create a file with the events you want to trace:
+
+ echo bdrv_aio_readv > /tmp/events
+ echo bdrv_aio_writev >> /tmp/events
+
+3. Run the virtual machine to produce a trace file:
+
+ qemu -trace events=/tmp/events ... # your normal QEMU invocation
+
+4. Pretty-print the binary trace file:
+
+ ./scripts/simpletrace.py trace-events-all trace-* # Override * with QEMU <pid>
+
+== Trace events ==
+
+=== Sub-directory setup ===
+
+Each directory in the source tree can declare a set of static trace events
+in a local "trace-events" file. All directories which contain "trace-events"
+files must be listed in the "trace-events-subdirs" make variable in the top
+level Makefile.objs. During build, the "trace-events" file in each listed
+subdirectory will be processed by the "tracetool" script to generate code for
+the trace events.
+
+The individual "trace-events" files are merged into a "trace-events-all" file,
+which is also installed into "/usr/share/qemu" with the name "trace-events".
+This merged file is to be used by the "simpletrace.py" script to later analyse
+traces in the simpletrace data format.
+
+In the sub-directory the following files will be automatically generated
+
+ - trace.c - the trace event state declarations
+ - trace.h - the trace event enums and probe functions
+ - trace-dtrace.h - DTrace event probe specification
+ - trace-dtrace.dtrace - DTrace event probe helper declaration
+ - trace-dtrace.o - binary DTrace provider (generated by dtrace)
+ - trace-ust.h - UST event probe helper declarations
+
+Source files in the sub-directory should #include the local 'trace.h' file,
+without any sub-directory path prefix. eg io/channel-buffer.c would do
+
+ #include "trace.h"
+
+To access the 'io/trace.h' file. While it is possible to include a trace.h
+file from outside a source files' own sub-directory, this is discouraged in
+general. It is strongly preferred that all events be declared directly in
+the sub-directory that uses them. The only exception is where there are some
+shared trace events defined in the top level directory trace-events file.
+The top level directory generates trace files with a filename prefix of
+"trace-root" instead of just "trace". This is to avoid ambiguity between
+a trace.h in the current directory, vs the top level directory.
+
+=== Using trace events ===
+
+Trace events are invoked directly from source code like this:
+
+ #include "trace.h" /* needed for trace event prototype */
+
+ void *qemu_vmalloc(size_t size)
+ {
+ void *ptr;
+ size_t align = QEMU_VMALLOC_ALIGN;
+
+ if (size < align) {
+ align = getpagesize();
+ }
+ ptr = qemu_memalign(align, size);
+ trace_qemu_vmalloc(size, ptr);
+ return ptr;
+ }
+
+=== Declaring trace events ===
+
+The "tracetool" script produces the trace.h header file which is included by
+every source file that uses trace events. Since many source files include
+trace.h, it uses a minimum of types and other header files included to keep the
+namespace clean and compile times and dependencies down.
+
+Trace events should use types as follows:
+
+ * Use stdint.h types for fixed-size types. Most offsets and guest memory
+ addresses are best represented with uint32_t or uint64_t. Use fixed-size
+ types over primitive types whose size may change depending on the host
+ (32-bit versus 64-bit) so trace events don't truncate values or break
+ the build.
+
+ * Use void * for pointers to structs or for arrays. The trace.h header
+ cannot include all user-defined struct declarations and it is therefore
+ necessary to use void * for pointers to structs.
+
+ * For everything else, use primitive scalar types (char, int, long) with the
+ appropriate signedness.
+
+Format strings should reflect the types defined in the trace event. Take
+special care to use PRId64 and PRIu64 for int64_t and uint64_t types,
+respectively. This ensures portability between 32- and 64-bit platforms.
+
+Each event declaration will start with the event name, then its arguments,
+finally a format string for pretty-printing. For example:
+
+ qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p"
+ qemu_vfree(void *ptr) "ptr %p"
+
+
+=== Hints for adding new trace events ===
+
+1. Trace state changes in the code. Interesting points in the code usually
+ involve a state change like starting, stopping, allocating, freeing. State
+ changes are good trace events because they can be used to understand the
+ execution of the system.
+
+2. Trace guest operations. Guest I/O accesses like reading device registers
+ are good trace events because they can be used to understand guest
+ interactions.
+
+3. Use correlator fields so the context of an individual line of trace output
+ can be understood. For example, trace the pointer returned by malloc and
+ used as an argument to free. This way mallocs and frees can be matched up.
+ Trace events with no context are not very useful.
+
+4. Name trace events after their function. If there are multiple trace events
+ in one function, append a unique distinguisher at the end of the name.
+
+== Generic interface and monitor commands ==
+
+You can programmatically query and control the state of trace events through a
+backend-agnostic interface provided by the header "trace/control.h".
+
+Note that some of the backends do not provide an implementation for some parts
+of this interface, in which case QEMU will just print a warning (please refer to
+header "trace/control.h" to see which routines are backend-dependent).
+
+The state of events can also be queried and modified through monitor commands:
+
+* info trace-events
+ View available trace events and their state. State 1 means enabled, state 0
+ means disabled.
+
+* trace-event NAME on|off
+ Enable/disable a given trace event or a group of events (using wildcards).
+
+The "-trace events=<file>" command line argument can be used to enable the
+events listed in <file> from the very beginning of the program. This file must
+contain one event name per line.
+
+If a line in the "-trace events=<file>" file begins with a '-', the trace event
+will be disabled instead of enabled. This is useful when a wildcard was used
+to enable an entire family of events but one noisy event needs to be disabled.
+
+Wildcard matching is supported in both the monitor command "trace-event" and the
+events list file. That means you can enable/disable the events having a common
+prefix in a batch. For example, virtio-blk trace events could be enabled using
+the following monitor command:
+
+ trace-event virtio_blk_* on
+
+== Trace backends ==
+
+The "tracetool" script automates tedious trace event code generation and also
+keeps the trace event declarations independent of the trace backend. The trace
+events are not tightly coupled to a specific trace backend, such as LTTng or
+SystemTap. Support for trace backends can be added by extending the "tracetool"
+script.
+
+The trace backends are chosen at configure time:
+
+ ./configure --enable-trace-backends=simple
+
+For a list of supported trace backends, try ./configure --help or see below.
+If multiple backends are enabled, the trace is sent to them all.
+
+If no backends are explicitly selected, configure will default to the
+"log" backend.
+
+The following subsections describe the supported trace backends.
+
+=== Nop ===
+
+The "nop" backend generates empty trace event functions so that the compiler
+can optimize out trace events completely. This imposes no performance
+penalty.
+
+Note that regardless of the selected trace backend, events with the "disable"
+property will be generated with the "nop" backend.
+
+=== Log ===
+
+The "log" backend sends trace events directly to standard error. This
+effectively turns trace events into debug printfs.
+
+This is the simplest backend and can be used together with existing code that
+uses DPRINTF().
+
+=== Simpletrace ===
+
+The "simple" backend supports common use cases and comes as part of the QEMU
+source tree. It may not be as powerful as platform-specific or third-party
+trace backends but it is portable. This is the recommended trace backend
+unless you have specific needs for more advanced backends.
+
+=== Ftrace ===
+
+The "ftrace" backend writes trace data to ftrace marker. This effectively
+sends trace events to ftrace ring buffer, and you can compare qemu trace
+data and kernel(especially kvm.ko when using KVM) trace data.
+
+if you use KVM, enable kvm events in ftrace:
+
+ # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable
+
+After running qemu by root user, you can get the trace:
+
+ # cat /sys/kernel/debug/tracing/trace
+
+Restriction: "ftrace" backend is restricted to Linux only.
+
+=== Syslog ===
+
+The "syslog" backend sends trace events using the POSIX syslog API. The log
+is opened specifying the LOG_DAEMON facility and LOG_PID option (so events
+are tagged with the pid of the particular QEMU process that generated
+them). All events are logged at LOG_INFO level.
+
+NOTE: syslog may squash duplicate consecutive trace events and apply rate
+ limiting.
+
+Restriction: "syslog" backend is restricted to POSIX compliant OS.
+
+==== Monitor commands ====
+
+* trace-file on|off|flush|set <path>
+ Enable/disable/flush the trace file or set the trace file name.
+
+==== Analyzing trace files ====
+
+The "simple" backend produces binary trace files that can be formatted with the
+simpletrace.py script. The script takes the "trace-events-all" file and the
+binary trace:
+
+ ./scripts/simpletrace.py trace-events-all trace-12345
+
+You must ensure that the same "trace-events-all" file was used to build QEMU,
+otherwise trace event declarations may have changed and output will not be
+consistent.
+
+=== LTTng Userspace Tracer ===
+
+The "ust" backend uses the LTTng Userspace Tracer library. There are no
+monitor commands built into QEMU, instead UST utilities should be used to list,
+enable/disable, and dump traces.
+
+Package lttng-tools is required for userspace tracing. You must ensure that the
+current user belongs to the "tracing" group, or manually launch the
+lttng-sessiond daemon for the current user prior to running any instance of
+QEMU.
+
+While running an instrumented QEMU, LTTng should be able to list all available
+events:
+
+ lttng list -u
+
+Create tracing session:
+
+ lttng create mysession
+
+Enable events:
+
+ lttng enable-event qemu:g_malloc -u
+
+Where the events can either be a comma-separated list of events, or "-a" to
+enable all tracepoint events. Start and stop tracing as needed:
+
+ lttng start
+ lttng stop
+
+View the trace:
+
+ lttng view
+
+Destroy tracing session:
+
+ lttng destroy
+
+Babeltrace can be used at any later time to view the trace:
+
+ babeltrace $HOME/lttng-traces/mysession-<date>-<time>
+
+=== SystemTap ===
+
+The "dtrace" backend uses DTrace sdt probes but has only been tested with
+SystemTap. When SystemTap support is detected a .stp file with wrapper probes
+is generated to make use in scripts more convenient. This step can also be
+performed manually after a build in order to change the binary name in the .stp
+probes:
+
+ scripts/tracetool.py --backends=dtrace --format=stap \
+ --binary path/to/qemu-binary \
+ --target-type system \
+ --target-name x86_64 \
+ <trace-events-all >qemu.stp
+
+== Trace event properties ==
+
+Each event in the "trace-events-all" file can be prefixed with a space-separated
+list of zero or more of the following event properties.
+
+=== "disable" ===
+
+If a specific trace event is going to be invoked a huge number of times, this
+might have a noticeable performance impact even when the event is
+programmatically disabled.
+
+In this case you should declare such event with the "disable" property. This
+will effectively disable the event at compile time (by using the "nop" backend),
+thus having no performance impact at all on regular builds (i.e., unless you
+edit the "trace-events-all" file).
+
+In addition, there might be cases where relatively complex computations must be
+performed to generate values that are only used as arguments for a trace
+function. In these cases you can use the macro 'TRACE_${EVENT_NAME}_ENABLED' to
+guard such computations and avoid its compilation when the event is disabled:
+
+ #include "trace.h" /* needed for trace event prototype */
+
+ void *qemu_vmalloc(size_t size)
+ {
+ void *ptr;
+ size_t align = QEMU_VMALLOC_ALIGN;
+
+ if (size < align) {
+ align = getpagesize();
+ }
+ ptr = qemu_memalign(align, size);
+ if (TRACE_QEMU_VMALLOC_ENABLED) { /* preprocessor macro */
+ void *complex;
+ /* some complex computations to produce the 'complex' value */
+ trace_qemu_vmalloc(size, ptr, complex);
+ }
+ return ptr;
+ }
+
+You can check both if the event has been disabled and is dynamically enabled at
+the same time using the 'trace_event_get_state' routine (see header
+"trace/control.h" for more information).
+
+=== "tcg" ===
+
+Guest code generated by TCG can be traced by defining an event with the "tcg"
+event property. Internally, this property generates two events:
+"<eventname>_trans" to trace the event at translation time, and
+"<eventname>_exec" to trace the event at execution time.
+
+Instead of using these two events, you should instead use the function
+"trace_<eventname>_tcg" during translation (TCG code generation). This function
+will automatically call "trace_<eventname>_trans", and will generate the
+necessary TCG code to call "trace_<eventname>_exec" during guest code execution.
+
+Events with the "tcg" property can be declared in the "trace-events" file with a
+mix of native and TCG types, and "trace_<eventname>_tcg" will gracefully forward
+them to the "<eventname>_trans" and "<eventname>_exec" events. Since TCG values
+are not known at translation time, these are ignored by the "<eventname>_trans"
+event. Because of this, the entry in the "trace-events" file needs two printing
+formats (separated by a comma):
+
+ tcg foo(uint8_t a1, TCGv_i32 a2) "a1=%d", "a1=%d a2=%d"
+
+For example:
+
+ #include "trace-tcg.h"
+
+ void some_disassembly_func (...)
+ {
+ uint8_t a1 = ...;
+ TCGv_i32 a2 = ...;
+ trace_foo_tcg(a1, a2);
+ }
+
+This will immediately call:
+
+ void trace_foo_trans(uint8_t a1);
+
+and will generate the TCG code to call:
+
+ void trace_foo(uint8_t a1, uint32_t a2);
+
+=== "vcpu" ===
+
+Identifies events that trace vCPU-specific information. It implicitly adds a
+"CPUState*" argument, and extends the tracing print format to show the vCPU
+information. If used together with the "tcg" property, it adds a second
+"TCGv_env" argument that must point to the per-target global TCG register that
+points to the vCPU when guest code is executed (usually the "cpu_env" variable).
+
+The "tcg" and "vcpu" properties are currently only honored in the root
+./trace-events file.
+
+The following example events:
+
+ foo(uint32_t a) "a=%x"
+ vcpu bar(uint32_t a) "a=%x"
+ tcg vcpu baz(uint32_t a) "a=%x", "a=%x"
+
+Can be used as:
+
+ #include "trace-tcg.h"
+
+ CPUArchState *env;
+ TCGv_ptr cpu_env;
+
+ void some_disassembly_func(...)
+ {
+ /* trace emitted at this point */
+ trace_foo(0xd1);
+ /* trace emitted at this point */
+ trace_bar(ENV_GET_CPU(env), 0xd2);
+ /* trace emitted at this point (env) and when guest code is executed (cpu_env) */
+ trace_baz_tcg(ENV_GET_CPU(env), cpu_env, 0xd3);
+ }
+
+If the translating vCPU has address 0xc1 and code is later executed by vCPU
+0xc2, this would be an example output:
+
+ // at guest code translation
+ foo a=0xd1
+ bar cpu=0xc1 a=0xd2
+ baz_trans cpu=0xc1 a=0xd3
+ // at guest code execution
+ baz_exec cpu=0xc2 a=0xd3
--- /dev/null
+Virtio devices and migration
+============================
+
+Copyright 2015 IBM Corp.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later. See
+the COPYING file in the top-level directory.
+
+Saving and restoring the state of virtio devices is a bit of a twisty maze,
+for several reasons:
+- state is distributed between several parts:
+ - virtio core, for common fields like features, number of queues, ...
+ - virtio transport (pci, ccw, ...), for the different proxy devices and
+ transport specific state (msix vectors, indicators, ...)
+ - virtio device (net, blk, ...), for the different device types and their
+ state (mac address, request queue, ...)
+- most fields are saved via the stream interface; subsequently, subsections
+ have been added to make cross-version migration possible
+
+This file attempts to document the current procedure and point out some
+caveats.
+
+
+Save state procedure
+====================
+
+virtio core virtio transport virtio device
+----------- ---------------- -------------
+
+ save() function registered
+ via VMState wrapper on
+ device class
+virtio_save() <----------
+ ------> save_config()
+ - save proxy device
+ - save transport-specific
+ device fields
+- save common device
+ fields
+- save common virtqueue
+ fields
+ ------> save_queue()
+ - save transport-specific
+ virtqueue fields
+ ------> save_device()
+ - save device-specific
+ fields
+- save subsections
+ - device endianness,
+ if changed from
+ default endianness
+ - 64 bit features, if
+ any high feature bit
+ is set
+ - virtio-1 virtqueue
+ fields, if VERSION_1
+ is set
+
+
+Load state procedure
+====================
+
+virtio core virtio transport virtio device
+----------- ---------------- -------------
+
+ load() function registered
+ via VMState wrapper on
+ device class
+virtio_load() <----------
+ ------> load_config()
+ - load proxy device
+ - load transport-specific
+ device fields
+- load common device
+ fields
+- load common virtqueue
+ fields
+ ------> load_queue()
+ - load transport-specific
+ virtqueue fields
+- notify guest
+ ------> load_device()
+ - load device-specific
+ fields
+- load subsections
+ - device endianness
+ - 64 bit features
+ - virtio-1 virtqueue
+ fields
+- sanitize endianness
+- sanitize features
+- virtqueue index sanity
+ check
+ - feature-dependent setup
+
+
+Implications of this setup
+==========================
+
+Devices need to be careful in their state processing during load: The
+load_device() procedure is invoked by the core before subsections have
+been loaded. Any code that depends on information transmitted in subsections
+therefore has to be invoked in the device's load() function _after_
+virtio_load() returned (like e.g. code depending on features).
+
+Any extension of the state being migrated should be done in subsections
+added to the core for compatibility reasons. If transport or device specific
+state is added, core needs to invoke a callback from the new subsection.
--- /dev/null
+= How to write QMP commands using the QAPI framework =
+
+This document is a step-by-step guide on how to write new QMP commands using
+the QAPI framework. It also shows how to implement new style HMP commands.
+
+This document doesn't discuss QMP protocol level details, nor does it dive
+into the QAPI framework implementation.
+
+For an in-depth introduction to the QAPI framework, please refer to
+docs/qapi-code-gen.txt. For documentation about the QMP protocol,
+start with docs/qmp-intro.txt.
+
+== Overview ==
+
+Generally speaking, the following steps should be taken in order to write a
+new QMP command.
+
+1. Write the command's and type(s) specification in the QAPI schema file
+ (qapi-schema.json in the root source directory)
+
+2. Write the QMP command itself, which is a regular C function. Preferably,
+ the command should be exported by some QEMU subsystem. But it can also be
+ added to the qmp.c file
+
+3. At this point the command can be tested under the QMP protocol
+
+4. Write the HMP command equivalent. This is not required and should only be
+ done if it does make sense to have the functionality in HMP. The HMP command
+ is implemented in terms of the QMP command
+
+The following sections will demonstrate each of the steps above. We will start
+very simple and get more complex as we progress.
+
+=== Testing ===
+
+For all the examples in the next sections, the test setup is the same and is
+shown here.
+
+First, QEMU should be started as:
+
+# /path/to/your/source/qemu [...] \
+ -chardev socket,id=qmp,port=4444,host=localhost,server \
+ -mon chardev=qmp,mode=control,pretty=on
+
+Then, in a different terminal:
+
+$ telnet localhost 4444
+Trying 127.0.0.1...
+Connected to localhost.
+Escape character is '^]'.
+{
+ "QMP": {
+ "version": {
+ "qemu": {
+ "micro": 50,
+ "minor": 15,
+ "major": 0
+ },
+ "package": ""
+ },
+ "capabilities": [
+ ]
+ }
+}
+
+The above output is the QMP server saying you're connected. The server is
+actually in capabilities negotiation mode. To enter in command mode type:
+
+{ "execute": "qmp_capabilities" }
+
+Then the server should respond:
+
+{
+ "return": {
+ }
+}
+
+Which is QMP's way of saying "the latest command executed OK and didn't return
+any data". Now you're ready to enter the QMP example commands as explained in
+the following sections.
+
+== Writing a command that doesn't return data ==
+
+That's the most simple QMP command that can be written. Usually, this kind of
+command carries some meaningful action in QEMU but here it will just print
+"Hello, world" to the standard output.
+
+Our command will be called "hello-world". It takes no arguments, nor does it
+return any data.
+
+The first step is to add the following line to the bottom of the
+qapi-schema.json file:
+
+{ 'command': 'hello-world' }
+
+The "command" keyword defines a new QMP command. It's an JSON object. All
+schema entries are JSON objects. The line above will instruct the QAPI to
+generate any prototypes and the necessary code to marshal and unmarshal
+protocol data.
+
+The next step is to write the "hello-world" implementation. As explained
+earlier, it's preferable for commands to live in QEMU subsystems. But
+"hello-world" doesn't pertain to any, so we put its implementation in qmp.c:
+
+void qmp_hello_world(Error **errp)
+{
+ printf("Hello, world!\n");
+}
+
+There are a few things to be noticed:
+
+1. QMP command implementation functions must be prefixed with "qmp_"
+2. qmp_hello_world() returns void, this is in accordance with the fact that the
+ command doesn't return any data
+3. It takes an "Error **" argument. This is required. Later we will see how to
+ return errors and take additional arguments. The Error argument should not
+ be touched if the command doesn't return errors
+4. We won't add the function's prototype. That's automatically done by the QAPI
+5. Printing to the terminal is discouraged for QMP commands, we do it here
+ because it's the easiest way to demonstrate a QMP command
+
+You're done. Now build qemu, run it as suggested in the "Testing" section,
+and then type the following QMP command:
+
+{ "execute": "hello-world" }
+
+Then check the terminal running qemu and look for the "Hello, world" string. If
+you don't see it then something went wrong.
+
+=== Arguments ===
+
+Let's add an argument called "message" to our "hello-world" command. The new
+argument will contain the string to be printed to stdout. It's an optional
+argument, if it's not present we print our default "Hello, World" string.
+
+The first change we have to do is to modify the command specification in the
+schema file to the following:
+
+{ 'command': 'hello-world', 'data': { '*message': 'str' } }
+
+Notice the new 'data' member in the schema. It's an JSON object whose each
+element is an argument to the command in question. Also notice the asterisk,
+it's used to mark the argument optional (that means that you shouldn't use it
+for mandatory arguments). Finally, 'str' is the argument's type, which
+stands for "string". The QAPI also supports integers, booleans, enumerations
+and user defined types.
+
+Now, let's update our C implementation in qmp.c:
+
+void qmp_hello_world(bool has_message, const char *message, Error **errp)
+{
+ if (has_message) {
+ printf("%s\n", message);
+ } else {
+ printf("Hello, world\n");
+ }
+}
+
+There are two important details to be noticed:
+
+1. All optional arguments are accompanied by a 'has_' boolean, which is set
+ if the optional argument is present or false otherwise
+2. The C implementation signature must follow the schema's argument ordering,
+ which is defined by the "data" member
+
+Time to test our new version of the "hello-world" command. Build qemu, run it as
+described in the "Testing" section and then send two commands:
+
+{ "execute": "hello-world" }
+{
+ "return": {
+ }
+}
+
+{ "execute": "hello-world", "arguments": { "message": "We love qemu" } }
+{
+ "return": {
+ }
+}
+
+You should see "Hello, world" and "we love qemu" in the terminal running qemu,
+if you don't see these strings, then something went wrong.
+
+=== Errors ===
+
+QMP commands should use the error interface exported by the error.h header
+file. Basically, most errors are set by calling the error_setg() function.
+
+Let's say we don't accept the string "message" to contain the word "love". If
+it does contain it, we want the "hello-world" command to return an error:
+
+void qmp_hello_world(bool has_message, const char *message, Error **errp)
+{
+ if (has_message) {
+ if (strstr(message, "love")) {
+ error_setg(errp, "the word 'love' is not allowed");
+ return;
+ }
+ printf("%s\n", message);
+ } else {
+ printf("Hello, world\n");
+ }
+}
+
+The first argument to the error_setg() function is the Error pointer
+to pointer, which is passed to all QMP functions. The next argument is a human
+description of the error, this is a free-form printf-like string.
+
+Let's test the example above. Build qemu, run it as defined in the "Testing"
+section, and then issue the following command:
+
+{ "execute": "hello-world", "arguments": { "message": "all you need is love" } }
+
+The QMP server's response should be:
+
+{
+ "error": {
+ "class": "GenericError",
+ "desc": "the word 'love' is not allowed"
+ }
+}
+
+As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR
+(done by default when using error_setg()). There are two exceptions to
+this rule:
+
+ 1. A non-generic ErrorClass value exists* for the failure you want to report
+ (eg. DeviceNotFound)
+
+ 2. Management applications have to take special action on the failure you
+ want to report, hence you have to add a new ErrorClass value so that they
+ can check for it
+
+If the failure you want to report falls into one of the two cases above,
+use error_set() with a second argument of an ErrorClass value.
+
+ * All existing ErrorClass values are defined in the qapi-schema.json file
+
+=== Command Documentation ===
+
+There's only one step missing to make "hello-world"'s implementation complete,
+and that's its documentation in the schema file.
+
+This is very important. No QMP command will be accepted in QEMU without proper
+documentation.
+
+There are many examples of such documentation in the schema file already, but
+here goes "hello-world"'s new entry for the qapi-schema.json file:
+
+##
+# @hello-world
+#
+# Print a client provided string to the standard output stream.
+#
+# @message: string to be printed
+#
+# Returns: Nothing on success.
+#
+# Notes: if @message is not provided, the "Hello, world" string will
+# be printed instead
+#
+# Since: <next qemu stable release, eg. 1.0>
+##
+{ 'command': 'hello-world', 'data': { '*message': 'str' } }
+
+Please, note that the "Returns" clause is optional if a command doesn't return
+any data nor any errors.
+
+=== Implementing the HMP command ===
+
+Now that the QMP command is in place, we can also make it available in the human
+monitor (HMP).
+
+With the introduction of the QAPI, HMP commands make QMP calls. Most of the
+time HMP commands are simple wrappers. All HMP commands implementation exist in
+the hmp.c file.
+
+Here's the implementation of the "hello-world" HMP command:
+
+void hmp_hello_world(Monitor *mon, const QDict *qdict)
+{
+ const char *message = qdict_get_try_str(qdict, "message");
+ Error *err = NULL;
+
+ qmp_hello_world(!!message, message, &err);
+ if (err) {
+ monitor_printf(mon, "%s\n", error_get_pretty(err));
+ error_free(err);
+ return;
+ }
+}
+
+Also, you have to add the function's prototype to the hmp.h file.
+
+There are three important points to be noticed:
+
+1. The "mon" and "qdict" arguments are mandatory for all HMP functions. The
+ former is the monitor object. The latter is how the monitor passes
+ arguments entered by the user to the command implementation
+2. hmp_hello_world() performs error checking. In this example we just print
+ the error description to the user, but we could do more, like taking
+ different actions depending on the error qmp_hello_world() returns
+3. The "err" variable must be initialized to NULL before performing the
+ QMP call
+
+There's one last step to actually make the command available to monitor users,
+we should add it to the hmp-commands.hx file:
+
+ {
+ .name = "hello-world",
+ .args_type = "message:s?",
+ .params = "hello-world [message]",
+ .help = "Print message to the standard output",
+ .cmd = hmp_hello_world,
+ },
+
+STEXI
+@item hello_world @var{message}
+@findex hello_world
+Print message to the standard output
+ETEXI
+
+To test this you have to open a user monitor and issue the "hello-world"
+command. It might be instructive to check the command's documentation with
+HMP's "help" command.
+
+Please, check the "-monitor" command-line option to know how to open a user
+monitor.
+
+== Writing a command that returns data ==
+
+A QMP command is capable of returning any data the QAPI supports like integers,
+strings, booleans, enumerations and user defined types.
+
+In this section we will focus on user defined types. Please, check the QAPI
+documentation for information about the other types.
+
+=== User Defined Types ===
+
+FIXME This example needs to be redone after commit 6d32717
+
+For this example we will write the query-alarm-clock command, which returns
+information about QEMU's timer alarm. For more information about it, please
+check the "-clock" command-line option.
+
+We want to return two pieces of information. The first one is the alarm clock's
+name. The second one is when the next alarm will fire. The former information is
+returned as a string, the latter is an integer in nanoseconds (which is not
+very useful in practice, as the timer has probably already fired when the
+information reaches the client).
+
+The best way to return that data is to create a new QAPI type, as shown below:
+
+##
+# @QemuAlarmClock
+#
+# QEMU alarm clock information.
+#
+# @clock-name: The alarm clock method's name.
+#
+# @next-deadline: The time (in nanoseconds) the next alarm will fire.
+#
+# Since: 1.0
+##
+{ 'type': 'QemuAlarmClock',
+ 'data': { 'clock-name': 'str', '*next-deadline': 'int' } }
+
+The "type" keyword defines a new QAPI type. Its "data" member contains the
+type's members. In this example our members are the "clock-name" and the
+"next-deadline" one, which is optional.
+
+Now let's define the query-alarm-clock command:
+
+##
+# @query-alarm-clock
+#
+# Return information about QEMU's alarm clock.
+#
+# Returns a @QemuAlarmClock instance describing the alarm clock method
+# being currently used by QEMU (this is usually set by the '-clock'
+# command-line option).
+#
+# Since: 1.0
+##
+{ 'command': 'query-alarm-clock', 'returns': 'QemuAlarmClock' }
+
+Notice the "returns" keyword. As its name suggests, it's used to define the
+data returned by a command.
+
+It's time to implement the qmp_query_alarm_clock() function, you can put it
+in the qemu-timer.c file:
+
+QemuAlarmClock *qmp_query_alarm_clock(Error **errp)
+{
+ QemuAlarmClock *clock;
+ int64_t deadline;
+
+ clock = g_malloc0(sizeof(*clock));
+
+ deadline = qemu_next_alarm_deadline();
+ if (deadline > 0) {
+ clock->has_next_deadline = true;
+ clock->next_deadline = deadline;
+ }
+ clock->clock_name = g_strdup(alarm_timer->name);
+
+ return clock;
+}
+
+There are a number of things to be noticed:
+
+1. The QemuAlarmClock type is automatically generated by the QAPI framework,
+ its members correspond to the type's specification in the schema file
+2. As specified in the schema file, the function returns a QemuAlarmClock
+ instance and takes no arguments (besides the "errp" one, which is mandatory
+ for all QMP functions)
+3. The "clock" variable (which will point to our QAPI type instance) is
+ allocated by the regular g_malloc0() function. Note that we chose to
+ initialize the memory to zero. This is recommended for all QAPI types, as
+ it helps avoiding bad surprises (specially with booleans)
+4. Remember that "next_deadline" is optional? All optional members have a
+ 'has_TYPE_NAME' member that should be properly set by the implementation,
+ as shown above
+5. Even static strings, such as "alarm_timer->name", should be dynamically
+ allocated by the implementation. This is so because the QAPI also generates
+ a function to free its types and it cannot distinguish between dynamically
+ or statically allocated strings
+6. You have to include the "qmp-commands.h" header file in qemu-timer.c,
+ otherwise qemu won't build
+
+Time to test the new command. Build qemu, run it as described in the "Testing"
+section and try this:
+
+{ "execute": "query-alarm-clock" }
+{
+ "return": {
+ "next-deadline": 2368219,
+ "clock-name": "dynticks"
+ }
+}
+
+==== The HMP command ====
+
+Here's the HMP counterpart of the query-alarm-clock command:
+
+void hmp_info_alarm_clock(Monitor *mon)
+{
+ QemuAlarmClock *clock;
+ Error *err = NULL;
+
+ clock = qmp_query_alarm_clock(&err);
+ if (err) {
+ monitor_printf(mon, "Could not query alarm clock information\n");
+ error_free(err);
+ return;
+ }
+
+ monitor_printf(mon, "Alarm clock method in use: '%s'\n", clock->clock_name);
+ if (clock->has_next_deadline) {
+ monitor_printf(mon, "Next alarm will fire in %" PRId64 " nanoseconds\n",
+ clock->next_deadline);
+ }
+
+ qapi_free_QemuAlarmClock(clock);
+}
+
+It's important to notice that hmp_info_alarm_clock() calls
+qapi_free_QemuAlarmClock() to free the data returned by qmp_query_alarm_clock().
+For user defined types, the QAPI will generate a qapi_free_QAPI_TYPE_NAME()
+function and that's what you have to use to free the types you define and
+qapi_free_QAPI_TYPE_NAMEList() for list types (explained in the next section).
+If the QMP call returns a string, then you should g_free() to free it.
+
+Also note that hmp_info_alarm_clock() performs error handling. That's not
+strictly required if you're sure the QMP function doesn't return errors, but
+it's good practice to always check for errors.
+
+Another important detail is that HMP's "info" commands don't go into the
+hmp-commands.hx. Instead, they go into the info_cmds[] table, which is defined
+in the monitor.c file. The entry for the "info alarmclock" follows:
+
+ {
+ .name = "alarmclock",
+ .args_type = "",
+ .params = "",
+ .help = "show information about the alarm clock",
+ .cmd = hmp_info_alarm_clock,
+ },
+
+To test this, run qemu and type "info alarmclock" in the user monitor.
+
+=== Returning Lists ===
+
+For this example, we're going to return all available methods for the timer
+alarm, which is pretty much what the command-line option "-clock ?" does,
+except that we're also going to inform which method is in use.
+
+This first step is to define a new type:
+
+##
+# @TimerAlarmMethod
+#
+# Timer alarm method information.
+#
+# @method-name: The method's name.
+#
+# @current: true if this alarm method is currently in use, false otherwise
+#
+# Since: 1.0
+##
+{ 'type': 'TimerAlarmMethod',
+ 'data': { 'method-name': 'str', 'current': 'bool' } }
+
+The command will be called "query-alarm-methods", here is its schema
+specification:
+
+##
+# @query-alarm-methods
+#
+# Returns information about available alarm methods.
+#
+# Returns: a list of @TimerAlarmMethod for each method
+#
+# Since: 1.0
+##
+{ 'command': 'query-alarm-methods', 'returns': ['TimerAlarmMethod'] }
+
+Notice the syntax for returning lists "'returns': ['TimerAlarmMethod']", this
+should be read as "returns a list of TimerAlarmMethod instances".
+
+The C implementation follows:
+
+TimerAlarmMethodList *qmp_query_alarm_methods(Error **errp)
+{
+ TimerAlarmMethodList *method_list = NULL;
+ const struct qemu_alarm_timer *p;
+ bool current = true;
+
+ for (p = alarm_timers; p->name; p++) {
+ TimerAlarmMethodList *info = g_malloc0(sizeof(*info));
+ info->value = g_malloc0(sizeof(*info->value));
+ info->value->method_name = g_strdup(p->name);
+ info->value->current = current;
+
+ current = false;
+
+ info->next = method_list;
+ method_list = info;
+ }
+
+ return method_list;
+}
+
+The most important difference from the previous examples is the
+TimerAlarmMethodList type, which is automatically generated by the QAPI from
+the TimerAlarmMethod type.
+
+Each list node is represented by a TimerAlarmMethodList instance. We have to
+allocate it, and that's done inside the for loop: the "info" pointer points to
+an allocated node. We also have to allocate the node's contents, which is
+stored in its "value" member. In our example, the "value" member is a pointer
+to an TimerAlarmMethod instance.
+
+Notice that the "current" variable is used as "true" only in the first
+iteration of the loop. That's because the alarm timer method in use is the
+first element of the alarm_timers array. Also notice that QAPI lists are handled
+by hand and we return the head of the list.
+
+Now Build qemu, run it as explained in the "Testing" section and try our new
+command:
+
+{ "execute": "query-alarm-methods" }
+{
+ "return": [
+ {
+ "current": false,
+ "method-name": "unix"
+ },
+ {
+ "current": true,
+ "method-name": "dynticks"
+ }
+ ]
+}
+
+The HMP counterpart is a bit more complex than previous examples because it
+has to traverse the list, it's shown below for reference:
+
+void hmp_info_alarm_methods(Monitor *mon)
+{
+ TimerAlarmMethodList *method_list, *method;
+ Error *err = NULL;
+
+ method_list = qmp_query_alarm_methods(&err);
+ if (err) {
+ monitor_printf(mon, "Could not query alarm methods\n");
+ error_free(err);
+ return;
+ }
+
+ for (method = method_list; method; method = method->next) {
+ monitor_printf(mon, "%c %s\n", method->value->current ? '*' : ' ',
+ method->value->method_name);
+ }
+
+ qapi_free_TimerAlarmMethodList(method_list);
+}
+++ /dev/null
-###########################################################################
-#
-# You can pass this file directly to qemu using the -readconfig
-# command line switch.
-#
-# This config file creates a EHCI adapter with companion UHCI
-# controllers as multifunction device in PCI slot "1d".
-#
-# Specify "bus=ehci.0" when creating usb devices to hook them up
-# there.
-#
-
-[device "ehci"]
- driver = "ich9-usb-ehci1"
- addr = "1d.7"
- multifunction = "on"
-
-[device "uhci-1"]
- driver = "ich9-usb-uhci1"
- addr = "1d.0"
- multifunction = "on"
- masterbus = "ehci.0"
- firstport = "0"
-
-[device "uhci-2"]
- driver = "ich9-usb-uhci2"
- addr = "1d.1"
- multifunction = "on"
- masterbus = "ehci.0"
- firstport = "2"
-
-[device "uhci-3"]
- driver = "ich9-usb-uhci3"
- addr = "1d.2"
- multifunction = "on"
- masterbus = "ehci.0"
- firstport = "4"
+++ /dev/null
-DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt)
-===================================================
-
-QEMU often uses reference counts to track data structures that are being
-accessed and should not be freed. For example, a loop that invoke
-callbacks like this is not safe:
-
- QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
- if (ioh->revents & G_IO_OUT) {
- ioh->fd_write(ioh->opaque);
- }
- }
-
-QLIST_FOREACH_SAFE protects against deletion of the current node (ioh)
-by stashing away its "next" pointer. However, ioh->fd_write could
-actually delete the next node from the list. The simplest way to
-avoid this is to mark the node as deleted, and remove it from the
-list in the above loop:
-
- QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
- if (ioh->deleted) {
- QLIST_REMOVE(ioh, next);
- g_free(ioh);
- } else {
- if (ioh->revents & G_IO_OUT) {
- ioh->fd_write(ioh->opaque);
- }
- }
- }
-
-If however this loop must also be reentrant, i.e. it is possible that
-ioh->fd_write invokes the loop again, some kind of counting is needed:
-
- walking_handlers++;
- QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
- if (ioh->deleted) {
- if (walking_handlers == 1) {
- QLIST_REMOVE(ioh, next);
- g_free(ioh);
- }
- } else {
- if (ioh->revents & G_IO_OUT) {
- ioh->fd_write(ioh->opaque);
- }
- }
- }
- walking_handlers--;
-
-One may think of using the RCU primitives, rcu_read_lock() and
-rcu_read_unlock(); effectively, the RCU nesting count would take
-the place of the walking_handlers global variable. Indeed,
-reference counting and RCU have similar purposes, but their usage in
-general is complementary:
-
-- reference counting is fine-grained and limited to a single data
- structure; RCU delays reclamation of *all* RCU-protected data
- structures;
-
-- reference counting works even in the presence of code that keeps
- a reference for a long time; RCU critical sections in principle
- should be kept short;
-
-- reference counting is often applied to code that is not thread-safe
- but is reentrant; in fact, usage of reference counting in QEMU predates
- the introduction of threads by many years. RCU is generally used to
- protect readers from other threads freeing memory after concurrent
- modifications to a data structure.
-
-- reclaiming data can be done by a separate thread in the case of RCU;
- this can improve performance, but also delay reclamation undesirably.
- With reference counting, reclamation is deterministic.
-
-This file documents QemuLockCnt, an abstraction for using reference
-counting in code that has to be both thread-safe and reentrant.
-
-
-QemuLockCnt concepts
---------------------
-
-A QemuLockCnt comprises both a counter and a mutex; it has primitives
-to increment and decrement the counter, and to take and release the
-mutex. The counter notes how many visits to the data structures are
-taking place (the visits could be from different threads, or there could
-be multiple reentrant visits from the same thread). The basic rules
-governing the counter/mutex pair then are the following:
-
-- Data protected by the QemuLockCnt must not be freed unless the
- counter is zero and the mutex is taken.
-
-- A new visit cannot be started while the counter is zero and the
- mutex is taken.
-
-Most of the time, the mutex protects all writes to the data structure,
-not just frees, though there could be cases where this is not necessary.
-
-Reads, instead, can be done without taking the mutex, as long as the
-readers and writers use the same macros that are used for RCU, for
-example atomic_rcu_read, atomic_rcu_set, QLIST_FOREACH_RCU, etc. This is
-because the reads are done outside a lock and a set or QLIST_INSERT_HEAD
-can happen concurrently with the read. The RCU API ensures that the
-processor and the compiler see all required memory barriers.
-
-This could be implemented simply by protecting the counter with the
-mutex, for example:
-
- // (1)
- qemu_mutex_lock(&walking_handlers_mutex);
- walking_handlers++;
- qemu_mutex_unlock(&walking_handlers_mutex);
-
- ...
-
- // (2)
- qemu_mutex_lock(&walking_handlers_mutex);
- if (--walking_handlers == 0) {
- QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
- if (ioh->deleted) {
- QLIST_REMOVE(ioh, next);
- g_free(ioh);
- }
- }
- }
- qemu_mutex_unlock(&walking_handlers_mutex);
-
-Here, no frees can happen in the code represented by the ellipsis.
-If another thread is executing critical section (2), that part of
-the code cannot be entered, because the thread will not be able
-to increment the walking_handlers variable. And of course
-during the visit any other thread will see a nonzero value for
-walking_handlers, as in the single-threaded code.
-
-Note that it is possible for multiple concurrent accesses to delay
-the cleanup arbitrarily; in other words, for the walking_handlers
-counter to never become zero. For this reason, this technique is
-more easily applicable if concurrent access to the structure is rare.
-
-However, critical sections are easy to forget since you have to do
-them for each modification of the counter. QemuLockCnt ensures that
-all modifications of the counter take the lock appropriately, and it
-can also be more efficient in two ways:
-
-- it avoids taking the lock for many operations (for example
- incrementing the counter while it is non-zero);
-
-- on some platforms, one can implement QemuLockCnt to hold the lock
- and the mutex in a single word, making the fast path no more expensive
- than simply managing a counter using atomic operations (see
- docs/atomics.txt). This can be very helpful if concurrent access to
- the data structure is expected to be rare.
-
-
-Using the same mutex for frees and writes can still incur some small
-inefficiencies; for example, a visit can never start if the counter is
-zero and the mutex is taken---even if the mutex is taken by a write,
-which in principle need not block a visit of the data structure.
-However, these are usually not a problem if any of the following
-assumptions are valid:
-
-- concurrent access is possible but rare
-
-- writes are rare
-
-- writes are frequent, but this kind of write (e.g. appending to a
- list) has a very small critical section.
-
-For example, QEMU uses QemuLockCnt to manage an AioContext's list of
-bottom halves and file descriptor handlers. Modifications to the list
-of file descriptor handlers are rare. Creation of a new bottom half is
-frequent and can happen on a fast path; however: 1) it is almost never
-concurrent with a visit to the list of bottom halves; 2) it only has
-three instructions in the critical path, two assignments and a smp_wmb().
-
-
-QemuLockCnt API
----------------
-
-The QemuLockCnt API is described in include/qemu/thread.h.
-
-
-QemuLockCnt usage
------------------
-
-This section explains the typical usage patterns for QemuLockCnt functions.
-
-Setting a variable to a non-NULL value can be done between
-qemu_lockcnt_lock and qemu_lockcnt_unlock:
-
- qemu_lockcnt_lock(&xyz_lockcnt);
- if (!xyz) {
- new_xyz = g_new(XYZ, 1);
- ...
- atomic_rcu_set(&xyz, new_xyz);
- }
- qemu_lockcnt_unlock(&xyz_lockcnt);
-
-Accessing the value can be done between qemu_lockcnt_inc and
-qemu_lockcnt_dec:
-
- qemu_lockcnt_inc(&xyz_lockcnt);
- if (xyz) {
- XYZ *p = atomic_rcu_read(&xyz);
- ...
- /* Accesses can now be done through "p". */
- }
- qemu_lockcnt_dec(&xyz_lockcnt);
-
-Freeing the object can similarly use qemu_lockcnt_lock and
-qemu_lockcnt_unlock, but you also need to ensure that the count
-is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc
-takes the QemuLockCnt's lock, the count cannot become non-zero while
-the object is being freed. Freeing an object looks like this:
-
- qemu_lockcnt_lock(&xyz_lockcnt);
- if (!qemu_lockcnt_count(&xyz_lockcnt)) {
- g_free(xyz);
- xyz = NULL;
- }
- qemu_lockcnt_unlock(&xyz_lockcnt);
-
-If an object has to be freed right after a visit, you can combine
-the decrement, the locking and the check on count as follows:
-
- qemu_lockcnt_inc(&xyz_lockcnt);
- if (xyz) {
- XYZ *p = atomic_rcu_read(&xyz);
- ...
- /* Accesses can now be done through "p". */
- }
- if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) {
- g_free(xyz);
- xyz = NULL;
- qemu_lockcnt_unlock(&xyz_lockcnt);
- }
-
-QemuLockCnt can also be used to access a list as follows:
-
- qemu_lockcnt_inc(&io_handlers_lockcnt);
- QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) {
- if (ioh->revents & G_IO_OUT) {
- ioh->fd_write(ioh->opaque);
- }
- }
-
- if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) {
- QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) {
- if (ioh->deleted) {
- QLIST_REMOVE(ioh, next);
- g_free(ioh);
- }
- }
- qemu_lockcnt_unlock(&io_handlers_lockcnt);
- }
-
-Again, the RCU primitives are used because new items can be added to the
-list during the walk. QLIST_FOREACH_RCU ensures that the processor and
-the compiler see the appropriate memory barriers.
-
-An alternative pattern uses qemu_lockcnt_dec_if_lock:
-
- qemu_lockcnt_inc(&io_handlers_lockcnt);
- QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) {
- if (ioh->deleted) {
- if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) {
- QLIST_REMOVE(ioh, next);
- g_free(ioh);
- qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt);
- }
- } else {
- if (ioh->revents & G_IO_OUT) {
- ioh->fd_write(ioh->opaque);
- }
- }
- }
- qemu_lockcnt_dec(&io_handlers_lockcnt);
-
-Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock,
-because there is no special task to do if the count goes from 1 to 0.
+++ /dev/null
-# mach-virt - VirtIO guest (graphical console)
-# =========================================================
-#
-# Usage:
-#
-# $ qemu-system-aarch64 \
-# -nodefaults \
-# -readconfig mach-virt-graphical.cfg \
-# -cpu host
-#
-# You will probably need to tweak the lines marked as
-# CHANGE ME before being able to use this configuration!
-#
-# The guest will have a selection of VirtIO devices
-# tailored towards optimal performance with modern guests,
-# and will be accessed through a graphical console.
-#
-# ---------------------------------------------------------
-#
-# Using -nodefaults is required to have full control over
-# the virtual hardware: when it's specified, QEMU will
-# populate the board with only the builtin peripherals,
-# such as the PL011 UART, plus a PCI Express Root Bus; the
-# user will then have to explicitly add further devices.
-#
-# The PCI Express Root Bus shows up in the guest as:
-#
-# 00:00.0 Host bridge
-#
-# This configuration file adds a number of other useful
-# devices, more specifically:
-#
-# 00:01.0 Display controller
-# 00.1c.* PCI bridge (PCI Express Root Ports)
-# 01:00.0 SCSI storage controller
-# 02:00.0 Ethernet controller
-# 03:00.0 USB controller
-#
-# More information about these devices is available below.
-
-
-# Machine options
-# =========================================================
-#
-# We use the virt machine type and enable KVM acceleration
-# for better performance.
-#
-# Using less than 1 GiB of memory is probably not going to
-# yield good performance in the guest, and might even lead
-# to obscure boot issues in some cases.
-#
-# Unfortunately, there is no way to configure the CPU model
-# in this file, so it will have to be provided on the
-# command line, but we can configure the guest to use the
-# same GIC version as the host.
-
-[machine]
- type = "virt"
- accel = "kvm"
- gic-version = "host"
-
-[memory]
- size = "1024"
-
-
-# Firmware configuration
-# =========================================================
-#
-# There are two parts to the firmware: a read-only image
-# containing the executable code, which is shared between
-# guests, and a read/write variable store that is owned
-# by one specific guest, exclusively, and is used to
-# record information such as the UEFI boot order.
-#
-# For any new guest, its permanent, private variable store
-# should initially be copied from the template file
-# provided along with the firmware binary.
-#
-# Depending on the OS distribution you're using on the
-# host, the name of the package containing the firmware
-# binary and variable store template, as well as the paths
-# to the files themselves, will be different. For example:
-#
-# Fedora
-# edk2-aarch64 (pkg)
-# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin)
-# /usr/share/edk2/aarch64/vars-template-pflash.raw (var)
-#
-# RHEL
-# AAVMF (pkg)
-# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
-# /usr/share/AAVMF/AAVMF_VARS.fd (var)
-#
-# Debian/Ubuntu
-# qemu-efi (pkg)
-# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
-# /usr/share/AAVMF/AAVMF_VARS.fd (var)
-
-[drive "uefi-binary"]
- file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME
- format = "raw"
- if = "pflash"
- unit = "0"
- readonly = "on"
-
-[drive "uefi-varstore"]
- file = "guest_VARS.fd" # CHANGE ME
- format = "raw"
- if = "pflash"
- unit = "1"
-
-
-# PCI bridge (PCI Express Root Ports)
-# =========================================================
-#
-# We create eight PCI Express Root Ports, and we plug them
-# all into separate functions of the same slot. Some of
-# them will be used by devices, the rest will remain
-# available for hotplug.
-
-[device "pcie.1"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.0"
- port = "1"
- chassis = "1"
- multifunction = "on"
-
-[device "pcie.2"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.1"
- port = "2"
- chassis = "2"
-
-[device "pcie.3"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.2"
- port = "3"
- chassis = "3"
-
-[device "pcie.4"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.3"
- port = "4"
- chassis = "4"
-
-[device "pcie.5"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.4"
- port = "5"
- chassis = "5"
-
-[device "pcie.6"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.5"
- port = "6"
- chassis = "6"
-
-[device "pcie.7"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.6"
- port = "7"
- chassis = "7"
-
-[device "pcie.8"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.7"
- port = "8"
- chassis = "8"
-
-
-# SCSI storage controller (and storage)
-# =========================================================
-#
-# We use virtio-scsi here so that we can (hot)plug a large
-# number of disks without running into issues; a SCSI disk,
-# backed by a qcow2 disk image on the host's filesystem, is
-# attached to it.
-#
-# We also create an optical disk, mostly for installation
-# purposes: once the guest OS has been succesfully
-# installed, the guest will no longer boot from optical
-# media. If you don't want, or no longer want, to have an
-# optical disk in the guest you can safely comment out
-# all relevant sections below.
-
-[device "scsi"]
- driver = "virtio-scsi-pci"
- bus = "pcie.1"
- addr = "00.0"
-
-[device "scsi-disk"]
- driver = "scsi-hd"
- bus = "scsi.0"
- drive = "disk"
- bootindex = "1"
-
-[drive "disk"]
- file = "guest.qcow2" # CHANGE ME
- format = "qcow2"
- if = "none"
-
-[device "scsi-optical-disk"]
- driver = "scsi-cd"
- bus = "scsi.0"
- drive = "optical-disk"
- bootindex = "2"
-
-[drive "optical-disk"]
- file = "install.iso" # CHANGE ME
- format = "raw"
- if = "none"
-
-
-# Ethernet controller
-# =========================================================
-#
-# We use virtio-net for improved performance over emulated
-# hardware; on the host side, we take advantage of user
-# networking so that the QEMU process doesn't require any
-# additional privileges.
-
-[netdev "hostnet"]
- type = "user"
-
-[device "net"]
- driver = "virtio-net-pci"
- netdev = "hostnet"
- bus = "pcie.2"
- addr = "00.0"
-
-
-# USB controller (and input devices)
-# =========================================================
-#
-# We add a virtualization-friendly USB 3.0 controller and
-# a USB keyboard / USB tablet combo so that graphical
-# guests can be controlled appropriately.
-
-[device "usb"]
- driver = "nec-usb-xhci"
- bus = "pcie.3"
- addr = "00.0"
-
-[device "keyboard"]
- driver = "usb-kbd"
- bus = "usb.0"
-
-[device "tablet"]
- driver = "usb-tablet"
- bus = "usb.0"
-
-
-# Display controller
-# =========================================================
-#
-# We use virtio-gpu because the legacy VGA framebuffer is
-# very troublesome on aarch64, and virtio-gpu is the only
-# video device that doesn't implement it.
-#
-# If you're running the guest on a remote, potentially
-# headless host, you will probably want to append something
-# like
-#
-# -display vnc=127.0.0.1:0
-#
-# to the command line in order to prevent QEMU from
-# creating a graphical display window on the host and
-# enable remote access instead.
-
-[device "video"]
- driver = "virtio-gpu"
- bus = "pcie.0"
- addr = "01.0"
+++ /dev/null
-# mach-virt - VirtIO guest (serial console)
-# =========================================================
-#
-# Usage:
-#
-# $ qemu-system-aarch64 \
-# -nodefaults \
-# -readconfig mach-virt-serial.cfg \
-# -display none -serial mon:stdio \
-# -cpu host
-#
-# You will probably need to tweak the lines marked as
-# CHANGE ME before being able to use this configuration!
-#
-# The guest will have a selection of VirtIO devices
-# tailored towards optimal performance with modern guests,
-# and will be accessed through the serial console.
-#
-# ---------------------------------------------------------
-#
-# Using -nodefaults is required to have full control over
-# the virtual hardware: when it's specified, QEMU will
-# populate the board with only the builtin peripherals,
-# such as the PL011 UART, plus a PCI Express Root Bus; the
-# user will then have to explicitly add further devices.
-#
-# The PCI Express Root Bus shows up in the guest as:
-#
-# 00:00.0 Host bridge
-#
-# This configuration file adds a number of other useful
-# devices, more specifically:
-#
-# 00.1c.* PCI bridge (PCI Express Root Ports)
-# 01:00.0 SCSI storage controller
-# 02:00.0 Ethernet controller
-#
-# More information about these devices is available below.
-#
-# We use '-display none' to prevent QEMU from creating a
-# graphical display window, which would serve no use in
-# this specific configuration, and '-serial mon:stdio' to
-# multiplex the guest's serial console and the QEMU monitor
-# to the host's stdio; use 'Ctrl+A h' to learn how to
-# switch between the two and more.
-
-
-# Machine options
-# =========================================================
-#
-# We use the virt machine type and enable KVM acceleration
-# for better performance.
-#
-# Using less than 1 GiB of memory is probably not going to
-# yield good performance in the guest, and might even lead
-# to obscure boot issues in some cases.
-#
-# Unfortunately, there is no way to configure the CPU model
-# in this file, so it will have to be provided on the
-# command line, but we can configure the guest to use the
-# same GIC version as the host.
-
-[machine]
- type = "virt"
- accel = "kvm"
- gic-version = "host"
-
-[memory]
- size = "1024"
-
-
-# Firmware configuration
-# =========================================================
-#
-# There are two parts to the firmware: a read-only image
-# containing the executable code, which is shared between
-# guests, and a read/write variable store that is owned
-# by one specific guest, exclusively, and is used to
-# record information such as the UEFI boot order.
-#
-# For any new guest, its permanent, private variable store
-# should initially be copied from the template file
-# provided along with the firmware binary.
-#
-# Depending on the OS distribution you're using on the
-# host, the name of the package containing the firmware
-# binary and variable store template, as well as the paths
-# to the files themselves, will be different. For example:
-#
-# Fedora
-# edk2-aarch64 (pkg)
-# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin)
-# /usr/share/edk2/aarch64/vars-template-pflash.raw (var)
-#
-# RHEL
-# AAVMF (pkg)
-# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
-# /usr/share/AAVMF/AAVMF_VARS.fd (var)
-#
-# Debian/Ubuntu
-# qemu-efi (pkg)
-# /usr/share/AAVMF/AAVMF_CODE.fd (bin)
-# /usr/share/AAVMF/AAVMF_VARS.fd (var)
-
-[drive "uefi-binary"]
- file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME
- format = "raw"
- if = "pflash"
- unit = "0"
- readonly = "on"
-
-[drive "uefi-varstore"]
- file = "guest_VARS.fd" # CHANGE ME
- format = "raw"
- if = "pflash"
- unit = "1"
-
-
-# PCI bridge (PCI Express Root Ports)
-# =========================================================
-#
-# We create eight PCI Express Root Ports, and we plug them
-# all into separate functions of the same slot. Some of
-# them will be used by devices, the rest will remain
-# available for hotplug.
-
-[device "pcie.1"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.0"
- port = "1"
- chassis = "1"
- multifunction = "on"
-
-[device "pcie.2"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.1"
- port = "2"
- chassis = "2"
-
-[device "pcie.3"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.2"
- port = "3"
- chassis = "3"
-
-[device "pcie.4"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.3"
- port = "4"
- chassis = "4"
-
-[device "pcie.5"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.4"
- port = "5"
- chassis = "5"
-
-[device "pcie.6"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.5"
- port = "6"
- chassis = "6"
-
-[device "pcie.7"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.6"
- port = "7"
- chassis = "7"
-
-[device "pcie.8"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.7"
- port = "8"
- chassis = "8"
-
-
-# SCSI storage controller (and storage)
-# =========================================================
-#
-# We use virtio-scsi here so that we can (hot)plug a large
-# number of disks without running into issues; a SCSI disk,
-# backed by a qcow2 disk image on the host's filesystem, is
-# attached to it.
-#
-# We also create an optical disk, mostly for installation
-# purposes: once the guest OS has been succesfully
-# installed, the guest will no longer boot from optical
-# media. If you don't want, or no longer want, to have an
-# optical disk in the guest you can safely comment out
-# all relevant sections below.
-
-[device "scsi"]
- driver = "virtio-scsi-pci"
- bus = "pcie.1"
- addr = "00.0"
-
-[device "scsi-disk"]
- driver = "scsi-hd"
- bus = "scsi.0"
- drive = "disk"
- bootindex = "1"
-
-[drive "disk"]
- file = "guest.qcow2" # CHANGE ME
- format = "qcow2"
- if = "none"
-
-[device "scsi-optical-disk"]
- driver = "scsi-cd"
- bus = "scsi.0"
- drive = "optical-disk"
- bootindex = "2"
-
-[drive "optical-disk"]
- file = "install.iso" # CHANGE ME
- format = "raw"
- if = "none"
-
-
-# Ethernet controller
-# =========================================================
-#
-# We use virtio-net for improved performance over emulated
-# hardware; on the host side, we take advantage of user
-# networking so that the QEMU process doesn't require any
-# additional privileges.
-
-[netdev "hostnet"]
- type = "user"
-
-[device "net"]
- driver = "virtio-net-pci"
- netdev = "hostnet"
- bus = "pcie.2"
- addr = "00.0"
+++ /dev/null
-The memory API
-==============
-
-The memory API models the memory and I/O buses and controllers of a QEMU
-machine. It attempts to allow modelling of:
-
- - ordinary RAM
- - memory-mapped I/O (MMIO)
- - memory controllers that can dynamically reroute physical memory regions
- to different destinations
-
-The memory model provides support for
-
- - tracking RAM changes by the guest
- - setting up coalesced memory for kvm
- - setting up ioeventfd regions for kvm
-
-Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks
-(leaves) are RAM and MMIO regions, while other nodes represent
-buses, memory controllers, and memory regions that have been rerouted.
-
-In addition to MemoryRegion objects, the memory API provides AddressSpace
-objects for every root and possibly for intermediate MemoryRegions too.
-These represent memory as seen from the CPU or a device's viewpoint.
-
-Types of regions
-----------------
-
-There are multiple types of memory regions (all represented by a single C type
-MemoryRegion):
-
-- RAM: a RAM region is simply a range of host memory that can be made available
- to the guest.
- You typically initialize these with memory_region_init_ram(). Some special
- purposes require the variants memory_region_init_resizeable_ram(),
- memory_region_init_ram_from_file(), or memory_region_init_ram_ptr().
-
-- MMIO: a range of guest memory that is implemented by host callbacks;
- each read or write causes a callback to be called on the host.
- You initialize these with memory_region_init_io(), passing it a
- MemoryRegionOps structure describing the callbacks.
-
-- ROM: a ROM memory region works like RAM for reads (directly accessing
- a region of host memory), and forbids writes. You initialize these with
- memory_region_init_rom().
-
-- ROM device: a ROM device memory region works like RAM for reads
- (directly accessing a region of host memory), but like MMIO for
- writes (invoking a callback). You initialize these with
- memory_region_init_rom_device().
-
-- IOMMU region: an IOMMU region translates addresses of accesses made to it
- and forwards them to some other target memory region. As the name suggests,
- these are only needed for modelling an IOMMU, not for simple devices.
- You initialize these with memory_region_init_iommu().
-
-- container: a container simply includes other memory regions, each at
- a different offset. Containers are useful for grouping several regions
- into one unit. For example, a PCI BAR may be composed of a RAM region
- and an MMIO region.
-
- A container's subregions are usually non-overlapping. In some cases it is
- useful to have overlapping regions; for example a memory controller that
- can overlay a subregion of RAM with MMIO or ROM, or a PCI controller
- that does not prevent card from claiming overlapping BARs.
-
- You initialize a pure container with memory_region_init().
-
-- alias: a subsection of another region. Aliases allow a region to be
- split apart into discontiguous regions. Examples of uses are memory banks
- used when the guest address space is smaller than the amount of RAM
- addressed, or a memory controller that splits main memory to expose a "PCI
- hole". Aliases may point to any type of region, including other aliases,
- but an alias may not point back to itself, directly or indirectly.
- You initialize these with memory_region_init_alias().
-
-- reservation region: a reservation region is primarily for debugging.
- It claims I/O space that is not supposed to be handled by QEMU itself.
- The typical use is to track parts of the address space which will be
- handled by the host kernel when KVM is enabled.
- You initialize these with memory_region_init_reservation(), or by
- passing a NULL callback parameter to memory_region_init_io().
-
-It is valid to add subregions to a region which is not a pure container
-(that is, to an MMIO, RAM or ROM region). This means that the region
-will act like a container, except that any addresses within the container's
-region which are not claimed by any subregion are handled by the
-container itself (ie by its MMIO callbacks or RAM backing). However
-it is generally possible to achieve the same effect with a pure container
-one of whose subregions is a low priority "background" region covering
-the whole address range; this is often clearer and is preferred.
-Subregions cannot be added to an alias region.
-
-Region names
-------------
-
-Regions are assigned names by the constructor. For most regions these are
-only used for debugging purposes, but RAM regions also use the name to identify
-live migration sections. This means that RAM region names need to have ABI
-stability.
-
-Region lifecycle
-----------------
-
-A region is created by one of the memory_region_init*() functions and
-attached to an object, which acts as its owner or parent. QEMU ensures
-that the owner object remains alive as long as the region is visible to
-the guest, or as long as the region is in use by a virtual CPU or another
-device. For example, the owner object will not die between an
-address_space_map operation and the corresponding address_space_unmap.
-
-After creation, a region can be added to an address space or a
-container with memory_region_add_subregion(), and removed using
-memory_region_del_subregion().
-
-Various region attributes (read-only, dirty logging, coalesced mmio,
-ioeventfd) can be changed during the region lifecycle. They take effect
-as soon as the region is made visible. This can be immediately, later,
-or never.
-
-Destruction of a memory region happens automatically when the owner
-object dies.
-
-If however the memory region is part of a dynamically allocated data
-structure, you should call object_unparent() to destroy the memory region
-before the data structure is freed. For an example see VFIOMSIXInfo
-and VFIOQuirk in hw/vfio/pci.c.
-
-You must not destroy a memory region as long as it may be in use by a
-device or CPU. In order to do this, as a general rule do not create or
-destroy memory regions dynamically during a device's lifetime, and only
-call object_unparent() in the memory region owner's instance_finalize
-callback. The dynamically allocated data structure that contains the
-memory region then should obviously be freed in the instance_finalize
-callback as well.
-
-If you break this rule, the following situation can happen:
-
-- the memory region's owner had a reference taken via memory_region_ref
- (for example by address_space_map)
-
-- the region is unparented, and has no owner anymore
-
-- when address_space_unmap is called, the reference to the memory region's
- owner is leaked.
-
-
-There is an exception to the above rule: it is okay to call
-object_unparent at any time for an alias or a container region. It is
-therefore also okay to create or destroy alias and container regions
-dynamically during a device's lifetime.
-
-This exceptional usage is valid because aliases and containers only help
-QEMU building the guest's memory map; they are never accessed directly.
-memory_region_ref and memory_region_unref are never called on aliases
-or containers, and the above situation then cannot happen. Exploiting
-this exception is rarely necessary, and therefore it is discouraged,
-but nevertheless it is used in a few places.
-
-For regions that "have no owner" (NULL is passed at creation time), the
-machine object is actually used as the owner. Since instance_finalize is
-never called for the machine object, you must never call object_unparent
-on regions that have no owner, unless they are aliases or containers.
-
-
-Overlapping regions and priority
---------------------------------
-Usually, regions may not overlap each other; a memory address decodes into
-exactly one target. In some cases it is useful to allow regions to overlap,
-and sometimes to control which of an overlapping regions is visible to the
-guest. This is done with memory_region_add_subregion_overlap(), which
-allows the region to overlap any other region in the same container, and
-specifies a priority that allows the core to decide which of two regions at
-the same address are visible (highest wins).
-Priority values are signed, and the default value is zero. This means that
-you can use memory_region_add_subregion_overlap() both to specify a region
-that must sit 'above' any others (with a positive priority) and also a
-background region that sits 'below' others (with a negative priority).
-
-If the higher priority region in an overlap is a container or alias, then
-the lower priority region will appear in any "holes" that the higher priority
-region has left by not mapping subregions to that area of its address range.
-(This applies recursively -- if the subregions are themselves containers or
-aliases that leave holes then the lower priority region will appear in these
-holes too.)
-
-For example, suppose we have a container A of size 0x8000 with two subregions
-B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is
-an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two
-of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at
-offset 0x2000. As a diagram:
-
- 0 1000 2000 3000 4000 5000 6000 7000 8000
- |------|------|------|------|------|------|------|------|
- A: [ ]
- C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC]
- B: [ ]
- D: [DDDDD]
- E: [EEEEE]
-
-The regions that will be seen within this address range then are:
- [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC]
-
-Since B has higher priority than C, its subregions appear in the flat map
-even where they overlap with C. In ranges where B has not mapped anything
-C's region appears.
-
-If B had provided its own MMIO operations (ie it was not a pure container)
-then these would be used for any addresses in its range not handled by
-D or E, and the result would be:
- [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB]
-
-Priority values are local to a container, because the priorities of two
-regions are only compared when they are both children of the same container.
-This means that the device in charge of the container (typically modelling
-a bus or a memory controller) can use them to manage the interaction of
-its child regions without any side effects on other parts of the system.
-In the example above, the priorities of D and E are unimportant because
-they do not overlap each other. It is the relative priority of B and C
-that causes D and E to appear on top of C: D and E's priorities are never
-compared against the priority of C.
-
-Visibility
-----------
-The memory core uses the following rules to select a memory region when the
-guest accesses an address:
-
-- all direct subregions of the root region are matched against the address, in
- descending priority order
- - if the address lies outside the region offset/size, the subregion is
- discarded
- - if the subregion is a leaf (RAM or MMIO), the search terminates, returning
- this leaf region
- - if the subregion is a container, the same algorithm is used within the
- subregion (after the address is adjusted by the subregion offset)
- - if the subregion is an alias, the search is continued at the alias target
- (after the address is adjusted by the subregion offset and alias offset)
- - if a recursive search within a container or alias subregion does not
- find a match (because of a "hole" in the container's coverage of its
- address range), then if this is a container with its own MMIO or RAM
- backing the search terminates, returning the container itself. Otherwise
- we continue with the next subregion in priority order
-- if none of the subregions match the address then the search terminates
- with no match found
-
-Example memory map
-------------------
-
-system_memory: container@0-2^48-1
- |
- +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff)
- |
- +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff)
- |
- +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff)
- | (prio 1)
- |
- +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff)
-
-pci (0-2^32-1)
- |
- +--- vga-area: container@0xa0000-0xbffff
- | |
- | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff)
- | |
- | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff)
- |
- +---- vram: ram@0xe1000000-0xe1ffffff
- |
- +---- vga-mmio: mmio@0xe2000000-0xe200ffff
-
-ram: ram@0x00000000-0xffffffff
-
-This is a (simplified) PC memory map. The 4GB RAM block is mapped into the
-system address space via two aliases: "lomem" is a 1:1 mapping of the first
-3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the
-so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with
-4GB of memory.
-
-The memory controller diverts addresses in the range 640K-768K to the PCI
-address space. This is modelled using the "vga-window" alias, mapped at a
-higher priority so it obscures the RAM at the same addresses. The vga window
-can be removed by programming the memory controller; this is modelled by
-removing the alias and exposing the RAM underneath.
-
-The pci address space is not a direct child of the system address space, since
-we only want parts of it to be visible (we accomplish this using aliases).
-It has two subregions: vga-area models the legacy vga window and is occupied
-by two 32K memory banks pointing at two sections of the framebuffer.
-In addition the vram is mapped as a BAR at address e1000000, and an additional
-BAR containing MMIO registers is mapped after it.
-
-Note that if the guest maps a BAR outside the PCI hole, it would not be
-visible as the pci-hole alias clips it to a 0.5GB range.
-
-MMIO Operations
----------------
-
-MMIO regions are provided with ->read() and ->write() callbacks; in addition
-various constraints can be supplied to control how these callbacks are called:
-
- - .valid.min_access_size, .valid.max_access_size define the access sizes
- (in bytes) which the device accepts; accesses outside this range will
- have device and bus specific behaviour (ignored, or machine check)
- - .valid.unaligned specifies that the *device being modelled* supports
- unaligned accesses; if false, unaligned accesses will invoke the
- appropriate bus or CPU specific behaviour.
- - .impl.min_access_size, .impl.max_access_size define the access sizes
- (in bytes) supported by the *implementation*; other access sizes will be
- emulated using the ones available. For example a 4-byte write will be
- emulated using four 1-byte writes, if .impl.max_access_size = 1.
- - .impl.unaligned specifies that the *implementation* supports unaligned
- accesses; if false, unaligned accesses will be emulated by two aligned
- accesses.
- - .old_mmio eases the porting of code that was formerly using
- cpu_register_io_memory(). It should not be used in new code.
+++ /dev/null
-= Migration =
-
-QEMU has code to load/save the state of the guest that it is running.
-These are two complementary operations. Saving the state just does
-that, saves the state for each device that the guest is running.
-Restoring a guest is just the opposite operation: we need to load the
-state of each device.
-
-For this to work, QEMU has to be launched with the same arguments the
-two times. I.e. it can only restore the state in one guest that has
-the same devices that the one it was saved (this last requirement can
-be relaxed a bit, but for now we can consider that configuration has
-to be exactly the same).
-
-Once that we are able to save/restore a guest, a new functionality is
-requested: migration. This means that QEMU is able to start in one
-machine and being "migrated" to another machine. I.e. being moved to
-another machine.
-
-Next was the "live migration" functionality. This is important
-because some guests run with a lot of state (specially RAM), and it
-can take a while to move all state from one machine to another. Live
-migration allows the guest to continue running while the state is
-transferred. Only while the last part of the state is transferred has
-the guest to be stopped. Typically the time that the guest is
-unresponsive during live migration is the low hundred of milliseconds
-(notice that this depends on a lot of things).
-
-=== Types of migration ===
-
-Now that we have talked about live migration, there are several ways
-to do migration:
-
-- tcp migration: do the migration using tcp sockets
-- unix migration: do the migration using unix sockets
-- exec migration: do the migration using the stdin/stdout through a process.
-- fd migration: do the migration using an file descriptor that is
- passed to QEMU. QEMU doesn't care how this file descriptor is opened.
-
-All these four migration protocols use the same infrastructure to
-save/restore state devices. This infrastructure is shared with the
-savevm/loadvm functionality.
-
-=== State Live Migration ===
-
-This is used for RAM and block devices. It is not yet ported to vmstate.
-<Fill more information here>
-
-=== What is the common infrastructure ===
-
-QEMU uses a QEMUFile abstraction to be able to do migration. Any type
-of migration that wants to use QEMU infrastructure has to create a
-QEMUFile with:
-
-QEMUFile *qemu_fopen_ops(void *opaque,
- QEMUFilePutBufferFunc *put_buffer,
- QEMUFileGetBufferFunc *get_buffer,
- QEMUFileCloseFunc *close);
-
-The functions have the following functionality:
-
-This function writes a chunk of data to a file at the given position.
-The pos argument can be ignored if the file is only used for
-streaming. The handler should try to write all of the data it can.
-
-typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
- int64_t pos, int size);
-
-Read a chunk of data from a file at the given position. The pos argument
-can be ignored if the file is only be used for streaming. The number of
-bytes actually read should be returned.
-
-typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
- int64_t pos, int size);
-
-Close a file and return an error code.
-
-typedef int (QEMUFileCloseFunc)(void *opaque);
-
-You can use any internal state that you need using the opaque void *
-pointer that is passed to all functions.
-
-The important functions for us are put_buffer()/get_buffer() that
-allow to write/read a buffer into the QEMUFile.
-
-=== How to save the state of one device ===
-
-The state of a device is saved using intermediate buffers. There are
-some helper functions to assist this saving.
-
-There is a new concept that we have to explain here: device state
-version. When we migrate a device, we save/load the state as a series
-of fields. Some times, due to bugs or new functionality, we need to
-change the state to store more/different information. We use the
-version to identify each time that we do a change. Each version is
-associated with a series of fields saved. The save_state always saves
-the state as the newer version. But load_state sometimes is able to
-load state from an older version.
-
-=== Legacy way ===
-
-This way is going to disappear as soon as all current users are ported to VMSTATE.
-
-Each device has to register two functions, one to save the state and
-another to load the state back.
-
-int register_savevm(DeviceState *dev,
- const char *idstr,
- int instance_id,
- int version_id,
- SaveStateHandler *save_state,
- LoadStateHandler *load_state,
- void *opaque);
-
-typedef void SaveStateHandler(QEMUFile *f, void *opaque);
-typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
-
-The important functions for the device state format are the save_state
-and load_state. Notice that load_state receives a version_id
-parameter to know what state format is receiving. save_state doesn't
-have a version_id parameter because it always uses the latest version.
-
-=== VMState ===
-
-The legacy way of saving/loading state of the device had the problem
-that we have to maintain two functions in sync. If we did one change
-in one of them and not in the other, we would get a failed migration.
-
-VMState changed the way that state is saved/loaded. Instead of using
-a function to save the state and another to load it, it was changed to
-a declarative way of what the state consisted of. Now VMState is able
-to interpret that definition to be able to load/save the state. As
-the state is declared only once, it can't go out of sync in the
-save/load functions.
-
-An example (from hw/input/pckbd.c)
-
-static const VMStateDescription vmstate_kbd = {
- .name = "pckbd",
- .version_id = 3,
- .minimum_version_id = 3,
- .fields = (VMStateField[]) {
- VMSTATE_UINT8(write_cmd, KBDState),
- VMSTATE_UINT8(status, KBDState),
- VMSTATE_UINT8(mode, KBDState),
- VMSTATE_UINT8(pending, KBDState),
- VMSTATE_END_OF_LIST()
- }
-};
-
-We are declaring the state with name "pckbd".
-The version_id is 3, and the fields are 4 uint8_t in a KBDState structure.
-We registered this with:
-
- vmstate_register(NULL, 0, &vmstate_kbd, s);
-
-Note: talk about how vmstate <-> qdev interact, and what the instance ids mean.
-
-You can search for VMSTATE_* macros for lots of types used in QEMU in
-include/hw/hw.h.
-
-=== More about versions ===
-
-Version numbers are intended for major incompatible changes to the
-migration of a device, and using them breaks backwards-migration
-compatibility; in general most changes can be made by adding Subsections
-(see below) or _TEST macros (see below) which won't break compatibility.
-
-You can see that there are several version fields:
-
-- version_id: the maximum version_id supported by VMState for that device.
-- minimum_version_id: the minimum version_id that VMState is able to understand
- for that device.
-- minimum_version_id_old: For devices that were not able to port to vmstate, we can
- assign a function that knows how to read this old state. This field is
- ignored if there is no load_state_old handler.
-
-So, VMState is able to read versions from minimum_version_id to
-version_id. And the function load_state_old() (if present) is able to
-load state from minimum_version_id_old to minimum_version_id. This
-function is deprecated and will be removed when no more users are left.
-
-Saving state will always create a section with the 'version_id' value
-and thus can't be loaded by any older QEMU.
-
-=== Massaging functions ===
-
-Sometimes, it is not enough to be able to save the state directly
-from one structure, we need to fill the correct values there. One
-example is when we are using kvm. Before saving the cpu state, we
-need to ask kvm to copy to QEMU the state that it is using. And the
-opposite when we are loading the state, we need a way to tell kvm to
-load the state for the cpu that we have just loaded from the QEMUFile.
-
-The functions to do that are inside a vmstate definition, and are called:
-
-- int (*pre_load)(void *opaque);
-
- This function is called before we load the state of one device.
-
-- int (*post_load)(void *opaque, int version_id);
-
- This function is called after we load the state of one device.
-
-- void (*pre_save)(void *opaque);
-
- This function is called before we save the state of one device.
-
-Example: You can look at hpet.c, that uses the three function to
- massage the state that is transferred.
-
-If you use memory API functions that update memory layout outside
-initialization (i.e., in response to a guest action), this is a strong
-indication that you need to call these functions in a post_load callback.
-Examples of such memory API functions are:
-
- - memory_region_add_subregion()
- - memory_region_del_subregion()
- - memory_region_set_readonly()
- - memory_region_set_enabled()
- - memory_region_set_address()
- - memory_region_set_alias_offset()
-
-=== Subsections ===
-
-The use of version_id allows to be able to migrate from older versions
-to newer versions of a device. But not the other way around. This
-makes very complicated to fix bugs in stable branches. If we need to
-add anything to the state to fix a bug, we have to disable migration
-to older versions that don't have that bug-fix (i.e. a new field).
-
-But sometimes, that bug-fix is only needed sometimes, not always. For
-instance, if the device is in the middle of a DMA operation, it is
-using a specific functionality, ....
-
-It is impossible to create a way to make migration from any version to
-any other version to work. But we can do better than only allowing
-migration from older versions to newer ones. For that fields that are
-only needed sometimes, we add the idea of subsections. A subsection
-is "like" a device vmstate, but with a particularity, it has a Boolean
-function that tells if that values are needed to be sent or not. If
-this functions returns false, the subsection is not sent.
-
-On the receiving side, if we found a subsection for a device that we
-don't understand, we just fail the migration. If we understand all
-the subsections, then we load the state with success.
-
-One important note is that the post_load() function is called "after"
-loading all subsections, because a newer subsection could change same
-value that it uses.
-
-Example:
-
-static bool ide_drive_pio_state_needed(void *opaque)
-{
- IDEState *s = opaque;
-
- return ((s->status & DRQ_STAT) != 0)
- || (s->bus->error_status & BM_STATUS_PIO_RETRY);
-}
-
-const VMStateDescription vmstate_ide_drive_pio_state = {
- .name = "ide_drive/pio_state",
- .version_id = 1,
- .minimum_version_id = 1,
- .pre_save = ide_drive_pio_pre_save,
- .post_load = ide_drive_pio_post_load,
- .needed = ide_drive_pio_state_needed,
- .fields = (VMStateField[]) {
- VMSTATE_INT32(req_nb_sectors, IDEState),
- VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1,
- vmstate_info_uint8, uint8_t),
- VMSTATE_INT32(cur_io_buffer_offset, IDEState),
- VMSTATE_INT32(cur_io_buffer_len, IDEState),
- VMSTATE_UINT8(end_transfer_fn_idx, IDEState),
- VMSTATE_INT32(elementary_transfer_size, IDEState),
- VMSTATE_INT32(packet_transfer_size, IDEState),
- VMSTATE_END_OF_LIST()
- }
-};
-
-const VMStateDescription vmstate_ide_drive = {
- .name = "ide_drive",
- .version_id = 3,
- .minimum_version_id = 0,
- .post_load = ide_drive_post_load,
- .fields = (VMStateField[]) {
- .... several fields ....
- VMSTATE_END_OF_LIST()
- },
- .subsections = (const VMStateDescription*[]) {
- &vmstate_ide_drive_pio_state,
- NULL
- }
-};
-
-Here we have a subsection for the pio state. We only need to
-save/send this state when we are in the middle of a pio operation
-(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is
-not enabled, the values on that fields are garbage and don't need to
-be sent.
-
-Using a condition function that checks a 'property' to determine whether
-to send a subsection allows backwards migration compatibility when
-new subsections are added.
-
-For example;
- a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and
- default it to true.
- b) Add an entry to the HW_COMPAT_ for the previous version
- that sets the property to false.
- c) Add a static bool support_foo function that tests the property.
- d) Add a subsection with a .needed set to the support_foo function
- e) (potentially) Add a pre_load that sets up a default value for 'foo'
- to be used if the subsection isn't loaded.
-
-Now that subsection will not be generated when using an older
-machine type and the migration stream will be accepted by older
-QEMU versions. pre-load functions can be used to initialise state
-on the newer version so that they default to suitable values
-when loading streams created by older QEMU versions that do not
-generate the subsection.
-
-In some cases subsections are added for data that had been accidentally
-omitted by earlier versions; if the missing data causes the migration
-process to succeed but the guest to behave badly then it may be better
-to send the subsection and cause the migration to explicitly fail
-with the unknown subsection error. If the bad behaviour only happens
-with certain data values, making the subsection conditional on
-the data value (rather than the machine type) allows migrations to succeed
-in most cases. In general the preference is to tie the subsection to
-the machine type, and allow reliable migrations, unless the behaviour
-from omission of the subsection is really bad.
-
-= Not sending existing elements =
-
-Sometimes members of the VMState are no longer needed;
- removing them will break migration compatibility
- making them version dependent and bumping the version will break backwards
- migration compatibility.
-
-The best way is to:
- a) Add a new property/compatibility/function in the same way for subsections
- above.
- b) replace the VMSTATE macro with the _TEST version of the macro, e.g.:
- VMSTATE_UINT32(foo, barstruct)
- becomes
- VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)
-
- Sometime in the future when we no longer care about the ancient
-versions these can be killed off.
-
-= Return path =
-
-In most migration scenarios there is only a single data path that runs
-from the source VM to the destination, typically along a single fd (although
-possibly with another fd or similar for some fast way of throwing pages across).
-
-However, some uses need two way communication; in particular the Postcopy
-destination needs to be able to request pages on demand from the source.
-
-For these scenarios there is a 'return path' from the destination to the source;
-qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
-path.
-
- Source side
- Forward path - written by migration thread
- Return path - opened by main thread, read by return-path thread
-
- Destination side
- Forward path - read by main thread
- Return path - opened by main thread, written by main thread AND postcopy
- thread (protected by rp_mutex)
-
-= Postcopy =
-'Postcopy' migration is a way to deal with migrations that refuse to converge
-(or take too long to converge) its plus side is that there is an upper bound on
-the amount of migration traffic and time it takes, the down side is that during
-the postcopy phase, a failure of *either* side or the network connection causes
-the guest to be lost.
-
-In postcopy the destination CPUs are started before all the memory has been
-transferred, and accesses to pages that are yet to be transferred cause
-a fault that's translated by QEMU into a request to the source QEMU.
-
-Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
-doesn't finish in a given time the switch is made to postcopy.
-
-=== Enabling postcopy ===
-
-To enable postcopy, issue this command on the monitor prior to the
-start of migration:
-
-migrate_set_capability postcopy-ram on
-
-The normal commands are then used to start a migration, which is still
-started in precopy mode. Issuing:
-
-migrate_start_postcopy
-
-will now cause the transition from precopy to postcopy.
-It can be issued immediately after migration is started or any
-time later on. Issuing it after the end of a migration is harmless.
-
-Note: During the postcopy phase, the bandwidth limits set using
-migrate_set_speed is ignored (to avoid delaying requested pages that
-the destination is waiting for).
-
-=== Postcopy device transfer ===
-
-Loading of device data may cause the device emulation to access guest RAM
-that may trigger faults that have to be resolved by the source, as such
-the migration stream has to be able to respond with page data *during* the
-device load, and hence the device data has to be read from the stream completely
-before the device load begins to free the stream up. This is achieved by
-'packaging' the device data into a blob that's read in one go.
-
-Source behaviour
-
-Until postcopy is entered the migration stream is identical to normal
-precopy, except for the addition of a 'postcopy advise' command at
-the beginning, to tell the destination that postcopy might happen.
-When postcopy starts the source sends the page discard data and then
-forms the 'package' containing:
-
- Command: 'postcopy listen'
- The device state
- A series of sections, identical to the precopy streams device state stream
- containing everything except postcopiable devices (i.e. RAM)
- Command: 'postcopy run'
-
-The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
-contents are formatted in the same way as the main migration stream.
-
-During postcopy the source scans the list of dirty pages and sends them
-to the destination without being requested (in much the same way as precopy),
-however when a page request is received from the destination, the dirty page
-scanning restarts from the requested location. This causes requested pages
-to be sent quickly, and also causes pages directly after the requested page
-to be sent quickly in the hope that those pages are likely to be used
-by the destination soon.
-
-Destination behaviour
-
-Initially the destination looks the same as precopy, with a single thread
-reading the migration stream; the 'postcopy advise' and 'discard' commands
-are processed to change the way RAM is managed, but don't affect the stream
-processing.
-
-------------------------------------------------------------------------------
- 1 2 3 4 5 6 7
-main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN )
-thread | |
- | (page request)
- | \___
- v \
-listen thread: --- page -- page -- page -- page -- page --
-
- a b c
-------------------------------------------------------------------------------
-
-On receipt of CMD_PACKAGED (1)
- All the data associated with the package - the ( ... ) section in the
-diagram - is read into memory, and the main thread recurses into
-qemu_loadvm_state_main to process the contents of the package (2)
-which contains commands (3,6) and devices (4...)
-
-On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
-a new thread (a) is started that takes over servicing the migration stream,
-while the main thread carries on loading the package. It loads normal
-background page data (b) but if during a device load a fault happens (5) the
-returned page (c) is loaded by the listen thread allowing the main threads
-device load to carry on.
-
-The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
-CPUs start running.
-At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
-and is no longer used by migration, while the listen thread carries
-on servicing page data until the end of migration.
-
-=== Postcopy states ===
-
-Postcopy moves through a series of states (see postcopy_state) from
-ADVISE->DISCARD->LISTEN->RUNNING->END
-
- Advise: Set at the start of migration if postcopy is enabled, even
- if it hasn't had the start command; here the destination
- checks that its OS has the support needed for postcopy, and performs
- setup to ensure the RAM mappings are suitable for later postcopy.
- The destination will fail early in migration at this point if the
- required OS support is not present.
- (Triggered by reception of POSTCOPY_ADVISE command)
-
- Discard: Entered on receipt of the first 'discard' command; prior to
- the first Discard being performed, hugepages are switched off
- (using madvise) to ensure that no new huge pages are created
- during the postcopy phase, and to cause any huge pages that
- have discards on them to be broken.
-
- Listen: The first command in the package, POSTCOPY_LISTEN, switches
- the destination state to Listen, and starts a new thread
- (the 'listen thread') which takes over the job of receiving
- pages off the migration stream, while the main thread carries
- on processing the blob. With this thread able to process page
- reception, the destination now 'sensitises' the RAM to detect
- any access to missing pages (on Linux using the 'userfault'
- system).
-
- Running: POSTCOPY_RUN causes the destination to synchronise all
- state and start the CPUs and IO devices running. The main
- thread now finishes processing the migration package and
- now carries on as it would for normal precopy migration
- (although it can't do the cleanup it would do as it
- finishes a normal migration).
-
- End: The listen thread can now quit, and perform the cleanup of migration
- state, the migration is now complete.
-
-=== Source side page maps ===
-
-The source side keeps two bitmaps during postcopy; 'the migration bitmap'
-and 'unsent map'. The 'migration bitmap' is basically the same as in
-the precopy case, and holds a bit to indicate that page is 'dirty' -
-i.e. needs sending. During the precopy phase this is updated as the CPU
-dirties pages, however during postcopy the CPUs are stopped and nothing
-should dirty anything any more.
-
-The 'unsent map' is used for the transition to postcopy. It is a bitmap that
-has a bit cleared whenever a page is sent to the destination, however during
-the transition to postcopy mode it is combined with the migration bitmap
-to form a set of pages that:
- a) Have been sent but then redirtied (which must be discarded)
- b) Have not yet been sent - which also must be discarded to cause any
- transparent huge pages built during precopy to be broken.
-
-Note that the contents of the unsentmap are sacrificed during the calculation
-of the discard set and thus aren't valid once in postcopy. The dirtymap
-is still valid and is used to ensure that no page is sent more than once. Any
-request for a page that has already been sent is ignored. Duplicate requests
-such as this can happen as a page is sent at about the same time the
-destination accesses it.
-
-=== Postcopy with hugepages ===
-
-Postcopy now works with hugetlbfs backed memory:
- a) The linux kernel on the destination must support userfault on hugepages.
- b) The huge-page configuration on the source and destination VMs must be
- identical; i.e. RAMBlocks on both sides must use the same page size.
- c) Note that -mem-path /dev/hugepages will fall back to allocating normal
- RAM if it doesn't have enough hugepages, triggering (b) to fail.
- Using -mem-prealloc enforces the allocation using hugepages.
- d) Care should be taken with the size of hugepage used; postcopy with 2MB
- hugepages works well, however 1GB hugepages are likely to be problematic
- since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
- and until the full page is transferred the destination thread is blocked.
+++ /dev/null
-Copyright (c) 2015-2016 Linaro Ltd.
-
-This work is licensed under the terms of the GNU GPL, version 2 or
-later. See the COPYING file in the top-level directory.
-
-Introduction
-============
-
-This document outlines the design for multi-threaded TCG system-mode
-emulation. The current user-mode emulation mirrors the thread
-structure of the translated executable. Some of the work will be
-applicable to both system and linux-user emulation.
-
-The original system-mode TCG implementation was single threaded and
-dealt with multiple CPUs with simple round-robin scheduling. This
-simplified a lot of things but became increasingly limited as systems
-being emulated gained additional cores and per-core performance gains
-for host systems started to level off.
-
-vCPU Scheduling
-===============
-
-We introduce a new running mode where each vCPU will run on its own
-user-space thread. This will be enabled by default for all FE/BE
-combinations that have had the required work done to support this
-safely.
-
-In the general case of running translated code there should be no
-inter-vCPU dependencies and all vCPUs should be able to run at full
-speed. Synchronisation will only be required while accessing internal
-shared data structures or when the emulated architecture requires a
-coherent representation of the emulated machine state.
-
-Shared Data Structures
-======================
-
-Main Run Loop
--------------
-
-Even when there is no code being generated there are a number of
-structures associated with the hot-path through the main run-loop.
-These are associated with looking up the next translation block to
-execute. These include:
-
- tb_jmp_cache (per-vCPU, cache of recent jumps)
- tb_ctx.htable (global hash table, phys address->tb lookup)
-
-As TB linking only occurs when blocks are in the same page this code
-is critical to performance as looking up the next TB to execute is the
-most common reason to exit the generated code.
-
-DESIGN REQUIREMENT: Make access to lookup structures safe with
-multiple reader/writer threads. Minimise any lock contention to do it.
-
-The hot-path avoids using locks where possible. The tb_jmp_cache is
-updated with atomic accesses to ensure consistent results. The fall
-back QHT based hash table is also designed for lockless lookups. Locks
-are only taken when code generation is required or TranslationBlocks
-have their block-to-block jumps patched.
-
-Global TCG State
-----------------
-
-We need to protect the entire code generation cycle including any post
-generation patching of the translated code. This also implies a shared
-translation buffer which contains code running on all cores. Any
-execution path that comes to the main run loop will need to hold a
-mutex for code generation. This also includes times when we need flush
-code or entries from any shared lookups/caches. Structures held on a
-per-vCPU basis won't need locking unless other vCPUs will need to
-modify them.
-
-DESIGN REQUIREMENT: Add locking around all code generation and TB
-patching.
-
-(Current solution)
-
-Mainly as part of the linux-user work all code generation is
-serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the
-place of mmap_lock() in linux-user.
-
-Translation Blocks
-------------------
-
-Currently the whole system shares a single code generation buffer
-which when full will force a flush of all translations and start from
-scratch again. Some operations also force a full flush of translations
-including:
-
- - debugging operations (breakpoint insertion/removal)
- - some CPU helper functions
-
-This is done with the async_safe_run_on_cpu() mechanism to ensure all
-vCPUs are quiescent when changes are being made to shared global
-structures.
-
-More granular translation invalidation events are typically due
-to a change of the state of a physical page:
-
- - code modification (self modify code, patching code)
- - page changes (new page mapping in linux-user mode)
-
-While setting the invalid flag in a TranslationBlock will stop it
-being used when looked up in the hot-path there are a number of other
-book-keeping structures that need to be safely cleared.
-
-Any TranslationBlocks which have been patched to jump directly to the
-now invalid blocks need the jump patches reversing so they will return
-to the C code.
-
-There are a number of look-up caches that need to be properly updated
-including the:
-
- - jump lookup cache
- - the physical-to-tb lookup hash table
- - the global page table
-
-The global page table (l1_map) which provides a multi-level look-up
-for PageDesc structures which contain pointers to the start of a
-linked list of all Translation Blocks in that page (see page_next).
-
-Both the jump patching and the page cache involve linked lists that
-the invalidated TranslationBlock needs to be removed from.
-
-DESIGN REQUIREMENT: Safely handle invalidation of TBs
- - safely patch/revert direct jumps
- - remove central PageDesc lookup entries
- - ensure lookup caches/hashes are safely updated
-
-(Current solution)
-
-The direct jump themselves are updated atomically by the TCG
-tb_set_jmp_target() code. Modification to the linked lists that allow
-searching for linked pages are done under the protect of the
-tb_lock().
-
-The global page table is protected by the tb_lock() in system-mode and
-mmap_lock() in linux-user mode.
-
-The lookup caches are updated atomically and the lookup hash uses QHT
-which is designed for concurrent safe lookup.
-
-
-Memory maps and TLBs
---------------------
-
-The memory handling code is fairly critical to the speed of memory
-access in the emulated system. The SoftMMU code is designed so the
-hot-path can be handled entirely within translated code. This is
-handled with a per-vCPU TLB structure which once populated will allow
-a series of accesses to the page to occur without exiting the
-translated code. It is possible to set flags in the TLB address which
-will ensure the slow-path is taken for each access. This can be done
-to support:
-
- - Memory regions (dividing up access to PIO, MMIO and RAM)
- - Dirty page tracking (for code gen, SMC detection, migration and display)
- - Virtual TLB (for translating guest address->real address)
-
-When the TLB tables are updated by a vCPU thread other than their own
-we need to ensure it is done in a safe way so no inconsistent state is
-seen by the vCPU thread.
-
-Some operations require updating a number of vCPUs TLBs at the same
-time in a synchronised manner.
-
-DESIGN REQUIREMENTS:
-
- - TLB Flush All/Page
- - can be across-vCPUs
- - cross vCPU TLB flush may need other vCPU brought to halt
- - change may need to be visible to the calling vCPU immediately
- - TLB Flag Update
- - usually cross-vCPU
- - want change to be visible as soon as possible
- - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs)
- - This is a per-vCPU table - by definition can't race
- - updated by its own thread when the slow-path is forced
-
-(Current solution)
-
-We have updated cputlb.c to defer operations when a cross-vCPU
-operation with async_run_on_cpu() which ensures each vCPU sees a
-coherent state when it next runs its work (in a few instructions
-time).
-
-A new set up operations (tlb_flush_*_all_cpus) take an additional flag
-which when set will force synchronisation by setting the source vCPUs
-work as "safe work" and exiting the cpu run loop. This ensure by the
-time execution restarts all flush operations have completed.
-
-TLB flag updates are all done atomically and are also protected by the
-tb_lock() which is used by the functions that update the TLB in bulk.
-
-(Known limitation)
-
-Not really a limitation but the wait mechanism is overly strict for
-some architectures which only need flushes completed by a barrier
-instruction. This could be a future optimisation.
-
-Emulated hardware state
------------------------
-
-Currently thanks to KVM work any access to IO memory is automatically
-protected by the global iothread mutex, also known as the BQL (Big
-Qemu Lock). Any IO region that doesn't use global mutex is expected to
-do its own locking.
-
-However IO memory isn't the only way emulated hardware state can be
-modified. Some architectures have model specific registers that
-trigger hardware emulation features. Generally any translation helper
-that needs to update more than a single vCPUs of state should take the
-BQL.
-
-As the BQL, or global iothread mutex is shared across the system we
-push the use of the lock as far down into the TCG code as possible to
-minimise contention.
-
-(Current solution)
-
-MMIO access automatically serialises hardware emulation by way of the
-BQL. Currently ARM targets serialise all ARM_CP_IO register accesses
-and also defer the reset/startup of vCPUs to the vCPU context by way
-of async_run_on_cpu().
-
-Updates to interrupt state are also protected by the BQL as they can
-often be cross vCPU.
-
-Memory Consistency
-==================
-
-Between emulated guests and host systems there are a range of memory
-consistency models. Even emulating weakly ordered systems on strongly
-ordered hosts needs to ensure things like store-after-load re-ordering
-can be prevented when the guest wants to.
-
-Memory Barriers
----------------
-
-Barriers (sometimes known as fences) provide a mechanism for software
-to enforce a particular ordering of memory operations from the point
-of view of external observers (e.g. another processor core). They can
-apply to any memory operations as well as just loads or stores.
-
-The Linux kernel has an excellent write-up on the various forms of
-memory barrier and the guarantees they can provide [1].
-
-Barriers are often wrapped around synchronisation primitives to
-provide explicit memory ordering semantics. However they can be used
-by themselves to provide safe lockless access by ensuring for example
-a change to a signal flag will only be visible once the changes to
-payload are.
-
-DESIGN REQUIREMENT: Add a new tcg_memory_barrier op
-
-This would enforce a strong load/store ordering so all loads/stores
-complete at the memory barrier. On single-core non-SMP strongly
-ordered backends this could become a NOP.
-
-Aside from explicit standalone memory barrier instructions there are
-also implicit memory ordering semantics which comes with each guest
-memory access instruction. For example all x86 load/stores come with
-fairly strong guarantees of sequential consistency where as ARM has
-special variants of load/store instructions that imply acquire/release
-semantics.
-
-In the case of a strongly ordered guest architecture being emulated on
-a weakly ordered host the scope for a heavy performance impact is
-quite high.
-
-DESIGN REQUIREMENTS: Be efficient with use of memory barriers
- - host systems with stronger implied guarantees can skip some barriers
- - merge consecutive barriers to the strongest one
-
-(Current solution)
-
-The system currently has a tcg_gen_mb() which will add memory barrier
-operations if code generation is being done in a parallel context. The
-tcg_optimize() function attempts to merge barriers up to their
-strongest form before any load/store operations. The solution was
-originally developed and tested for linux-user based systems. All
-backends have been converted to emit fences when required. So far the
-following front-ends have been updated to emit fences when required:
-
- - target-i386
- - target-arm
- - target-aarch64
- - target-alpha
- - target-mips
-
-Memory Control and Maintenance
-------------------------------
-
-This includes a class of instructions for controlling system cache
-behaviour. While QEMU doesn't model cache behaviour these instructions
-are often seen when code modification has taken place to ensure the
-changes take effect.
-
-Synchronisation Primitives
---------------------------
-
-There are two broad types of synchronisation primitives found in
-modern ISAs: atomic instructions and exclusive regions.
-
-The first type offer a simple atomic instruction which will guarantee
-some sort of test and conditional store will be truly atomic w.r.t.
-other cores sharing access to the memory. The classic example is the
-x86 cmpxchg instruction.
-
-The second type offer a pair of load/store instructions which offer a
-guarantee that an region of memory has not been touched between the
-load and store instructions. An example of this is ARM's ldrex/strex
-pair where the strex instruction will return a flag indicating a
-successful store only if no other CPU has accessed the memory region
-since the ldrex.
-
-Traditionally TCG has generated a series of operations that work
-because they are within the context of a single translation block so
-will have completed before another CPU is scheduled. However with
-the ability to have multiple threads running to emulate multiple CPUs
-we will need to explicitly expose these semantics.
-
-DESIGN REQUIREMENTS:
- - Support classic atomic instructions
- - Support load/store exclusive (or load link/store conditional) pairs
- - Generic enough infrastructure to support all guest architectures
-CURRENT OPEN QUESTIONS:
- - How problematic is the ABA problem in general?
-
-(Current solution)
-
-The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which
-can be used directly or combined to emulate other instructions like
-ARM's ldrex/strex instructions. While they are susceptible to the ABA
-problem so far common guests have not implemented patterns where
-this may be a problem - typically presenting a locking ABI which
-assumes cmpxchg like semantics.
-
-The code also includes a fall-back for cases where multi-threaded TCG
-ops can't work (e.g. guest atomic width > host atomic width). In this
-case an EXCP_ATOMIC exit occurs and the instruction is emulated with
-an exclusive lock which ensures all emulation is serialised.
-
-While the atomic helpers look good enough for now there may be a need
-to look at solutions that can more closely model the guest
-architectures semantics.
-
-==========
-
-[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt
+++ /dev/null
-Copyright (c) 2014 Red Hat Inc.
-
-This work is licensed under the terms of the GNU GPL, version 2 or later. See
-the COPYING file in the top-level directory.
-
-
-This document explains the IOThread feature and how to write code that runs
-outside the QEMU global mutex.
-
-The main loop and IOThreads
----------------------------
-QEMU is an event-driven program that can do several things at once using an
-event loop. The VNC server and the QMP monitor are both processed from the
-same event loop, which monitors their file descriptors until they become
-readable and then invokes a callback.
-
-The default event loop is called the main loop (see main-loop.c). It is
-possible to create additional event loop threads using -object
-iothread,id=my-iothread.
-
-Side note: The main loop and IOThread are both event loops but their code is
-not shared completely. Sometimes it is useful to remember that although they
-are conceptually similar they are currently not interchangeable.
-
-Why IOThreads are useful
-------------------------
-IOThreads allow the user to control the placement of work. The main loop is a
-scalability bottleneck on hosts with many CPUs. Work can be spread across
-several IOThreads instead of just one main loop. When set up correctly this
-can improve I/O latency and reduce jitter seen by the guest.
-
-The main loop is also deeply associated with the QEMU global mutex, which is a
-scalability bottleneck in itself. vCPU threads and the main loop use the QEMU
-global mutex to serialize execution of QEMU code. This mutex is necessary
-because a lot of QEMU's code historically was not thread-safe.
-
-The fact that all I/O processing is done in a single main loop and that the
-QEMU global mutex is contended by all vCPU threads and the main loop explain
-why it is desirable to place work into IOThreads.
-
-The experimental virtio-blk data-plane implementation has been benchmarked and
-shows these effects:
-ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf
-
-How to program for IOThreads
-----------------------------
-The main difference between legacy code and new code that can run in an
-IOThread is dealing explicitly with the event loop object, AioContext
-(see include/block/aio.h). Code that only works in the main loop
-implicitly uses the main loop's AioContext. Code that supports running
-in IOThreads must be aware of its AioContext.
-
-AioContext supports the following services:
- * File descriptor monitoring (read/write/error on POSIX hosts)
- * Event notifiers (inter-thread signalling)
- * Timers
- * Bottom Halves (BH) deferred callbacks
-
-There are several old APIs that use the main loop AioContext:
- * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor
- * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier
- * LEGACY timer_new_ms() - create a timer
- * LEGACY qemu_bh_new() - create a BH
- * LEGACY qemu_aio_wait() - run an event loop iteration
-
-Since they implicitly work on the main loop they cannot be used in code that
-runs in an IOThread. They might cause a crash or deadlock if called from an
-IOThread since the QEMU global mutex is not held.
-
-Instead, use the AioContext functions directly (see include/block/aio.h):
- * aio_set_fd_handler() - monitor a file descriptor
- * aio_set_event_notifier() - monitor an event notifier
- * aio_timer_new() - create a timer
- * aio_bh_new() - create a BH
- * aio_poll() - run an event loop iteration
-
-The AioContext can be obtained from the IOThread using
-iothread_get_aio_context() or for the main loop using qemu_get_aio_context().
-Code that takes an AioContext argument works both in IOThreads or the main
-loop, depending on which AioContext instance the caller passes in.
-
-How to synchronize with an IOThread
------------------------------------
-AioContext is not thread-safe so some rules must be followed when using file
-descriptors, event notifiers, timers, or BHs across threads:
-
-1. AioContext functions can always be called safely. They handle their
-own locking internally.
-
-2. Other threads wishing to access the AioContext must use
-aio_context_acquire()/aio_context_release() for mutual exclusion. Once the
-context is acquired no other thread can access it or run event loop iterations
-in this AioContext.
-
-aio_context_acquire()/aio_context_release() calls may be nested. This
-means you can call them if you're not sure whether #2 applies.
-
-There is currently no lock ordering rule if a thread needs to acquire multiple
-AioContexts simultaneously. Therefore, it is only safe for code holding the
-QEMU global mutex to acquire other AioContexts.
-
-Side note: the best way to schedule a function call across threads is to call
-aio_bh_schedule_oneshot(). No acquire/release or locking is needed.
-
-AioContext and the block layer
-------------------------------
-The AioContext originates from the QEMU block layer, even though nowadays
-AioContext is a generic event loop that can be used by any QEMU subsystem.
-
-The block layer has support for AioContext integrated. Each BlockDriverState
-is associated with an AioContext using bdrv_set_aio_context() and
-bdrv_get_aio_context(). This allows block layer code to process I/O inside the
-right AioContext. Other subsystems may wish to follow a similar approach.
-
-Block layer code must therefore expect to run in an IOThread and avoid using
-old APIs that implicitly use the main loop. See the "How to program for
-IOThreads" above for information on how to do that.
-
-If main loop code such as a QMP function wishes to access a BlockDriverState
-it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
-that callbacks in the IOThread do not run in parallel.
-
-Code running in the monitor typically needs to ensure that past
-requests from the guest are completed. When a block device is running
-in an IOThread, the IOThread can also process requests from the guest
-(via ioeventfd). To achieve both objects, wrap the code between
-bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained
-section". The functions must be called between aio_context_acquire()
-and aio_context_release(). You can freely release and re-acquire the
-AioContext within a drained section.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in
-the BlockDriverState's AioContext to avoid the need to acquire/release around
-each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier,
-or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends,
-can be used to get a notification whenever bdrv_set_aio_context() moves a
-BlockDriverState to a different AioContext.
+++ /dev/null
-# q35 - Emulated guest (graphical console)
-# =========================================================
-#
-# Usage:
-#
-# $ qemu-system-x86_64 \
-# -nodefaults \
-# -readconfig q35-emulated.cfg
-#
-# You will probably need to tweak the lines marked as
-# CHANGE ME before being able to use this configuration!
-#
-# The guest will have a selection of emulated devices that
-# closely resembles that of a physical machine, and will be
-# accessed through a graphical console.
-#
-# ---------------------------------------------------------
-#
-# Using -nodefaults is required to have full control over
-# the virtual hardware: when it's specified, QEMU will
-# populate the board with only the builtin peripherals
-# plus a small selection of core PCI devices and
-# controllers; the user will then have to explicitly add
-# further devices.
-#
-# The core PCI devices show up in the guest as:
-#
-# 00:00.0 Host bridge
-# 00:1f.0 ISA bridge / LPC
-# 00:1f.2 SATA (AHCI) controller
-# 00:1f.3 SMBus controller
-#
-# This configuration file adds a number of devices that
-# are pretty much guaranteed to be present in every single
-# physical machine based on q35, more specifically:
-#
-# 00:01.0 VGA compatible controller
-# 00:19.0 Ethernet controller
-# 00:1a.* USB controller (#2)
-# 00:1b.0 Audio device
-# 00:1c.* PCI bridge (PCI Express Root Ports)
-# 00:1d.* USB Controller (#1)
-# 00:1e.0 PCI bridge (legacy PCI bridge)
-#
-# More information about these devices is available below.
-
-
-# Machine options
-# =========================================================
-#
-# We use the q35 machine type and enable KVM acceleration
-# for better performance.
-#
-# Using less than 1 GiB of memory is probably not going to
-# yield good performance in the guest, and might even lead
-# to obscure boot issues in some cases.
-#
-# Unfortunately, there is no way to configure the CPU model
-# in this file, so it will have to be provided on the
-# command line.
-
-[machine]
- type = "q35"
- accel = "kvm"
-
-[memory]
- size = "1024"
-
-
-# PCI bridge (PCI Express Root Ports)
-# =========================================================
-#
-# We add four PCI Express Root Ports, all sharing the same
-# slot on the PCI Express Root Bus. These ports support
-# hotplug.
-
-[device "ich9-pcie-port-1"]
- driver = "ioh3420"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1c.0"
- port = "1"
- chassis = "1"
-
-[device "ich9-pcie-port-2"]
- driver = "ioh3420"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1c.1"
- port = "2"
- chassis = "2"
-
-[device "ich9-pcie-port-3"]
- driver = "ioh3420"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1c.2"
- port = "3"
- chassis = "3"
-
-[device "ich9-pcie-port-4"]
- driver = "ioh3420"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1c.3"
- port = "4"
- chassis = "4"
-
-
-# PCI bridge (legacy PCI bridge)
-# =========================================================
-#
-# This bridge can be used to build an independent topology
-# for legacy PCI devices. PCI Express devices should be
-# plugged into PCI Express slots instead, so ideally there
-# will be no devices connected to this bridge.
-
-[device "ich9-pci-bridge"]
- driver = "i82801b11-bridge"
- bus = "pcie.0"
- addr = "1e.0"
-
-
-# SATA storage
-# =========================================================
-#
-# An implicit SATA controller is created automatically for
-# every single q35 guest; here we create a disk, backed by
-# a qcow2 disk image on the host's filesystem, and attach
-# it to that controller so that the guest can use it.
-#
-# We also create an optical disk, mostly for installation
-# purposes: once the guest OS has been succesfully
-# installed, the guest will no longer boot from optical
-# media. If you don't want, or no longer want, to have an
-# optical disk in the guest you can safely comment out
-# all relevant sections below.
-
-[device "sata-disk"]
- driver = "ide-hd"
- bus = "ide.0"
- drive = "disk"
- bootindex = "1"
-
-[drive "disk"]
- file = "guest.qcow2" # CHANGE ME
- format = "qcow2"
- if = "none"
-
-[device "sata-optical-disk"]
- driver = "ide-cd"
- bus = "ide.1"
- drive = "optical-disk"
- bootindex = "2"
-
-[drive "optical-disk"]
- file = "install.iso" # CHANGE ME
- format = "raw"
- if = "none"
-
-
-# USB controller (#1)
-# =========================================================
-#
-# EHCI controller + UHCI companion controllers.
-
-[device "ich9-ehci-1"]
- driver = "ich9-usb-ehci1"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1d.7"
-
-[device "ich9-uhci-1"]
- driver = "ich9-usb-uhci1"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1d.0"
- masterbus = "ich9-ehci-1.0"
- firstport = "0"
-
-[device "ich9-uhci-2"]
- driver = "ich9-usb-uhci2"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1d.1"
- masterbus = "ich9-ehci-1.0"
- firstport = "2"
-
-[device "ich9-uhci-3"]
- driver = "ich9-usb-uhci3"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1d.2"
- masterbus = "ich9-ehci-1.0"
- firstport = "4"
-
-
-# USB controller (#2)
-# =========================================================
-#
-# EHCI controller + UHCI companion controllers.
-
-[device "ich9-ehci-2"]
- driver = "ich9-usb-ehci2"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1a.7"
-
-[device "ich9-uhci-4"]
- driver = "ich9-usb-uhci4"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1a.0"
- masterbus = "ich9-ehci-2.0"
- firstport = "0"
-
-[device "ich9-uhci-5"]
- driver = "ich9-usb-uhci5"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1a.1"
- masterbus = "ich9-ehci-2.0"
- firstport = "2"
-
-[device "ich9-uhci-6"]
- driver = "ich9-usb-uhci6"
- multifunction = "on"
- bus = "pcie.0"
- addr = "1a.2"
- masterbus = "ich9-ehci-2.0"
- firstport = "4"
-
-
-# Ethernet controller
-# =========================================================
-#
-# We add a Gigabit Ethernet interface to the guest; on the
-# host side, we take advantage of user networking so that
-# the QEMU process doesn't require any additional
-# privileges.
-
-[netdev "hostnet"]
- type = "user"
-
-[device "net"]
- driver = "e1000"
- netdev = "hostnet"
- bus = "pcie.0"
- addr = "19.0"
-
-
-# VGA compatible controller
-# =========================================================
-#
-# We use stdvga instead of Cirrus as it supports more video
-# modes and is closer to what actual hardware looks like.
-#
-# If you're running the guest on a remote, potentially
-# headless host, you will probably want to append something
-# like
-#
-# -display vnc=127.0.0.1:0
-#
-# to the command line in order to prevent QEMU from
-# creating a graphical display window on the host and
-# enable remote access instead.
-
-[device "video"]
- driver = "VGA"
- bus = "pcie.0"
- addr = "01.0"
-
-
-# Audio device
-# =========================================================
-#
-# The sound card is a legacy PCI device that is plugged
-# directly into the PCI Express Root Bus.
-
-[device "ich9-hda-audio"]
- driver = "ich9-intel-hda"
- bus = "pcie.0"
- addr = "1b.0"
-
-[device "ich9-hda-duplex"]
- driver = "hda-duplex"
- bus = "ich9-hda-audio.0"
- cad = "0"
+++ /dev/null
-# q35 - VirtIO guest (graphical console)
-# =========================================================
-#
-# Usage:
-#
-# $ qemu-system-x86_64 \
-# -nodefaults \
-# -readconfig q35-virtio-graphical.cfg
-#
-# You will probably need to tweak the lines marked as
-# CHANGE ME before being able to use this configuration!
-#
-# The guest will have a selection of VirtIO devices
-# tailored towards optimal performance with modern guests,
-# and will be accessed through a graphical console.
-#
-# ---------------------------------------------------------
-#
-# Using -nodefaults is required to have full control over
-# the virtual hardware: when it's specified, QEMU will
-# populate the board with only the builtin peripherals
-# plus a small selection of core PCI devices and
-# controllers; the user will then have to explicitly add
-# further devices.
-#
-# The core PCI devices show up in the guest as:
-#
-# 00:00.0 Host bridge
-# 00:1f.0 ISA bridge / LPC
-# 00:1f.2 SATA (AHCI) controller
-# 00:1f.3 SMBus controller
-#
-# This configuration file adds a number of other useful
-# devices, more specifically:
-#
-# 00:01.0 VGA compatible controller
-# 00:1b.0 Audio device
-# 00.1c.* PCI bridge (PCI Express Root Ports)
-# 01:00.0 SCSI storage controller
-# 02:00.0 Ethernet controller
-# 03:00.0 USB controller
-#
-# More information about these devices is available below.
-
-
-# Machine options
-# =========================================================
-#
-# We use the q35 machine type and enable KVM acceleration
-# for better performance.
-#
-# Using less than 1 GiB of memory is probably not going to
-# yield good performance in the guest, and might even lead
-# to obscure boot issues in some cases.
-
-[machine]
- type = "q35"
- accel = "kvm"
-
-[memory]
- size = "1024"
-
-
-# PCI bridge (PCI Express Root Ports)
-# =========================================================
-#
-# We create eight PCI Express Root Ports, and we plug them
-# all into separate functions of the same slot. Some of
-# them will be used by devices, the rest will remain
-# available for hotplug.
-
-[device "pcie.1"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.0"
- port = "1"
- chassis = "1"
- multifunction = "on"
-
-[device "pcie.2"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.1"
- port = "2"
- chassis = "2"
-
-[device "pcie.3"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.2"
- port = "3"
- chassis = "3"
-
-[device "pcie.4"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.3"
- port = "4"
- chassis = "4"
-
-[device "pcie.5"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.4"
- port = "5"
- chassis = "5"
-
-[device "pcie.6"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.5"
- port = "6"
- chassis = "6"
-
-[device "pcie.7"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.6"
- port = "7"
- chassis = "7"
-
-[device "pcie.8"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.7"
- port = "8"
- chassis = "8"
-
-
-# SCSI storage controller (and storage)
-# =========================================================
-#
-# We use virtio-scsi here so that we can (hot)plug a large
-# number of disks without running into issues; a SCSI disk,
-# backed by a qcow2 disk image on the host's filesystem, is
-# attached to it.
-#
-# We also create an optical disk, mostly for installation
-# purposes: once the guest OS has been succesfully
-# installed, the guest will no longer boot from optical
-# media. If you don't want, or no longer want, to have an
-# optical disk in the guest you can safely comment out
-# all relevant sections below.
-
-[device "scsi"]
- driver = "virtio-scsi-pci"
- bus = "pcie.1"
- addr = "00.0"
-
-[device "scsi-disk"]
- driver = "scsi-hd"
- bus = "scsi.0"
- drive = "disk"
- bootindex = "1"
-
-[drive "disk"]
- file = "guest.qcow2" # CHANGE ME
- format = "qcow2"
- if = "none"
-
-[device "scsi-optical-disk"]
- driver = "scsi-cd"
- bus = "scsi.0"
- drive = "optical-disk"
- bootindex = "2"
-
-[drive "optical-disk"]
- file = "install.iso" # CHANGE ME
- format = "raw"
- if = "none"
-
-
-# Ethernet controller
-# =========================================================
-#
-# We use virtio-net for improved performance over emulated
-# hardware; on the host side, we take advantage of user
-# networking so that the QEMU process doesn't require any
-# additional privileges.
-
-[netdev "hostnet"]
- type = "user"
-
-[device "net"]
- driver = "virtio-net-pci"
- netdev = "hostnet"
- bus = "pcie.2"
- addr = "00.0"
-
-
-# USB controller (and input devices)
-# =========================================================
-#
-# We add a virtualization-friendly USB 3.0 controller and
-# a USB tablet so that graphical guests can be controlled
-# appropriately. A USB keyboard is not needed, as q35
-# guests get a PS/2 one added automatically.
-
-[device "usb"]
- driver = "nec-usb-xhci"
- bus = "pcie.3"
- addr = "00.0"
-
-[device "tablet"]
- driver = "usb-tablet"
- bus = "usb.0"
-
-
-# VGA compatible controller
-# =========================================================
-#
-# We plug the QXL video card directly into the PCI Express
-# Root Bus as it is a legacy PCI device; this way, we can
-# reduce the number of PCI Express controllers in the
-# guest.
-#
-# If you're running the guest on a remote, potentially
-# headless host, you will probably want to append something
-# like
-#
-# -display vnc=127.0.0.1:0
-#
-# to the command line in order to prevent QEMU from
-# creating a graphical display window on the host and
-# enable remote access instead.
-
-[device "video"]
- driver = "qxl-vga"
- bus = "pcie.0"
- addr = "01.0"
-
-
-# Audio device
-# =========================================================
-#
-# Like the video card, the sound card is a legacy PCI
-# device and as such can be plugged directly into the PCI
-# Express Root Bus.
-
-[device "sound"]
- driver = "ich9-intel-hda"
- bus = "pcie.0"
- addr = "1b.0"
-
-[device "duplex"]
- driver = "hda-duplex"
- bus = "sound.0"
- cad = "0"
+++ /dev/null
-# q35 - VirtIO guest (serial console)
-# =========================================================
-#
-# Usage:
-#
-# $ qemu-system-x86_64 \
-# -nodefaults \
-# -readconfig q35-virtio-serial.cfg \
-# -display none -serial mon:stdio
-#
-# You will probably need to tweak the lines marked as
-# CHANGE ME before being able to use this configuration!
-#
-# The guest will have a selection of VirtIO devices
-# tailored towards optimal performance with modern guests,
-# and will be accessed through the serial console.
-#
-# ---------------------------------------------------------
-#
-# Using -nodefaults is required to have full control over
-# the virtual hardware: when it's specified, QEMU will
-# populate the board with only the builtin peripherals
-# plus a small selection of core PCI devices and
-# controllers; the user will then have to explicitly add
-# further devices.
-#
-# The core PCI devices show up in the guest as:
-#
-# 00:00.0 Host bridge
-# 00:1f.0 ISA bridge / LPC
-# 00:1f.2 SATA (AHCI) controller
-# 00:1f.3 SMBus controller
-#
-# This configuration file adds a number of other useful
-# devices, more specifically:
-#
-# 00.1c.* PCI bridge (PCI Express Root Ports)
-# 01:00.0 SCSI storage controller
-# 02:00.0 Ethernet controller
-#
-# More information about these devices is available below.
-#
-# We use '-display none' to prevent QEMU from creating a
-# graphical display window, which would serve no use in
-# this specific configuration, and '-serial mon:stdio' to
-# multiplex the guest's serial console and the QEMU monitor
-# to the host's stdio; use 'Ctrl+A h' to learn how to
-# switch between the two and more.
-
-
-# Machine options
-# =========================================================
-#
-# We use the q35 machine type and enable KVM acceleration
-# for better performance.
-#
-# Using less than 1 GiB of memory is probably not going to
-# yield good performance in the guest, and might even lead
-# to obscure boot issues in some cases.
-
-[machine]
- type = "q35"
- accel = "kvm"
-
-[memory]
- size = "1024"
-
-
-# PCI bridge (PCI Express Root Ports)
-# =========================================================
-#
-# We create eight PCI Express Root Ports, and we plug them
-# all into separate functions of the same slot. Some of
-# them will be used by devices, the rest will remain
-# available for hotplug.
-
-[device "pcie.1"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.0"
- port = "1"
- chassis = "1"
- multifunction = "on"
-
-[device "pcie.2"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.1"
- port = "2"
- chassis = "2"
-
-[device "pcie.3"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.2"
- port = "3"
- chassis = "3"
-
-[device "pcie.4"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.3"
- port = "4"
- chassis = "4"
-
-[device "pcie.5"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.4"
- port = "5"
- chassis = "5"
-
-[device "pcie.6"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.5"
- port = "6"
- chassis = "6"
-
-[device "pcie.7"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.6"
- port = "7"
- chassis = "7"
-
-[device "pcie.8"]
- driver = "pcie-root-port"
- bus = "pcie.0"
- addr = "1c.7"
- port = "8"
- chassis = "8"
-
-
-# SCSI storage controller (and storage)
-# =========================================================
-#
-# We use virtio-scsi here so that we can (hot)plug a large
-# number of disks without running into issues; a SCSI disk,
-# backed by a qcow2 disk image on the host's filesystem, is
-# attached to it.
-#
-# We also create an optical disk, mostly for installation
-# purposes: once the guest OS has been succesfully
-# installed, the guest will no longer boot from optical
-# media. If you don't want, or no longer want, to have an
-# optical disk in the guest you can safely comment out
-# all relevant sections below.
-
-[device "scsi"]
- driver = "virtio-scsi-pci"
- bus = "pcie.1"
- addr = "00.0"
-
-[device "scsi-disk"]
- driver = "scsi-hd"
- bus = "scsi.0"
- drive = "disk"
- bootindex = "1"
-
-[drive "disk"]
- file = "guest.qcow2" # CHANGE ME
- format = "qcow2"
- if = "none"
-
-[device "scsi-optical-disk"]
- driver = "scsi-cd"
- bus = "scsi.0"
- drive = "optical-disk"
- bootindex = "2"
-
-[drive "optical-disk"]
- file = "install.iso" # CHANGE ME
- format = "raw"
- if = "none"
-
-
-# Ethernet controller
-# =========================================================
-#
-# We use virtio-net for improved performance over emulated
-# hardware; on the host side, we take advantage of user
-# networking so that the QEMU process doesn't require any
-# additional privileges.
-
-[netdev "hostnet"]
- type = "user"
-
-[device "net"]
- driver = "virtio-net-pci"
- netdev = "hostnet"
- bus = "pcie.2"
- addr = "00.0"
+++ /dev/null
-= How to use the QAPI code generator =
-
-Copyright IBM Corp. 2011
-Copyright (C) 2012-2016 Red Hat, Inc.
-
-This work is licensed under the terms of the GNU GPL, version 2 or
-later. See the COPYING file in the top-level directory.
-
-== Introduction ==
-
-QAPI is a native C API within QEMU which provides management-level
-functionality to internal and external users. For external
-users/processes, this interface is made available by a JSON-based wire
-format for the QEMU Monitor Protocol (QMP) for controlling qemu, as
-well as the QEMU Guest Agent (QGA) for communicating with the guest.
-The remainder of this document uses "Client JSON Protocol" when
-referring to the wire contents of a QMP or QGA connection.
-
-To map Client JSON Protocol interfaces to the native C QAPI
-implementations, a JSON-based schema is used to define types and
-function signatures, and a set of scripts is used to generate types,
-signatures, and marshaling/dispatch code. This document will describe
-how the schemas, scripts, and resulting code are used.
-
-
-== QMP/Guest agent schema ==
-
-A QAPI schema file is designed to be loosely based on JSON
-(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style
-and the use of comments; a QAPI schema file is then parsed by a python
-code generation program. A valid QAPI schema consists of a series of
-top-level expressions, with no commas between them. Where
-dictionaries (JSON objects) are used, they are parsed as python
-OrderedDicts so that ordering is preserved (for predictable layout of
-generated C structs and parameter lists). Ordering doesn't matter
-between top-level expressions or the keys within an expression, but
-does matter within dictionary values for 'data' and 'returns' members
-of a single expression. QAPI schema input is written using 'single
-quotes' instead of JSON's "double quotes" (in contrast, Client JSON
-Protocol uses no comments, and while input accepts 'single quotes' as
-an extension, output is strict JSON using only "double quotes"). As
-in JSON, trailing commas are not permitted in arrays or dictionaries.
-Input must be ASCII (although QMP supports full Unicode strings, the
-QAPI parser does not). At present, there is no place where a QAPI
-schema requires the use of JSON numbers or null.
-
-
-=== Comments ===
-
-Comments are allowed; anything between an unquoted # and the following
-newline is ignored.
-
-A multi-line comment that starts and ends with a '##' line is a
-documentation comment. These are parsed by the documentation
-generator, which recognizes certain markup detailed below.
-
-
-==== Documentation markup ====
-
-Comment text starting with '=' is a section title:
-
- # = Section title
-
-Double the '=' for a subsection title:
-
- # == Subection title
-
-'|' denotes examples:
-
- # | Text of the example, may span
- # | multiple lines
-
-'*' starts an itemized list:
-
- # * First item, may span
- # multiple lines
- # * Second item
-
-You can also use '-' instead of '*'.
-
-A decimal number followed by '.' starts a numbered list:
-
- # 1. First item, may span
- # multiple lines
- # 2. Second item
-
-The actual number doesn't matter. You could even use '*' instead of
-'2.' for the second item.
-
-Lists can't be nested. Blank lines are currently not supported within
-lists.
-
-Additional whitespace between the initial '#' and the comment text is
-permitted.
-
-*foo* and _foo_ are for strong and emphasis styles respectively (they
-do not work over multiple lines). @foo is used to reference a name in
-the schema.
-
-Example:
-
-##
-# = Section
-# == Subsection
-#
-# Some text foo with *strong* and _emphasis_
-# 1. with a list
-# 2. like that
-#
-# And some code:
-# | $ echo foo
-# | -> do this
-# | <- get that
-#
-##
-
-
-==== Expression documentation ====
-
-Each expression that isn't an include directive may be preceded by a
-documentation block. Such blocks are called expression documentation
-blocks.
-
-When documentation is required (see pragma 'doc-required'), expression
-documentation blocks are mandatory.
-
-The documentation block consists of a first line naming the
-expression, an optional overview, a description of each argument (for
-commands and events) or member (for structs, unions and alternates),
-and optional tagged sections.
-
-FIXME: the parser accepts these things in almost any order.
-
-Extensions added after the expression was first released carry a
-'(since x.y.z)' comment.
-
-A tagged section starts with one of the following words:
-"Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:".
-The section ends with the start of a new section.
-
-A 'Since: x.y.z' tagged section lists the release that introduced the
-expression.
-
-For example:
-
-##
-# @BlockStats:
-#
-# Statistics of a virtual block device or a block backing device.
-#
-# @device: If the stats are for a virtual block device, the name
-# corresponding to the virtual block device.
-#
-# @node-name: The node name of the device. (since 2.3)
-#
-# ... more members ...
-#
-# Since: 0.14.0
-##
-{ 'struct': 'BlockStats',
- 'data': {'*device': 'str', '*node-name': 'str',
- ... more members ... } }
-
-##
-# @query-blockstats:
-#
-# Query the @BlockStats for all virtual block devices.
-#
-# @query-nodes: If true, the command will query all the
-# block nodes ... explain, explain ... (since 2.3)
-#
-# Returns: A list of @BlockStats for each virtual block devices.
-#
-# Since: 0.14.0
-#
-# Example:
-#
-# -> { "execute": "query-blockstats" }
-# <- {
-# ... lots of output ...
-# }
-#
-##
-{ 'command': 'query-blockstats',
- 'data': { '*query-nodes': 'bool' },
- 'returns': ['BlockStats'] }
-
-==== Free-form documentation ====
-
-A documentation block that isn't an expression documentation block is
-a free-form documentation block. These may be used to provide
-additional text and structuring content.
-
-
-=== Schema overview ===
-
-The schema sets up a series of types, as well as commands and events
-that will use those types. Forward references are allowed: the parser
-scans in two passes, where the first pass learns all type names, and
-the second validates the schema and generates the code. This allows
-the definition of complex structs that can have mutually recursive
-types, and allows for indefinite nesting of Client JSON Protocol that
-satisfies the schema. A type name should not be defined more than
-once. It is permissible for the schema to contain additional types
-not used by any commands or events in the Client JSON Protocol, for
-the side effect of generated C code used internally.
-
-There are eight top-level expressions recognized by the parser:
-'include', 'pragma', 'command', 'struct', 'enum', 'union',
-'alternate', and 'event'. There are several groups of types: simple
-types (a number of built-in types, such as 'int' and 'str'; as well as
-enumerations), complex types (structs and two flavors of unions), and
-alternate types (a choice between other types). The 'command' and
-'event' expressions can refer to existing types by name, or list an
-anonymous type as a dictionary. Listing a type name inside an array
-refers to a single-dimension array of that type; multi-dimension
-arrays are not directly supported (although an array of a complex
-struct that contains an array member is possible).
-
-All names must begin with a letter, and contain only ASCII letters,
-digits, hyphen, and underscore. There are two exceptions: enum values
-may start with a digit, and names that are downstream extensions (see
-section Downstream extensions) start with underscore.
-
-Names beginning with 'q_' are reserved for the generator, which uses
-them for munging QMP names that resemble C keywords or other
-problematic strings. For example, a member named "default" in qapi
-becomes "q_default" in the generated C code.
-
-Types, commands, and events share a common namespace. Therefore,
-generally speaking, type definitions should always use CamelCase for
-user-defined type names, while built-in types are lowercase.
-
-Type names ending with 'Kind' or 'List' are reserved for the
-generator, which uses them for implicit union enums and array types,
-respectively.
-
-Command names, and member names within a type, should be all lower
-case with words separated by a hyphen. However, some existing older
-commands and complex types use underscore; when extending such
-expressions, consistency is preferred over blindly avoiding
-underscore.
-
-Event names should be ALL_CAPS with words separated by underscore.
-
-Member names starting with 'has-' or 'has_' are reserved for the
-generator, which uses them for tracking optional members.
-
-Any name (command, event, type, member, or enum value) beginning with
-"x-" is marked experimental, and may be withdrawn or changed
-incompatibly in a future release.
-
-Pragma 'name-case-whitelist' lets you violate the rules on use of
-upper and lower case. Use for new code is strongly discouraged.
-
-In the rest of this document, usage lines are given for each
-expression type, with literal strings written in lower case and
-placeholders written in capitals. If a literal string includes a
-prefix of '*', that key/value pair can be omitted from the expression.
-For example, a usage statement that includes '*base':STRUCT-NAME
-means that an expression has an optional key 'base', which if present
-must have a value that forms a struct name.
-
-
-=== Built-in Types ===
-
-The following types are predefined, and map to C as follows:
-
- Schema C JSON
- str char * any JSON string, UTF-8
- number double any JSON number
- int int64_t a JSON number without fractional part
- that fits into the C integer type
- int8 int8_t likewise
- int16 int16_t likewise
- int32 int32_t likewise
- int64 int64_t likewise
- uint8 uint8_t likewise
- uint16 uint16_t likewise
- uint32 uint32_t likewise
- uint64 uint64_t likewise
- size uint64_t like uint64_t, except StringInputVisitor
- accepts size suffixes
- bool bool JSON true or false
- any QObject * any JSON value
- QType QType JSON string matching enum QType values
-
-
-=== Include directives ===
-
-Usage: { 'include': STRING }
-
-The QAPI schema definitions can be modularized using the 'include' directive:
-
- { 'include': 'path/to/file.json' }
-
-The directive is evaluated recursively, and include paths are relative to the
-file using the directive. Multiple includes of the same file are
-idempotent. No other keys should appear in the expression, and the include
-value should be a string.
-
-As a matter of style, it is a good idea to have all files be
-self-contained, but at the moment, nothing prevents an included file
-from making a forward reference to a type that is only introduced by
-an outer file. The parser may be made stricter in the future to
-prevent incomplete include files.
-
-
-=== Pragma directives ===
-
-Usage: { 'pragma': DICT }
-
-The pragma directive lets you control optional generator behavior.
-The dictionary's entries are pragma names and values.
-
-Pragma's scope is currently the complete schema. Setting the same
-pragma to different values in parts of the schema doesn't work.
-
-Pragma 'doc-required' takes a boolean value. If true, documentation
-is required. Default is false.
-
-Pragma 'returns-whitelist' takes a list of command names that may
-violate the rules on permitted return types. Default is none.
-
-Pragma 'name-case-whitelist' takes a list of names that may violate
-rules on use of upper- vs. lower-case letters. Default is none.
-
-
-=== Struct types ===
-
-Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME }
-
-A struct is a dictionary containing a single 'data' key whose value is
-a dictionary; the dictionary may be empty. This corresponds to a
-struct in C or an Object in JSON. Each value of the 'data' dictionary
-must be the name of a type, or a one-element array containing a type
-name. An example of a struct is:
-
- { 'struct': 'MyType',
- 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } }
-
-The use of '*' as a prefix to the name means the member is optional in
-the corresponding JSON protocol usage.
-
-The default initialization value of an optional argument should not be changed
-between versions of QEMU unless the new default maintains backward
-compatibility to the user-visible behavior of the old default.
-
-With proper documentation, this policy still allows some flexibility; for
-example, documenting that a default of 0 picks an optimal buffer size allows
-one release to declare the optimal size at 512 while another release declares
-the optimal size at 4096 - the user-visible behavior is not the bytes used by
-the buffer, but the fact that the buffer was optimal size.
-
-On input structures (only mentioned in the 'data' side of a command), changing
-from mandatory to optional is safe (older clients will supply the option, and
-newer clients can benefit from the default); changing from optional to
-mandatory is backwards incompatible (older clients may be omitting the option,
-and must continue to work).
-
-On output structures (only mentioned in the 'returns' side of a command),
-changing from mandatory to optional is in general unsafe (older clients may be
-expecting the member, and could crash if it is missing), although it
-can be done if the only way that the optional argument will be omitted
-is when it is triggered by the presence of a new input flag to the
-command that older clients don't know to send. Changing from optional
-to mandatory is safe.
-
-A structure that is used in both input and output of various commands
-must consider the backwards compatibility constraints of both directions
-of use.
-
-A struct definition can specify another struct as its base.
-In this case, the members of the base type are included as top-level members
-of the new struct's dictionary in the Client JSON Protocol wire
-format. An example definition is:
-
- { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } }
- { 'struct': 'BlockdevOptionsGenericCOWFormat',
- 'base': 'BlockdevOptionsGenericFormat',
- 'data': { '*backing': 'str' } }
-
-An example BlockdevOptionsGenericCOWFormat object on the wire could use
-both members like this:
-
- { "file": "/some/place/my-image",
- "backing": "/some/place/my-backing-file" }
-
-
-=== Enumeration types ===
-
-Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING }
- { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING }
-
-An enumeration type is a dictionary containing a single 'data' key
-whose value is a list of strings. An example enumeration is:
-
- { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] }
-
-Nothing prevents an empty enumeration, although it is probably not
-useful. The list of strings should be lower case; if an enum name
-represents multiple words, use '-' between words. The string 'max' is
-not allowed as an enum value, and values should not be repeated.
-
-The enum constants will be named by using a heuristic to turn the
-type name into a set of underscore separated words. For the example
-above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name
-of 'MY_ENUM_VALUE1' for the first value. If the default heuristic
-does not result in a desirable name, the optional 'prefix' member
-can be used when defining the enum.
-
-The enumeration values are passed as strings over the Client JSON
-Protocol, but are encoded as C enum integral values in generated code.
-While the C code starts numbering at 0, it is better to use explicit
-comparisons to enum values than implicit comparisons to 0; the C code
-will also include a generated enum member ending in _MAX for tracking
-the size of the enum, useful when using common functions for
-converting between strings and enum values. Since the wire format
-always passes by name, it is acceptable to reorder or add new
-enumeration members in any location without breaking clients of Client
-JSON Protocol; however, removing enum values would break
-compatibility. For any struct that has a member that will only contain
-a finite set of string values, using an enum type for that member is
-better than open-coding the member to be type 'str'.
-
-
-=== Union types ===
-
-Usage: { 'union': STRING, 'data': DICT }
-or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT,
- 'discriminator': ENUM-MEMBER-OF-BASE }
-
-Union types are used to let the user choose between several different
-variants for an object. There are two flavors: simple (no
-discriminator or base), and flat (both discriminator and base). A union
-type is defined using a data dictionary as explained in the following
-paragraphs. The data dictionary for either type of union must not
-be empty.
-
-A simple union type defines a mapping from automatic discriminator
-values to data types like in this example:
-
- { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } }
- { 'struct': 'BlockdevOptionsQcow2',
- 'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } }
-
- { 'union': 'BlockdevOptionsSimple',
- 'data': { 'file': 'BlockdevOptionsFile',
- 'qcow2': 'BlockdevOptionsQcow2' } }
-
-In the Client JSON Protocol, a simple union is represented by a
-dictionary that contains the 'type' member as a discriminator, and a
-'data' member that is of the specified data type corresponding to the
-discriminator value, as in these examples:
-
- { "type": "file", "data": { "filename": "/some/place/my-image" } }
- { "type": "qcow2", "data": { "backing": "/some/place/my-image",
- "lazy-refcounts": true } }
-
-The generated C code uses a struct containing a union. Additionally,
-an implicit C enum 'NameKind' is created, corresponding to the union
-'Name', for accessing the various branches of the union. No branch of
-the union can be named 'max', as this would collide with the implicit
-enum. The value for each branch can be of any type.
-
-A flat union definition avoids nesting on the wire, and specifies a
-set of common members that occur in all variants of the union. The
-'base' key must specify either a type name (the type must be a
-struct, not a union), or a dictionary representing an anonymous type.
-All branches of the union must be complex types, and the top-level
-members of the union dictionary on the wire will be combination of
-members from both the base type and the appropriate branch type (when
-merging two dictionaries, there must be no keys in common). The
-'discriminator' member must be the name of a non-optional enum-typed
-member of the base struct.
-
-The following example enhances the above simple union example by
-adding an optional common member 'read-only', renaming the
-discriminator to something more applicable than the simple union's
-default of 'type', and reducing the number of {} required on the wire:
-
- { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] }
- { 'union': 'BlockdevOptions',
- 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' },
- 'discriminator': 'driver',
- 'data': { 'file': 'BlockdevOptionsFile',
- 'qcow2': 'BlockdevOptionsQcow2' } }
-
-Resulting in these JSON objects:
-
- { "driver": "file", "read-only": true,
- "filename": "/some/place/my-image" }
- { "driver": "qcow2", "read-only": false,
- "backing": "/some/place/my-image", "lazy-refcounts": true }
-
-Notice that in a flat union, the discriminator name is controlled by
-the user, but because it must map to a base member with enum type, the
-code generator can ensure that branches exist for all values of the
-enum (although the order of the keys need not match the declaration of
-the enum). In the resulting generated C data types, a flat union is
-represented as a struct with the base members included directly, and
-then a union of structures for each branch of the struct.
-
-A simple union can always be re-written as a flat union where the base
-class has a single member named 'type', and where each branch of the
-union has a struct with a single member named 'data'. That is,
-
- { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } }
-
-is identical on the wire to:
-
- { 'enum': 'Enum', 'data': ['one', 'two'] }
- { 'struct': 'Branch1', 'data': { 'data': 'str' } }
- { 'struct': 'Branch2', 'data': { 'data': 'int' } }
- { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type',
- 'data': { 'one': 'Branch1', 'two': 'Branch2' } }
-
-
-=== Alternate types ===
-
-Usage: { 'alternate': STRING, 'data': DICT }
-
-An alternate type is one that allows a choice between two or more JSON
-data types (string, integer, number, or object, but currently not
-array) on the wire. The definition is similar to a simple union type,
-where each branch of the union names a QAPI type. For example:
-
- { 'alternate': 'BlockdevRef',
- 'data': { 'definition': 'BlockdevOptions',
- 'reference': 'str' } }
-
-Unlike a union, the discriminator string is never passed on the wire
-for the Client JSON Protocol. Instead, the value's JSON type serves
-as an implicit discriminator, which in turn means that an alternate
-can only express a choice between types represented differently in
-JSON. If a branch is typed as the 'bool' built-in, the alternate
-accepts true and false; if it is typed as any of the various numeric
-built-ins, it accepts a JSON number; if it is typed as a 'str'
-built-in or named enum type, it accepts a JSON string; and if it is
-typed as a complex type (struct or union), it accepts a JSON object.
-Two different complex types, for instance, aren't permitted, because
-both are represented as a JSON object.
-
-The example alternate declaration above allows using both of the
-following example objects:
-
- { "file": "my_existing_block_device_id" }
- { "file": { "driver": "file",
- "read-only": false,
- "filename": "/tmp/mydisk.qcow2" } }
-
-
-=== Commands ===
-
-Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT,
- '*returns': TYPE-NAME, '*boxed': true,
- '*gen': false, '*success-response': false }
-
-Commands are defined by using a dictionary containing several members,
-where three members are most common. The 'command' member is a
-mandatory string, and determines the "execute" value passed in a
-Client JSON Protocol command exchange.
-
-The 'data' argument maps to the "arguments" dictionary passed in as
-part of a Client JSON Protocol command. The 'data' member is optional
-and defaults to {} (an empty dictionary). If present, it must be the
-string name of a complex type, or a dictionary that declares an
-anonymous type with the same semantics as a 'struct' expression.
-
-The 'returns' member describes what will appear in the "return" member
-of a Client JSON Protocol reply on successful completion of a command.
-The member is optional from the command declaration; if absent, the
-"return" member will be an empty dictionary. If 'returns' is present,
-it must be the string name of a complex or built-in type, a
-one-element array containing the name of a complex or built-in type.
-To return anything else, you have to list the command in pragma
-'returns-whitelist'. If you do this, the command cannot be extended
-to return additional information in the future. Use of
-'returns-whitelist' for new commands is strongly discouraged.
-
-All commands in Client JSON Protocol use a dictionary to report
-failure, with no way to specify that in QAPI. Where the error return
-is different than the usual GenericError class in order to help the
-client react differently to certain error conditions, it is worth
-documenting this in the comments before the command declaration.
-
-Some example commands:
-
- { 'command': 'my-first-command',
- 'data': { 'arg1': 'str', '*arg2': 'str' } }
- { 'struct': 'MyType', 'data': { '*value': 'str' } }
- { 'command': 'my-second-command',
- 'returns': [ 'MyType' ] }
-
-which would validate this Client JSON Protocol transaction:
-
- => { "execute": "my-first-command",
- "arguments": { "arg1": "hello" } }
- <= { "return": { } }
- => { "execute": "my-second-command" }
- <= { "return": [ { "value": "one" }, { } ] }
-
-The generator emits a prototype for the user's function implementing
-the command. Normally, 'data' is a dictionary for an anonymous type,
-or names a struct type (possibly empty, but not a union), and its
-members are passed as separate arguments to this function. If the
-command definition includes a key 'boxed' with the boolean value true,
-then 'data' is instead the name of any non-empty complex type
-(struct, union, or alternate), and a pointer to that QAPI type is
-passed as a single argument.
-
-The generator also emits a marshalling function that extracts
-arguments for the user's function out of an input QDict, calls the
-user's function, and if it succeeded, builds an output QObject from
-its return value.
-
-In rare cases, QAPI cannot express a type-safe representation of a
-corresponding Client JSON Protocol command. You then have to suppress
-generation of a marshalling function by including a key 'gen' with
-boolean value false, and instead write your own function. Please try
-to avoid adding new commands that rely on this, and instead use
-type-safe unions. For an example of this usage:
-
- { 'command': 'netdev_add',
- 'data': {'type': 'str', 'id': 'str'},
- 'gen': false }
-
-Normally, the QAPI schema is used to describe synchronous exchanges,
-where a response is expected. But in some cases, the action of a
-command is expected to change state in a way that a successful
-response is not possible (although the command will still return a
-normal dictionary error on failure). When a successful reply is not
-possible, the command expression should include the optional key
-'success-response' with boolean value false. So far, only QGA makes
-use of this member.
-
-
-=== Events ===
-
-Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT,
- '*boxed': true }
-
-Events are defined with the keyword 'event'. It is not allowed to
-name an event 'MAX', since the generator also produces a C enumeration
-of all event names with a generated _MAX value at the end. When
-'data' is also specified, additional info will be included in the
-event, with similar semantics to a 'struct' expression. Finally there
-will be C API generated in qapi-event.h; when called by QEMU code, a
-message with timestamp will be emitted on the wire.
-
-An example event is:
-
-{ 'event': 'EVENT_C',
- 'data': { '*a': 'int', 'b': 'str' } }
-
-Resulting in this JSON object:
-
-{ "event": "EVENT_C",
- "data": { "b": "test string" },
- "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }
-
-The generator emits a function to send the event. Normally, 'data' is
-a dictionary for an anonymous type, or names a struct type (possibly
-empty, but not a union), and its members are passed as separate
-arguments to this function. If the event definition includes a key
-'boxed' with the boolean value true, then 'data' is instead the name of
-any non-empty complex type (struct, union, or alternate), and a
-pointer to that QAPI type is passed as a single argument.
-
-
-=== Downstream extensions ===
-
-QAPI schema names that are externally visible, say in the Client JSON
-Protocol, need to be managed with care. Names starting with a
-downstream prefix of the form __RFQDN_ are reserved for the downstream
-who controls the valid, reverse fully qualified domain name RFQDN.
-RFQDN may only contain ASCII letters, digits, hyphen and period.
-
-Example: Red Hat, Inc. controls redhat.com, and may therefore add a
-downstream command __com.redhat_drive-mirror.
-
-
-== Client JSON Protocol introspection ==
-
-Clients of a Client JSON Protocol commonly need to figure out what
-exactly the server (QEMU) supports.
-
-For this purpose, QMP provides introspection via command
-query-qmp-schema. QGA currently doesn't support introspection.
-
-While Client JSON Protocol wire compatibility should be maintained
-between qemu versions, we cannot make the same guarantees for
-introspection stability. For example, one version of qemu may provide
-a non-variant optional member of a struct, and a later version rework
-the member to instead be non-optional and associated with a variant.
-Likewise, one version of qemu may list a member with open-ended type
-'str', and a later version could convert it to a finite set of strings
-via an enum type; or a member may be converted from a specific type to
-an alternate that represents a choice between the original type and
-something else.
-
-query-qmp-schema returns a JSON array of SchemaInfo objects. These
-objects together describe the wire ABI, as defined in the QAPI schema.
-There is no specified order to the SchemaInfo objects returned; a
-client must search for a particular name throughout the entire array
-to learn more about that name, but is at least guaranteed that there
-will be no collisions between type, command, and event names.
-
-However, the SchemaInfo can't reflect all the rules and restrictions
-that apply to QMP. It's interface introspection (figuring out what's
-there), not interface specification. The specification is in the QAPI
-schema. To understand how QMP is to be used, you need to study the
-QAPI schema.
-
-Like any other command, query-qmp-schema is itself defined in the QAPI
-schema, along with the SchemaInfo type. This text attempts to give an
-overview how things work. For details you need to consult the QAPI
-schema.
-
-SchemaInfo objects have common members "name" and "meta-type", and
-additional variant members depending on the value of meta-type.
-
-Each SchemaInfo object describes a wire ABI entity of a certain
-meta-type: a command, event or one of several kinds of type.
-
-SchemaInfo for commands and events have the same name as in the QAPI
-schema.
-
-Command and event names are part of the wire ABI, but type names are
-not. Therefore, the SchemaInfo for types have auto-generated
-meaningless names. For readability, the examples in this section use
-meaningful type names instead.
-
-To examine a type, start with a command or event using it, then follow
-references by name.
-
-QAPI schema definitions not reachable that way are omitted.
-
-The SchemaInfo for a command has meta-type "command", and variant
-members "arg-type" and "ret-type". On the wire, the "arguments"
-member of a client's "execute" command must conform to the object type
-named by "arg-type". The "return" member that the server passes in a
-success response conforms to the type named by "ret-type".
-
-If the command takes no arguments, "arg-type" names an object type
-without members. Likewise, if the command returns nothing, "ret-type"
-names an object type without members.
-
-Example: the SchemaInfo for command query-qmp-schema
-
- { "name": "query-qmp-schema", "meta-type": "command",
- "arg-type": "q_empty", "ret-type": "SchemaInfoList" }
-
- Type "q_empty" is an automatic object type without members, and type
- "SchemaInfoList" is the array of SchemaInfo type.
-
-The SchemaInfo for an event has meta-type "event", and variant member
-"arg-type". On the wire, a "data" member that the server passes in an
-event conforms to the object type named by "arg-type".
-
-If the event carries no additional information, "arg-type" names an
-object type without members. The event may not have a data member on
-the wire then.
-
-Each command or event defined with dictionary-valued 'data' in the
-QAPI schema implicitly defines an object type.
-
-Example: the SchemaInfo for EVENT_C from section Events
-
- { "name": "EVENT_C", "meta-type": "event",
- "arg-type": "q_obj-EVENT_C-arg" }
-
- Type "q_obj-EVENT_C-arg" is an implicitly defined object type with
- the two members from the event's definition.
-
-The SchemaInfo for struct and union types has meta-type "object".
-
-The SchemaInfo for a struct type has variant member "members".
-
-The SchemaInfo for a union type additionally has variant members "tag"
-and "variants".
-
-"members" is a JSON array describing the object's common members, if
-any. Each element is a JSON object with members "name" (the member's
-name), "type" (the name of its type), and optionally "default". The
-member is optional if "default" is present. Currently, "default" can
-only have value null. Other values are reserved for future
-extensions. The "members" array is in no particular order; clients
-must search the entire object when learning whether a particular
-member is supported.
-
-Example: the SchemaInfo for MyType from section Struct types
-
- { "name": "MyType", "meta-type": "object",
- "members": [
- { "name": "member1", "type": "str" },
- { "name": "member2", "type": "int" },
- { "name": "member3", "type": "str", "default": null } ] }
-
-"tag" is the name of the common member serving as type tag.
-"variants" is a JSON array describing the object's variant members.
-Each element is a JSON object with members "case" (the value of type
-tag this element applies to) and "type" (the name of an object type
-that provides the variant members for this type tag value). The
-"variants" array is in no particular order, and is not guaranteed to
-list cases in the same order as the corresponding "tag" enum type.
-
-Example: the SchemaInfo for flat union BlockdevOptions from section
-Union types
-
- { "name": "BlockdevOptions", "meta-type": "object",
- "members": [
- { "name": "driver", "type": "BlockdevDriver" },
- { "name": "read-only", "type": "bool", "default": null } ],
- "tag": "driver",
- "variants": [
- { "case": "file", "type": "BlockdevOptionsFile" },
- { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] }
-
-Note that base types are "flattened": its members are included in the
-"members" array.
-
-A simple union implicitly defines an enumeration type for its implicit
-discriminator (called "type" on the wire, see section Union types).
-
-A simple union implicitly defines an object type for each of its
-variants.
-
-Example: the SchemaInfo for simple union BlockdevOptionsSimple from section
-Union types
-
- { "name": "BlockdevOptionsSimple", "meta-type": "object",
- "members": [
- { "name": "type", "type": "BlockdevOptionsSimpleKind" } ],
- "tag": "type",
- "variants": [
- { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" },
- { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] }
-
- Enumeration type "BlockdevOptionsSimpleKind" and the object types
- "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper"
- are implicitly defined.
-
-The SchemaInfo for an alternate type has meta-type "alternate", and
-variant member "members". "members" is a JSON array. Each element is
-a JSON object with member "type", which names a type. Values of the
-alternate type conform to exactly one of its member types. There is
-no guarantee on the order in which "members" will be listed.
-
-Example: the SchemaInfo for BlockdevRef from section Alternate types
-
- { "name": "BlockdevRef", "meta-type": "alternate",
- "members": [
- { "type": "BlockdevOptions" },
- { "type": "str" } ] }
-
-The SchemaInfo for an array type has meta-type "array", and variant
-member "element-type", which names the array's element type. Array
-types are implicitly defined. For convenience, the array's name may
-resemble the element type; however, clients should examine member
-"element-type" instead of making assumptions based on parsing member
-"name".
-
-Example: the SchemaInfo for ['str']
-
- { "name": "[str]", "meta-type": "array",
- "element-type": "str" }
-
-The SchemaInfo for an enumeration type has meta-type "enum" and
-variant member "values". The values are listed in no particular
-order; clients must search the entire enum when learning whether a
-particular value is supported.
-
-Example: the SchemaInfo for MyEnum from section Enumeration types
-
- { "name": "MyEnum", "meta-type": "enum",
- "values": [ "value1", "value2", "value3" ] }
-
-The SchemaInfo for a built-in type has the same name as the type in
-the QAPI schema (see section Built-in Types), with one exception
-detailed below. It has variant member "json-type" that shows how
-values of this type are encoded on the wire.
-
-Example: the SchemaInfo for str
-
- { "name": "str", "meta-type": "builtin", "json-type": "string" }
-
-The QAPI schema supports a number of integer types that only differ in
-how they map to C. They are identical as far as SchemaInfo is
-concerned. Therefore, they get all mapped to a single type "int" in
-SchemaInfo.
-
-As explained above, type names are not part of the wire ABI. Not even
-the names of built-in types. Clients should examine member
-"json-type" instead of hard-coding names of built-in types.
-
-
-== Code generation ==
-
-Schemas are fed into five scripts to generate all the code/files that,
-paired with the core QAPI libraries, comprise everything required to
-take JSON commands read in by a Client JSON Protocol server, unmarshal
-the arguments into the underlying C types, call into the corresponding
-C function, map the response back to a Client JSON Protocol response
-to be returned to the user, and introspect the commands.
-
-As an example, we'll use the following schema, which describes a
-single complex user-defined type, along with command which takes a
-list of that type as a parameter, and returns a single element of that
-type. The user is responsible for writing the implementation of
-qmp_my_command(); everything else is produced by the generator.
-
- $ cat example-schema.json
- { 'struct': 'UserDefOne',
- 'data': { 'integer': 'int', '*string': 'str' } }
-
- { 'command': 'my-command',
- 'data': { 'arg1': ['UserDefOne'] },
- 'returns': 'UserDefOne' }
-
- { 'event': 'MY_EVENT' }
-
-For a more thorough look at generated code, the testsuite includes
-tests/qapi-schema/qapi-schema-tests.json that covers more examples of
-what the generator will accept, and compiles the resulting C code as
-part of 'make check-unit'.
-
-=== scripts/qapi-types.py ===
-
-Used to generate the C types defined by a schema, along with
-supporting code. The following files are created:
-
-$(prefix)qapi-types.h - C types corresponding to types defined in
- the schema you pass in
-$(prefix)qapi-types.c - Cleanup functions for the above C types
-
-The $(prefix) is an optional parameter used as a namespace to keep the
-generated code from one schema/code-generation separated from others so code
-can be generated/used from multiple schemas without clobbering previously
-created code.
-
-Example:
-
- $ python scripts/qapi-types.py --output-dir="qapi-generated" \
- --prefix="example-" example-schema.json
- $ cat qapi-generated/example-qapi-types.h
-[Uninteresting stuff omitted...]
-
- #ifndef EXAMPLE_QAPI_TYPES_H
- #define EXAMPLE_QAPI_TYPES_H
-
-[Built-in types omitted...]
-
- typedef struct UserDefOne UserDefOne;
-
- typedef struct UserDefOneList UserDefOneList;
-
- struct UserDefOne {
- int64_t integer;
- bool has_string;
- char *string;
- };
-
- void qapi_free_UserDefOne(UserDefOne *obj);
-
- struct UserDefOneList {
- UserDefOneList *next;
- UserDefOne *value;
- };
-
- void qapi_free_UserDefOneList(UserDefOneList *obj);
-
- #endif
- $ cat qapi-generated/example-qapi-types.c
-[Uninteresting stuff omitted...]
-
- void qapi_free_UserDefOne(UserDefOne *obj)
- {
- Visitor *v;
-
- if (!obj) {
- return;
- }
-
- v = qapi_dealloc_visitor_new();
- visit_type_UserDefOne(v, NULL, &obj, NULL);
- visit_free(v);
- }
-
- void qapi_free_UserDefOneList(UserDefOneList *obj)
- {
- Visitor *v;
-
- if (!obj) {
- return;
- }
-
- v = qapi_dealloc_visitor_new();
- visit_type_UserDefOneList(v, NULL, &obj, NULL);
- visit_free(v);
- }
-
-=== scripts/qapi-visit.py ===
-
-Used to generate the visitor functions used to walk through and
-convert between a native QAPI C data structure and some other format
-(such as QObject); the generated functions are named visit_type_FOO()
-and visit_type_FOO_members().
-
-The following files are generated:
-
-$(prefix)qapi-visit.c: visitor function for a particular C type, used
- to automagically convert QObjects into the
- corresponding C type and vice-versa, as well
- as for deallocating memory for an existing C
- type
-
-$(prefix)qapi-visit.h: declarations for previously mentioned visitor
- functions
-
-Example:
-
- $ python scripts/qapi-visit.py --output-dir="qapi-generated"
- --prefix="example-" example-schema.json
- $ cat qapi-generated/example-qapi-visit.h
-[Uninteresting stuff omitted...]
-
- #ifndef EXAMPLE_QAPI_VISIT_H
- #define EXAMPLE_QAPI_VISIT_H
-
-[Visitors for built-in types omitted...]
-
- void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp);
- void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp);
- void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp);
-
- #endif
- $ cat qapi-generated/example-qapi-visit.c
-[Uninteresting stuff omitted...]
-
- void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp)
- {
- Error *err = NULL;
-
- visit_type_int(v, "integer", &obj->integer, &err);
- if (err) {
- goto out;
- }
- if (visit_optional(v, "string", &obj->has_string)) {
- visit_type_str(v, "string", &obj->string, &err);
- if (err) {
- goto out;
- }
- }
-
- out:
- error_propagate(errp, err);
- }
-
- void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp)
- {
- Error *err = NULL;
-
- visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err);
- if (err) {
- goto out;
- }
- if (!*obj) {
- goto out_obj;
- }
- visit_type_UserDefOne_members(v, *obj, &err);
- if (err) {
- goto out_obj;
- }
- visit_check_struct(v, &err);
- out_obj:
- visit_end_struct(v, (void **)obj);
- if (err && visit_is_input(v)) {
- qapi_free_UserDefOne(*obj);
- *obj = NULL;
- }
- out:
- error_propagate(errp, err);
- }
-
- void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp)
- {
- Error *err = NULL;
- UserDefOneList *tail;
- size_t size = sizeof(**obj);
-
- visit_start_list(v, name, (GenericList **)obj, size, &err);
- if (err) {
- goto out;
- }
-
- for (tail = *obj; tail;
- tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) {
- visit_type_UserDefOne(v, NULL, &tail->value, &err);
- if (err) {
- break;
- }
- }
-
- visit_end_list(v, (void **)obj);
- if (err && visit_is_input(v)) {
- qapi_free_UserDefOneList(*obj);
- *obj = NULL;
- }
- out:
- error_propagate(errp, err);
- }
-
-=== scripts/qapi-commands.py ===
-
-Used to generate the marshaling/dispatch functions for the commands
-defined in the schema. The generated code implements
-qmp_marshal_COMMAND() (registered automatically), and declares
-qmp_COMMAND() that the user must implement. The following files are
-generated:
-
-$(prefix)qmp-marshal.c: command marshal/dispatch functions for each
- QMP command defined in the schema. Functions
- generated by qapi-visit.py are used to
- convert QObjects received from the wire into
- function parameters, and uses the same
- visitor functions to convert native C return
- values to QObjects from transmission back
- over the wire.
-
-$(prefix)qmp-commands.h: Function prototypes for the QMP commands
- specified in the schema.
-
-Example:
-
- $ python scripts/qapi-commands.py --output-dir="qapi-generated"
- --prefix="example-" example-schema.json
- $ cat qapi-generated/example-qmp-commands.h
-[Uninteresting stuff omitted...]
-
- #ifndef EXAMPLE_QMP_COMMANDS_H
- #define EXAMPLE_QMP_COMMANDS_H
-
- #include "example-qapi-types.h"
- #include "qapi/qmp/qdict.h"
- #include "qapi/error.h"
-
- UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp);
-
- #endif
- $ cat qapi-generated/example-qmp-marshal.c
-[Uninteresting stuff omitted...]
-
- static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp)
- {
- Error *err = NULL;
- Visitor *v;
-
- v = qobject_output_visitor_new(ret_out);
- visit_type_UserDefOne(v, "unused", &ret_in, &err);
- if (!err) {
- visit_complete(v, ret_out);
- }
- error_propagate(errp, err);
- visit_free(v);
- v = qapi_dealloc_visitor_new();
- visit_type_UserDefOne(v, "unused", &ret_in, NULL);
- visit_free(v);
- }
-
- static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp)
- {
- Error *err = NULL;
- UserDefOne *retval;
- Visitor *v;
- UserDefOneList *arg1 = NULL;
-
- v = qobject_input_visitor_new(QOBJECT(args));
- visit_start_struct(v, NULL, NULL, 0, &err);
- if (err) {
- goto out;
- }
- visit_type_UserDefOneList(v, "arg1", &arg1, &err);
- if (!err) {
- visit_check_struct(v, &err);
- }
- visit_end_struct(v, NULL);
- if (err) {
- goto out;
- }
-
- retval = qmp_my_command(arg1, &err);
- if (err) {
- goto out;
- }
-
- qmp_marshal_output_UserDefOne(retval, ret, &err);
-
- out:
- error_propagate(errp, err);
- visit_free(v);
- v = qapi_dealloc_visitor_new();
- visit_start_struct(v, NULL, NULL, 0, NULL);
- visit_type_UserDefOneList(v, "arg1", &arg1, NULL);
- visit_end_struct(v, NULL);
- visit_free(v);
- }
-
- static void qmp_init_marshal(void)
- {
- qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS);
- }
-
- qapi_init(qmp_init_marshal);
-
-=== scripts/qapi-event.py ===
-
-Used to generate the event-related C code defined by a schema, with
-implementations for qapi_event_send_FOO(). The following files are
-created:
-
-$(prefix)qapi-event.h - Function prototypes for each event type, plus an
- enumeration of all event names
-$(prefix)qapi-event.c - Implementation of functions to send an event
-
-Example:
-
- $ python scripts/qapi-event.py --output-dir="qapi-generated"
- --prefix="example-" example-schema.json
- $ cat qapi-generated/example-qapi-event.h
-[Uninteresting stuff omitted...]
-
- #ifndef EXAMPLE_QAPI_EVENT_H
- #define EXAMPLE_QAPI_EVENT_H
-
- #include "qapi/error.h"
- #include "qapi/qmp/qdict.h"
- #include "example-qapi-types.h"
-
-
- void qapi_event_send_my_event(Error **errp);
-
- typedef enum example_QAPIEvent {
- EXAMPLE_QAPI_EVENT_MY_EVENT = 0,
- EXAMPLE_QAPI_EVENT__MAX = 1,
- } example_QAPIEvent;
-
- extern const char *const example_QAPIEvent_lookup[];
-
- #endif
- $ cat qapi-generated/example-qapi-event.c
-[Uninteresting stuff omitted...]
-
- void qapi_event_send_my_event(Error **errp)
- {
- QDict *qmp;
- Error *err = NULL;
- QMPEventFuncEmit emit;
- emit = qmp_event_get_func_emit();
- if (!emit) {
- return;
- }
-
- qmp = qmp_event_build_dict("MY_EVENT");
-
- emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err);
-
- error_propagate(errp, err);
- QDECREF(qmp);
- }
-
- const char *const example_QAPIEvent_lookup[] = {
- [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT",
- [EXAMPLE_QAPI_EVENT__MAX] = NULL,
- };
-
-=== scripts/qapi-introspect.py ===
-
-Used to generate the introspection C code for a schema. The following
-files are created:
-
-$(prefix)qmp-introspect.c - Defines a string holding a JSON
- description of the schema.
-$(prefix)qmp-introspect.h - Declares the above string.
-
-Example:
-
- $ python scripts/qapi-introspect.py --output-dir="qapi-generated"
- --prefix="example-" example-schema.json
- $ cat qapi-generated/example-qmp-introspect.h
-[Uninteresting stuff omitted...]
-
- #ifndef EXAMPLE_QMP_INTROSPECT_H
- #define EXAMPLE_QMP_INTROSPECT_H
-
- extern const char example_qmp_schema_json[];
-
- #endif
- $ cat qapi-generated/example-qmp-introspect.c
-[Uninteresting stuff omitted...]
-
- const char example_qmp_schema_json[] = "["
- "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, "
- "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, "
- "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, "
- "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, "
- "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, "
- "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, "
- "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, "
- "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]";
+++ /dev/null
-Using RCU (Read-Copy-Update) for synchronization
-================================================
-
-Read-copy update (RCU) is a synchronization mechanism that is used to
-protect read-mostly data structures. RCU is very efficient and scalable
-on the read side (it is wait-free), and thus can make the read paths
-extremely fast.
-
-RCU supports concurrency between a single writer and multiple readers,
-thus it is not used alone. Typically, the write-side will use a lock to
-serialize multiple updates, but other approaches are possible (e.g.,
-restricting updates to a single task). In QEMU, when a lock is used,
-this will often be the "iothread mutex", also known as the "big QEMU
-lock" (BQL). Also, restricting updates to a single task is done in
-QEMU using the "bottom half" API.
-
-RCU is fundamentally a "wait-to-finish" mechanism. The read side marks
-sections of code with "critical sections", and the update side will wait
-for the execution of all *currently running* critical sections before
-proceeding, or before asynchronously executing a callback.
-
-The key point here is that only the currently running critical sections
-are waited for; critical sections that are started _after_ the beginning
-of the wait do not extend the wait, despite running concurrently with
-the updater. This is the reason why RCU is more scalable than,
-for example, reader-writer locks. It is so much more scalable that
-the system will have a single instance of the RCU mechanism; a single
-mechanism can be used for an arbitrary number of "things", without
-having to worry about things such as contention or deadlocks.
-
-How is this possible? The basic idea is to split updates in two phases,
-"removal" and "reclamation". During removal, we ensure that subsequent
-readers will not be able to get a reference to the old data. After
-removal has completed, a critical section will not be able to access
-the old data. Therefore, critical sections that begin after removal
-do not matter; as soon as all previous critical sections have finished,
-there cannot be any readers who hold references to the data structure,
-and these can now be safely reclaimed (e.g., freed or unref'ed).
-
-Here is a picture:
-
- thread 1 thread 2 thread 3
- ------------------- ------------------------ -------------------
- enter RCU crit.sec.
- | finish removal phase
- | begin wait
- | | enter RCU crit.sec.
- exit RCU crit.sec | |
- complete wait |
- begin reclamation phase |
- exit RCU crit.sec.
-
-
-Note how thread 3 is still executing its critical section when thread 2
-starts reclaiming data. This is possible, because the old version of the
-data structure was not accessible at the time thread 3 began executing
-that critical section.
-
-
-RCU API
-=======
-
-The core RCU API is small:
-
- void rcu_read_lock(void);
-
- Used by a reader to inform the reclaimer that the reader is
- entering an RCU read-side critical section.
-
- void rcu_read_unlock(void);
-
- Used by a reader to inform the reclaimer that the reader is
- exiting an RCU read-side critical section. Note that RCU
- read-side critical sections may be nested and/or overlapping.
-
- void synchronize_rcu(void);
-
- Blocks until all pre-existing RCU read-side critical sections
- on all threads have completed. This marks the end of the removal
- phase and the beginning of reclamation phase.
-
- Note that it would be valid for another update to come while
- synchronize_rcu is running. Because of this, it is better that
- the updater releases any locks it may hold before calling
- synchronize_rcu. If this is not possible (for example, because
- the updater is protected by the BQL), you can use call_rcu.
-
- void call_rcu1(struct rcu_head * head,
- void (*func)(struct rcu_head *head));
-
- This function invokes func(head) after all pre-existing RCU
- read-side critical sections on all threads have completed. This
- marks the end of the removal phase, with func taking care
- asynchronously of the reclamation phase.
-
- The foo struct needs to have an rcu_head structure added,
- perhaps as follows:
-
- struct foo {
- struct rcu_head rcu;
- int a;
- char b;
- long c;
- };
-
- so that the reclaimer function can fetch the struct foo address
- and free it:
-
- call_rcu1(&foo.rcu, foo_reclaim);
-
- void foo_reclaim(struct rcu_head *rp)
- {
- struct foo *fp = container_of(rp, struct foo, rcu);
- g_free(fp);
- }
-
- For the common case where the rcu_head member is the first of the
- struct, you can use the following macro.
-
- void call_rcu(T *p,
- void (*func)(T *p),
- field-name);
- void g_free_rcu(T *p,
- field-name);
-
- call_rcu1 is typically used through these macro, in the common case
- where the "struct rcu_head" is the first field in the struct. If
- the callback function is g_free, in particular, g_free_rcu can be
- used. In the above case, one could have written simply:
-
- g_free_rcu(&foo, rcu);
-
- typeof(*p) atomic_rcu_read(p);
-
- atomic_rcu_read() is similar to atomic_mb_read(), but it makes
- some assumptions on the code that calls it. This allows a more
- optimized implementation.
-
- atomic_rcu_read assumes that whenever a single RCU critical
- section reads multiple shared data, these reads are either
- data-dependent or need no ordering. This is almost always the
- case when using RCU, because read-side critical sections typically
- navigate one or more pointers (the pointers that are changed on
- every update) until reaching a data structure of interest,
- and then read from there.
-
- RCU read-side critical sections must use atomic_rcu_read() to
- read data, unless concurrent writes are prevented by another
- synchronization mechanism.
-
- Furthermore, RCU read-side critical sections should traverse the
- data structure in a single direction, opposite to the direction
- in which the updater initializes it.
-
- void atomic_rcu_set(p, typeof(*p) v);
-
- atomic_rcu_set() is also similar to atomic_mb_set(), and it also
- makes assumptions on the code that calls it in order to allow a more
- optimized implementation.
-
- In particular, atomic_rcu_set() suffices for synchronization
- with readers, if the updater never mutates a field within a
- data item that is already accessible to readers. This is the
- case when initializing a new copy of the RCU-protected data
- structure; just ensure that initialization of *p is carried out
- before atomic_rcu_set() makes the data item visible to readers.
- If this rule is observed, writes will happen in the opposite
- order as reads in the RCU read-side critical sections (or if
- there is just one update), and there will be no need for other
- synchronization mechanism to coordinate the accesses.
-
-The following APIs must be used before RCU is used in a thread:
-
- void rcu_register_thread(void);
-
- Mark a thread as taking part in the RCU mechanism. Such a thread
- will have to report quiescent points regularly, either manually
- or through the QemuCond/QemuSemaphore/QemuEvent APIs.
-
- void rcu_unregister_thread(void);
-
- Mark a thread as not taking part anymore in the RCU mechanism.
- It is not a problem if such a thread reports quiescent points,
- either manually or by using the QemuCond/QemuSemaphore/QemuEvent
- APIs.
-
-Note that these APIs are relatively heavyweight, and should _not_ be
-nested.
-
-
-DIFFERENCES WITH LINUX
-======================
-
-- Waiting on a mutex is possible, though discouraged, within an RCU critical
- section. This is because spinlocks are rarely (if ever) used in userspace
- programming; not allowing this would prevent upgrading an RCU read-side
- critical section to become an updater.
-
-- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and
- rcu_assign_pointer. They take a _pointer_ to the variable being accessed.
-
-- call_rcu is a macro that has an extra argument (the name of the first
- field in the struct, which must be a struct rcu_head), and expects the
- type of the callback's argument to be the type of the first argument.
- call_rcu1 is the same as Linux's call_rcu.
-
-
-RCU PATTERNS
-============
-
-Many patterns using read-writer locks translate directly to RCU, with
-the advantages of higher scalability and deadlock immunity.
-
-In general, RCU can be used whenever it is possible to create a new
-"version" of a data structure every time the updater runs. This may
-sound like a very strict restriction, however:
-
-- the updater does not mean "everything that writes to a data structure",
- but rather "everything that involves a reclamation step". See the
- array example below
-
-- in some cases, creating a new version of a data structure may actually
- be very cheap. For example, modifying the "next" pointer of a singly
- linked list is effectively creating a new version of the list.
-
-Here are some frequently-used RCU idioms that are worth noting.
-
-
-RCU list processing
--------------------
-
-TBD (not yet used in QEMU)
-
-
-RCU reference counting
-----------------------
-
-Because grace periods are not allowed to complete while there is an RCU
-read-side critical section in progress, the RCU read-side primitives
-may be used as a restricted reference-counting mechanism. For example,
-consider the following code fragment:
-
- rcu_read_lock();
- p = atomic_rcu_read(&foo);
- /* do something with p. */
- rcu_read_unlock();
-
-The RCU read-side critical section ensures that the value of "p" remains
-valid until after the rcu_read_unlock(). In some sense, it is acquiring
-a reference to p that is later released when the critical section ends.
-The write side looks simply like this (with appropriate locking):
-
- qemu_mutex_lock(&foo_mutex);
- old = foo;
- atomic_rcu_set(&foo, new);
- qemu_mutex_unlock(&foo_mutex);
- synchronize_rcu();
- free(old);
-
-If the processing cannot be done purely within the critical section, it
-is possible to combine this idiom with a "real" reference count:
-
- rcu_read_lock();
- p = atomic_rcu_read(&foo);
- foo_ref(p);
- rcu_read_unlock();
- /* do something with p. */
- foo_unref(p);
-
-The write side can be like this:
-
- qemu_mutex_lock(&foo_mutex);
- old = foo;
- atomic_rcu_set(&foo, new);
- qemu_mutex_unlock(&foo_mutex);
- synchronize_rcu();
- foo_unref(old);
-
-or with call_rcu:
-
- qemu_mutex_lock(&foo_mutex);
- old = foo;
- atomic_rcu_set(&foo, new);
- qemu_mutex_unlock(&foo_mutex);
- call_rcu(foo_unref, old, rcu);
-
-In both cases, the write side only performs removal. Reclamation
-happens when the last reference to a "foo" object is dropped.
-Using synchronize_rcu() is undesirably expensive, because the
-last reference may be dropped on the read side. Hence you can
-use call_rcu() instead:
-
- foo_unref(struct foo *p) {
- if (atomic_fetch_dec(&p->refcount) == 1) {
- call_rcu(foo_destroy, p, rcu);
- }
- }
-
-
-Note that the same idioms would be possible with reader/writer
-locks:
-
- read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock);
- p = foo; p = foo;
- /* do something with p. */ foo = new;
- read_unlock(&foo_rwlock); free(p);
- write_mutex_unlock(&foo_rwlock);
- free(p);
-
- ------------------------------------------------------------------
-
- read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock);
- p = foo; old = foo;
- foo_ref(p); foo = new;
- read_unlock(&foo_rwlock); foo_unref(old);
- /* do something with p. */ write_mutex_unlock(&foo_rwlock);
- read_lock(&foo_rwlock);
- foo_unref(p);
- read_unlock(&foo_rwlock);
-
-foo_unref could use a mechanism such as bottom halves to move deallocation
-out of the write-side critical section.
-
-
-RCU resizable arrays
---------------------
-
-Resizable arrays can be used with RCU. The expensive RCU synchronization
-(or call_rcu) only needs to take place when the array is resized.
-The two items to take care of are:
-
-- ensuring that the old version of the array is available between removal
- and reclamation;
-
-- avoiding mismatches in the read side between the array data and the
- array size.
-
-The first problem is avoided simply by not using realloc. Instead,
-each resize will allocate a new array and copy the old data into it.
-The second problem would arise if the size and the data pointers were
-two members of a larger struct:
-
- struct mystuff {
- ...
- int data_size;
- int data_alloc;
- T *data;
- ...
- };
-
-Instead, we store the size of the array with the array itself:
-
- struct arr {
- int size;
- int alloc;
- T data[];
- };
- struct arr *global_array;
-
- read side:
- rcu_read_lock();
- struct arr *array = atomic_rcu_read(&global_array);
- x = i < array->size ? array->data[i] : -1;
- rcu_read_unlock();
- return x;
-
- write side (running under a lock):
- if (global_array->size == global_array->alloc) {
- /* Creating a new version. */
- new_array = g_malloc(sizeof(struct arr) +
- global_array->alloc * 2 * sizeof(T));
- new_array->size = global_array->size;
- new_array->alloc = global_array->alloc * 2;
- memcpy(new_array->data, global_array->data,
- global_array->alloc * sizeof(T));
-
- /* Removal phase. */
- old_array = global_array;
- atomic_rcu_set(&new_array->data, new_array);
- synchronize_rcu();
-
- /* Reclamation phase. */
- free(old_array);
- }
-
-
-SOURCES
-=======
-
-* Documentation/RCU/ from the Linux kernel
--- /dev/null
+/*
+ * This model describes the interaction between ctx->notify_me
+ * and aio_notify().
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain. If you really want a license,
+ * the WTFPL will do.
+ *
+ * To simulate it:
+ * spin -p docs/aio_notify.promela
+ *
+ * To verify it:
+ * spin -a docs/aio_notify.promela
+ * gcc -O2 pan.c
+ * ./a.out -a
+ *
+ * To verify it (with a bug planted in the model):
+ * spin -a -DBUG docs/aio_notify.promela
+ * gcc -O2 pan.c
+ * ./a.out -a
+ */
+
+#define MAX 4
+#define LAST (1 << (MAX - 1))
+#define FINAL ((LAST << 1) - 1)
+
+bool notify_me;
+bool event;
+
+int req;
+int done;
+
+active proctype waiter()
+{
+ int fetch;
+
+ do
+ :: true -> {
+ notify_me++;
+
+ if
+#ifndef BUG
+ :: (req > 0) -> skip;
+#endif
+ :: else ->
+ // Wait for a nudge from the other side
+ do
+ :: event == 1 -> { event = 0; break; }
+ od;
+ fi;
+
+ notify_me--;
+
+ atomic { fetch = req; req = 0; }
+ done = done | fetch;
+ }
+ od
+}
+
+active proctype notifier()
+{
+ int next = 1;
+
+ do
+ :: next <= LAST -> {
+ // generate a request
+ req = req | next;
+ next = next << 1;
+
+ // aio_notify
+ if
+ :: notify_me == 1 -> event = 1;
+ :: else -> printf("Skipped event_notifier_set\n"); skip;
+ fi;
+
+ // Test both synchronous and asynchronous delivery
+ if
+ :: 1 -> do
+ :: req == 0 -> break;
+ od;
+ :: 1 -> skip;
+ fi;
+ }
+ od;
+}
+
+never { /* [] done < FINAL */
+accept_init:
+ do
+ :: done < FINAL -> skip;
+ od;
+}
--- /dev/null
+/*
+ * This model describes the interaction between ctx->notified
+ * and ctx->notifier.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain. If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify the buggy version:
+ * spin -a -DBUG1 docs/aio_notify_bug.promela
+ * gcc -O2 pan.c
+ * ./a.out -a -f
+ * (or -DBUG2)
+ *
+ * To verify the fixed version:
+ * spin -a docs/aio_notify_bug.promela
+ * gcc -O2 pan.c
+ * ./a.out -a -f
+ *
+ * Add -DCHECK_REQ to test an alternative invariant and the
+ * "notify_me" optimization.
+ */
+
+int notify_me;
+bool notified;
+bool event;
+bool req;
+bool notifier_done;
+
+#ifdef CHECK_REQ
+#define USE_NOTIFY_ME 1
+#else
+#define USE_NOTIFY_ME 0
+#endif
+
+#ifdef BUG
+#error Please define BUG1 or BUG2 instead.
+#endif
+
+active proctype notifier()
+{
+ do
+ :: true -> {
+ req = 1;
+ if
+ :: !USE_NOTIFY_ME || notify_me ->
+#if defined BUG1
+ /* CHECK_REQ does not detect this bug! */
+ notified = 1;
+ event = 1;
+#elif defined BUG2
+ if
+ :: !notified -> event = 1;
+ :: else -> skip;
+ fi;
+ notified = 1;
+#else
+ event = 1;
+ notified = 1;
+#endif
+ :: else -> skip;
+ fi
+ }
+ :: true -> break;
+ od;
+ notifier_done = 1;
+}
+
+#define AIO_POLL \
+ notify_me++; \
+ if \
+ :: !req -> { \
+ if \
+ :: event -> skip; \
+ fi; \
+ } \
+ :: else -> skip; \
+ fi; \
+ notify_me--; \
+ \
+ atomic { old = notified; notified = 0; } \
+ if \
+ :: old -> event = 0; \
+ :: else -> skip; \
+ fi; \
+ \
+ req = 0;
+
+active proctype waiter()
+{
+ bool old;
+
+ do
+ :: true -> AIO_POLL;
+ od;
+}
+
+/* Same as waiter(), but disappears after a while. */
+active proctype temporary_waiter()
+{
+ bool old;
+
+ do
+ :: true -> AIO_POLL;
+ :: true -> break;
+ od;
+}
+
+#ifdef CHECK_REQ
+never {
+ do
+ :: req -> goto accept_if_req_not_eventually_false;
+ :: true -> skip;
+ od;
+
+accept_if_req_not_eventually_false:
+ if
+ :: req -> goto accept_if_req_not_eventually_false;
+ fi;
+ assert(0);
+}
+
+#else
+/* There must be infinitely many transitions of event as long
+ * as the notifier does not exit.
+ *
+ * If event stayed always true, the waiters would be busy looping.
+ * If event stayed always false, the waiters would be sleeping
+ * forever.
+ */
+never {
+ do
+ :: !event -> goto accept_if_event_not_eventually_true;
+ :: event -> goto accept_if_event_not_eventually_false;
+ :: true -> skip;
+ od;
+
+accept_if_event_not_eventually_true:
+ if
+ :: !event && notifier_done -> do :: true -> skip; od;
+ :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
+ fi;
+ assert(0);
+
+accept_if_event_not_eventually_false:
+ if
+ :: event -> goto accept_if_event_not_eventually_false;
+ fi;
+ assert(0);
+}
+#endif
--- /dev/null
+/*
+ * This model describes a bug in aio_notify. If ctx->notifier is
+ * cleared too late, a wakeup could be lost.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain. If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify the buggy version:
+ * spin -a -DBUG docs/aio_notify_bug.promela
+ * gcc -O2 pan.c
+ * ./a.out -a -f
+ *
+ * To verify the fixed version:
+ * spin -a docs/aio_notify_bug.promela
+ * gcc -O2 pan.c
+ * ./a.out -a -f
+ *
+ * Add -DCHECK_REQ to test an alternative invariant and the
+ * "notify_me" optimization.
+ */
+
+int notify_me;
+bool event;
+bool req;
+bool notifier_done;
+
+#ifdef CHECK_REQ
+#define USE_NOTIFY_ME 1
+#else
+#define USE_NOTIFY_ME 0
+#endif
+
+active proctype notifier()
+{
+ do
+ :: true -> {
+ req = 1;
+ if
+ :: !USE_NOTIFY_ME || notify_me -> event = 1;
+ :: else -> skip;
+ fi
+ }
+ :: true -> break;
+ od;
+ notifier_done = 1;
+}
+
+#ifdef BUG
+#define AIO_POLL \
+ notify_me++; \
+ if \
+ :: !req -> { \
+ if \
+ :: event -> skip; \
+ fi; \
+ } \
+ :: else -> skip; \
+ fi; \
+ notify_me--; \
+ \
+ req = 0; \
+ event = 0;
+#else
+#define AIO_POLL \
+ notify_me++; \
+ if \
+ :: !req -> { \
+ if \
+ :: event -> skip; \
+ fi; \
+ } \
+ :: else -> skip; \
+ fi; \
+ notify_me--; \
+ \
+ event = 0; \
+ req = 0;
+#endif
+
+active proctype waiter()
+{
+ do
+ :: true -> AIO_POLL;
+ od;
+}
+
+/* Same as waiter(), but disappears after a while. */
+active proctype temporary_waiter()
+{
+ do
+ :: true -> AIO_POLL;
+ :: true -> break;
+ od;
+}
+
+#ifdef CHECK_REQ
+never {
+ do
+ :: req -> goto accept_if_req_not_eventually_false;
+ :: true -> skip;
+ od;
+
+accept_if_req_not_eventually_false:
+ if
+ :: req -> goto accept_if_req_not_eventually_false;
+ fi;
+ assert(0);
+}
+
+#else
+/* There must be infinitely many transitions of event as long
+ * as the notifier does not exit.
+ *
+ * If event stayed always true, the waiters would be busy looping.
+ * If event stayed always false, the waiters would be sleeping
+ * forever.
+ */
+never {
+ do
+ :: !event -> goto accept_if_event_not_eventually_true;
+ :: event -> goto accept_if_event_not_eventually_false;
+ :: true -> skip;
+ od;
+
+accept_if_event_not_eventually_true:
+ if
+ :: !event && notifier_done -> do :: true -> skip; od;
+ :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
+ fi;
+ assert(0);
+
+accept_if_event_not_eventually_false:
+ if
+ :: event -> goto accept_if_event_not_eventually_false;
+ fi;
+ assert(0);
+}
+#endif
--- /dev/null
+/*
+ * This model describes the implementation of exclusive sections in
+ * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start,
+ * cpu_exec_end).
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain. If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify it:
+ * spin -a docs/tcg-exclusive.promela
+ * gcc pan.c -O2
+ * ./a.out -a
+ *
+ * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX,
+ * TEST_EXPENSIVE.
+ */
+
+// Define the missing parameters for the model
+#ifndef N_CPUS
+#define N_CPUS 2
+#warning defaulting to 2 CPU processes
+#endif
+
+// the expensive test is not so expensive for <= 2 CPUs
+// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs
+// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM
+#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX)
+#define TEST_EXPENSIVE
+#endif
+
+#ifndef N_EXCLUSIVE
+# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE
+# define N_EXCLUSIVE 2
+# warning defaulting to 2 concurrent exclusive sections
+# else
+# define N_EXCLUSIVE 1
+# warning defaulting to 1 concurrent exclusive sections
+# endif
+#endif
+#ifndef N_CYCLES
+# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE
+# define N_CYCLES 2
+# warning defaulting to 2 CPU cycles
+# else
+# define N_CYCLES 1
+# warning defaulting to 1 CPU cycles
+# endif
+#endif
+
+
+// synchronization primitives. condition variables require a
+// process-local "cond_t saved;" variable.
+
+#define mutex_t byte
+#define MUTEX_LOCK(m) atomic { m == 0 -> m = 1 }
+#define MUTEX_UNLOCK(m) m = 0
+
+#define cond_t int
+#define COND_WAIT(c, m) { \
+ saved = c; \
+ MUTEX_UNLOCK(m); \
+ c != saved -> MUTEX_LOCK(m); \
+ }
+#define COND_BROADCAST(c) c++
+
+// this is the logic from cpus-common.c
+
+mutex_t mutex;
+cond_t exclusive_cond;
+cond_t exclusive_resume;
+byte pending_cpus;
+
+byte running[N_CPUS];
+byte has_waiter[N_CPUS];
+
+#define exclusive_idle() \
+ do \
+ :: pending_cpus -> COND_WAIT(exclusive_resume, mutex); \
+ :: else -> break; \
+ od
+
+#define start_exclusive() \
+ MUTEX_LOCK(mutex); \
+ exclusive_idle(); \
+ pending_cpus = 1; \
+ \
+ i = 0; \
+ do \
+ :: i < N_CPUS -> { \
+ if \
+ :: running[i] -> has_waiter[i] = 1; pending_cpus++; \
+ :: else -> skip; \
+ fi; \
+ i++; \
+ } \
+ :: else -> break; \
+ od; \
+ \
+ do \
+ :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex); \
+ :: else -> break; \
+ od; \
+ MUTEX_UNLOCK(mutex);
+
+#define end_exclusive() \
+ MUTEX_LOCK(mutex); \
+ pending_cpus = 0; \
+ COND_BROADCAST(exclusive_resume); \
+ MUTEX_UNLOCK(mutex);
+
+#ifdef USE_MUTEX
+// Simple version using mutexes
+#define cpu_exec_start(id) \
+ MUTEX_LOCK(mutex); \
+ exclusive_idle(); \
+ running[id] = 1; \
+ MUTEX_UNLOCK(mutex);
+
+#define cpu_exec_end(id) \
+ MUTEX_LOCK(mutex); \
+ running[id] = 0; \
+ if \
+ :: pending_cpus -> { \
+ pending_cpus--; \
+ if \
+ :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
+ :: else -> skip; \
+ fi; \
+ } \
+ :: else -> skip; \
+ fi; \
+ MUTEX_UNLOCK(mutex);
+#else
+// Wait-free fast path, only needs mutex when concurrent with
+// an exclusive section
+#define cpu_exec_start(id) \
+ running[id] = 1; \
+ if \
+ :: pending_cpus -> { \
+ MUTEX_LOCK(mutex); \
+ if \
+ :: !has_waiter[id] -> { \
+ running[id] = 0; \
+ exclusive_idle(); \
+ running[id] = 1; \
+ } \
+ :: else -> skip; \
+ fi; \
+ MUTEX_UNLOCK(mutex); \
+ } \
+ :: else -> skip; \
+ fi;
+
+#define cpu_exec_end(id) \
+ running[id] = 0; \
+ if \
+ :: pending_cpus -> { \
+ MUTEX_LOCK(mutex); \
+ if \
+ :: has_waiter[id] -> { \
+ has_waiter[id] = 0; \
+ pending_cpus--; \
+ if \
+ :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
+ :: else -> skip; \
+ fi; \
+ } \
+ :: else -> skip; \
+ fi; \
+ MUTEX_UNLOCK(mutex); \
+ } \
+ :: else -> skip; \
+ fi
+#endif
+
+// Promela processes
+
+byte done_cpu;
+byte in_cpu;
+active[N_CPUS] proctype cpu()
+{
+ byte id = _pid % N_CPUS;
+ byte cycles = 0;
+ cond_t saved;
+
+ do
+ :: cycles == N_CYCLES -> break;
+ :: else -> {
+ cycles++;
+ cpu_exec_start(id)
+ in_cpu++;
+ done_cpu++;
+ in_cpu--;
+ cpu_exec_end(id)
+ }
+ od;
+}
+
+byte done_exclusive;
+byte in_exclusive;
+active[N_EXCLUSIVE] proctype exclusive()
+{
+ cond_t saved;
+ byte i;
+
+ start_exclusive();
+ in_exclusive = 1;
+ done_exclusive++;
+ in_exclusive = 0;
+ end_exclusive();
+}
+
+#define LIVENESS (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE)
+#define SAFETY !(in_exclusive && in_cpu)
+
+never { /* ! ([] SAFETY && <> [] LIVENESS) */
+ do
+ // once the liveness property is satisfied, this is not executable
+ // and the never clause is not accepted
+ :: ! LIVENESS -> accept_liveness: skip
+ :: 1 -> assert(SAFETY)
+ od;
+}
--- /dev/null
+/*
+ * This model describes the implementation of QemuEvent in
+ * util/qemu-thread-win32.c.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain. If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify it:
+ * spin -a docs/event.promela
+ * gcc -O2 pan.c -DSAFETY
+ * ./a.out
+ */
+
+bool event;
+int value;
+
+/* Primitives for a Win32 event */
+#define RAW_RESET event = false
+#define RAW_SET event = true
+#define RAW_WAIT do :: event -> break; od
+
+#if 0
+/* Basic sanity checking: test the Win32 event primitives */
+#define RESET RAW_RESET
+#define SET RAW_SET
+#define WAIT RAW_WAIT
+#else
+/* Full model: layer a userspace-only fast path on top of the RAW_*
+ * primitives. SET/RESET/WAIT have exactly the same semantics as
+ * RAW_SET/RAW_RESET/RAW_WAIT, but try to avoid invoking them.
+ */
+#define EV_SET 0
+#define EV_FREE 1
+#define EV_BUSY -1
+
+int state = EV_FREE;
+
+int xchg_result;
+#define SET if :: state != EV_SET -> \
+ atomic { /* xchg_result=xchg(state, EV_SET) */ \
+ xchg_result = state; \
+ state = EV_SET; \
+ } \
+ if :: xchg_result == EV_BUSY -> RAW_SET; \
+ :: else -> skip; \
+ fi; \
+ :: else -> skip; \
+ fi
+
+#define RESET if :: state == EV_SET -> atomic { state = state | EV_FREE; } \
+ :: else -> skip; \
+ fi
+
+int tmp1, tmp2;
+#define WAIT tmp1 = state; \
+ if :: tmp1 != EV_SET -> \
+ if :: tmp1 == EV_FREE -> \
+ RAW_RESET; \
+ atomic { /* tmp2=cas(state, EV_FREE, EV_BUSY) */ \
+ tmp2 = state; \
+ if :: tmp2 == EV_FREE -> state = EV_BUSY; \
+ :: else -> skip; \
+ fi; \
+ } \
+ if :: tmp2 == EV_SET -> tmp1 = EV_SET; \
+ :: else -> tmp1 = EV_BUSY; \
+ fi; \
+ :: else -> skip; \
+ fi; \
+ assert(tmp1 != EV_FREE); \
+ if :: tmp1 == EV_BUSY -> RAW_WAIT; \
+ :: else -> skip; \
+ fi; \
+ :: else -> skip; \
+ fi
+#endif
+
+active proctype waiter()
+{
+ if
+ :: !value ->
+ RESET;
+ if
+ :: !value -> WAIT;
+ :: else -> skip;
+ fi;
+ :: else -> skip;
+ fi;
+ assert(value);
+}
+
+active proctype notifier()
+{
+ value = true;
+ SET;
+}
+++ /dev/null
-/*
- * This model describes the implementation of exclusive sections in
- * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start,
- * cpu_exec_end).
- *
- * Author: Paolo Bonzini <pbonzini@redhat.com>
- *
- * This file is in the public domain. If you really want a license,
- * the WTFPL will do.
- *
- * To verify it:
- * spin -a docs/tcg-exclusive.promela
- * gcc pan.c -O2
- * ./a.out -a
- *
- * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX,
- * TEST_EXPENSIVE.
- */
-
-// Define the missing parameters for the model
-#ifndef N_CPUS
-#define N_CPUS 2
-#warning defaulting to 2 CPU processes
-#endif
-
-// the expensive test is not so expensive for <= 2 CPUs
-// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs
-// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM
-#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX)
-#define TEST_EXPENSIVE
-#endif
-
-#ifndef N_EXCLUSIVE
-# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE
-# define N_EXCLUSIVE 2
-# warning defaulting to 2 concurrent exclusive sections
-# else
-# define N_EXCLUSIVE 1
-# warning defaulting to 1 concurrent exclusive sections
-# endif
-#endif
-#ifndef N_CYCLES
-# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE
-# define N_CYCLES 2
-# warning defaulting to 2 CPU cycles
-# else
-# define N_CYCLES 1
-# warning defaulting to 1 CPU cycles
-# endif
-#endif
-
-
-// synchronization primitives. condition variables require a
-// process-local "cond_t saved;" variable.
-
-#define mutex_t byte
-#define MUTEX_LOCK(m) atomic { m == 0 -> m = 1 }
-#define MUTEX_UNLOCK(m) m = 0
-
-#define cond_t int
-#define COND_WAIT(c, m) { \
- saved = c; \
- MUTEX_UNLOCK(m); \
- c != saved -> MUTEX_LOCK(m); \
- }
-#define COND_BROADCAST(c) c++
-
-// this is the logic from cpus-common.c
-
-mutex_t mutex;
-cond_t exclusive_cond;
-cond_t exclusive_resume;
-byte pending_cpus;
-
-byte running[N_CPUS];
-byte has_waiter[N_CPUS];
-
-#define exclusive_idle() \
- do \
- :: pending_cpus -> COND_WAIT(exclusive_resume, mutex); \
- :: else -> break; \
- od
-
-#define start_exclusive() \
- MUTEX_LOCK(mutex); \
- exclusive_idle(); \
- pending_cpus = 1; \
- \
- i = 0; \
- do \
- :: i < N_CPUS -> { \
- if \
- :: running[i] -> has_waiter[i] = 1; pending_cpus++; \
- :: else -> skip; \
- fi; \
- i++; \
- } \
- :: else -> break; \
- od; \
- \
- do \
- :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex); \
- :: else -> break; \
- od; \
- MUTEX_UNLOCK(mutex);
-
-#define end_exclusive() \
- MUTEX_LOCK(mutex); \
- pending_cpus = 0; \
- COND_BROADCAST(exclusive_resume); \
- MUTEX_UNLOCK(mutex);
-
-#ifdef USE_MUTEX
-// Simple version using mutexes
-#define cpu_exec_start(id) \
- MUTEX_LOCK(mutex); \
- exclusive_idle(); \
- running[id] = 1; \
- MUTEX_UNLOCK(mutex);
-
-#define cpu_exec_end(id) \
- MUTEX_LOCK(mutex); \
- running[id] = 0; \
- if \
- :: pending_cpus -> { \
- pending_cpus--; \
- if \
- :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
- :: else -> skip; \
- fi; \
- } \
- :: else -> skip; \
- fi; \
- MUTEX_UNLOCK(mutex);
-#else
-// Wait-free fast path, only needs mutex when concurrent with
-// an exclusive section
-#define cpu_exec_start(id) \
- running[id] = 1; \
- if \
- :: pending_cpus -> { \
- MUTEX_LOCK(mutex); \
- if \
- :: !has_waiter[id] -> { \
- running[id] = 0; \
- exclusive_idle(); \
- running[id] = 1; \
- } \
- :: else -> skip; \
- fi; \
- MUTEX_UNLOCK(mutex); \
- } \
- :: else -> skip; \
- fi;
-
-#define cpu_exec_end(id) \
- running[id] = 0; \
- if \
- :: pending_cpus -> { \
- MUTEX_LOCK(mutex); \
- if \
- :: has_waiter[id] -> { \
- has_waiter[id] = 0; \
- pending_cpus--; \
- if \
- :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
- :: else -> skip; \
- fi; \
- } \
- :: else -> skip; \
- fi; \
- MUTEX_UNLOCK(mutex); \
- } \
- :: else -> skip; \
- fi
-#endif
-
-// Promela processes
-
-byte done_cpu;
-byte in_cpu;
-active[N_CPUS] proctype cpu()
-{
- byte id = _pid % N_CPUS;
- byte cycles = 0;
- cond_t saved;
-
- do
- :: cycles == N_CYCLES -> break;
- :: else -> {
- cycles++;
- cpu_exec_start(id)
- in_cpu++;
- done_cpu++;
- in_cpu--;
- cpu_exec_end(id)
- }
- od;
-}
-
-byte done_exclusive;
-byte in_exclusive;
-active[N_EXCLUSIVE] proctype exclusive()
-{
- cond_t saved;
- byte i;
-
- start_exclusive();
- in_exclusive = 1;
- done_exclusive++;
- in_exclusive = 0;
- end_exclusive();
-}
-
-#define LIVENESS (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE)
-#define SAFETY !(in_exclusive && in_cpu)
-
-never { /* ! ([] SAFETY && <> [] LIVENESS) */
- do
- // once the liveness property is satisfied, this is not executable
- // and the never clause is not accepted
- :: ! LIVENESS -> accept_liveness: skip
- :: 1 -> assert(SAFETY)
- od;
-}
+++ /dev/null
-= Tracing =
-
-== Introduction ==
-
-This document describes the tracing infrastructure in QEMU and how to use it
-for debugging, profiling, and observing execution.
-
-== Quickstart ==
-
-1. Build with the 'simple' trace backend:
-
- ./configure --enable-trace-backends=simple
- make
-
-2. Create a file with the events you want to trace:
-
- echo bdrv_aio_readv > /tmp/events
- echo bdrv_aio_writev >> /tmp/events
-
-3. Run the virtual machine to produce a trace file:
-
- qemu -trace events=/tmp/events ... # your normal QEMU invocation
-
-4. Pretty-print the binary trace file:
-
- ./scripts/simpletrace.py trace-events-all trace-* # Override * with QEMU <pid>
-
-== Trace events ==
-
-=== Sub-directory setup ===
-
-Each directory in the source tree can declare a set of static trace events
-in a local "trace-events" file. All directories which contain "trace-events"
-files must be listed in the "trace-events-subdirs" make variable in the top
-level Makefile.objs. During build, the "trace-events" file in each listed
-subdirectory will be processed by the "tracetool" script to generate code for
-the trace events.
-
-The individual "trace-events" files are merged into a "trace-events-all" file,
-which is also installed into "/usr/share/qemu" with the name "trace-events".
-This merged file is to be used by the "simpletrace.py" script to later analyse
-traces in the simpletrace data format.
-
-In the sub-directory the following files will be automatically generated
-
- - trace.c - the trace event state declarations
- - trace.h - the trace event enums and probe functions
- - trace-dtrace.h - DTrace event probe specification
- - trace-dtrace.dtrace - DTrace event probe helper declaration
- - trace-dtrace.o - binary DTrace provider (generated by dtrace)
- - trace-ust.h - UST event probe helper declarations
-
-Source files in the sub-directory should #include the local 'trace.h' file,
-without any sub-directory path prefix. eg io/channel-buffer.c would do
-
- #include "trace.h"
-
-To access the 'io/trace.h' file. While it is possible to include a trace.h
-file from outside a source files' own sub-directory, this is discouraged in
-general. It is strongly preferred that all events be declared directly in
-the sub-directory that uses them. The only exception is where there are some
-shared trace events defined in the top level directory trace-events file.
-The top level directory generates trace files with a filename prefix of
-"trace-root" instead of just "trace". This is to avoid ambiguity between
-a trace.h in the current directory, vs the top level directory.
-
-=== Using trace events ===
-
-Trace events are invoked directly from source code like this:
-
- #include "trace.h" /* needed for trace event prototype */
-
- void *qemu_vmalloc(size_t size)
- {
- void *ptr;
- size_t align = QEMU_VMALLOC_ALIGN;
-
- if (size < align) {
- align = getpagesize();
- }
- ptr = qemu_memalign(align, size);
- trace_qemu_vmalloc(size, ptr);
- return ptr;
- }
-
-=== Declaring trace events ===
-
-The "tracetool" script produces the trace.h header file which is included by
-every source file that uses trace events. Since many source files include
-trace.h, it uses a minimum of types and other header files included to keep the
-namespace clean and compile times and dependencies down.
-
-Trace events should use types as follows:
-
- * Use stdint.h types for fixed-size types. Most offsets and guest memory
- addresses are best represented with uint32_t or uint64_t. Use fixed-size
- types over primitive types whose size may change depending on the host
- (32-bit versus 64-bit) so trace events don't truncate values or break
- the build.
-
- * Use void * for pointers to structs or for arrays. The trace.h header
- cannot include all user-defined struct declarations and it is therefore
- necessary to use void * for pointers to structs.
-
- * For everything else, use primitive scalar types (char, int, long) with the
- appropriate signedness.
-
-Format strings should reflect the types defined in the trace event. Take
-special care to use PRId64 and PRIu64 for int64_t and uint64_t types,
-respectively. This ensures portability between 32- and 64-bit platforms.
-
-Each event declaration will start with the event name, then its arguments,
-finally a format string for pretty-printing. For example:
-
- qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p"
- qemu_vfree(void *ptr) "ptr %p"
-
-
-=== Hints for adding new trace events ===
-
-1. Trace state changes in the code. Interesting points in the code usually
- involve a state change like starting, stopping, allocating, freeing. State
- changes are good trace events because they can be used to understand the
- execution of the system.
-
-2. Trace guest operations. Guest I/O accesses like reading device registers
- are good trace events because they can be used to understand guest
- interactions.
-
-3. Use correlator fields so the context of an individual line of trace output
- can be understood. For example, trace the pointer returned by malloc and
- used as an argument to free. This way mallocs and frees can be matched up.
- Trace events with no context are not very useful.
-
-4. Name trace events after their function. If there are multiple trace events
- in one function, append a unique distinguisher at the end of the name.
-
-== Generic interface and monitor commands ==
-
-You can programmatically query and control the state of trace events through a
-backend-agnostic interface provided by the header "trace/control.h".
-
-Note that some of the backends do not provide an implementation for some parts
-of this interface, in which case QEMU will just print a warning (please refer to
-header "trace/control.h" to see which routines are backend-dependent).
-
-The state of events can also be queried and modified through monitor commands:
-
-* info trace-events
- View available trace events and their state. State 1 means enabled, state 0
- means disabled.
-
-* trace-event NAME on|off
- Enable/disable a given trace event or a group of events (using wildcards).
-
-The "-trace events=<file>" command line argument can be used to enable the
-events listed in <file> from the very beginning of the program. This file must
-contain one event name per line.
-
-If a line in the "-trace events=<file>" file begins with a '-', the trace event
-will be disabled instead of enabled. This is useful when a wildcard was used
-to enable an entire family of events but one noisy event needs to be disabled.
-
-Wildcard matching is supported in both the monitor command "trace-event" and the
-events list file. That means you can enable/disable the events having a common
-prefix in a batch. For example, virtio-blk trace events could be enabled using
-the following monitor command:
-
- trace-event virtio_blk_* on
-
-== Trace backends ==
-
-The "tracetool" script automates tedious trace event code generation and also
-keeps the trace event declarations independent of the trace backend. The trace
-events are not tightly coupled to a specific trace backend, such as LTTng or
-SystemTap. Support for trace backends can be added by extending the "tracetool"
-script.
-
-The trace backends are chosen at configure time:
-
- ./configure --enable-trace-backends=simple
-
-For a list of supported trace backends, try ./configure --help or see below.
-If multiple backends are enabled, the trace is sent to them all.
-
-If no backends are explicitly selected, configure will default to the
-"log" backend.
-
-The following subsections describe the supported trace backends.
-
-=== Nop ===
-
-The "nop" backend generates empty trace event functions so that the compiler
-can optimize out trace events completely. This imposes no performance
-penalty.
-
-Note that regardless of the selected trace backend, events with the "disable"
-property will be generated with the "nop" backend.
-
-=== Log ===
-
-The "log" backend sends trace events directly to standard error. This
-effectively turns trace events into debug printfs.
-
-This is the simplest backend and can be used together with existing code that
-uses DPRINTF().
-
-=== Simpletrace ===
-
-The "simple" backend supports common use cases and comes as part of the QEMU
-source tree. It may not be as powerful as platform-specific or third-party
-trace backends but it is portable. This is the recommended trace backend
-unless you have specific needs for more advanced backends.
-
-=== Ftrace ===
-
-The "ftrace" backend writes trace data to ftrace marker. This effectively
-sends trace events to ftrace ring buffer, and you can compare qemu trace
-data and kernel(especially kvm.ko when using KVM) trace data.
-
-if you use KVM, enable kvm events in ftrace:
-
- # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable
-
-After running qemu by root user, you can get the trace:
-
- # cat /sys/kernel/debug/tracing/trace
-
-Restriction: "ftrace" backend is restricted to Linux only.
-
-=== Syslog ===
-
-The "syslog" backend sends trace events using the POSIX syslog API. The log
-is opened specifying the LOG_DAEMON facility and LOG_PID option (so events
-are tagged with the pid of the particular QEMU process that generated
-them). All events are logged at LOG_INFO level.
-
-NOTE: syslog may squash duplicate consecutive trace events and apply rate
- limiting.
-
-Restriction: "syslog" backend is restricted to POSIX compliant OS.
-
-==== Monitor commands ====
-
-* trace-file on|off|flush|set <path>
- Enable/disable/flush the trace file or set the trace file name.
-
-==== Analyzing trace files ====
-
-The "simple" backend produces binary trace files that can be formatted with the
-simpletrace.py script. The script takes the "trace-events-all" file and the
-binary trace:
-
- ./scripts/simpletrace.py trace-events-all trace-12345
-
-You must ensure that the same "trace-events-all" file was used to build QEMU,
-otherwise trace event declarations may have changed and output will not be
-consistent.
-
-=== LTTng Userspace Tracer ===
-
-The "ust" backend uses the LTTng Userspace Tracer library. There are no
-monitor commands built into QEMU, instead UST utilities should be used to list,
-enable/disable, and dump traces.
-
-Package lttng-tools is required for userspace tracing. You must ensure that the
-current user belongs to the "tracing" group, or manually launch the
-lttng-sessiond daemon for the current user prior to running any instance of
-QEMU.
-
-While running an instrumented QEMU, LTTng should be able to list all available
-events:
-
- lttng list -u
-
-Create tracing session:
-
- lttng create mysession
-
-Enable events:
-
- lttng enable-event qemu:g_malloc -u
-
-Where the events can either be a comma-separated list of events, or "-a" to
-enable all tracepoint events. Start and stop tracing as needed:
-
- lttng start
- lttng stop
-
-View the trace:
-
- lttng view
-
-Destroy tracing session:
-
- lttng destroy
-
-Babeltrace can be used at any later time to view the trace:
-
- babeltrace $HOME/lttng-traces/mysession-<date>-<time>
-
-=== SystemTap ===
-
-The "dtrace" backend uses DTrace sdt probes but has only been tested with
-SystemTap. When SystemTap support is detected a .stp file with wrapper probes
-is generated to make use in scripts more convenient. This step can also be
-performed manually after a build in order to change the binary name in the .stp
-probes:
-
- scripts/tracetool.py --backends=dtrace --format=stap \
- --binary path/to/qemu-binary \
- --target-type system \
- --target-name x86_64 \
- <trace-events-all >qemu.stp
-
-== Trace event properties ==
-
-Each event in the "trace-events-all" file can be prefixed with a space-separated
-list of zero or more of the following event properties.
-
-=== "disable" ===
-
-If a specific trace event is going to be invoked a huge number of times, this
-might have a noticeable performance impact even when the event is
-programmatically disabled.
-
-In this case you should declare such event with the "disable" property. This
-will effectively disable the event at compile time (by using the "nop" backend),
-thus having no performance impact at all on regular builds (i.e., unless you
-edit the "trace-events-all" file).
-
-In addition, there might be cases where relatively complex computations must be
-performed to generate values that are only used as arguments for a trace
-function. In these cases you can use the macro 'TRACE_${EVENT_NAME}_ENABLED' to
-guard such computations and avoid its compilation when the event is disabled:
-
- #include "trace.h" /* needed for trace event prototype */
-
- void *qemu_vmalloc(size_t size)
- {
- void *ptr;
- size_t align = QEMU_VMALLOC_ALIGN;
-
- if (size < align) {
- align = getpagesize();
- }
- ptr = qemu_memalign(align, size);
- if (TRACE_QEMU_VMALLOC_ENABLED) { /* preprocessor macro */
- void *complex;
- /* some complex computations to produce the 'complex' value */
- trace_qemu_vmalloc(size, ptr, complex);
- }
- return ptr;
- }
-
-You can check both if the event has been disabled and is dynamically enabled at
-the same time using the 'trace_event_get_state' routine (see header
-"trace/control.h" for more information).
-
-=== "tcg" ===
-
-Guest code generated by TCG can be traced by defining an event with the "tcg"
-event property. Internally, this property generates two events:
-"<eventname>_trans" to trace the event at translation time, and
-"<eventname>_exec" to trace the event at execution time.
-
-Instead of using these two events, you should instead use the function
-"trace_<eventname>_tcg" during translation (TCG code generation). This function
-will automatically call "trace_<eventname>_trans", and will generate the
-necessary TCG code to call "trace_<eventname>_exec" during guest code execution.
-
-Events with the "tcg" property can be declared in the "trace-events" file with a
-mix of native and TCG types, and "trace_<eventname>_tcg" will gracefully forward
-them to the "<eventname>_trans" and "<eventname>_exec" events. Since TCG values
-are not known at translation time, these are ignored by the "<eventname>_trans"
-event. Because of this, the entry in the "trace-events" file needs two printing
-formats (separated by a comma):
-
- tcg foo(uint8_t a1, TCGv_i32 a2) "a1=%d", "a1=%d a2=%d"
-
-For example:
-
- #include "trace-tcg.h"
-
- void some_disassembly_func (...)
- {
- uint8_t a1 = ...;
- TCGv_i32 a2 = ...;
- trace_foo_tcg(a1, a2);
- }
-
-This will immediately call:
-
- void trace_foo_trans(uint8_t a1);
-
-and will generate the TCG code to call:
-
- void trace_foo(uint8_t a1, uint32_t a2);
-
-=== "vcpu" ===
-
-Identifies events that trace vCPU-specific information. It implicitly adds a
-"CPUState*" argument, and extends the tracing print format to show the vCPU
-information. If used together with the "tcg" property, it adds a second
-"TCGv_env" argument that must point to the per-target global TCG register that
-points to the vCPU when guest code is executed (usually the "cpu_env" variable).
-
-The "tcg" and "vcpu" properties are currently only honored in the root
-./trace-events file.
-
-The following example events:
-
- foo(uint32_t a) "a=%x"
- vcpu bar(uint32_t a) "a=%x"
- tcg vcpu baz(uint32_t a) "a=%x", "a=%x"
-
-Can be used as:
-
- #include "trace-tcg.h"
-
- CPUArchState *env;
- TCGv_ptr cpu_env;
-
- void some_disassembly_func(...)
- {
- /* trace emitted at this point */
- trace_foo(0xd1);
- /* trace emitted at this point */
- trace_bar(ENV_GET_CPU(env), 0xd2);
- /* trace emitted at this point (env) and when guest code is executed (cpu_env) */
- trace_baz_tcg(ENV_GET_CPU(env), cpu_env, 0xd3);
- }
-
-If the translating vCPU has address 0xc1 and code is later executed by vCPU
-0xc2, this would be an example output:
-
- // at guest code translation
- foo a=0xd1
- bar cpu=0xc1 a=0xd2
- baz_trans cpu=0xc1 a=0xd3
- // at guest code execution
- baz_exec cpu=0xc2 a=0xd3
+++ /dev/null
-Virtio devices and migration
-============================
-
-Copyright 2015 IBM Corp.
-
-This work is licensed under the terms of the GNU GPL, version 2 or later. See
-the COPYING file in the top-level directory.
-
-Saving and restoring the state of virtio devices is a bit of a twisty maze,
-for several reasons:
-- state is distributed between several parts:
- - virtio core, for common fields like features, number of queues, ...
- - virtio transport (pci, ccw, ...), for the different proxy devices and
- transport specific state (msix vectors, indicators, ...)
- - virtio device (net, blk, ...), for the different device types and their
- state (mac address, request queue, ...)
-- most fields are saved via the stream interface; subsequently, subsections
- have been added to make cross-version migration possible
-
-This file attempts to document the current procedure and point out some
-caveats.
-
-
-Save state procedure
-====================
-
-virtio core virtio transport virtio device
------------ ---------------- -------------
-
- save() function registered
- via VMState wrapper on
- device class
-virtio_save() <----------
- ------> save_config()
- - save proxy device
- - save transport-specific
- device fields
-- save common device
- fields
-- save common virtqueue
- fields
- ------> save_queue()
- - save transport-specific
- virtqueue fields
- ------> save_device()
- - save device-specific
- fields
-- save subsections
- - device endianness,
- if changed from
- default endianness
- - 64 bit features, if
- any high feature bit
- is set
- - virtio-1 virtqueue
- fields, if VERSION_1
- is set
-
-
-Load state procedure
-====================
-
-virtio core virtio transport virtio device
------------ ---------------- -------------
-
- load() function registered
- via VMState wrapper on
- device class
-virtio_load() <----------
- ------> load_config()
- - load proxy device
- - load transport-specific
- device fields
-- load common device
- fields
-- load common virtqueue
- fields
- ------> load_queue()
- - load transport-specific
- virtqueue fields
-- notify guest
- ------> load_device()
- - load device-specific
- fields
-- load subsections
- - device endianness
- - 64 bit features
- - virtio-1 virtqueue
- fields
-- sanitize endianness
-- sanitize features
-- virtqueue index sanity
- check
- - feature-dependent setup
-
-
-Implications of this setup
-==========================
-
-Devices need to be careful in their state processing during load: The
-load_device() procedure is invoked by the core before subsections have
-been loaded. Any code that depends on information transmitted in subsections
-therefore has to be invoked in the device's load() function _after_
-virtio_load() returned (like e.g. code depending on features).
-
-Any extension of the state being migrated should be done in subsections
-added to the core for compatibility reasons. If transport or device specific
-state is added, core needs to invoke a callback from the new subsection.
+++ /dev/null
-/*
- * This model describes the implementation of QemuEvent in
- * util/qemu-thread-win32.c.
- *
- * Author: Paolo Bonzini <pbonzini@redhat.com>
- *
- * This file is in the public domain. If you really want a license,
- * the WTFPL will do.
- *
- * To verify it:
- * spin -a docs/event.promela
- * gcc -O2 pan.c -DSAFETY
- * ./a.out
- */
-
-bool event;
-int value;
-
-/* Primitives for a Win32 event */
-#define RAW_RESET event = false
-#define RAW_SET event = true
-#define RAW_WAIT do :: event -> break; od
-
-#if 0
-/* Basic sanity checking: test the Win32 event primitives */
-#define RESET RAW_RESET
-#define SET RAW_SET
-#define WAIT RAW_WAIT
-#else
-/* Full model: layer a userspace-only fast path on top of the RAW_*
- * primitives. SET/RESET/WAIT have exactly the same semantics as
- * RAW_SET/RAW_RESET/RAW_WAIT, but try to avoid invoking them.
- */
-#define EV_SET 0
-#define EV_FREE 1
-#define EV_BUSY -1
-
-int state = EV_FREE;
-
-int xchg_result;
-#define SET if :: state != EV_SET -> \
- atomic { /* xchg_result=xchg(state, EV_SET) */ \
- xchg_result = state; \
- state = EV_SET; \
- } \
- if :: xchg_result == EV_BUSY -> RAW_SET; \
- :: else -> skip; \
- fi; \
- :: else -> skip; \
- fi
-
-#define RESET if :: state == EV_SET -> atomic { state = state | EV_FREE; } \
- :: else -> skip; \
- fi
-
-int tmp1, tmp2;
-#define WAIT tmp1 = state; \
- if :: tmp1 != EV_SET -> \
- if :: tmp1 == EV_FREE -> \
- RAW_RESET; \
- atomic { /* tmp2=cas(state, EV_FREE, EV_BUSY) */ \
- tmp2 = state; \
- if :: tmp2 == EV_FREE -> state = EV_BUSY; \
- :: else -> skip; \
- fi; \
- } \
- if :: tmp2 == EV_SET -> tmp1 = EV_SET; \
- :: else -> tmp1 = EV_BUSY; \
- fi; \
- :: else -> skip; \
- fi; \
- assert(tmp1 != EV_FREE); \
- if :: tmp1 == EV_BUSY -> RAW_WAIT; \
- :: else -> skip; \
- fi; \
- :: else -> skip; \
- fi
-#endif
-
-active proctype waiter()
-{
- if
- :: !value ->
- RESET;
- if
- :: !value -> WAIT;
- :: else -> skip;
- fi;
- :: else -> skip;
- fi;
- assert(value);
-}
-
-active proctype notifier()
-{
- value = true;
- SET;
-}
+++ /dev/null
-= How to write QMP commands using the QAPI framework =
-
-This document is a step-by-step guide on how to write new QMP commands using
-the QAPI framework. It also shows how to implement new style HMP commands.
-
-This document doesn't discuss QMP protocol level details, nor does it dive
-into the QAPI framework implementation.
-
-For an in-depth introduction to the QAPI framework, please refer to
-docs/qapi-code-gen.txt. For documentation about the QMP protocol,
-start with docs/qmp-intro.txt.
-
-== Overview ==
-
-Generally speaking, the following steps should be taken in order to write a
-new QMP command.
-
-1. Write the command's and type(s) specification in the QAPI schema file
- (qapi-schema.json in the root source directory)
-
-2. Write the QMP command itself, which is a regular C function. Preferably,
- the command should be exported by some QEMU subsystem. But it can also be
- added to the qmp.c file
-
-3. At this point the command can be tested under the QMP protocol
-
-4. Write the HMP command equivalent. This is not required and should only be
- done if it does make sense to have the functionality in HMP. The HMP command
- is implemented in terms of the QMP command
-
-The following sections will demonstrate each of the steps above. We will start
-very simple and get more complex as we progress.
-
-=== Testing ===
-
-For all the examples in the next sections, the test setup is the same and is
-shown here.
-
-First, QEMU should be started as:
-
-# /path/to/your/source/qemu [...] \
- -chardev socket,id=qmp,port=4444,host=localhost,server \
- -mon chardev=qmp,mode=control,pretty=on
-
-Then, in a different terminal:
-
-$ telnet localhost 4444
-Trying 127.0.0.1...
-Connected to localhost.
-Escape character is '^]'.
-{
- "QMP": {
- "version": {
- "qemu": {
- "micro": 50,
- "minor": 15,
- "major": 0
- },
- "package": ""
- },
- "capabilities": [
- ]
- }
-}
-
-The above output is the QMP server saying you're connected. The server is
-actually in capabilities negotiation mode. To enter in command mode type:
-
-{ "execute": "qmp_capabilities" }
-
-Then the server should respond:
-
-{
- "return": {
- }
-}
-
-Which is QMP's way of saying "the latest command executed OK and didn't return
-any data". Now you're ready to enter the QMP example commands as explained in
-the following sections.
-
-== Writing a command that doesn't return data ==
-
-That's the most simple QMP command that can be written. Usually, this kind of
-command carries some meaningful action in QEMU but here it will just print
-"Hello, world" to the standard output.
-
-Our command will be called "hello-world". It takes no arguments, nor does it
-return any data.
-
-The first step is to add the following line to the bottom of the
-qapi-schema.json file:
-
-{ 'command': 'hello-world' }
-
-The "command" keyword defines a new QMP command. It's an JSON object. All
-schema entries are JSON objects. The line above will instruct the QAPI to
-generate any prototypes and the necessary code to marshal and unmarshal
-protocol data.
-
-The next step is to write the "hello-world" implementation. As explained
-earlier, it's preferable for commands to live in QEMU subsystems. But
-"hello-world" doesn't pertain to any, so we put its implementation in qmp.c:
-
-void qmp_hello_world(Error **errp)
-{
- printf("Hello, world!\n");
-}
-
-There are a few things to be noticed:
-
-1. QMP command implementation functions must be prefixed with "qmp_"
-2. qmp_hello_world() returns void, this is in accordance with the fact that the
- command doesn't return any data
-3. It takes an "Error **" argument. This is required. Later we will see how to
- return errors and take additional arguments. The Error argument should not
- be touched if the command doesn't return errors
-4. We won't add the function's prototype. That's automatically done by the QAPI
-5. Printing to the terminal is discouraged for QMP commands, we do it here
- because it's the easiest way to demonstrate a QMP command
-
-You're done. Now build qemu, run it as suggested in the "Testing" section,
-and then type the following QMP command:
-
-{ "execute": "hello-world" }
-
-Then check the terminal running qemu and look for the "Hello, world" string. If
-you don't see it then something went wrong.
-
-=== Arguments ===
-
-Let's add an argument called "message" to our "hello-world" command. The new
-argument will contain the string to be printed to stdout. It's an optional
-argument, if it's not present we print our default "Hello, World" string.
-
-The first change we have to do is to modify the command specification in the
-schema file to the following:
-
-{ 'command': 'hello-world', 'data': { '*message': 'str' } }
-
-Notice the new 'data' member in the schema. It's an JSON object whose each
-element is an argument to the command in question. Also notice the asterisk,
-it's used to mark the argument optional (that means that you shouldn't use it
-for mandatory arguments). Finally, 'str' is the argument's type, which
-stands for "string". The QAPI also supports integers, booleans, enumerations
-and user defined types.
-
-Now, let's update our C implementation in qmp.c:
-
-void qmp_hello_world(bool has_message, const char *message, Error **errp)
-{
- if (has_message) {
- printf("%s\n", message);
- } else {
- printf("Hello, world\n");
- }
-}
-
-There are two important details to be noticed:
-
-1. All optional arguments are accompanied by a 'has_' boolean, which is set
- if the optional argument is present or false otherwise
-2. The C implementation signature must follow the schema's argument ordering,
- which is defined by the "data" member
-
-Time to test our new version of the "hello-world" command. Build qemu, run it as
-described in the "Testing" section and then send two commands:
-
-{ "execute": "hello-world" }
-{
- "return": {
- }
-}
-
-{ "execute": "hello-world", "arguments": { "message": "We love qemu" } }
-{
- "return": {
- }
-}
-
-You should see "Hello, world" and "we love qemu" in the terminal running qemu,
-if you don't see these strings, then something went wrong.
-
-=== Errors ===
-
-QMP commands should use the error interface exported by the error.h header
-file. Basically, most errors are set by calling the error_setg() function.
-
-Let's say we don't accept the string "message" to contain the word "love". If
-it does contain it, we want the "hello-world" command to return an error:
-
-void qmp_hello_world(bool has_message, const char *message, Error **errp)
-{
- if (has_message) {
- if (strstr(message, "love")) {
- error_setg(errp, "the word 'love' is not allowed");
- return;
- }
- printf("%s\n", message);
- } else {
- printf("Hello, world\n");
- }
-}
-
-The first argument to the error_setg() function is the Error pointer
-to pointer, which is passed to all QMP functions. The next argument is a human
-description of the error, this is a free-form printf-like string.
-
-Let's test the example above. Build qemu, run it as defined in the "Testing"
-section, and then issue the following command:
-
-{ "execute": "hello-world", "arguments": { "message": "all you need is love" } }
-
-The QMP server's response should be:
-
-{
- "error": {
- "class": "GenericError",
- "desc": "the word 'love' is not allowed"
- }
-}
-
-As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR
-(done by default when using error_setg()). There are two exceptions to
-this rule:
-
- 1. A non-generic ErrorClass value exists* for the failure you want to report
- (eg. DeviceNotFound)
-
- 2. Management applications have to take special action on the failure you
- want to report, hence you have to add a new ErrorClass value so that they
- can check for it
-
-If the failure you want to report falls into one of the two cases above,
-use error_set() with a second argument of an ErrorClass value.
-
- * All existing ErrorClass values are defined in the qapi-schema.json file
-
-=== Command Documentation ===
-
-There's only one step missing to make "hello-world"'s implementation complete,
-and that's its documentation in the schema file.
-
-This is very important. No QMP command will be accepted in QEMU without proper
-documentation.
-
-There are many examples of such documentation in the schema file already, but
-here goes "hello-world"'s new entry for the qapi-schema.json file:
-
-##
-# @hello-world
-#
-# Print a client provided string to the standard output stream.
-#
-# @message: string to be printed
-#
-# Returns: Nothing on success.
-#
-# Notes: if @message is not provided, the "Hello, world" string will
-# be printed instead
-#
-# Since: <next qemu stable release, eg. 1.0>
-##
-{ 'command': 'hello-world', 'data': { '*message': 'str' } }
-
-Please, note that the "Returns" clause is optional if a command doesn't return
-any data nor any errors.
-
-=== Implementing the HMP command ===
-
-Now that the QMP command is in place, we can also make it available in the human
-monitor (HMP).
-
-With the introduction of the QAPI, HMP commands make QMP calls. Most of the
-time HMP commands are simple wrappers. All HMP commands implementation exist in
-the hmp.c file.
-
-Here's the implementation of the "hello-world" HMP command:
-
-void hmp_hello_world(Monitor *mon, const QDict *qdict)
-{
- const char *message = qdict_get_try_str(qdict, "message");
- Error *err = NULL;
-
- qmp_hello_world(!!message, message, &err);
- if (err) {
- monitor_printf(mon, "%s\n", error_get_pretty(err));
- error_free(err);
- return;
- }
-}
-
-Also, you have to add the function's prototype to the hmp.h file.
-
-There are three important points to be noticed:
-
-1. The "mon" and "qdict" arguments are mandatory for all HMP functions. The
- former is the monitor object. The latter is how the monitor passes
- arguments entered by the user to the command implementation
-2. hmp_hello_world() performs error checking. In this example we just print
- the error description to the user, but we could do more, like taking
- different actions depending on the error qmp_hello_world() returns
-3. The "err" variable must be initialized to NULL before performing the
- QMP call
-
-There's one last step to actually make the command available to monitor users,
-we should add it to the hmp-commands.hx file:
-
- {
- .name = "hello-world",
- .args_type = "message:s?",
- .params = "hello-world [message]",
- .help = "Print message to the standard output",
- .cmd = hmp_hello_world,
- },
-
-STEXI
-@item hello_world @var{message}
-@findex hello_world
-Print message to the standard output
-ETEXI
-
-To test this you have to open a user monitor and issue the "hello-world"
-command. It might be instructive to check the command's documentation with
-HMP's "help" command.
-
-Please, check the "-monitor" command-line option to know how to open a user
-monitor.
-
-== Writing a command that returns data ==
-
-A QMP command is capable of returning any data the QAPI supports like integers,
-strings, booleans, enumerations and user defined types.
-
-In this section we will focus on user defined types. Please, check the QAPI
-documentation for information about the other types.
-
-=== User Defined Types ===
-
-FIXME This example needs to be redone after commit 6d32717
-
-For this example we will write the query-alarm-clock command, which returns
-information about QEMU's timer alarm. For more information about it, please
-check the "-clock" command-line option.
-
-We want to return two pieces of information. The first one is the alarm clock's
-name. The second one is when the next alarm will fire. The former information is
-returned as a string, the latter is an integer in nanoseconds (which is not
-very useful in practice, as the timer has probably already fired when the
-information reaches the client).
-
-The best way to return that data is to create a new QAPI type, as shown below:
-
-##
-# @QemuAlarmClock
-#
-# QEMU alarm clock information.
-#
-# @clock-name: The alarm clock method's name.
-#
-# @next-deadline: The time (in nanoseconds) the next alarm will fire.
-#
-# Since: 1.0
-##
-{ 'type': 'QemuAlarmClock',
- 'data': { 'clock-name': 'str', '*next-deadline': 'int' } }
-
-The "type" keyword defines a new QAPI type. Its "data" member contains the
-type's members. In this example our members are the "clock-name" and the
-"next-deadline" one, which is optional.
-
-Now let's define the query-alarm-clock command:
-
-##
-# @query-alarm-clock
-#
-# Return information about QEMU's alarm clock.
-#
-# Returns a @QemuAlarmClock instance describing the alarm clock method
-# being currently used by QEMU (this is usually set by the '-clock'
-# command-line option).
-#
-# Since: 1.0
-##
-{ 'command': 'query-alarm-clock', 'returns': 'QemuAlarmClock' }
-
-Notice the "returns" keyword. As its name suggests, it's used to define the
-data returned by a command.
-
-It's time to implement the qmp_query_alarm_clock() function, you can put it
-in the qemu-timer.c file:
-
-QemuAlarmClock *qmp_query_alarm_clock(Error **errp)
-{
- QemuAlarmClock *clock;
- int64_t deadline;
-
- clock = g_malloc0(sizeof(*clock));
-
- deadline = qemu_next_alarm_deadline();
- if (deadline > 0) {
- clock->has_next_deadline = true;
- clock->next_deadline = deadline;
- }
- clock->clock_name = g_strdup(alarm_timer->name);
-
- return clock;
-}
-
-There are a number of things to be noticed:
-
-1. The QemuAlarmClock type is automatically generated by the QAPI framework,
- its members correspond to the type's specification in the schema file
-2. As specified in the schema file, the function returns a QemuAlarmClock
- instance and takes no arguments (besides the "errp" one, which is mandatory
- for all QMP functions)
-3. The "clock" variable (which will point to our QAPI type instance) is
- allocated by the regular g_malloc0() function. Note that we chose to
- initialize the memory to zero. This is recommended for all QAPI types, as
- it helps avoiding bad surprises (specially with booleans)
-4. Remember that "next_deadline" is optional? All optional members have a
- 'has_TYPE_NAME' member that should be properly set by the implementation,
- as shown above
-5. Even static strings, such as "alarm_timer->name", should be dynamically
- allocated by the implementation. This is so because the QAPI also generates
- a function to free its types and it cannot distinguish between dynamically
- or statically allocated strings
-6. You have to include the "qmp-commands.h" header file in qemu-timer.c,
- otherwise qemu won't build
-
-Time to test the new command. Build qemu, run it as described in the "Testing"
-section and try this:
-
-{ "execute": "query-alarm-clock" }
-{
- "return": {
- "next-deadline": 2368219,
- "clock-name": "dynticks"
- }
-}
-
-==== The HMP command ====
-
-Here's the HMP counterpart of the query-alarm-clock command:
-
-void hmp_info_alarm_clock(Monitor *mon)
-{
- QemuAlarmClock *clock;
- Error *err = NULL;
-
- clock = qmp_query_alarm_clock(&err);
- if (err) {
- monitor_printf(mon, "Could not query alarm clock information\n");
- error_free(err);
- return;
- }
-
- monitor_printf(mon, "Alarm clock method in use: '%s'\n", clock->clock_name);
- if (clock->has_next_deadline) {
- monitor_printf(mon, "Next alarm will fire in %" PRId64 " nanoseconds\n",
- clock->next_deadline);
- }
-
- qapi_free_QemuAlarmClock(clock);
-}
-
-It's important to notice that hmp_info_alarm_clock() calls
-qapi_free_QemuAlarmClock() to free the data returned by qmp_query_alarm_clock().
-For user defined types, the QAPI will generate a qapi_free_QAPI_TYPE_NAME()
-function and that's what you have to use to free the types you define and
-qapi_free_QAPI_TYPE_NAMEList() for list types (explained in the next section).
-If the QMP call returns a string, then you should g_free() to free it.
-
-Also note that hmp_info_alarm_clock() performs error handling. That's not
-strictly required if you're sure the QMP function doesn't return errors, but
-it's good practice to always check for errors.
-
-Another important detail is that HMP's "info" commands don't go into the
-hmp-commands.hx. Instead, they go into the info_cmds[] table, which is defined
-in the monitor.c file. The entry for the "info alarmclock" follows:
-
- {
- .name = "alarmclock",
- .args_type = "",
- .params = "",
- .help = "show information about the alarm clock",
- .cmd = hmp_info_alarm_clock,
- },
-
-To test this, run qemu and type "info alarmclock" in the user monitor.
-
-=== Returning Lists ===
-
-For this example, we're going to return all available methods for the timer
-alarm, which is pretty much what the command-line option "-clock ?" does,
-except that we're also going to inform which method is in use.
-
-This first step is to define a new type:
-
-##
-# @TimerAlarmMethod
-#
-# Timer alarm method information.
-#
-# @method-name: The method's name.
-#
-# @current: true if this alarm method is currently in use, false otherwise
-#
-# Since: 1.0
-##
-{ 'type': 'TimerAlarmMethod',
- 'data': { 'method-name': 'str', 'current': 'bool' } }
-
-The command will be called "query-alarm-methods", here is its schema
-specification:
-
-##
-# @query-alarm-methods
-#
-# Returns information about available alarm methods.
-#
-# Returns: a list of @TimerAlarmMethod for each method
-#
-# Since: 1.0
-##
-{ 'command': 'query-alarm-methods', 'returns': ['TimerAlarmMethod'] }
-
-Notice the syntax for returning lists "'returns': ['TimerAlarmMethod']", this
-should be read as "returns a list of TimerAlarmMethod instances".
-
-The C implementation follows:
-
-TimerAlarmMethodList *qmp_query_alarm_methods(Error **errp)
-{
- TimerAlarmMethodList *method_list = NULL;
- const struct qemu_alarm_timer *p;
- bool current = true;
-
- for (p = alarm_timers; p->name; p++) {
- TimerAlarmMethodList *info = g_malloc0(sizeof(*info));
- info->value = g_malloc0(sizeof(*info->value));
- info->value->method_name = g_strdup(p->name);
- info->value->current = current;
-
- current = false;
-
- info->next = method_list;
- method_list = info;
- }
-
- return method_list;
-}
-
-The most important difference from the previous examples is the
-TimerAlarmMethodList type, which is automatically generated by the QAPI from
-the TimerAlarmMethod type.
-
-Each list node is represented by a TimerAlarmMethodList instance. We have to
-allocate it, and that's done inside the for loop: the "info" pointer points to
-an allocated node. We also have to allocate the node's contents, which is
-stored in its "value" member. In our example, the "value" member is a pointer
-to an TimerAlarmMethod instance.
-
-Notice that the "current" variable is used as "true" only in the first
-iteration of the loop. That's because the alarm timer method in use is the
-first element of the alarm_timers array. Also notice that QAPI lists are handled
-by hand and we return the head of the list.
-
-Now Build qemu, run it as explained in the "Testing" section and try our new
-command:
-
-{ "execute": "query-alarm-methods" }
-{
- "return": [
- {
- "current": false,
- "method-name": "unix"
- },
- {
- "current": true,
- "method-name": "dynticks"
- }
- ]
-}
-
-The HMP counterpart is a bit more complex than previous examples because it
-has to traverse the list, it's shown below for reference:
-
-void hmp_info_alarm_methods(Monitor *mon)
-{
- TimerAlarmMethodList *method_list, *method;
- Error *err = NULL;
-
- method_list = qmp_query_alarm_methods(&err);
- if (err) {
- monitor_printf(mon, "Could not query alarm methods\n");
- error_free(err);
- return;
- }
-
- for (method = method_list; method; method = method->next) {
- monitor_printf(mon, "%c %s\n", method->value->current ? '*' : ' ',
- method->value->method_name);
- }
-
- qapi_free_TimerAlarmMethodList(method_list);
-}