From: Paolo Bonzini <pbonzini@redhat.com> Date: Tue, 6 Jun 2017 14:46:26 +0000 (+0200) Subject: docs: create config/, devel/ and spin/ subdirectories X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=ac06724a71;p=qemu.git docs: create config/, devel/ and spin/ subdirectories Developer documentation should be its own manual. As a start, move all developer-oriented files to a separate directory. Also move non-text files to their own directories: docs/config/ for QEMU -readconfig input, and docs/spin/ for formal models to be used with the SPIN model checker. Reviewed-by: Daniel P. Berrange <berrange@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> --- diff --git a/docs/aio_notify.promela b/docs/aio_notify.promela deleted file mode 100644 index fccc7ee1c3..0000000000 --- a/docs/aio_notify.promela +++ /dev/null @@ -1,93 +0,0 @@ -/* - * This model describes the interaction between ctx->notify_me - * and aio_notify(). - * - * Author: Paolo Bonzini <pbonzini@redhat.com> - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To simulate it: - * spin -p docs/aio_notify.promela - * - * To verify it: - * spin -a docs/aio_notify.promela - * gcc -O2 pan.c - * ./a.out -a - * - * To verify it (with a bug planted in the model): - * spin -a -DBUG docs/aio_notify.promela - * gcc -O2 pan.c - * ./a.out -a - */ - -#define MAX 4 -#define LAST (1 << (MAX - 1)) -#define FINAL ((LAST << 1) - 1) - -bool notify_me; -bool event; - -int req; -int done; - -active proctype waiter() -{ - int fetch; - - do - :: true -> { - notify_me++; - - if -#ifndef BUG - :: (req > 0) -> skip; -#endif - :: else -> - // Wait for a nudge from the other side - do - :: event == 1 -> { event = 0; break; } - od; - fi; - - notify_me--; - - atomic { fetch = req; req = 0; } - done = done | fetch; - } - od -} - -active proctype notifier() -{ - int next = 1; - - do - :: next <= LAST -> { - // generate a request - req = req | next; - next = next << 1; - - // aio_notify - if - :: notify_me == 1 -> event = 1; - :: else -> printf("Skipped event_notifier_set\n"); skip; - fi; - - // Test both synchronous and asynchronous delivery - if - :: 1 -> do - :: req == 0 -> break; - od; - :: 1 -> skip; - fi; - } - od; -} - -never { /* [] done < FINAL */ -accept_init: - do - :: done < FINAL -> skip; - od; -} diff --git a/docs/aio_notify_accept.promela b/docs/aio_notify_accept.promela deleted file mode 100644 index 9cef2c955d..0000000000 --- a/docs/aio_notify_accept.promela +++ /dev/null @@ -1,152 +0,0 @@ -/* - * This model describes the interaction between ctx->notified - * and ctx->notifier. - * - * Author: Paolo Bonzini <pbonzini@redhat.com> - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify the buggy version: - * spin -a -DBUG1 docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * (or -DBUG2) - * - * To verify the fixed version: - * spin -a docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * Add -DCHECK_REQ to test an alternative invariant and the - * "notify_me" optimization. - */ - -int notify_me; -bool notified; -bool event; -bool req; -bool notifier_done; - -#ifdef CHECK_REQ -#define USE_NOTIFY_ME 1 -#else -#define USE_NOTIFY_ME 0 -#endif - -#ifdef BUG -#error Please define BUG1 or BUG2 instead. -#endif - -active proctype notifier() -{ - do - :: true -> { - req = 1; - if - :: !USE_NOTIFY_ME || notify_me -> -#if defined BUG1 - /* CHECK_REQ does not detect this bug! */ - notified = 1; - event = 1; -#elif defined BUG2 - if - :: !notified -> event = 1; - :: else -> skip; - fi; - notified = 1; -#else - event = 1; - notified = 1; -#endif - :: else -> skip; - fi - } - :: true -> break; - od; - notifier_done = 1; -} - -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - atomic { old = notified; notified = 0; } \ - if \ - :: old -> event = 0; \ - :: else -> skip; \ - fi; \ - \ - req = 0; - -active proctype waiter() -{ - bool old; - - do - :: true -> AIO_POLL; - od; -} - -/* Same as waiter(), but disappears after a while. */ -active proctype temporary_waiter() -{ - bool old; - - do - :: true -> AIO_POLL; - :: true -> break; - od; -} - -#ifdef CHECK_REQ -never { - do - :: req -> goto accept_if_req_not_eventually_false; - :: true -> skip; - od; - -accept_if_req_not_eventually_false: - if - :: req -> goto accept_if_req_not_eventually_false; - fi; - assert(0); -} - -#else -/* There must be infinitely many transitions of event as long - * as the notifier does not exit. - * - * If event stayed always true, the waiters would be busy looping. - * If event stayed always false, the waiters would be sleeping - * forever. - */ -never { - do - :: !event -> goto accept_if_event_not_eventually_true; - :: event -> goto accept_if_event_not_eventually_false; - :: true -> skip; - od; - -accept_if_event_not_eventually_true: - if - :: !event && notifier_done -> do :: true -> skip; od; - :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; - fi; - assert(0); - -accept_if_event_not_eventually_false: - if - :: event -> goto accept_if_event_not_eventually_false; - fi; - assert(0); -} -#endif diff --git a/docs/aio_notify_bug.promela b/docs/aio_notify_bug.promela deleted file mode 100644 index b3bfca1ca4..0000000000 --- a/docs/aio_notify_bug.promela +++ /dev/null @@ -1,140 +0,0 @@ -/* - * This model describes a bug in aio_notify. If ctx->notifier is - * cleared too late, a wakeup could be lost. - * - * Author: Paolo Bonzini <pbonzini@redhat.com> - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify the buggy version: - * spin -a -DBUG docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * To verify the fixed version: - * spin -a docs/aio_notify_bug.promela - * gcc -O2 pan.c - * ./a.out -a -f - * - * Add -DCHECK_REQ to test an alternative invariant and the - * "notify_me" optimization. - */ - -int notify_me; -bool event; -bool req; -bool notifier_done; - -#ifdef CHECK_REQ -#define USE_NOTIFY_ME 1 -#else -#define USE_NOTIFY_ME 0 -#endif - -active proctype notifier() -{ - do - :: true -> { - req = 1; - if - :: !USE_NOTIFY_ME || notify_me -> event = 1; - :: else -> skip; - fi - } - :: true -> break; - od; - notifier_done = 1; -} - -#ifdef BUG -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - req = 0; \ - event = 0; -#else -#define AIO_POLL \ - notify_me++; \ - if \ - :: !req -> { \ - if \ - :: event -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - notify_me--; \ - \ - event = 0; \ - req = 0; -#endif - -active proctype waiter() -{ - do - :: true -> AIO_POLL; - od; -} - -/* Same as waiter(), but disappears after a while. */ -active proctype temporary_waiter() -{ - do - :: true -> AIO_POLL; - :: true -> break; - od; -} - -#ifdef CHECK_REQ -never { - do - :: req -> goto accept_if_req_not_eventually_false; - :: true -> skip; - od; - -accept_if_req_not_eventually_false: - if - :: req -> goto accept_if_req_not_eventually_false; - fi; - assert(0); -} - -#else -/* There must be infinitely many transitions of event as long - * as the notifier does not exit. - * - * If event stayed always true, the waiters would be busy looping. - * If event stayed always false, the waiters would be sleeping - * forever. - */ -never { - do - :: !event -> goto accept_if_event_not_eventually_true; - :: event -> goto accept_if_event_not_eventually_false; - :: true -> skip; - od; - -accept_if_event_not_eventually_true: - if - :: !event && notifier_done -> do :: true -> skip; od; - :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; - fi; - assert(0); - -accept_if_event_not_eventually_false: - if - :: event -> goto accept_if_event_not_eventually_false; - fi; - assert(0); -} -#endif diff --git a/docs/atomics.txt b/docs/atomics.txt deleted file mode 100644 index 3ef5d85b1b..0000000000 --- a/docs/atomics.txt +++ /dev/null @@ -1,388 +0,0 @@ -CPUs perform independent memory operations effectively in random order. -but this can be a problem for CPU-CPU interaction (including interactions -between QEMU and the guest). Multi-threaded programs use various tools -to instruct the compiler and the CPU to restrict the order to something -that is consistent with the expectations of the programmer. - -The most basic tool is locking. Mutexes, condition variables and -semaphores are used in QEMU, and should be the default approach to -synchronization. Anything else is considerably harder, but it's -also justified more often than one would like. The two tools that -are provided by qemu/atomic.h are memory barriers and atomic operations. - -Macros defined by qemu/atomic.h fall in three camps: - -- compiler barriers: barrier(); - -- weak atomic access and manual memory barriers: atomic_read(), - atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(), - smp_mb_release(), smp_read_barrier_depends(); - -- sequentially consistent atomic access: everything else. - - -COMPILER MEMORY BARRIER -======================= - -barrier() prevents the compiler from moving the memory accesses either -side of it to the other side. The compiler barrier has no direct effect -on the CPU, which may then reorder things however it wishes. - -barrier() is mostly used within qemu/atomic.h itself. On some -architectures, CPU guarantees are strong enough that blocking compiler -optimizations already ensures the correct order of execution. In this -case, qemu/atomic.h will reduce stronger memory barriers to simple -compiler barriers. - -Still, barrier() can be useful when writing code that can be interrupted -by signal handlers. - - -SEQUENTIALLY CONSISTENT ATOMIC ACCESS -===================================== - -Most of the operations in the qemu/atomic.h header ensure *sequential -consistency*, where "the result of any execution is the same as if the -operations of all the processors were executed in some sequential order, -and the operations of each individual processor appear in this sequence -in the order specified by its program". - -qemu/atomic.h provides the following set of atomic read-modify-write -operations: - - void atomic_inc(ptr) - void atomic_dec(ptr) - void atomic_add(ptr, val) - void atomic_sub(ptr, val) - void atomic_and(ptr, val) - void atomic_or(ptr, val) - - typeof(*ptr) atomic_fetch_inc(ptr) - typeof(*ptr) atomic_fetch_dec(ptr) - typeof(*ptr) atomic_fetch_add(ptr, val) - typeof(*ptr) atomic_fetch_sub(ptr, val) - typeof(*ptr) atomic_fetch_and(ptr, val) - typeof(*ptr) atomic_fetch_or(ptr, val) - typeof(*ptr) atomic_xchg(ptr, val) - typeof(*ptr) atomic_cmpxchg(ptr, old, new) - -all of which return the old value of *ptr. These operations are -polymorphic; they operate on any type that is as wide as an int. - -Sequentially consistent loads and stores can be done using: - - atomic_fetch_add(ptr, 0) for loads - atomic_xchg(ptr, val) for stores - -However, they are quite expensive on some platforms, notably POWER and -ARM. Therefore, qemu/atomic.h provides two primitives with slightly -weaker constraints: - - typeof(*ptr) atomic_mb_read(ptr) - void atomic_mb_set(ptr, val) - -The semantics of these primitives map to Java volatile variables, -and are strongly related to memory barriers as used in the Linux -kernel (see below). - -As long as you use atomic_mb_read and atomic_mb_set, accesses cannot -be reordered with each other, and it is also not possible to reorder -"normal" accesses around them. - -However, and this is the important difference between -atomic_mb_read/atomic_mb_set and sequential consistency, it is important -for both threads to access the same volatile variable. It is not the -case that everything visible to thread A when it writes volatile field f -becomes visible to thread B after it reads volatile field g. The store -and load have to "match" (i.e., be performed on the same volatile -field) to achieve the right semantics. - - -These operations operate on any type that is as wide as an int or smaller. - - -WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS -============================================= - -Compared to sequentially consistent atomic access, programming with -weaker consistency models can be considerably more complicated. -In general, if the algorithm you are writing includes both writes -and reads on the same side, it is generally simpler to use sequentially -consistent primitives. - -When using this model, variables are accessed with atomic_read() and -atomic_set(), and restrictions to the ordering of accesses is enforced -using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(), -smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends(). - -atomic_read() and atomic_set() prevents the compiler from using -optimizations that might otherwise optimize accesses out of existence -on the one hand, or that might create unsolicited accesses on the other. -In general this should not have any effect, because the same compiler -barriers are already implied by memory barriers. However, it is useful -to do so, because it tells readers which variables are shared with -other threads, and which are local to the current thread or protected -by other, more mundane means. - -Memory barriers control the order of references to shared memory. -They come in six kinds: - -- smp_rmb() guarantees that all the LOAD operations specified before - the barrier will appear to happen before all the LOAD operations - specified after the barrier with respect to the other components of - the system. - - In other words, smp_rmb() puts a partial ordering on loads, but is not - required to have any effect on stores. - -- smp_wmb() guarantees that all the STORE operations specified before - the barrier will appear to happen before all the STORE operations - specified after the barrier with respect to the other components of - the system. - - In other words, smp_wmb() puts a partial ordering on stores, but is not - required to have any effect on loads. - -- smp_mb_acquire() guarantees that all the LOAD operations specified before - the barrier will appear to happen before all the LOAD or STORE operations - specified after the barrier with respect to the other components of - the system. - -- smp_mb_release() guarantees that all the STORE operations specified *after* - the barrier will appear to happen after all the LOAD or STORE operations - specified *before* the barrier with respect to the other components of - the system. - -- smp_mb() guarantees that all the LOAD and STORE operations specified - before the barrier will appear to happen before all the LOAD and - STORE operations specified after the barrier with respect to the other - components of the system. - - smp_mb() puts a partial ordering on both loads and stores. It is - stronger than both a read and a write memory barrier; it implies both - smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs - coming before the barrier from overtaking LOADs coming after the - barrier and vice versa. - -- smp_read_barrier_depends() is a weaker kind of read barrier. On - most processors, whenever two loads are performed such that the - second depends on the result of the first (e.g., the first load - retrieves the address to which the second load will be directed), - the processor will guarantee that the first LOAD will appear to happen - before the second with respect to the other components of the system. - However, this is not always true---for example, it was not true on - Alpha processors. Whenever this kind of access happens to shared - memory (that is not protected by a lock), a read barrier is needed, - and smp_read_barrier_depends() can be used instead of smp_rmb(). - - Note that the first load really has to have a _data_ dependency and not - a control dependency. If the address for the second load is dependent - on the first load, but the dependency is through a conditional rather - than actually loading the address itself, then it's a _control_ - dependency and a full read barrier or better is required. - - -This is the set of barriers that is required *between* two atomic_read() -and atomic_set() operations to achieve sequential consistency: - - | 2nd operation | - |-----------------------------------------------| - 1st operation | (after last) | atomic_read | atomic_set | - ---------------+----------------+-------------+----------------| - (before first) | | none | smp_mb_release | - ---------------+----------------+-------------+----------------| - atomic_read | smp_mb_acquire | smp_rmb | ** | - ---------------+----------------+-------------+----------------| - atomic_set | none | smp_mb()*** | smp_wmb() | - ---------------+----------------+-------------+----------------| - - * Or smp_read_barrier_depends(). - - ** This requires a load-store barrier. This is achieved by - either smp_mb_acquire() or smp_mb_release(). - - *** This requires a store-load barrier. On most machines, the only - way to achieve this is a full barrier. - - -You can see that the two possible definitions of atomic_mb_read() -and atomic_mb_set() are the following: - - 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire() - atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb() - - 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire() - atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); - -Usually the former is used, because smp_mb() is expensive and a program -normally has more reads than writes. Therefore it makes more sense to -make atomic_mb_set() the more expensive operation. - -There are two common cases in which atomic_mb_read and atomic_mb_set -generate too many memory barriers, and thus it can be useful to manually -place barriers instead: - -- when a data structure has one thread that is always a writer - and one thread that is always a reader, manual placement of - memory barriers makes the write side faster. Furthermore, - correctness is easy to check for in this case using the "pairing" - trick that is explained below: - - thread 1 thread 1 - ------------------------- ------------------------ - (other writes) - smp_mb_release() - atomic_mb_set(&a, x) atomic_set(&a, x) - smp_wmb() - atomic_mb_set(&b, y) atomic_set(&b, y) - - => - thread 2 thread 2 - ------------------------- ------------------------ - y = atomic_mb_read(&b) y = atomic_read(&b) - smp_rmb() - x = atomic_mb_read(&a) x = atomic_read(&a) - smp_mb_acquire() - - Note that the barrier between the stores in thread 1, and between - the loads in thread 2, has been optimized here to a write or a - read memory barrier respectively. On some architectures, notably - ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as - smp_mb, but smp_rmb and/or smp_wmb are more efficient. - -- sometimes, a thread is accessing many variables that are otherwise - unrelated to each other (for example because, apart from the current - thread, exactly one other thread will read or write each of these - variables). In this case, it is possible to "hoist" the implicit - barriers provided by atomic_mb_read() and atomic_mb_set() outside - a loop. For example, the above definition atomic_mb_read() gives - the following transformation: - - n = 0; n = 0; - for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) - n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); - smp_mb_acquire(); - - Similarly, atomic_mb_set() can be transformed as follows: - smp_mb(): - - smp_mb_release(); - for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) - atomic_mb_set(&a[i], false); atomic_set(&a[i], false); - smp_mb(); - - -The two tricks can be combined. In this case, splitting a loop in -two lets you hoist the barriers out of the loops _and_ eliminate the -expensive smp_mb(): - - smp_mb_release(); - for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) - atomic_mb_set(&a[i], false); atomic_set(&a[i], false); - atomic_mb_set(&b[i], false); smb_wmb(); - } for (i = 0; i < 10; i++) - atomic_set(&a[i], false); - smp_mb(); - - The other thread can still use atomic_mb_read()/atomic_mb_set() - - -Memory barrier pairing ----------------------- - -A useful rule of thumb is that memory barriers should always, or almost -always, be paired with another barrier. In the case of QEMU, however, -note that the other barrier may actually be in a driver that runs in -the guest! - -For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() -both count as read barriers. A read barrier shall pair with a write -barrier or a full barrier; a write barrier shall pair with a read -barrier or a full barrier. A full barrier can pair with anything. -For example: - - thread 1 thread 2 - =============== =============== - a = 1; - smp_wmb(); - b = 2; x = b; - smp_rmb(); - y = a; - -Note that the "writing" thread is accessing the variables in the -opposite order as the "reading" thread. This is expected: stores -before the write barrier will normally match the loads after the -read barrier, and vice versa. The same is true for more than 2 -access and for data dependency barriers: - - thread 1 thread 2 - =============== =============== - b[2] = 1; - smp_wmb(); - x->i = 2; - smp_wmb(); - a = x; x = a; - smp_read_barrier_depends(); - y = x->i; - smp_read_barrier_depends(); - z = b[y]; - -smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire(). -and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release(). - - -COMPARISON WITH LINUX KERNEL MEMORY BARRIERS -============================================ - -Here is a list of differences between Linux kernel atomic operations -and memory barriers, and the equivalents in QEMU: - -- atomic operations in Linux are always on a 32-bit int type and - use a boxed atomic_t type; atomic operations in QEMU are polymorphic - and use normal C types. - -- Originally, atomic_read and atomic_set in Linux gave no guarantee - at all. Linux 4.1 updated them to implement volatile - semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE). - - QEMU's atomic_read/set implement, if the compiler supports it, C11 - atomic relaxed semantics, and volatile semantics otherwise. - Both semantics prevent the compiler from doing certain transformations; - the difference is that atomic accesses are guaranteed to be atomic, - while volatile accesses aren't. Thus, in the volatile case we just cross - our fingers hoping that the compiler will generate atomic accesses, - since we assume the variables passed are machine-word sized and - properly aligned. - No barriers are implied by atomic_read/set in either Linux or QEMU. - -- atomic read-modify-write operations in Linux are of three kinds: - - atomic_OP returns void - atomic_OP_return returns new value of the variable - atomic_fetch_OP returns the old value of the variable - atomic_cmpxchg returns the old value of the variable - - In QEMU, the second kind does not exist. Currently Linux has - atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub. - -- different atomic read-modify-write operations in Linux imply - a different set of memory barriers; in QEMU, all of them enforce - sequential consistency, which means they imply full memory barriers - before and after the operation. - -- Linux does not have an equivalent of atomic_mb_set(). In particular, - note that smp_store_mb() is a little weaker than atomic_mb_set(). - atomic_mb_read() compiles to the same instructions as Linux's - smp_load_acquire(), but this should be treated as an implementation - detail. QEMU does have atomic_load_acquire() and atomic_store_release() - macros, but for now they are only used within atomic.h. This may - change in the future. - - -SOURCES -======= - -* Documentation/memory-barriers.txt from the Linux kernel - -* "The JSR-133 Cookbook for Compiler Writers", available at - http://g.oswego.edu/dl/jmm/cookbook.html diff --git a/docs/bitmaps.md b/docs/bitmaps.md deleted file mode 100644 index a2e8d51163..0000000000 --- a/docs/bitmaps.md +++ /dev/null @@ -1,505 +0,0 @@ -<!-- -Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc. -All rights reserved. - -This file is licensed via The FreeBSD Documentation License, the full text of -which is included at the end of this document. ---> - -# Dirty Bitmaps and Incremental Backup - -* Dirty Bitmaps are objects that track which data needs to be backed up for the - next incremental backup. - -* Dirty bitmaps can be created at any time and attached to any node - (not just complete drives.) - -## Dirty Bitmap Names - -* A dirty bitmap's name is unique to the node, but bitmaps attached to different - nodes can share the same name. - -* Dirty bitmaps created for internal use by QEMU may be anonymous and have no - name, but any user-created bitmaps may not be. There can be any number of - anonymous bitmaps per node. - -* The name of a user-created bitmap must not be empty (""). - -## Bitmap Modes - -* A Bitmap can be "frozen," which means that it is currently in-use by a backup - operation and cannot be deleted, renamed, written to, reset, - etc. - -* The normal operating mode for a bitmap is "active." - -## Basic QMP Usage - -### Supported Commands ### - -* block-dirty-bitmap-add -* block-dirty-bitmap-remove -* block-dirty-bitmap-clear - -### Creation - -* To create a new bitmap, enabled, on the drive with id=drive0: - -```json -{ "execute": "block-dirty-bitmap-add", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -* This bitmap will have a default granularity that matches the cluster size of - its associated drive, if available, clamped to between [4KiB, 64KiB]. - The current default for qcow2 is 64KiB. - -* To create a new bitmap that tracks changes in 32KiB segments: - -```json -{ "execute": "block-dirty-bitmap-add", - "arguments": { - "node": "drive0", - "name": "bitmap0", - "granularity": 32768 - } -} -``` - -### Deletion - -* Bitmaps that are frozen cannot be deleted. - -* Deleting the bitmap does not impact any other bitmaps attached to the same - node, nor does it affect any backups already created from this node. - -* Because bitmaps are only unique to the node to which they are attached, - you must specify the node/drive name here, too. - -```json -{ "execute": "block-dirty-bitmap-remove", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -### Resetting - -* Resetting a bitmap will clear all information it holds. - -* An incremental backup created from an empty bitmap will copy no data, - as if nothing has changed. - -```json -{ "execute": "block-dirty-bitmap-clear", - "arguments": { - "node": "drive0", - "name": "bitmap0" - } -} -``` - -## Transactions - -### Justification - -Bitmaps can be safely modified when the VM is paused or halted by using -the basic QMP commands. For instance, you might perform the following actions: - -1. Boot the VM in a paused state. -2. Create a full drive backup of drive0. -3. Create a new bitmap attached to drive0. -4. Resume execution of the VM. -5. Incremental backups are ready to be created. - -At this point, the bitmap and drive backup would be correctly in sync, -and incremental backups made from this point forward would be correctly aligned -to the full drive backup. - -This is not particularly useful if we decide we want to start incremental -backups after the VM has been running for a while, for which we will need to -perform actions such as the following: - -1. Boot the VM and begin execution. -2. Using a single transaction, perform the following operations: - * Create bitmap0. - * Create a full drive backup of drive0. -3. Incremental backups are now ready to be created. - -### Supported Bitmap Transactions - -* block-dirty-bitmap-add -* block-dirty-bitmap-clear - -The usages are identical to their respective QMP commands, but see below -for examples. - -### Example: New Incremental Backup - -As outlined in the justification, perhaps we want to create a new incremental -backup chain attached to a drive. - -```json -{ "execute": "transaction", - "arguments": { - "actions": [ - {"type": "block-dirty-bitmap-add", - "data": {"node": "drive0", "name": "bitmap0"} }, - {"type": "drive-backup", - "data": {"device": "drive0", "target": "/path/to/full_backup.img", - "sync": "full", "format": "qcow2"} } - ] - } -} -``` - -### Example: New Incremental Backup Anchor Point - -Maybe we just want to create a new full backup with an existing bitmap and -want to reset the bitmap to track the new chain. - -```json -{ "execute": "transaction", - "arguments": { - "actions": [ - {"type": "block-dirty-bitmap-clear", - "data": {"node": "drive0", "name": "bitmap0"} }, - {"type": "drive-backup", - "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", - "sync": "full", "format": "qcow2"} } - ] - } -} -``` - -## Incremental Backups - -The star of the show. - -**Nota Bene!** Only incremental backups of entire drives are supported for now. -So despite the fact that you can attach a bitmap to any arbitrary node, they are -only currently useful when attached to the root node. This is because -drive-backup only supports drives/devices instead of arbitrary nodes. - -### Example: First Incremental Backup - -1. Create a full backup and sync it to the dirty bitmap, as in the transactional -examples above; or with the VM offline, manually create a full copy and then -create a new bitmap before the VM begins execution. - - * Let's assume the full backup is named 'full_backup.img'. - * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. - -2. Create a destination image for the incremental backup that utilizes the -full backup as a backing image. - - * Let's assume it is named 'incremental.0.img'. - - ```sh - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -3. Issue the incremental backup command: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -### Example: Second Incremental Backup - -1. Create a new destination image for the incremental backup that points to the - previous one, e.g.: 'incremental.1.img' - - ```sh - # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 - ``` - -2. Issue a new incremental backup command. The only difference here is that we - have changed the target image below. - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.1.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -## Errors - -* In the event of an error that occurs after a backup job is successfully - launched, either by a direct QMP command or a QMP transaction, the user - will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied - by a BLOCK_JOB_ERROR event. - -* In the case of an event being cancelled, the user will receive a - BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. - -* In either case, the incremental backup data contained within the bitmap is - safely rolled back, and the data within the bitmap is not lost. The image - file created for the failed attempt can be safely deleted. - -* Once the underlying problem is fixed (e.g. more storage space is freed up), - you can simply retry the incremental backup command with the same bitmap. - -### Example - -1. Create a target image: - - ```sh - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -2. Attempt to create an incremental backup via QMP: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -3. Receive an event notifying us of failure: - - ```json - { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "No space left on device", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - ``` - -4. Delete the failed incremental, and re-create the image. - - ```sh - # rm incremental.0.img - # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 - ``` - -5. Retry the command after fixing the underlying problem, - such as freeing up space on the backup volume: - - ```json - { "execute": "drive-backup", - "arguments": { - "device": "drive0", - "bitmap": "bitmap0", - "target": "incremental.0.img", - "format": "qcow2", - "sync": "incremental", - "mode": "existing" - } - } - ``` - -6. Receive confirmation that the job completed successfully: - - ```json - { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, - "data": { "device": "drive1", "type": "backup", - "speed": 0, "len": 67108864, "offset": 67108864}, - "event": "BLOCK_JOB_COMPLETED" } - ``` - -### Partial Transactional Failures - -* Sometimes, a transaction will succeed in launching and return success, - but then later the backup jobs themselves may fail. It is possible that - a management application may have to deal with a partial backup failure - after a successful transaction. - -* If multiple backup jobs are specified in a single transaction, when one of - them fails, it will not interact with the other backup jobs in any way. - -* The job(s) that succeeded will clear the dirty bitmap associated with the - operation, but the job(s) that failed will not. It is not "safe" to delete - any incremental backups that were created successfully in this scenario, - even though others failed. - -#### Example - -* QMP example highlighting two backup jobs: - - ```json - { "execute": "transaction", - "arguments": { - "actions": [ - { "type": "drive-backup", - "data": { "device": "drive0", "bitmap": "bitmap0", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d0-incr-1.qcow2" } }, - { "type": "drive-backup", - "data": { "device": "drive1", "bitmap": "bitmap1", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d1-incr-1.qcow2" } }, - ] - } - } - ``` - -* QMP example response, highlighting one success and one failure: - * Acknowledgement that the Transaction was accepted and jobs were launched: - ```json - { "return": {} } - ``` - - * Later, QEMU sends notice that the first job was completed: - ```json - { "timestamp": { "seconds": 1447192343, "microseconds": 615698 }, - "data": { "device": "drive0", "type": "backup", - "speed": 0, "len": 67108864, "offset": 67108864 }, - "event": "BLOCK_JOB_COMPLETED" - } - ``` - - * Later yet, QEMU sends notice that the second job has failed: - ```json - { "timestamp": { "seconds": 1447192399, "microseconds": 683015 }, - "data": { "device": "drive1", "action": "report", - "operation": "read" }, - "event": "BLOCK_JOB_ERROR" } - ``` - - ```json - { "timestamp": { "seconds": 1447192399, "microseconds": 685853 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "Input/output error", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - -* In the above example, "d0-incr-1.qcow2" is valid and must be kept, - but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide - incremental backup of all drives at a point-in-time is to be made, - new backups for both drives will need to be made, taking into account - that a new incremental backup for drive0 needs to be based on top of - "d0-incr-1.qcow2." - -### Grouped Completion Mode - -* While jobs launched by transactions normally complete or fail on their own, - it is possible to instruct them to complete or fail together as a group. - -* QMP transactions take an optional properties structure that can affect - the semantics of the transaction. - -* The "completion-mode" transaction property can be either "individual" - which is the default, legacy behavior described above, or "grouped," - a new behavior detailed below. - -* Delayed Completion: In grouped completion mode, no jobs will report - success until all jobs are ready to report success. - -* Grouped failure: If any job fails in grouped completion mode, all remaining - jobs will be cancelled. Any incremental backups will restore their dirty - bitmap objects as if no backup command was ever issued. - - * Regardless of if QEMU reports a particular incremental backup job as - CANCELLED or as an ERROR, the in-memory bitmap will be restored. - -#### Example - -* Here's the same example scenario from above with the new property: - - ```json - { "execute": "transaction", - "arguments": { - "actions": [ - { "type": "drive-backup", - "data": { "device": "drive0", "bitmap": "bitmap0", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d0-incr-1.qcow2" } }, - { "type": "drive-backup", - "data": { "device": "drive1", "bitmap": "bitmap1", - "format": "qcow2", "mode": "existing", - "sync": "incremental", "target": "d1-incr-1.qcow2" } }, - ], - "properties": { - "completion-mode": "grouped" - } - } - } - ``` - -* QMP example response, highlighting a failure for drive2: - * Acknowledgement that the Transaction was accepted and jobs were launched: - ```json - { "return": {} } - ``` - - * Later, QEMU sends notice that the second job has errored out, - but that the first job was also cancelled: - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 632377 }, - "data": { "device": "drive1", "action": "report", - "operation": "read" }, - "event": "BLOCK_JOB_ERROR" } - ``` - - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 640074 }, - "data": { "speed": 0, "offset": 0, "len": 67108864, - "error": "Input/output error", - "device": "drive1", "type": "backup" }, - "event": "BLOCK_JOB_COMPLETED" } - ``` - - ```json - { "timestamp": { "seconds": 1447193702, "microseconds": 640163 }, - "data": { "device": "drive0", "type": "backup", "speed": 0, - "len": 67108864, "offset": 16777216 }, - "event": "BLOCK_JOB_CANCELLED" } - ``` - -<!-- -The FreeBSD Documentation License - -Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML, -PDF, PostScript, RTF and so forth) with or without modification, are permitted -provided that the following conditions are met: - -Redistributions of source code (Markdown) must retain the above copyright -notice, this list of conditions and the following disclaimer of this file -unmodified. - -Redistributions in compiled form (transformed to other DTDs, converted to PDF, -PostScript, RTF and other formats) must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation and/or -other materials provided with the distribution. - -THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---> diff --git a/docs/blkdebug.txt b/docs/blkdebug.txt deleted file mode 100644 index 43d8e8f9c6..0000000000 --- a/docs/blkdebug.txt +++ /dev/null @@ -1,162 +0,0 @@ -Block I/O error injection using blkdebug ----------------------------------------- -Copyright (C) 2014-2015 Red Hat Inc - -This work is licensed under the terms of the GNU GPL, version 2 or later. See -the COPYING file in the top-level directory. - -The blkdebug block driver is a rule-based error injection engine. It can be -used to exercise error code paths in block drivers including ENOSPC (out of -space) and EIO. - -This document gives an overview of the features available in blkdebug. - -Background ----------- -Block drivers have many error code paths that handle I/O errors. Image formats -are especially complex since metadata I/O errors during cluster allocation or -while updating tables happen halfway through request processing and require -discipline to keep image files consistent. - -Error injection allows test cases to trigger I/O errors at specific points. -This way, all error paths can be tested to make sure they are correct. - -Rules ------ -The blkdebug block driver takes a list of "rules" that tell the error injection -engine when to fail an I/O request. - -Each I/O request is evaluated against the rules. If a rule matches the request -then its "action" is executed. - -Rules can be placed in a configuration file; the configuration file -follows the same .ini-like format used by QEMU's -readconfig option, and -each section of the file represents a rule. - -The following configuration file defines a single rule: - - $ cat blkdebug.conf - [inject-error] - event = "read_aio" - errno = "28" - -This rule fails all aio read requests with ENOSPC (28). Note that the errno -value depends on the host. On Linux, see -/usr/include/asm-generic/errno-base.h for errno values. - -Invoke QEMU as follows: - - $ qemu-system-x86_64 - -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \ - -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0 - -Rules support the following attributes: - - event - which type of operation to match (e.g. read_aio, write_aio, - flush_to_os, flush_to_disk). See the "Events" section for - information on events. - - state - (optional) the engine must be in this state number in order for this - rule to match. See the "State transitions" section for information - on states. - - errno - the numeric errno value to return when a request matches this rule. - The errno values depend on the host since the numeric values are not - standarized in the POSIX specification. - - sector - (optional) a sector number that the request must overlap in order to - match this rule - - once - (optional, default "off") only execute this action on the first - matching request - - immediately - (optional, default "off") return a NULL BlockAIOCB - pointer and fail without an errno instead. This - exercises the code path where BlockAIOCB fails and the - caller's BlockCompletionFunc is not invoked. - -Events ------- -Block drivers provide information about the type of I/O request they are about -to make so rules can match specific types of requests. For example, the qcow2 -block driver tells blkdebug when it accesses the L1 table so rules can match -only L1 table accesses and not other metadata or guest data requests. - -The core events are: - - read_aio - guest data read - - write_aio - guest data write - - flush_to_os - write out unwritten block driver state (e.g. cached metadata) - - flush_to_disk - flush the host block device's disk cache - -See qapi/block-core.json:BlkdebugEvent for the full list of events. -You may need to grep block driver source code to understand the -meaning of specific events. - -State transitions ------------------ -There are cases where more power is needed to match a particular I/O request in -a longer sequence of requests. For example: - - write_aio - flush_to_disk - write_aio - -How do we match the 2nd write_aio but not the first? This is where state -transitions come in. - -The error injection engine has an integer called the "state" that always starts -initialized to 1. The state integer is internal to blkdebug and cannot be -observed from outside but rules can interact with it for powerful matching -behavior. - -Rules can be conditional on the current state and they can transition to a new -state. - -When a rule's "state" attribute is non-zero then the current state must equal -the attribute in order for the rule to match. - -For example, to match the 2nd write_aio: - - [set-state] - event = "write_aio" - state = "1" - new_state = "2" - - [inject-error] - event = "write_aio" - state = "2" - errno = "5" - -The first write_aio request matches the set-state rule and transitions from -state 1 to state 2. Once state 2 has been entered, the set-state rule no -longer matches since it requires state 1. But the inject-error rule now -matches the next write_aio request and injects EIO (5). - -State transition rules support the following attributes: - - event - which type of operation to match (e.g. read_aio, write_aio, - flush_to_os, flush_to_disk). See the "Events" section for - information on events. - - state - (optional) the engine must be in this state number in order for this - rule to match - - new_state - transition to this state number - -Suspend and resume ------------------- -Exercising code paths in block drivers may require specific ordering amongst -concurrent requests. The "breakpoint" feature allows requests to be halted on -a blkdebug event and resumed later. This makes it possible to achieve -deterministic ordering when multiple requests are in flight. - -Breakpoints on blkdebug events are associated with a user-defined "tag" string. -This tag serves as an identifier by which the request can be resumed at a later -point. - -See the qemu-io(1) break, resume, remove_break, and wait_break commands for -details. diff --git a/docs/blkverify.txt b/docs/blkverify.txt deleted file mode 100644 index d556dc4e6d..0000000000 --- a/docs/blkverify.txt +++ /dev/null @@ -1,69 +0,0 @@ -= Block driver correctness testing with blkverify = - -== Introduction == - -This document describes how to use the blkverify protocol to test that a block -driver is operating correctly. - -It is difficult to test and debug block drivers against real guests. Often -processes inside the guest will crash because corrupt sectors were read as part -of the executable. Other times obscure errors are raised by a program inside -the guest. These issues are extremely hard to trace back to bugs in the block -driver. - -Blkverify solves this problem by catching data corruption inside QEMU the first -time bad data is read and reporting the disk sector that is corrupted. - -== How it works == - -The blkverify protocol has two child block devices, the "test" device and the -"raw" device. Read/write operations are mirrored to both devices so their -state should always be in sync. - -The "raw" device is a raw image, a flat file, that has identical starting -contents to the "test" image. The idea is that the "raw" device will handle -read/write operations correctly and not corrupt data. It can be used as a -reference for comparison against the "test" device. - -After a mirrored read operation completes, blkverify will compare the data and -raise an error if it is not identical. This makes it possible to catch the -first instance where corrupt data is read. - -== Example == - -Imagine raw.img has 0xcd repeated throughout its first sector: - - $ ./qemu-io -c 'read -v 0 512' raw.img - 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - [...] - 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ - read 512/512 bytes at offset 0 - 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) - -And test.img is corrupt, its first sector is zeroed when it shouldn't be: - - $ ./qemu-io -c 'read -v 0 512' test.img - 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - [...] - 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ - read 512/512 bytes at offset 0 - 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) - -This error is caught by blkverify: - - $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img - blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 - -A more realistic scenario is verifying the installation of a guest OS: - - $ ./qemu-img create raw.img 16G - $ ./qemu-img create -f qcow2 test.qcow2 16G - $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ - -drive file=blkverify:raw.img:test.qcow2 - -If the installation is aborted when blkverify detects corruption, use qemu-io -to explore the contents of the disk image at the sector in question. diff --git a/docs/build-system.txt b/docs/build-system.txt deleted file mode 100644 index 2af1e668c5..0000000000 --- a/docs/build-system.txt +++ /dev/null @@ -1,512 +0,0 @@ - The QEMU build system architecture - ================================== - -This document aims to help developers understand the architecture of the -QEMU build system. As with projects using GNU autotools, the QEMU build -system has two stages, first the developer runs the "configure" script -to determine the local build environment characteristics, then they run -"make" to build the project. There is about where the similarities with -GNU autotools end, so try to forget what you know about them. - - -Stage 1: configure -================== - -The QEMU configure script is written directly in shell, and should be -compatible with any POSIX shell, hence it uses #!/bin/sh. An important -implication of this is that it is important to avoid using bash-isms on -development platforms where bash is the primary host. - -In contrast to autoconf scripts, QEMU's configure is expected to be -silent while it is checking for features. It will only display output -when an error occurs, or to show the final feature enablement summary -on completion. - -Adding new checks to the configure script usually comprises the -following tasks: - - - Initialize one or more variables with the default feature state. - - Ideally features should auto-detect whether they are present, - so try to avoid hardcoding the initial state to either enabled - or disabled, as that forces the user to pass a --enable-XXX - / --disable-XXX flag on every invocation of configure. - - - Add support to the command line arg parser to handle any new - --enable-XXX / --disable-XXX flags required by the feature XXX. - - - Add information to the help output message to report on the new - feature flag. - - - Add code to perform the actual feature check. As noted above, try to - be fully dynamic in checking enablement/disablement. - - - Add code to print out the feature status in the configure summary - upon completion. - - - Add any new makefile variables to $config_host_mak on completion. - - -Taking (a simplified version of) the probe for gnutls from configure, -we have the following pieces: - - # Initial variable state - gnutls="" - - ..snip.. - - # Configure flag processing - --disable-gnutls) gnutls="no" - ;; - --enable-gnutls) gnutls="yes" - ;; - - ..snip.. - - # Help output feature message - gnutls GNUTLS cryptography support - - ..snip.. - - # Test for gnutls - if test "$gnutls" != "no"; then - if ! $pkg_config --exists "gnutls"; then - gnutls_cflags=`$pkg_config --cflags gnutls` - gnutls_libs=`$pkg_config --libs gnutls` - libs_softmmu="$gnutls_libs $libs_softmmu" - libs_tools="$gnutls_libs $libs_tools" - QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags" - gnutls="yes" - elif test "$gnutls" = "yes"; then - feature_not_found "gnutls" "Install gnutls devel" - else - gnutls="no" - fi - fi - - ..snip.. - - # Completion feature summary - echo "GNUTLS support $gnutls" - - ..snip.. - - # Define make variables - if test "$gnutls" = "yes" ; then - echo "CONFIG_GNUTLS=y" >> $config_host_mak - fi - - -Helper functions ----------------- - -The configure script provides a variety of helper functions to assist -developers in checking for system features: - - - do_cc $ARGS... - - Attempt to run the system C compiler passing it $ARGS... - - - do_cxx $ARGS... - - Attempt to run the system C++ compiler passing it $ARGS... - - - compile_object $CFLAGS - - Attempt to compile a test program with the system C compiler using - $CFLAGS. The test program must have been previously written to a file - called $TMPC. - - - compile_prog $CFLAGS $LDFLAGS - - Attempt to compile a test program with the system C compiler using - $CFLAGS and link it with the system linker using $LDFLAGS. The test - program must have been previously written to a file called $TMPC. - - - has $COMMAND - - Determine if $COMMAND exists in the current environment, either as a - shell builtin, or executable binary, returning 0 on success. - - - path_of $COMMAND - - Return the fully qualified path of $COMMAND, printing it to stdout, - and returning 0 on success. - - - check_define $NAME - - Determine if the macro $NAME is defined by the system C compiler - - - check_include $NAME - - Determine if the include $NAME file is available to the system C - compiler - - - write_c_skeleton - - Write a minimal C program main() function to the temporary file - indicated by $TMPC - - - feature_not_found $NAME $REMEDY - - Print a message to stderr that the feature $NAME was not available - on the system, suggesting the user try $REMEDY to address the - problem. - - - error_exit $MESSAGE $MORE... - - Print $MESSAGE to stderr, followed by $MORE... and then exit from the - configure script with non-zero status - - - query_pkg_config $ARGS... - - Run pkg-config passing it $ARGS. If QEMU is doing a static build, - then --static will be automatically added to $ARGS - - -Stage 2: makefiles -================== - -The use of GNU make is required with the QEMU build system. - -Although the source code is spread across multiple subdirectories, the -build system should be considered largely non-recursive in nature, in -contrast to common practices seen with automake. There is some recursive -invocation of make, but this is related to the things being built, -rather than the source directory structure. - -QEMU currently supports both VPATH and non-VPATH builds, so there are -three general ways to invoke configure & perform a build. - - - VPATH, build artifacts outside of QEMU source tree entirely - - cd ../ - mkdir build - cd build - ../qemu/configure - make - - - VPATH, build artifacts in a subdir of QEMU source tree - - mkdir build - cd build - ../configure - make - - - non-VPATH, build artifacts everywhere - - ./configure - make - -The QEMU maintainers generally recommend that a VPATH build is used by -developers. Patches to QEMU are expected to ensure VPATH build still -works. - - -Module structure ----------------- - -There are a number of key outputs of the QEMU build system: - - - Tools - qemu-img, qemu-nbd, qga (guest agent), etc - - System emulators - qemu-system-$ARCH - - Userspace emulators - qemu-$ARCH - - Unit tests - -The source code is highly modularized, split across many files to -facilitate building of all of these components with as little duplicated -compilation as possible. There can be considered to be two distinct -groups of files, those which are independent of the QEMU emulation -target and those which are dependent on the QEMU emulation target. - -In the target-independent set lives various general purpose helper code, -such as error handling infrastructure, standard data structures, -platform portability wrapper functions, etc. This code can be compiled -once only and the .o files linked into all output binaries. - -In the target-dependent set lives CPU emulation, device emulation and -much glue code. This sometimes also has to be compiled multiple times, -once for each target being built. - -The utility code that is used by all binaries is built into a -static archive called libqemuutil.a, which is then linked to all the -binaries. In order to provide hooks that are only needed by some of the -binaries, code in libqemuutil.a may depend on other functions that are -not fully implemented by all QEMU binaries. To deal with this there is a -second library called libqemustub.a which provides dummy stubs for all -these functions. These will get lazy linked into the binary if the real -implementation is not present. In this way, the libqemustub.a static -library can be thought of as a portable implementation of the weak -symbols concept. All binaries should link to both libqemuutil.a and -libqemustub.a. e.g. - - qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a - - -Windows platform portability ----------------------------- - -On Windows, all binaries have the suffix '.exe', so all Makefile rules -which create binaries must include the $(EXESUF) variable on the binary -name. e.g. - - qemu-img$(EXESUF): qemu-img.o ..snip.. - -This expands to '.exe' on Windows, or '' on other platforms. - -A further complication for the system emulator binaries is that -two separate binaries need to be generated. - -The main binary (e.g. qemu-system-x86_64.exe) is linked against the -Windows console runtime subsystem. These are expected to be run from a -command prompt window, and so will print stderr to the console that -launched them. - -The second binary generated has a 'w' on the end of its name (e.g. -qemu-system-x86_64w.exe) and is linked against the Windows graphical -runtime subsystem. These are expected to be run directly from the -desktop and will open up a dedicated console window for stderr output. - -The Makefile.target will generate the binary for the graphical subsystem -first, and then use objcopy to relink it against the console subsystem -to generate the second binary. - - -Object variable naming ----------------------- - -The QEMU convention is to define variables to list different groups of -object files. These are named with the convention $PREFIX-obj-y. For -example the libqemuutil.a file will be linked with all objects listed -in a variable 'util-obj-y'. So, for example, util/Makefile.obj will -contain a set of definitions looking like - - util-obj-y += bitmap.o bitops.o hbitmap.o - util-obj-y += fifo8.o - util-obj-y += acl.o - util-obj-y += error.o qemu-error.o - -When there is an object file which needs to be conditionally built based -on some characteristic of the host system, the configure script will -define a variable for the conditional. For example, on Windows it will -define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a -value of 'y'. It is now possible to use the config variables when -listing object files. For example, - - util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o - util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o - -On Windows this expands to - - util-obj-y += oslib-win32.o qemu-thread-win32.o - util-obj-n += oslib-posix.o qemu-thread-posix.o - -Since libqemutil.a links in $(util-obj-y), the POSIX specific files -listed against $(util-obj-n) are ignored on the Windows platform builds. - - -CFLAGS / LDFLAGS / LIBS handling --------------------------------- - -There are many different binaries being built with differing purposes, -and some of them might even be 3rd party libraries pulled in via git -submodules. As such the use of the global CFLAGS variable is generally -avoided in QEMU, since it would apply to too many build targets. - -Flags that are needed by any QEMU code (i.e. everything *except* GIT -submodule projects) are put in $(QEMU_CFLAGS) variable. For linker -flags the $(LIBS) variable is sometimes used, but a couple of more -targeted variables are preferred. $(libs_softmmu) is used for -libraries that must be linked to system emulator targets, $(LIBS_TOOLS) -is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used -for the QEMU guest agent. There is currently no specific variable for -the userspace emulator targets as the global $(LIBS), or more targeted -variables shown below, are sufficient. - -In addition to these variables, it is possible to provide cflags and -libs against individual source code files, by defining variables of the -form $FILENAME-cflags and $FILENAME-libs. For example, the curl block -driver needs to link to the libcurl library, so block/Makefile defines -some variables: - - curl.o-cflags := $(CURL_CFLAGS) - curl.o-libs := $(CURL_LIBS) - -The scope is a little different between the two variables. The libs get -used when linking any target binary that includes the curl.o object -file, while the cflags get used when compiling the curl.c file only. - - -Statically defined files ------------------------- - -The following key files are statically defined in the source tree, with -the rules needed to build QEMU. Their behaviour is influenced by a -number of dynamically created files listed later. - -- Makefile - -The main entry point used when invoking make to build all the components -of QEMU. The default 'all' target will naturally result in the build of -every component. The various tools and helper binaries are built -directly via a non-recursive set of rules. - -Each system/userspace emulation target needs to have a slightly -different set of make rules / variables. Thus, make will be recursively -invoked for each of the emulation targets. - -The recursive invocation will end up processing the toplevel -Makefile.target file (more on that later). - - -- */Makefile.objs - -Since the source code is spread across multiple directories, the rules -for each file are similarly modularized. Thus each subdirectory -containing .c files will usually also contain a Makefile.objs file. -These files are not directly invoked by a recursive make, but instead -they are imported by the top level Makefile and/or Makefile.target - -Each Makefile.objs usually just declares a set of variables listing the -.o files that need building from the source files in the directory. They -will also define any custom linker or compiler flags. For example in -block/Makefile.objs - - block-obj-$(CONFIG_LIBISCSI) += iscsi.o - block-obj-$(CONFIG_CURL) += curl.o - - ..snip... - - iscsi.o-cflags := $(LIBISCSI_CFLAGS) - iscsi.o-libs := $(LIBISCSI_LIBS) - curl.o-cflags := $(CURL_CFLAGS) - curl.o-libs := $(CURL_LIBS) - -If there are any rules defined in the Makefile.objs file, they should -all use $(obj) as a prefix to the target, e.g. - - $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp - - -- Makefile.target - -This file provides the entry point used to build each individual system -or userspace emulator target. Each enabled target has its own -subdirectory. For example if configure is run with the argument -'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu' -will be created, containing a 'Makefile' which symlinks back to -Makefile.target - -So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up -using Makefile.target for the build rules. - - -- rules.mak - -This file provides the generic helper rules for invoking build tools, in -particular the compiler and linker. This also contains the magic (hairy) -'unnest-vars' function which is used to merge the variable definitions -from all Makefile.objs in the source tree down into the main Makefile -context. - - -- default-configs/*.mak - -The files under default-configs/ control what emulated hardware is built -into each QEMU system and userspace emulator targets. They merely -contain a long list of config variable definitions. For example, -default-configs/x86_64-softmmu.mak has: - - include pci.mak - include sound.mak - include usb.mak - CONFIG_QXL=$(CONFIG_SPICE) - CONFIG_VGA_ISA=y - CONFIG_VGA_CIRRUS=y - CONFIG_VMWARE_VGA=y - CONFIG_VIRTIO_VGA=y - ...snip... - -These files rarely need changing unless new devices / hardware need to -be enabled for a particular system/userspace emulation target - - -- tests/Makefile - -Rules for building the unit tests. This file is included directly by the -top level Makefile, so anything defined in this file will influence the -entire build system. Care needs to be taken when writing rules for tests -to ensure they only apply to the unit test execution / build. - -- tests/docker/Makefile.include - -Rules for Docker tests. Like tests/Makefile, this file is included -directly by the top level Makefile, anything defined in this file will -influence the entire build system. - -- po/Makefile - -Rules for building and installing the binary message catalogs from the -text .po file sources. This almost never needs changing for any reason. - - -Dynamically created files -------------------------- - -The following files are generated dynamically by configure in order to -control the behaviour of the statically defined makefiles. This avoids -the need for QEMU makefiles to go through any pre-processing as seen -with autotools, where Makefile.am generates Makefile.in which generates -Makefile. - - -- config-host.mak - -When configure has determined the characteristics of the build host it -will write a long list of variables to config-host.mak file. This -provides the various install directories, compiler / linker flags and a -variety of CONFIG_* variables related to optionally enabled features. -This is imported by the top level Makefile in order to tailor the build -output. - -The variables defined here are those which are applicable to all QEMU -build outputs. Variables which are potentially different for each -emulator target are defined by the next file... - -It is also used as a dependency checking mechanism. If make sees that -the modification timestamp on configure is newer than that on -config-host.mak, then configure will be re-run. - - -- config-host.h - -The config-host.h file is used by source code to determine what features -are enabled. It is generated from the contents of config-host.mak using -the scripts/create_config program. This extracts all the CONFIG_* variables, -most of the HOST_* variables and a few other misc variables from -config-host.mak, formatting them as C preprocessor macros. - - -- $TARGET-NAME/config-target.mak - -TARGET-NAME is the name of a system or userspace emulator, for example, -x86_64-softmmu denotes the system emulator for the x86_64 architecture. -This file contains the variables which need to vary on a per-target -basis. For example, it will indicate whether KVM or Xen are enabled for -the target and any other potential custom libraries needed for linking -the target. - - -- $TARGET-NAME/config-devices.mak - -TARGET-NAME is again the name of a system or userspace emulator. The -config-devices.mak file is automatically generated by make using the -scripts/make_device_config.sh program, feeding it the -default-configs/$TARGET-NAME file as input. - - -- $TARGET-NAME/Makefile - -This is the entrypoint used when make recurses to build a single system -or userspace emulator target. It is merely a symlink back to the -Makefile.target in the top level. diff --git a/docs/config/ich9-ehci-uhci.cfg b/docs/config/ich9-ehci-uhci.cfg new file mode 100644 index 0000000000..a0e9b96f4d --- /dev/null +++ b/docs/config/ich9-ehci-uhci.cfg @@ -0,0 +1,37 @@ +########################################################################### +# +# You can pass this file directly to qemu using the -readconfig +# command line switch. +# +# This config file creates a EHCI adapter with companion UHCI +# controllers as multifunction device in PCI slot "1d". +# +# Specify "bus=ehci.0" when creating usb devices to hook them up +# there. +# + +[device "ehci"] + driver = "ich9-usb-ehci1" + addr = "1d.7" + multifunction = "on" + +[device "uhci-1"] + driver = "ich9-usb-uhci1" + addr = "1d.0" + multifunction = "on" + masterbus = "ehci.0" + firstport = "0" + +[device "uhci-2"] + driver = "ich9-usb-uhci2" + addr = "1d.1" + multifunction = "on" + masterbus = "ehci.0" + firstport = "2" + +[device "uhci-3"] + driver = "ich9-usb-uhci3" + addr = "1d.2" + multifunction = "on" + masterbus = "ehci.0" + firstport = "4" diff --git a/docs/config/mach-virt-graphical.cfg b/docs/config/mach-virt-graphical.cfg new file mode 100644 index 0000000000..0fdf6846dd --- /dev/null +++ b/docs/config/mach-virt-graphical.cfg @@ -0,0 +1,281 @@ +# mach-virt - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-graphical.cfg \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 Display controller +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB keyboard / USB tablet combo so that graphical +# guests can be controlled appropriately. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "keyboard"] + driver = "usb-kbd" + bus = "usb.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# Display controller +# ========================================================= +# +# We use virtio-gpu because the legacy VGA framebuffer is +# very troublesome on aarch64, and virtio-gpu is the only +# video device that doesn't implement it. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "virtio-gpu" + bus = "pcie.0" + addr = "01.0" diff --git a/docs/config/mach-virt-serial.cfg b/docs/config/mach-virt-serial.cfg new file mode 100644 index 0000000000..aee9f1c5a1 --- /dev/null +++ b/docs/config/mach-virt-serial.cfg @@ -0,0 +1,243 @@ +# mach-virt - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-serial.cfg \ +# -display none -serial mon:stdio \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/config/q35-emulated.cfg b/docs/config/q35-emulated.cfg new file mode 100644 index 0000000000..c6416d6545 --- /dev/null +++ b/docs/config/q35-emulated.cfg @@ -0,0 +1,288 @@ +# q35 - Emulated guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-emulated.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of emulated devices that +# closely resembles that of a physical machine, and will be +# accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of devices that +# are pretty much guaranteed to be present in every single +# physical machine based on q35, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:19.0 Ethernet controller +# 00:1a.* USB controller (#2) +# 00:1b.0 Audio device +# 00:1c.* PCI bridge (PCI Express Root Ports) +# 00:1d.* USB Controller (#1) +# 00:1e.0 PCI bridge (legacy PCI bridge) +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We add four PCI Express Root Ports, all sharing the same +# slot on the PCI Express Root Bus. These ports support +# hotplug. + +[device "ich9-pcie-port-1"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + +[device "ich9-pcie-port-2"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "ich9-pcie-port-3"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "ich9-pcie-port-4"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + + +# PCI bridge (legacy PCI bridge) +# ========================================================= +# +# This bridge can be used to build an independent topology +# for legacy PCI devices. PCI Express devices should be +# plugged into PCI Express slots instead, so ideally there +# will be no devices connected to this bridge. + +[device "ich9-pci-bridge"] + driver = "i82801b11-bridge" + bus = "pcie.0" + addr = "1e.0" + + +# SATA storage +# ========================================================= +# +# An implicit SATA controller is created automatically for +# every single q35 guest; here we create a disk, backed by +# a qcow2 disk image on the host's filesystem, and attach +# it to that controller so that the guest can use it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "sata-disk"] + driver = "ide-hd" + bus = "ide.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "sata-optical-disk"] + driver = "ide-cd" + bus = "ide.1" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# USB controller (#1) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-1"] + driver = "ich9-usb-ehci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.7" + +[device "ich9-uhci-1"] + driver = "ich9-usb-uhci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.0" + masterbus = "ich9-ehci-1.0" + firstport = "0" + +[device "ich9-uhci-2"] + driver = "ich9-usb-uhci2" + multifunction = "on" + bus = "pcie.0" + addr = "1d.1" + masterbus = "ich9-ehci-1.0" + firstport = "2" + +[device "ich9-uhci-3"] + driver = "ich9-usb-uhci3" + multifunction = "on" + bus = "pcie.0" + addr = "1d.2" + masterbus = "ich9-ehci-1.0" + firstport = "4" + + +# USB controller (#2) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-2"] + driver = "ich9-usb-ehci2" + multifunction = "on" + bus = "pcie.0" + addr = "1a.7" + +[device "ich9-uhci-4"] + driver = "ich9-usb-uhci4" + multifunction = "on" + bus = "pcie.0" + addr = "1a.0" + masterbus = "ich9-ehci-2.0" + firstport = "0" + +[device "ich9-uhci-5"] + driver = "ich9-usb-uhci5" + multifunction = "on" + bus = "pcie.0" + addr = "1a.1" + masterbus = "ich9-ehci-2.0" + firstport = "2" + +[device "ich9-uhci-6"] + driver = "ich9-usb-uhci6" + multifunction = "on" + bus = "pcie.0" + addr = "1a.2" + masterbus = "ich9-ehci-2.0" + firstport = "4" + + +# Ethernet controller +# ========================================================= +# +# We add a Gigabit Ethernet interface to the guest; on the +# host side, we take advantage of user networking so that +# the QEMU process doesn't require any additional +# privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "e1000" + netdev = "hostnet" + bus = "pcie.0" + addr = "19.0" + + +# VGA compatible controller +# ========================================================= +# +# We use stdvga instead of Cirrus as it supports more video +# modes and is closer to what actual hardware looks like. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "VGA" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# The sound card is a legacy PCI device that is plugged +# directly into the PCI Express Root Bus. + +[device "ich9-hda-audio"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "ich9-hda-duplex"] + driver = "hda-duplex" + bus = "ich9-hda-audio.0" + cad = "0" diff --git a/docs/config/q35-virtio-graphical.cfg b/docs/config/q35-virtio-graphical.cfg new file mode 100644 index 0000000000..28bde2fc57 --- /dev/null +++ b/docs/config/q35-virtio-graphical.cfg @@ -0,0 +1,248 @@ +# q35 - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-graphical.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:1b.0 Audio device +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB tablet so that graphical guests can be controlled +# appropriately. A USB keyboard is not needed, as q35 +# guests get a PS/2 one added automatically. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# VGA compatible controller +# ========================================================= +# +# We plug the QXL video card directly into the PCI Express +# Root Bus as it is a legacy PCI device; this way, we can +# reduce the number of PCI Express controllers in the +# guest. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "qxl-vga" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# Like the video card, the sound card is a legacy PCI +# device and as such can be plugged directly into the PCI +# Express Root Bus. + +[device "sound"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "duplex"] + driver = "hda-duplex" + bus = "sound.0" + cad = "0" diff --git a/docs/config/q35-virtio-serial.cfg b/docs/config/q35-virtio-serial.cfg new file mode 100644 index 0000000000..c33c9cc07a --- /dev/null +++ b/docs/config/q35-virtio-serial.cfg @@ -0,0 +1,193 @@ +# q35 - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-serial.cfg \ +# -display none -serial mon:stdio +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt new file mode 100644 index 0000000000..3ef5d85b1b --- /dev/null +++ b/docs/devel/atomics.txt @@ -0,0 +1,388 @@ +CPUs perform independent memory operations effectively in random order. +but this can be a problem for CPU-CPU interaction (including interactions +between QEMU and the guest). Multi-threaded programs use various tools +to instruct the compiler and the CPU to restrict the order to something +that is consistent with the expectations of the programmer. + +The most basic tool is locking. Mutexes, condition variables and +semaphores are used in QEMU, and should be the default approach to +synchronization. Anything else is considerably harder, but it's +also justified more often than one would like. The two tools that +are provided by qemu/atomic.h are memory barriers and atomic operations. + +Macros defined by qemu/atomic.h fall in three camps: + +- compiler barriers: barrier(); + +- weak atomic access and manual memory barriers: atomic_read(), + atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(), + smp_mb_release(), smp_read_barrier_depends(); + +- sequentially consistent atomic access: everything else. + + +COMPILER MEMORY BARRIER +======================= + +barrier() prevents the compiler from moving the memory accesses either +side of it to the other side. The compiler barrier has no direct effect +on the CPU, which may then reorder things however it wishes. + +barrier() is mostly used within qemu/atomic.h itself. On some +architectures, CPU guarantees are strong enough that blocking compiler +optimizations already ensures the correct order of execution. In this +case, qemu/atomic.h will reduce stronger memory barriers to simple +compiler barriers. + +Still, barrier() can be useful when writing code that can be interrupted +by signal handlers. + + +SEQUENTIALLY CONSISTENT ATOMIC ACCESS +===================================== + +Most of the operations in the qemu/atomic.h header ensure *sequential +consistency*, where "the result of any execution is the same as if the +operations of all the processors were executed in some sequential order, +and the operations of each individual processor appear in this sequence +in the order specified by its program". + +qemu/atomic.h provides the following set of atomic read-modify-write +operations: + + void atomic_inc(ptr) + void atomic_dec(ptr) + void atomic_add(ptr, val) + void atomic_sub(ptr, val) + void atomic_and(ptr, val) + void atomic_or(ptr, val) + + typeof(*ptr) atomic_fetch_inc(ptr) + typeof(*ptr) atomic_fetch_dec(ptr) + typeof(*ptr) atomic_fetch_add(ptr, val) + typeof(*ptr) atomic_fetch_sub(ptr, val) + typeof(*ptr) atomic_fetch_and(ptr, val) + typeof(*ptr) atomic_fetch_or(ptr, val) + typeof(*ptr) atomic_xchg(ptr, val) + typeof(*ptr) atomic_cmpxchg(ptr, old, new) + +all of which return the old value of *ptr. These operations are +polymorphic; they operate on any type that is as wide as an int. + +Sequentially consistent loads and stores can be done using: + + atomic_fetch_add(ptr, 0) for loads + atomic_xchg(ptr, val) for stores + +However, they are quite expensive on some platforms, notably POWER and +ARM. Therefore, qemu/atomic.h provides two primitives with slightly +weaker constraints: + + typeof(*ptr) atomic_mb_read(ptr) + void atomic_mb_set(ptr, val) + +The semantics of these primitives map to Java volatile variables, +and are strongly related to memory barriers as used in the Linux +kernel (see below). + +As long as you use atomic_mb_read and atomic_mb_set, accesses cannot +be reordered with each other, and it is also not possible to reorder +"normal" accesses around them. + +However, and this is the important difference between +atomic_mb_read/atomic_mb_set and sequential consistency, it is important +for both threads to access the same volatile variable. It is not the +case that everything visible to thread A when it writes volatile field f +becomes visible to thread B after it reads volatile field g. The store +and load have to "match" (i.e., be performed on the same volatile +field) to achieve the right semantics. + + +These operations operate on any type that is as wide as an int or smaller. + + +WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS +============================================= + +Compared to sequentially consistent atomic access, programming with +weaker consistency models can be considerably more complicated. +In general, if the algorithm you are writing includes both writes +and reads on the same side, it is generally simpler to use sequentially +consistent primitives. + +When using this model, variables are accessed with atomic_read() and +atomic_set(), and restrictions to the ordering of accesses is enforced +using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(), +smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends(). + +atomic_read() and atomic_set() prevents the compiler from using +optimizations that might otherwise optimize accesses out of existence +on the one hand, or that might create unsolicited accesses on the other. +In general this should not have any effect, because the same compiler +barriers are already implied by memory barriers. However, it is useful +to do so, because it tells readers which variables are shared with +other threads, and which are local to the current thread or protected +by other, more mundane means. + +Memory barriers control the order of references to shared memory. +They come in six kinds: + +- smp_rmb() guarantees that all the LOAD operations specified before + the barrier will appear to happen before all the LOAD operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_rmb() puts a partial ordering on loads, but is not + required to have any effect on stores. + +- smp_wmb() guarantees that all the STORE operations specified before + the barrier will appear to happen before all the STORE operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_wmb() puts a partial ordering on stores, but is not + required to have any effect on loads. + +- smp_mb_acquire() guarantees that all the LOAD operations specified before + the barrier will appear to happen before all the LOAD or STORE operations + specified after the barrier with respect to the other components of + the system. + +- smp_mb_release() guarantees that all the STORE operations specified *after* + the barrier will appear to happen after all the LOAD or STORE operations + specified *before* the barrier with respect to the other components of + the system. + +- smp_mb() guarantees that all the LOAD and STORE operations specified + before the barrier will appear to happen before all the LOAD and + STORE operations specified after the barrier with respect to the other + components of the system. + + smp_mb() puts a partial ordering on both loads and stores. It is + stronger than both a read and a write memory barrier; it implies both + smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs + coming before the barrier from overtaking LOADs coming after the + barrier and vice versa. + +- smp_read_barrier_depends() is a weaker kind of read barrier. On + most processors, whenever two loads are performed such that the + second depends on the result of the first (e.g., the first load + retrieves the address to which the second load will be directed), + the processor will guarantee that the first LOAD will appear to happen + before the second with respect to the other components of the system. + However, this is not always true---for example, it was not true on + Alpha processors. Whenever this kind of access happens to shared + memory (that is not protected by a lock), a read barrier is needed, + and smp_read_barrier_depends() can be used instead of smp_rmb(). + + Note that the first load really has to have a _data_ dependency and not + a control dependency. If the address for the second load is dependent + on the first load, but the dependency is through a conditional rather + than actually loading the address itself, then it's a _control_ + dependency and a full read barrier or better is required. + + +This is the set of barriers that is required *between* two atomic_read() +and atomic_set() operations to achieve sequential consistency: + + | 2nd operation | + |-----------------------------------------------| + 1st operation | (after last) | atomic_read | atomic_set | + ---------------+----------------+-------------+----------------| + (before first) | | none | smp_mb_release | + ---------------+----------------+-------------+----------------| + atomic_read | smp_mb_acquire | smp_rmb | ** | + ---------------+----------------+-------------+----------------| + atomic_set | none | smp_mb()*** | smp_wmb() | + ---------------+----------------+-------------+----------------| + + * Or smp_read_barrier_depends(). + + ** This requires a load-store barrier. This is achieved by + either smp_mb_acquire() or smp_mb_release(). + + *** This requires a store-load barrier. On most machines, the only + way to achieve this is a full barrier. + + +You can see that the two possible definitions of atomic_mb_read() +and atomic_mb_set() are the following: + + 1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire() + atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb() + + 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire() + atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); + +Usually the former is used, because smp_mb() is expensive and a program +normally has more reads than writes. Therefore it makes more sense to +make atomic_mb_set() the more expensive operation. + +There are two common cases in which atomic_mb_read and atomic_mb_set +generate too many memory barriers, and thus it can be useful to manually +place barriers instead: + +- when a data structure has one thread that is always a writer + and one thread that is always a reader, manual placement of + memory barriers makes the write side faster. Furthermore, + correctness is easy to check for in this case using the "pairing" + trick that is explained below: + + thread 1 thread 1 + ------------------------- ------------------------ + (other writes) + smp_mb_release() + atomic_mb_set(&a, x) atomic_set(&a, x) + smp_wmb() + atomic_mb_set(&b, y) atomic_set(&b, y) + + => + thread 2 thread 2 + ------------------------- ------------------------ + y = atomic_mb_read(&b) y = atomic_read(&b) + smp_rmb() + x = atomic_mb_read(&a) x = atomic_read(&a) + smp_mb_acquire() + + Note that the barrier between the stores in thread 1, and between + the loads in thread 2, has been optimized here to a write or a + read memory barrier respectively. On some architectures, notably + ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as + smp_mb, but smp_rmb and/or smp_wmb are more efficient. + +- sometimes, a thread is accessing many variables that are otherwise + unrelated to each other (for example because, apart from the current + thread, exactly one other thread will read or write each of these + variables). In this case, it is possible to "hoist" the implicit + barriers provided by atomic_mb_read() and atomic_mb_set() outside + a loop. For example, the above definition atomic_mb_read() gives + the following transformation: + + n = 0; n = 0; + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); + smp_mb_acquire(); + + Similarly, atomic_mb_set() can be transformed as follows: + smp_mb(): + + smp_mb_release(); + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + smp_mb(); + + +The two tricks can be combined. In this case, splitting a loop in +two lets you hoist the barriers out of the loops _and_ eliminate the +expensive smp_mb(): + + smp_mb_release(); + for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + atomic_mb_set(&b[i], false); smb_wmb(); + } for (i = 0; i < 10; i++) + atomic_set(&a[i], false); + smp_mb(); + + The other thread can still use atomic_mb_read()/atomic_mb_set() + + +Memory barrier pairing +---------------------- + +A useful rule of thumb is that memory barriers should always, or almost +always, be paired with another barrier. In the case of QEMU, however, +note that the other barrier may actually be in a driver that runs in +the guest! + +For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() +both count as read barriers. A read barrier shall pair with a write +barrier or a full barrier; a write barrier shall pair with a read +barrier or a full barrier. A full barrier can pair with anything. +For example: + + thread 1 thread 2 + =============== =============== + a = 1; + smp_wmb(); + b = 2; x = b; + smp_rmb(); + y = a; + +Note that the "writing" thread is accessing the variables in the +opposite order as the "reading" thread. This is expected: stores +before the write barrier will normally match the loads after the +read barrier, and vice versa. The same is true for more than 2 +access and for data dependency barriers: + + thread 1 thread 2 + =============== =============== + b[2] = 1; + smp_wmb(); + x->i = 2; + smp_wmb(); + a = x; x = a; + smp_read_barrier_depends(); + y = x->i; + smp_read_barrier_depends(); + z = b[y]; + +smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire(). +and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release(). + + +COMPARISON WITH LINUX KERNEL MEMORY BARRIERS +============================================ + +Here is a list of differences between Linux kernel atomic operations +and memory barriers, and the equivalents in QEMU: + +- atomic operations in Linux are always on a 32-bit int type and + use a boxed atomic_t type; atomic operations in QEMU are polymorphic + and use normal C types. + +- Originally, atomic_read and atomic_set in Linux gave no guarantee + at all. Linux 4.1 updated them to implement volatile + semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE). + + QEMU's atomic_read/set implement, if the compiler supports it, C11 + atomic relaxed semantics, and volatile semantics otherwise. + Both semantics prevent the compiler from doing certain transformations; + the difference is that atomic accesses are guaranteed to be atomic, + while volatile accesses aren't. Thus, in the volatile case we just cross + our fingers hoping that the compiler will generate atomic accesses, + since we assume the variables passed are machine-word sized and + properly aligned. + No barriers are implied by atomic_read/set in either Linux or QEMU. + +- atomic read-modify-write operations in Linux are of three kinds: + + atomic_OP returns void + atomic_OP_return returns new value of the variable + atomic_fetch_OP returns the old value of the variable + atomic_cmpxchg returns the old value of the variable + + In QEMU, the second kind does not exist. Currently Linux has + atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub. + +- different atomic read-modify-write operations in Linux imply + a different set of memory barriers; in QEMU, all of them enforce + sequential consistency, which means they imply full memory barriers + before and after the operation. + +- Linux does not have an equivalent of atomic_mb_set(). In particular, + note that smp_store_mb() is a little weaker than atomic_mb_set(). + atomic_mb_read() compiles to the same instructions as Linux's + smp_load_acquire(), but this should be treated as an implementation + detail. QEMU does have atomic_load_acquire() and atomic_store_release() + macros, but for now they are only used within atomic.h. This may + change in the future. + + +SOURCES +======= + +* Documentation/memory-barriers.txt from the Linux kernel + +* "The JSR-133 Cookbook for Compiler Writers", available at + http://g.oswego.edu/dl/jmm/cookbook.html diff --git a/docs/devel/bitmaps.md b/docs/devel/bitmaps.md new file mode 100644 index 0000000000..a2e8d51163 --- /dev/null +++ b/docs/devel/bitmaps.md @@ -0,0 +1,505 @@ +<!-- +Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc. +All rights reserved. + +This file is licensed via The FreeBSD Documentation License, the full text of +which is included at the end of this document. +--> + +# Dirty Bitmaps and Incremental Backup + +* Dirty Bitmaps are objects that track which data needs to be backed up for the + next incremental backup. + +* Dirty bitmaps can be created at any time and attached to any node + (not just complete drives.) + +## Dirty Bitmap Names + +* A dirty bitmap's name is unique to the node, but bitmaps attached to different + nodes can share the same name. + +* Dirty bitmaps created for internal use by QEMU may be anonymous and have no + name, but any user-created bitmaps may not be. There can be any number of + anonymous bitmaps per node. + +* The name of a user-created bitmap must not be empty (""). + +## Bitmap Modes + +* A Bitmap can be "frozen," which means that it is currently in-use by a backup + operation and cannot be deleted, renamed, written to, reset, + etc. + +* The normal operating mode for a bitmap is "active." + +## Basic QMP Usage + +### Supported Commands ### + +* block-dirty-bitmap-add +* block-dirty-bitmap-remove +* block-dirty-bitmap-clear + +### Creation + +* To create a new bitmap, enabled, on the drive with id=drive0: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +* This bitmap will have a default granularity that matches the cluster size of + its associated drive, if available, clamped to between [4KiB, 64KiB]. + The current default for qcow2 is 64KiB. + +* To create a new bitmap that tracks changes in 32KiB segments: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0", + "granularity": 32768 + } +} +``` + +### Deletion + +* Bitmaps that are frozen cannot be deleted. + +* Deleting the bitmap does not impact any other bitmaps attached to the same + node, nor does it affect any backups already created from this node. + +* Because bitmaps are only unique to the node to which they are attached, + you must specify the node/drive name here, too. + +```json +{ "execute": "block-dirty-bitmap-remove", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +### Resetting + +* Resetting a bitmap will clear all information it holds. + +* An incremental backup created from an empty bitmap will copy no data, + as if nothing has changed. + +```json +{ "execute": "block-dirty-bitmap-clear", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +## Transactions + +### Justification + +Bitmaps can be safely modified when the VM is paused or halted by using +the basic QMP commands. For instance, you might perform the following actions: + +1. Boot the VM in a paused state. +2. Create a full drive backup of drive0. +3. Create a new bitmap attached to drive0. +4. Resume execution of the VM. +5. Incremental backups are ready to be created. + +At this point, the bitmap and drive backup would be correctly in sync, +and incremental backups made from this point forward would be correctly aligned +to the full drive backup. + +This is not particularly useful if we decide we want to start incremental +backups after the VM has been running for a while, for which we will need to +perform actions such as the following: + +1. Boot the VM and begin execution. +2. Using a single transaction, perform the following operations: + * Create bitmap0. + * Create a full drive backup of drive0. +3. Incremental backups are now ready to be created. + +### Supported Bitmap Transactions + +* block-dirty-bitmap-add +* block-dirty-bitmap-clear + +The usages are identical to their respective QMP commands, but see below +for examples. + +### Example: New Incremental Backup + +As outlined in the justification, perhaps we want to create a new incremental +backup chain attached to a drive. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-add", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +### Example: New Incremental Backup Anchor Point + +Maybe we just want to create a new full backup with an existing bitmap and +want to reset the bitmap to track the new chain. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-clear", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +## Incremental Backups + +The star of the show. + +**Nota Bene!** Only incremental backups of entire drives are supported for now. +So despite the fact that you can attach a bitmap to any arbitrary node, they are +only currently useful when attached to the root node. This is because +drive-backup only supports drives/devices instead of arbitrary nodes. + +### Example: First Incremental Backup + +1. Create a full backup and sync it to the dirty bitmap, as in the transactional +examples above; or with the VM offline, manually create a full copy and then +create a new bitmap before the VM begins execution. + + * Let's assume the full backup is named 'full_backup.img'. + * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. + +2. Create a destination image for the incremental backup that utilizes the +full backup as a backing image. + + * Let's assume it is named 'incremental.0.img'. + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +3. Issue the incremental backup command: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +### Example: Second Incremental Backup + +1. Create a new destination image for the incremental backup that points to the + previous one, e.g.: 'incremental.1.img' + + ```sh + # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 + ``` + +2. Issue a new incremental backup command. The only difference here is that we + have changed the target image below. + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.1.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +## Errors + +* In the event of an error that occurs after a backup job is successfully + launched, either by a direct QMP command or a QMP transaction, the user + will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied + by a BLOCK_JOB_ERROR event. + +* In the case of an event being cancelled, the user will receive a + BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. + +* In either case, the incremental backup data contained within the bitmap is + safely rolled back, and the data within the bitmap is not lost. The image + file created for the failed attempt can be safely deleted. + +* Once the underlying problem is fixed (e.g. more storage space is freed up), + you can simply retry the incremental backup command with the same bitmap. + +### Example + +1. Create a target image: + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +2. Attempt to create an incremental backup via QMP: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +3. Receive an event notifying us of failure: + + ```json + { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "No space left on device", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +4. Delete the failed incremental, and re-create the image. + + ```sh + # rm incremental.0.img + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +5. Retry the command after fixing the underlying problem, + such as freeing up space on the backup volume: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +6. Receive confirmation that the job completed successfully: + + ```json + { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, + "data": { "device": "drive1", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864}, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +### Partial Transactional Failures + +* Sometimes, a transaction will succeed in launching and return success, + but then later the backup jobs themselves may fail. It is possible that + a management application may have to deal with a partial backup failure + after a successful transaction. + +* If multiple backup jobs are specified in a single transaction, when one of + them fails, it will not interact with the other backup jobs in any way. + +* The job(s) that succeeded will clear the dirty bitmap associated with the + operation, but the job(s) that failed will not. It is not "safe" to delete + any incremental backups that were created successfully in this scenario, + even though others failed. + +#### Example + +* QMP example highlighting two backup jobs: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ] + } + } + ``` + +* QMP example response, highlighting one success and one failure: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the first job was completed: + ```json + { "timestamp": { "seconds": 1447192343, "microseconds": 615698 }, + "data": { "device": "drive0", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864 }, + "event": "BLOCK_JOB_COMPLETED" + } + ``` + + * Later yet, QEMU sends notice that the second job has failed: + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 683015 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 685853 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + +* In the above example, "d0-incr-1.qcow2" is valid and must be kept, + but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide + incremental backup of all drives at a point-in-time is to be made, + new backups for both drives will need to be made, taking into account + that a new incremental backup for drive0 needs to be based on top of + "d0-incr-1.qcow2." + +### Grouped Completion Mode + +* While jobs launched by transactions normally complete or fail on their own, + it is possible to instruct them to complete or fail together as a group. + +* QMP transactions take an optional properties structure that can affect + the semantics of the transaction. + +* The "completion-mode" transaction property can be either "individual" + which is the default, legacy behavior described above, or "grouped," + a new behavior detailed below. + +* Delayed Completion: In grouped completion mode, no jobs will report + success until all jobs are ready to report success. + +* Grouped failure: If any job fails in grouped completion mode, all remaining + jobs will be cancelled. Any incremental backups will restore their dirty + bitmap objects as if no backup command was ever issued. + + * Regardless of if QEMU reports a particular incremental backup job as + CANCELLED or as an ERROR, the in-memory bitmap will be restored. + +#### Example + +* Here's the same example scenario from above with the new property: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ], + "properties": { + "completion-mode": "grouped" + } + } + } + ``` + +* QMP example response, highlighting a failure for drive2: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the second job has errored out, + but that the first job was also cancelled: + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 632377 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640074 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640163 }, + "data": { "device": "drive0", "type": "backup", "speed": 0, + "len": 67108864, "offset": 16777216 }, + "event": "BLOCK_JOB_CANCELLED" } + ``` + +<!-- +The FreeBSD Documentation License + +Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML, +PDF, PostScript, RTF and so forth) with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code (Markdown) must retain the above copyright +notice, this list of conditions and the following disclaimer of this file +unmodified. + +Redistributions in compiled form (transformed to other DTDs, converted to PDF, +PostScript, RTF and other formats) must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> diff --git a/docs/devel/blkdebug.txt b/docs/devel/blkdebug.txt new file mode 100644 index 0000000000..43d8e8f9c6 --- /dev/null +++ b/docs/devel/blkdebug.txt @@ -0,0 +1,162 @@ +Block I/O error injection using blkdebug +---------------------------------------- +Copyright (C) 2014-2015 Red Hat Inc + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + +The blkdebug block driver is a rule-based error injection engine. It can be +used to exercise error code paths in block drivers including ENOSPC (out of +space) and EIO. + +This document gives an overview of the features available in blkdebug. + +Background +---------- +Block drivers have many error code paths that handle I/O errors. Image formats +are especially complex since metadata I/O errors during cluster allocation or +while updating tables happen halfway through request processing and require +discipline to keep image files consistent. + +Error injection allows test cases to trigger I/O errors at specific points. +This way, all error paths can be tested to make sure they are correct. + +Rules +----- +The blkdebug block driver takes a list of "rules" that tell the error injection +engine when to fail an I/O request. + +Each I/O request is evaluated against the rules. If a rule matches the request +then its "action" is executed. + +Rules can be placed in a configuration file; the configuration file +follows the same .ini-like format used by QEMU's -readconfig option, and +each section of the file represents a rule. + +The following configuration file defines a single rule: + + $ cat blkdebug.conf + [inject-error] + event = "read_aio" + errno = "28" + +This rule fails all aio read requests with ENOSPC (28). Note that the errno +value depends on the host. On Linux, see +/usr/include/asm-generic/errno-base.h for errno values. + +Invoke QEMU as follows: + + $ qemu-system-x86_64 + -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \ + -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0 + +Rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match. See the "State transitions" section for information + on states. + + errno - the numeric errno value to return when a request matches this rule. + The errno values depend on the host since the numeric values are not + standarized in the POSIX specification. + + sector - (optional) a sector number that the request must overlap in order to + match this rule + + once - (optional, default "off") only execute this action on the first + matching request + + immediately - (optional, default "off") return a NULL BlockAIOCB + pointer and fail without an errno instead. This + exercises the code path where BlockAIOCB fails and the + caller's BlockCompletionFunc is not invoked. + +Events +------ +Block drivers provide information about the type of I/O request they are about +to make so rules can match specific types of requests. For example, the qcow2 +block driver tells blkdebug when it accesses the L1 table so rules can match +only L1 table accesses and not other metadata or guest data requests. + +The core events are: + + read_aio - guest data read + + write_aio - guest data write + + flush_to_os - write out unwritten block driver state (e.g. cached metadata) + + flush_to_disk - flush the host block device's disk cache + +See qapi/block-core.json:BlkdebugEvent for the full list of events. +You may need to grep block driver source code to understand the +meaning of specific events. + +State transitions +----------------- +There are cases where more power is needed to match a particular I/O request in +a longer sequence of requests. For example: + + write_aio + flush_to_disk + write_aio + +How do we match the 2nd write_aio but not the first? This is where state +transitions come in. + +The error injection engine has an integer called the "state" that always starts +initialized to 1. The state integer is internal to blkdebug and cannot be +observed from outside but rules can interact with it for powerful matching +behavior. + +Rules can be conditional on the current state and they can transition to a new +state. + +When a rule's "state" attribute is non-zero then the current state must equal +the attribute in order for the rule to match. + +For example, to match the 2nd write_aio: + + [set-state] + event = "write_aio" + state = "1" + new_state = "2" + + [inject-error] + event = "write_aio" + state = "2" + errno = "5" + +The first write_aio request matches the set-state rule and transitions from +state 1 to state 2. Once state 2 has been entered, the set-state rule no +longer matches since it requires state 1. But the inject-error rule now +matches the next write_aio request and injects EIO (5). + +State transition rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match + + new_state - transition to this state number + +Suspend and resume +------------------ +Exercising code paths in block drivers may require specific ordering amongst +concurrent requests. The "breakpoint" feature allows requests to be halted on +a blkdebug event and resumed later. This makes it possible to achieve +deterministic ordering when multiple requests are in flight. + +Breakpoints on blkdebug events are associated with a user-defined "tag" string. +This tag serves as an identifier by which the request can be resumed at a later +point. + +See the qemu-io(1) break, resume, remove_break, and wait_break commands for +details. diff --git a/docs/devel/blkverify.txt b/docs/devel/blkverify.txt new file mode 100644 index 0000000000..d556dc4e6d --- /dev/null +++ b/docs/devel/blkverify.txt @@ -0,0 +1,69 @@ += Block driver correctness testing with blkverify = + +== Introduction == + +This document describes how to use the blkverify protocol to test that a block +driver is operating correctly. + +It is difficult to test and debug block drivers against real guests. Often +processes inside the guest will crash because corrupt sectors were read as part +of the executable. Other times obscure errors are raised by a program inside +the guest. These issues are extremely hard to trace back to bugs in the block +driver. + +Blkverify solves this problem by catching data corruption inside QEMU the first +time bad data is read and reporting the disk sector that is corrupted. + +== How it works == + +The blkverify protocol has two child block devices, the "test" device and the +"raw" device. Read/write operations are mirrored to both devices so their +state should always be in sync. + +The "raw" device is a raw image, a flat file, that has identical starting +contents to the "test" image. The idea is that the "raw" device will handle +read/write operations correctly and not corrupt data. It can be used as a +reference for comparison against the "test" device. + +After a mirrored read operation completes, blkverify will compare the data and +raise an error if it is not identical. This makes it possible to catch the +first instance where corrupt data is read. + +== Example == + +Imagine raw.img has 0xcd repeated throughout its first sector: + + $ ./qemu-io -c 'read -v 0 512' raw.img + 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + [...] + 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) + +And test.img is corrupt, its first sector is zeroed when it shouldn't be: + + $ ./qemu-io -c 'read -v 0 512' test.img + 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + [...] + 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) + +This error is caught by blkverify: + + $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img + blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 + +A more realistic scenario is verifying the installation of a guest OS: + + $ ./qemu-img create raw.img 16G + $ ./qemu-img create -f qcow2 test.qcow2 16G + $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ + -drive file=blkverify:raw.img:test.qcow2 + +If the installation is aborted when blkverify detects corruption, use qemu-io +to explore the contents of the disk image at the sector in question. diff --git a/docs/devel/build-system.txt b/docs/devel/build-system.txt new file mode 100644 index 0000000000..2af1e668c5 --- /dev/null +++ b/docs/devel/build-system.txt @@ -0,0 +1,512 @@ + The QEMU build system architecture + ================================== + +This document aims to help developers understand the architecture of the +QEMU build system. As with projects using GNU autotools, the QEMU build +system has two stages, first the developer runs the "configure" script +to determine the local build environment characteristics, then they run +"make" to build the project. There is about where the similarities with +GNU autotools end, so try to forget what you know about them. + + +Stage 1: configure +================== + +The QEMU configure script is written directly in shell, and should be +compatible with any POSIX shell, hence it uses #!/bin/sh. An important +implication of this is that it is important to avoid using bash-isms on +development platforms where bash is the primary host. + +In contrast to autoconf scripts, QEMU's configure is expected to be +silent while it is checking for features. It will only display output +when an error occurs, or to show the final feature enablement summary +on completion. + +Adding new checks to the configure script usually comprises the +following tasks: + + - Initialize one or more variables with the default feature state. + + Ideally features should auto-detect whether they are present, + so try to avoid hardcoding the initial state to either enabled + or disabled, as that forces the user to pass a --enable-XXX + / --disable-XXX flag on every invocation of configure. + + - Add support to the command line arg parser to handle any new + --enable-XXX / --disable-XXX flags required by the feature XXX. + + - Add information to the help output message to report on the new + feature flag. + + - Add code to perform the actual feature check. As noted above, try to + be fully dynamic in checking enablement/disablement. + + - Add code to print out the feature status in the configure summary + upon completion. + + - Add any new makefile variables to $config_host_mak on completion. + + +Taking (a simplified version of) the probe for gnutls from configure, +we have the following pieces: + + # Initial variable state + gnutls="" + + ..snip.. + + # Configure flag processing + --disable-gnutls) gnutls="no" + ;; + --enable-gnutls) gnutls="yes" + ;; + + ..snip.. + + # Help output feature message + gnutls GNUTLS cryptography support + + ..snip.. + + # Test for gnutls + if test "$gnutls" != "no"; then + if ! $pkg_config --exists "gnutls"; then + gnutls_cflags=`$pkg_config --cflags gnutls` + gnutls_libs=`$pkg_config --libs gnutls` + libs_softmmu="$gnutls_libs $libs_softmmu" + libs_tools="$gnutls_libs $libs_tools" + QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags" + gnutls="yes" + elif test "$gnutls" = "yes"; then + feature_not_found "gnutls" "Install gnutls devel" + else + gnutls="no" + fi + fi + + ..snip.. + + # Completion feature summary + echo "GNUTLS support $gnutls" + + ..snip.. + + # Define make variables + if test "$gnutls" = "yes" ; then + echo "CONFIG_GNUTLS=y" >> $config_host_mak + fi + + +Helper functions +---------------- + +The configure script provides a variety of helper functions to assist +developers in checking for system features: + + - do_cc $ARGS... + + Attempt to run the system C compiler passing it $ARGS... + + - do_cxx $ARGS... + + Attempt to run the system C++ compiler passing it $ARGS... + + - compile_object $CFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS. The test program must have been previously written to a file + called $TMPC. + + - compile_prog $CFLAGS $LDFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS and link it with the system linker using $LDFLAGS. The test + program must have been previously written to a file called $TMPC. + + - has $COMMAND + + Determine if $COMMAND exists in the current environment, either as a + shell builtin, or executable binary, returning 0 on success. + + - path_of $COMMAND + + Return the fully qualified path of $COMMAND, printing it to stdout, + and returning 0 on success. + + - check_define $NAME + + Determine if the macro $NAME is defined by the system C compiler + + - check_include $NAME + + Determine if the include $NAME file is available to the system C + compiler + + - write_c_skeleton + + Write a minimal C program main() function to the temporary file + indicated by $TMPC + + - feature_not_found $NAME $REMEDY + + Print a message to stderr that the feature $NAME was not available + on the system, suggesting the user try $REMEDY to address the + problem. + + - error_exit $MESSAGE $MORE... + + Print $MESSAGE to stderr, followed by $MORE... and then exit from the + configure script with non-zero status + + - query_pkg_config $ARGS... + + Run pkg-config passing it $ARGS. If QEMU is doing a static build, + then --static will be automatically added to $ARGS + + +Stage 2: makefiles +================== + +The use of GNU make is required with the QEMU build system. + +Although the source code is spread across multiple subdirectories, the +build system should be considered largely non-recursive in nature, in +contrast to common practices seen with automake. There is some recursive +invocation of make, but this is related to the things being built, +rather than the source directory structure. + +QEMU currently supports both VPATH and non-VPATH builds, so there are +three general ways to invoke configure & perform a build. + + - VPATH, build artifacts outside of QEMU source tree entirely + + cd ../ + mkdir build + cd build + ../qemu/configure + make + + - VPATH, build artifacts in a subdir of QEMU source tree + + mkdir build + cd build + ../configure + make + + - non-VPATH, build artifacts everywhere + + ./configure + make + +The QEMU maintainers generally recommend that a VPATH build is used by +developers. Patches to QEMU are expected to ensure VPATH build still +works. + + +Module structure +---------------- + +There are a number of key outputs of the QEMU build system: + + - Tools - qemu-img, qemu-nbd, qga (guest agent), etc + - System emulators - qemu-system-$ARCH + - Userspace emulators - qemu-$ARCH + - Unit tests + +The source code is highly modularized, split across many files to +facilitate building of all of these components with as little duplicated +compilation as possible. There can be considered to be two distinct +groups of files, those which are independent of the QEMU emulation +target and those which are dependent on the QEMU emulation target. + +In the target-independent set lives various general purpose helper code, +such as error handling infrastructure, standard data structures, +platform portability wrapper functions, etc. This code can be compiled +once only and the .o files linked into all output binaries. + +In the target-dependent set lives CPU emulation, device emulation and +much glue code. This sometimes also has to be compiled multiple times, +once for each target being built. + +The utility code that is used by all binaries is built into a +static archive called libqemuutil.a, which is then linked to all the +binaries. In order to provide hooks that are only needed by some of the +binaries, code in libqemuutil.a may depend on other functions that are +not fully implemented by all QEMU binaries. To deal with this there is a +second library called libqemustub.a which provides dummy stubs for all +these functions. These will get lazy linked into the binary if the real +implementation is not present. In this way, the libqemustub.a static +library can be thought of as a portable implementation of the weak +symbols concept. All binaries should link to both libqemuutil.a and +libqemustub.a. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a + + +Windows platform portability +---------------------------- + +On Windows, all binaries have the suffix '.exe', so all Makefile rules +which create binaries must include the $(EXESUF) variable on the binary +name. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. + +This expands to '.exe' on Windows, or '' on other platforms. + +A further complication for the system emulator binaries is that +two separate binaries need to be generated. + +The main binary (e.g. qemu-system-x86_64.exe) is linked against the +Windows console runtime subsystem. These are expected to be run from a +command prompt window, and so will print stderr to the console that +launched them. + +The second binary generated has a 'w' on the end of its name (e.g. +qemu-system-x86_64w.exe) and is linked against the Windows graphical +runtime subsystem. These are expected to be run directly from the +desktop and will open up a dedicated console window for stderr output. + +The Makefile.target will generate the binary for the graphical subsystem +first, and then use objcopy to relink it against the console subsystem +to generate the second binary. + + +Object variable naming +---------------------- + +The QEMU convention is to define variables to list different groups of +object files. These are named with the convention $PREFIX-obj-y. For +example the libqemuutil.a file will be linked with all objects listed +in a variable 'util-obj-y'. So, for example, util/Makefile.obj will +contain a set of definitions looking like + + util-obj-y += bitmap.o bitops.o hbitmap.o + util-obj-y += fifo8.o + util-obj-y += acl.o + util-obj-y += error.o qemu-error.o + +When there is an object file which needs to be conditionally built based +on some characteristic of the host system, the configure script will +define a variable for the conditional. For example, on Windows it will +define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a +value of 'y'. It is now possible to use the config variables when +listing object files. For example, + + util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o + util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o + +On Windows this expands to + + util-obj-y += oslib-win32.o qemu-thread-win32.o + util-obj-n += oslib-posix.o qemu-thread-posix.o + +Since libqemutil.a links in $(util-obj-y), the POSIX specific files +listed against $(util-obj-n) are ignored on the Windows platform builds. + + +CFLAGS / LDFLAGS / LIBS handling +-------------------------------- + +There are many different binaries being built with differing purposes, +and some of them might even be 3rd party libraries pulled in via git +submodules. As such the use of the global CFLAGS variable is generally +avoided in QEMU, since it would apply to too many build targets. + +Flags that are needed by any QEMU code (i.e. everything *except* GIT +submodule projects) are put in $(QEMU_CFLAGS) variable. For linker +flags the $(LIBS) variable is sometimes used, but a couple of more +targeted variables are preferred. $(libs_softmmu) is used for +libraries that must be linked to system emulator targets, $(LIBS_TOOLS) +is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used +for the QEMU guest agent. There is currently no specific variable for +the userspace emulator targets as the global $(LIBS), or more targeted +variables shown below, are sufficient. + +In addition to these variables, it is possible to provide cflags and +libs against individual source code files, by defining variables of the +form $FILENAME-cflags and $FILENAME-libs. For example, the curl block +driver needs to link to the libcurl library, so block/Makefile defines +some variables: + + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +The scope is a little different between the two variables. The libs get +used when linking any target binary that includes the curl.o object +file, while the cflags get used when compiling the curl.c file only. + + +Statically defined files +------------------------ + +The following key files are statically defined in the source tree, with +the rules needed to build QEMU. Their behaviour is influenced by a +number of dynamically created files listed later. + +- Makefile + +The main entry point used when invoking make to build all the components +of QEMU. The default 'all' target will naturally result in the build of +every component. The various tools and helper binaries are built +directly via a non-recursive set of rules. + +Each system/userspace emulation target needs to have a slightly +different set of make rules / variables. Thus, make will be recursively +invoked for each of the emulation targets. + +The recursive invocation will end up processing the toplevel +Makefile.target file (more on that later). + + +- */Makefile.objs + +Since the source code is spread across multiple directories, the rules +for each file are similarly modularized. Thus each subdirectory +containing .c files will usually also contain a Makefile.objs file. +These files are not directly invoked by a recursive make, but instead +they are imported by the top level Makefile and/or Makefile.target + +Each Makefile.objs usually just declares a set of variables listing the +.o files that need building from the source files in the directory. They +will also define any custom linker or compiler flags. For example in +block/Makefile.objs + + block-obj-$(CONFIG_LIBISCSI) += iscsi.o + block-obj-$(CONFIG_CURL) += curl.o + + ..snip... + + iscsi.o-cflags := $(LIBISCSI_CFLAGS) + iscsi.o-libs := $(LIBISCSI_LIBS) + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +If there are any rules defined in the Makefile.objs file, they should +all use $(obj) as a prefix to the target, e.g. + + $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp + + +- Makefile.target + +This file provides the entry point used to build each individual system +or userspace emulator target. Each enabled target has its own +subdirectory. For example if configure is run with the argument +'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu' +will be created, containing a 'Makefile' which symlinks back to +Makefile.target + +So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up +using Makefile.target for the build rules. + + +- rules.mak + +This file provides the generic helper rules for invoking build tools, in +particular the compiler and linker. This also contains the magic (hairy) +'unnest-vars' function which is used to merge the variable definitions +from all Makefile.objs in the source tree down into the main Makefile +context. + + +- default-configs/*.mak + +The files under default-configs/ control what emulated hardware is built +into each QEMU system and userspace emulator targets. They merely +contain a long list of config variable definitions. For example, +default-configs/x86_64-softmmu.mak has: + + include pci.mak + include sound.mak + include usb.mak + CONFIG_QXL=$(CONFIG_SPICE) + CONFIG_VGA_ISA=y + CONFIG_VGA_CIRRUS=y + CONFIG_VMWARE_VGA=y + CONFIG_VIRTIO_VGA=y + ...snip... + +These files rarely need changing unless new devices / hardware need to +be enabled for a particular system/userspace emulation target + + +- tests/Makefile + +Rules for building the unit tests. This file is included directly by the +top level Makefile, so anything defined in this file will influence the +entire build system. Care needs to be taken when writing rules for tests +to ensure they only apply to the unit test execution / build. + +- tests/docker/Makefile.include + +Rules for Docker tests. Like tests/Makefile, this file is included +directly by the top level Makefile, anything defined in this file will +influence the entire build system. + +- po/Makefile + +Rules for building and installing the binary message catalogs from the +text .po file sources. This almost never needs changing for any reason. + + +Dynamically created files +------------------------- + +The following files are generated dynamically by configure in order to +control the behaviour of the statically defined makefiles. This avoids +the need for QEMU makefiles to go through any pre-processing as seen +with autotools, where Makefile.am generates Makefile.in which generates +Makefile. + + +- config-host.mak + +When configure has determined the characteristics of the build host it +will write a long list of variables to config-host.mak file. This +provides the various install directories, compiler / linker flags and a +variety of CONFIG_* variables related to optionally enabled features. +This is imported by the top level Makefile in order to tailor the build +output. + +The variables defined here are those which are applicable to all QEMU +build outputs. Variables which are potentially different for each +emulator target are defined by the next file... + +It is also used as a dependency checking mechanism. If make sees that +the modification timestamp on configure is newer than that on +config-host.mak, then configure will be re-run. + + +- config-host.h + +The config-host.h file is used by source code to determine what features +are enabled. It is generated from the contents of config-host.mak using +the scripts/create_config program. This extracts all the CONFIG_* variables, +most of the HOST_* variables and a few other misc variables from +config-host.mak, formatting them as C preprocessor macros. + + +- $TARGET-NAME/config-target.mak + +TARGET-NAME is the name of a system or userspace emulator, for example, +x86_64-softmmu denotes the system emulator for the x86_64 architecture. +This file contains the variables which need to vary on a per-target +basis. For example, it will indicate whether KVM or Xen are enabled for +the target and any other potential custom libraries needed for linking +the target. + + +- $TARGET-NAME/config-devices.mak + +TARGET-NAME is again the name of a system or userspace emulator. The +config-devices.mak file is automatically generated by make using the +scripts/make_device_config.sh program, feeding it the +default-configs/$TARGET-NAME file as input. + + +- $TARGET-NAME/Makefile + +This is the entrypoint used when make recurses to build a single system +or userspace emulator target. It is merely a symlink back to the +Makefile.target in the top level. diff --git a/docs/devel/lockcnt.txt b/docs/devel/lockcnt.txt new file mode 100644 index 0000000000..2a79b3205b --- /dev/null +++ b/docs/devel/lockcnt.txt @@ -0,0 +1,277 @@ +DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt) +=================================================== + +QEMU often uses reference counts to track data structures that are being +accessed and should not be freed. For example, a loop that invoke +callbacks like this is not safe: + + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + +QLIST_FOREACH_SAFE protects against deletion of the current node (ioh) +by stashing away its "next" pointer. However, ioh->fd_write could +actually delete the next node from the list. The simplest way to +avoid this is to mark the node as deleted, and remove it from the +list in the above loop: + + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + +If however this loop must also be reentrant, i.e. it is possible that +ioh->fd_write invokes the loop again, some kind of counting is needed: + + walking_handlers++; + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + if (walking_handlers == 1) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + walking_handlers--; + +One may think of using the RCU primitives, rcu_read_lock() and +rcu_read_unlock(); effectively, the RCU nesting count would take +the place of the walking_handlers global variable. Indeed, +reference counting and RCU have similar purposes, but their usage in +general is complementary: + +- reference counting is fine-grained and limited to a single data + structure; RCU delays reclamation of *all* RCU-protected data + structures; + +- reference counting works even in the presence of code that keeps + a reference for a long time; RCU critical sections in principle + should be kept short; + +- reference counting is often applied to code that is not thread-safe + but is reentrant; in fact, usage of reference counting in QEMU predates + the introduction of threads by many years. RCU is generally used to + protect readers from other threads freeing memory after concurrent + modifications to a data structure. + +- reclaiming data can be done by a separate thread in the case of RCU; + this can improve performance, but also delay reclamation undesirably. + With reference counting, reclamation is deterministic. + +This file documents QemuLockCnt, an abstraction for using reference +counting in code that has to be both thread-safe and reentrant. + + +QemuLockCnt concepts +-------------------- + +A QemuLockCnt comprises both a counter and a mutex; it has primitives +to increment and decrement the counter, and to take and release the +mutex. The counter notes how many visits to the data structures are +taking place (the visits could be from different threads, or there could +be multiple reentrant visits from the same thread). The basic rules +governing the counter/mutex pair then are the following: + +- Data protected by the QemuLockCnt must not be freed unless the + counter is zero and the mutex is taken. + +- A new visit cannot be started while the counter is zero and the + mutex is taken. + +Most of the time, the mutex protects all writes to the data structure, +not just frees, though there could be cases where this is not necessary. + +Reads, instead, can be done without taking the mutex, as long as the +readers and writers use the same macros that are used for RCU, for +example atomic_rcu_read, atomic_rcu_set, QLIST_FOREACH_RCU, etc. This is +because the reads are done outside a lock and a set or QLIST_INSERT_HEAD +can happen concurrently with the read. The RCU API ensures that the +processor and the compiler see all required memory barriers. + +This could be implemented simply by protecting the counter with the +mutex, for example: + + // (1) + qemu_mutex_lock(&walking_handlers_mutex); + walking_handlers++; + qemu_mutex_unlock(&walking_handlers_mutex); + + ... + + // (2) + qemu_mutex_lock(&walking_handlers_mutex); + if (--walking_handlers == 0) { + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } + } + qemu_mutex_unlock(&walking_handlers_mutex); + +Here, no frees can happen in the code represented by the ellipsis. +If another thread is executing critical section (2), that part of +the code cannot be entered, because the thread will not be able +to increment the walking_handlers variable. And of course +during the visit any other thread will see a nonzero value for +walking_handlers, as in the single-threaded code. + +Note that it is possible for multiple concurrent accesses to delay +the cleanup arbitrarily; in other words, for the walking_handlers +counter to never become zero. For this reason, this technique is +more easily applicable if concurrent access to the structure is rare. + +However, critical sections are easy to forget since you have to do +them for each modification of the counter. QemuLockCnt ensures that +all modifications of the counter take the lock appropriately, and it +can also be more efficient in two ways: + +- it avoids taking the lock for many operations (for example + incrementing the counter while it is non-zero); + +- on some platforms, one can implement QemuLockCnt to hold the lock + and the mutex in a single word, making the fast path no more expensive + than simply managing a counter using atomic operations (see + docs/atomics.txt). This can be very helpful if concurrent access to + the data structure is expected to be rare. + + +Using the same mutex for frees and writes can still incur some small +inefficiencies; for example, a visit can never start if the counter is +zero and the mutex is taken---even if the mutex is taken by a write, +which in principle need not block a visit of the data structure. +However, these are usually not a problem if any of the following +assumptions are valid: + +- concurrent access is possible but rare + +- writes are rare + +- writes are frequent, but this kind of write (e.g. appending to a + list) has a very small critical section. + +For example, QEMU uses QemuLockCnt to manage an AioContext's list of +bottom halves and file descriptor handlers. Modifications to the list +of file descriptor handlers are rare. Creation of a new bottom half is +frequent and can happen on a fast path; however: 1) it is almost never +concurrent with a visit to the list of bottom halves; 2) it only has +three instructions in the critical path, two assignments and a smp_wmb(). + + +QemuLockCnt API +--------------- + +The QemuLockCnt API is described in include/qemu/thread.h. + + +QemuLockCnt usage +----------------- + +This section explains the typical usage patterns for QemuLockCnt functions. + +Setting a variable to a non-NULL value can be done between +qemu_lockcnt_lock and qemu_lockcnt_unlock: + + qemu_lockcnt_lock(&xyz_lockcnt); + if (!xyz) { + new_xyz = g_new(XYZ, 1); + ... + atomic_rcu_set(&xyz, new_xyz); + } + qemu_lockcnt_unlock(&xyz_lockcnt); + +Accessing the value can be done between qemu_lockcnt_inc and +qemu_lockcnt_dec: + + qemu_lockcnt_inc(&xyz_lockcnt); + if (xyz) { + XYZ *p = atomic_rcu_read(&xyz); + ... + /* Accesses can now be done through "p". */ + } + qemu_lockcnt_dec(&xyz_lockcnt); + +Freeing the object can similarly use qemu_lockcnt_lock and +qemu_lockcnt_unlock, but you also need to ensure that the count +is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc +takes the QemuLockCnt's lock, the count cannot become non-zero while +the object is being freed. Freeing an object looks like this: + + qemu_lockcnt_lock(&xyz_lockcnt); + if (!qemu_lockcnt_count(&xyz_lockcnt)) { + g_free(xyz); + xyz = NULL; + } + qemu_lockcnt_unlock(&xyz_lockcnt); + +If an object has to be freed right after a visit, you can combine +the decrement, the locking and the check on count as follows: + + qemu_lockcnt_inc(&xyz_lockcnt); + if (xyz) { + XYZ *p = atomic_rcu_read(&xyz); + ... + /* Accesses can now be done through "p". */ + } + if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) { + g_free(xyz); + xyz = NULL; + qemu_lockcnt_unlock(&xyz_lockcnt); + } + +QemuLockCnt can also be used to access a list as follows: + + qemu_lockcnt_inc(&io_handlers_lockcnt); + QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + + if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) { + QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + } + } + qemu_lockcnt_unlock(&io_handlers_lockcnt); + } + +Again, the RCU primitives are used because new items can be added to the +list during the walk. QLIST_FOREACH_RCU ensures that the processor and +the compiler see the appropriate memory barriers. + +An alternative pattern uses qemu_lockcnt_dec_if_lock: + + qemu_lockcnt_inc(&io_handlers_lockcnt); + QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) { + if (ioh->deleted) { + if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) { + QLIST_REMOVE(ioh, next); + g_free(ioh); + qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt); + } + } else { + if (ioh->revents & G_IO_OUT) { + ioh->fd_write(ioh->opaque); + } + } + } + qemu_lockcnt_dec(&io_handlers_lockcnt); + +Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock, +because there is no special task to do if the count goes from 1 to 0. diff --git a/docs/devel/memory.txt b/docs/devel/memory.txt new file mode 100644 index 0000000000..811b1bd3c5 --- /dev/null +++ b/docs/devel/memory.txt @@ -0,0 +1,316 @@ +The memory API +============== + +The memory API models the memory and I/O buses and controllers of a QEMU +machine. It attempts to allow modelling of: + + - ordinary RAM + - memory-mapped I/O (MMIO) + - memory controllers that can dynamically reroute physical memory regions + to different destinations + +The memory model provides support for + + - tracking RAM changes by the guest + - setting up coalesced memory for kvm + - setting up ioeventfd regions for kvm + +Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks +(leaves) are RAM and MMIO regions, while other nodes represent +buses, memory controllers, and memory regions that have been rerouted. + +In addition to MemoryRegion objects, the memory API provides AddressSpace +objects for every root and possibly for intermediate MemoryRegions too. +These represent memory as seen from the CPU or a device's viewpoint. + +Types of regions +---------------- + +There are multiple types of memory regions (all represented by a single C type +MemoryRegion): + +- RAM: a RAM region is simply a range of host memory that can be made available + to the guest. + You typically initialize these with memory_region_init_ram(). Some special + purposes require the variants memory_region_init_resizeable_ram(), + memory_region_init_ram_from_file(), or memory_region_init_ram_ptr(). + +- MMIO: a range of guest memory that is implemented by host callbacks; + each read or write causes a callback to be called on the host. + You initialize these with memory_region_init_io(), passing it a + MemoryRegionOps structure describing the callbacks. + +- ROM: a ROM memory region works like RAM for reads (directly accessing + a region of host memory), and forbids writes. You initialize these with + memory_region_init_rom(). + +- ROM device: a ROM device memory region works like RAM for reads + (directly accessing a region of host memory), but like MMIO for + writes (invoking a callback). You initialize these with + memory_region_init_rom_device(). + +- IOMMU region: an IOMMU region translates addresses of accesses made to it + and forwards them to some other target memory region. As the name suggests, + these are only needed for modelling an IOMMU, not for simple devices. + You initialize these with memory_region_init_iommu(). + +- container: a container simply includes other memory regions, each at + a different offset. Containers are useful for grouping several regions + into one unit. For example, a PCI BAR may be composed of a RAM region + and an MMIO region. + + A container's subregions are usually non-overlapping. In some cases it is + useful to have overlapping regions; for example a memory controller that + can overlay a subregion of RAM with MMIO or ROM, or a PCI controller + that does not prevent card from claiming overlapping BARs. + + You initialize a pure container with memory_region_init(). + +- alias: a subsection of another region. Aliases allow a region to be + split apart into discontiguous regions. Examples of uses are memory banks + used when the guest address space is smaller than the amount of RAM + addressed, or a memory controller that splits main memory to expose a "PCI + hole". Aliases may point to any type of region, including other aliases, + but an alias may not point back to itself, directly or indirectly. + You initialize these with memory_region_init_alias(). + +- reservation region: a reservation region is primarily for debugging. + It claims I/O space that is not supposed to be handled by QEMU itself. + The typical use is to track parts of the address space which will be + handled by the host kernel when KVM is enabled. + You initialize these with memory_region_init_reservation(), or by + passing a NULL callback parameter to memory_region_init_io(). + +It is valid to add subregions to a region which is not a pure container +(that is, to an MMIO, RAM or ROM region). This means that the region +will act like a container, except that any addresses within the container's +region which are not claimed by any subregion are handled by the +container itself (ie by its MMIO callbacks or RAM backing). However +it is generally possible to achieve the same effect with a pure container +one of whose subregions is a low priority "background" region covering +the whole address range; this is often clearer and is preferred. +Subregions cannot be added to an alias region. + +Region names +------------ + +Regions are assigned names by the constructor. For most regions these are +only used for debugging purposes, but RAM regions also use the name to identify +live migration sections. This means that RAM region names need to have ABI +stability. + +Region lifecycle +---------------- + +A region is created by one of the memory_region_init*() functions and +attached to an object, which acts as its owner or parent. QEMU ensures +that the owner object remains alive as long as the region is visible to +the guest, or as long as the region is in use by a virtual CPU or another +device. For example, the owner object will not die between an +address_space_map operation and the corresponding address_space_unmap. + +After creation, a region can be added to an address space or a +container with memory_region_add_subregion(), and removed using +memory_region_del_subregion(). + +Various region attributes (read-only, dirty logging, coalesced mmio, +ioeventfd) can be changed during the region lifecycle. They take effect +as soon as the region is made visible. This can be immediately, later, +or never. + +Destruction of a memory region happens automatically when the owner +object dies. + +If however the memory region is part of a dynamically allocated data +structure, you should call object_unparent() to destroy the memory region +before the data structure is freed. For an example see VFIOMSIXInfo +and VFIOQuirk in hw/vfio/pci.c. + +You must not destroy a memory region as long as it may be in use by a +device or CPU. In order to do this, as a general rule do not create or +destroy memory regions dynamically during a device's lifetime, and only +call object_unparent() in the memory region owner's instance_finalize +callback. The dynamically allocated data structure that contains the +memory region then should obviously be freed in the instance_finalize +callback as well. + +If you break this rule, the following situation can happen: + +- the memory region's owner had a reference taken via memory_region_ref + (for example by address_space_map) + +- the region is unparented, and has no owner anymore + +- when address_space_unmap is called, the reference to the memory region's + owner is leaked. + + +There is an exception to the above rule: it is okay to call +object_unparent at any time for an alias or a container region. It is +therefore also okay to create or destroy alias and container regions +dynamically during a device's lifetime. + +This exceptional usage is valid because aliases and containers only help +QEMU building the guest's memory map; they are never accessed directly. +memory_region_ref and memory_region_unref are never called on aliases +or containers, and the above situation then cannot happen. Exploiting +this exception is rarely necessary, and therefore it is discouraged, +but nevertheless it is used in a few places. + +For regions that "have no owner" (NULL is passed at creation time), the +machine object is actually used as the owner. Since instance_finalize is +never called for the machine object, you must never call object_unparent +on regions that have no owner, unless they are aliases or containers. + + +Overlapping regions and priority +-------------------------------- +Usually, regions may not overlap each other; a memory address decodes into +exactly one target. In some cases it is useful to allow regions to overlap, +and sometimes to control which of an overlapping regions is visible to the +guest. This is done with memory_region_add_subregion_overlap(), which +allows the region to overlap any other region in the same container, and +specifies a priority that allows the core to decide which of two regions at +the same address are visible (highest wins). +Priority values are signed, and the default value is zero. This means that +you can use memory_region_add_subregion_overlap() both to specify a region +that must sit 'above' any others (with a positive priority) and also a +background region that sits 'below' others (with a negative priority). + +If the higher priority region in an overlap is a container or alias, then +the lower priority region will appear in any "holes" that the higher priority +region has left by not mapping subregions to that area of its address range. +(This applies recursively -- if the subregions are themselves containers or +aliases that leave holes then the lower priority region will appear in these +holes too.) + +For example, suppose we have a container A of size 0x8000 with two subregions +B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is +an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two +of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at +offset 0x2000. As a diagram: + + 0 1000 2000 3000 4000 5000 6000 7000 8000 + |------|------|------|------|------|------|------|------| + A: [ ] + C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + B: [ ] + D: [DDDDD] + E: [EEEEE] + +The regions that will be seen within this address range then are: + [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] + +Since B has higher priority than C, its subregions appear in the flat map +even where they overlap with C. In ranges where B has not mapped anything +C's region appears. + +If B had provided its own MMIO operations (ie it was not a pure container) +then these would be used for any addresses in its range not handled by +D or E, and the result would be: + [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB] + +Priority values are local to a container, because the priorities of two +regions are only compared when they are both children of the same container. +This means that the device in charge of the container (typically modelling +a bus or a memory controller) can use them to manage the interaction of +its child regions without any side effects on other parts of the system. +In the example above, the priorities of D and E are unimportant because +they do not overlap each other. It is the relative priority of B and C +that causes D and E to appear on top of C: D and E's priorities are never +compared against the priority of C. + +Visibility +---------- +The memory core uses the following rules to select a memory region when the +guest accesses an address: + +- all direct subregions of the root region are matched against the address, in + descending priority order + - if the address lies outside the region offset/size, the subregion is + discarded + - if the subregion is a leaf (RAM or MMIO), the search terminates, returning + this leaf region + - if the subregion is a container, the same algorithm is used within the + subregion (after the address is adjusted by the subregion offset) + - if the subregion is an alias, the search is continued at the alias target + (after the address is adjusted by the subregion offset and alias offset) + - if a recursive search within a container or alias subregion does not + find a match (because of a "hole" in the container's coverage of its + address range), then if this is a container with its own MMIO or RAM + backing the search terminates, returning the container itself. Otherwise + we continue with the next subregion in priority order +- if none of the subregions match the address then the search terminates + with no match found + +Example memory map +------------------ + +system_memory: container@0-2^48-1 + | + +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff) + | + +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff) + | + +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff) + | (prio 1) + | + +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff) + +pci (0-2^32-1) + | + +--- vga-area: container@0xa0000-0xbffff + | | + | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff) + | | + | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff) + | + +---- vram: ram@0xe1000000-0xe1ffffff + | + +---- vga-mmio: mmio@0xe2000000-0xe200ffff + +ram: ram@0x00000000-0xffffffff + +This is a (simplified) PC memory map. The 4GB RAM block is mapped into the +system address space via two aliases: "lomem" is a 1:1 mapping of the first +3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the +so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with +4GB of memory. + +The memory controller diverts addresses in the range 640K-768K to the PCI +address space. This is modelled using the "vga-window" alias, mapped at a +higher priority so it obscures the RAM at the same addresses. The vga window +can be removed by programming the memory controller; this is modelled by +removing the alias and exposing the RAM underneath. + +The pci address space is not a direct child of the system address space, since +we only want parts of it to be visible (we accomplish this using aliases). +It has two subregions: vga-area models the legacy vga window and is occupied +by two 32K memory banks pointing at two sections of the framebuffer. +In addition the vram is mapped as a BAR at address e1000000, and an additional +BAR containing MMIO registers is mapped after it. + +Note that if the guest maps a BAR outside the PCI hole, it would not be +visible as the pci-hole alias clips it to a 0.5GB range. + +MMIO Operations +--------------- + +MMIO regions are provided with ->read() and ->write() callbacks; in addition +various constraints can be supplied to control how these callbacks are called: + + - .valid.min_access_size, .valid.max_access_size define the access sizes + (in bytes) which the device accepts; accesses outside this range will + have device and bus specific behaviour (ignored, or machine check) + - .valid.unaligned specifies that the *device being modelled* supports + unaligned accesses; if false, unaligned accesses will invoke the + appropriate bus or CPU specific behaviour. + - .impl.min_access_size, .impl.max_access_size define the access sizes + (in bytes) supported by the *implementation*; other access sizes will be + emulated using the ones available. For example a 4-byte write will be + emulated using four 1-byte writes, if .impl.max_access_size = 1. + - .impl.unaligned specifies that the *implementation* supports unaligned + accesses; if false, unaligned accesses will be emulated by two aligned + accesses. + - .old_mmio eases the porting of code that was formerly using + cpu_register_io_memory(). It should not be used in new code. diff --git a/docs/devel/migration.txt b/docs/devel/migration.txt new file mode 100644 index 0000000000..1b940a829b --- /dev/null +++ b/docs/devel/migration.txt @@ -0,0 +1,555 @@ += Migration = + +QEMU has code to load/save the state of the guest that it is running. +These are two complementary operations. Saving the state just does +that, saves the state for each device that the guest is running. +Restoring a guest is just the opposite operation: we need to load the +state of each device. + +For this to work, QEMU has to be launched with the same arguments the +two times. I.e. it can only restore the state in one guest that has +the same devices that the one it was saved (this last requirement can +be relaxed a bit, but for now we can consider that configuration has +to be exactly the same). + +Once that we are able to save/restore a guest, a new functionality is +requested: migration. This means that QEMU is able to start in one +machine and being "migrated" to another machine. I.e. being moved to +another machine. + +Next was the "live migration" functionality. This is important +because some guests run with a lot of state (specially RAM), and it +can take a while to move all state from one machine to another. Live +migration allows the guest to continue running while the state is +transferred. Only while the last part of the state is transferred has +the guest to be stopped. Typically the time that the guest is +unresponsive during live migration is the low hundred of milliseconds +(notice that this depends on a lot of things). + +=== Types of migration === + +Now that we have talked about live migration, there are several ways +to do migration: + +- tcp migration: do the migration using tcp sockets +- unix migration: do the migration using unix sockets +- exec migration: do the migration using the stdin/stdout through a process. +- fd migration: do the migration using an file descriptor that is + passed to QEMU. QEMU doesn't care how this file descriptor is opened. + +All these four migration protocols use the same infrastructure to +save/restore state devices. This infrastructure is shared with the +savevm/loadvm functionality. + +=== State Live Migration === + +This is used for RAM and block devices. It is not yet ported to vmstate. +<Fill more information here> + +=== What is the common infrastructure === + +QEMU uses a QEMUFile abstraction to be able to do migration. Any type +of migration that wants to use QEMU infrastructure has to create a +QEMUFile with: + +QEMUFile *qemu_fopen_ops(void *opaque, + QEMUFilePutBufferFunc *put_buffer, + QEMUFileGetBufferFunc *get_buffer, + QEMUFileCloseFunc *close); + +The functions have the following functionality: + +This function writes a chunk of data to a file at the given position. +The pos argument can be ignored if the file is only used for +streaming. The handler should try to write all of the data it can. + +typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, + int64_t pos, int size); + +Read a chunk of data from a file at the given position. The pos argument +can be ignored if the file is only be used for streaming. The number of +bytes actually read should be returned. + +typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, + int64_t pos, int size); + +Close a file and return an error code. + +typedef int (QEMUFileCloseFunc)(void *opaque); + +You can use any internal state that you need using the opaque void * +pointer that is passed to all functions. + +The important functions for us are put_buffer()/get_buffer() that +allow to write/read a buffer into the QEMUFile. + +=== How to save the state of one device === + +The state of a device is saved using intermediate buffers. There are +some helper functions to assist this saving. + +There is a new concept that we have to explain here: device state +version. When we migrate a device, we save/load the state as a series +of fields. Some times, due to bugs or new functionality, we need to +change the state to store more/different information. We use the +version to identify each time that we do a change. Each version is +associated with a series of fields saved. The save_state always saves +the state as the newer version. But load_state sometimes is able to +load state from an older version. + +=== Legacy way === + +This way is going to disappear as soon as all current users are ported to VMSTATE. + +Each device has to register two functions, one to save the state and +another to load the state back. + +int register_savevm(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); + +typedef void SaveStateHandler(QEMUFile *f, void *opaque); +typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + +The important functions for the device state format are the save_state +and load_state. Notice that load_state receives a version_id +parameter to know what state format is receiving. save_state doesn't +have a version_id parameter because it always uses the latest version. + +=== VMState === + +The legacy way of saving/loading state of the device had the problem +that we have to maintain two functions in sync. If we did one change +in one of them and not in the other, we would get a failed migration. + +VMState changed the way that state is saved/loaded. Instead of using +a function to save the state and another to load it, it was changed to +a declarative way of what the state consisted of. Now VMState is able +to interpret that definition to be able to load/save the state. As +the state is declared only once, it can't go out of sync in the +save/load functions. + +An example (from hw/input/pckbd.c) + +static const VMStateDescription vmstate_kbd = { + .name = "pckbd", + .version_id = 3, + .minimum_version_id = 3, + .fields = (VMStateField[]) { + VMSTATE_UINT8(write_cmd, KBDState), + VMSTATE_UINT8(status, KBDState), + VMSTATE_UINT8(mode, KBDState), + VMSTATE_UINT8(pending, KBDState), + VMSTATE_END_OF_LIST() + } +}; + +We are declaring the state with name "pckbd". +The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. +We registered this with: + + vmstate_register(NULL, 0, &vmstate_kbd, s); + +Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. + +You can search for VMSTATE_* macros for lots of types used in QEMU in +include/hw/hw.h. + +=== More about versions === + +Version numbers are intended for major incompatible changes to the +migration of a device, and using them breaks backwards-migration +compatibility; in general most changes can be made by adding Subsections +(see below) or _TEST macros (see below) which won't break compatibility. + +You can see that there are several version fields: + +- version_id: the maximum version_id supported by VMState for that device. +- minimum_version_id: the minimum version_id that VMState is able to understand + for that device. +- minimum_version_id_old: For devices that were not able to port to vmstate, we can + assign a function that knows how to read this old state. This field is + ignored if there is no load_state_old handler. + +So, VMState is able to read versions from minimum_version_id to +version_id. And the function load_state_old() (if present) is able to +load state from minimum_version_id_old to minimum_version_id. This +function is deprecated and will be removed when no more users are left. + +Saving state will always create a section with the 'version_id' value +and thus can't be loaded by any older QEMU. + +=== Massaging functions === + +Sometimes, it is not enough to be able to save the state directly +from one structure, we need to fill the correct values there. One +example is when we are using kvm. Before saving the cpu state, we +need to ask kvm to copy to QEMU the state that it is using. And the +opposite when we are loading the state, we need a way to tell kvm to +load the state for the cpu that we have just loaded from the QEMUFile. + +The functions to do that are inside a vmstate definition, and are called: + +- int (*pre_load)(void *opaque); + + This function is called before we load the state of one device. + +- int (*post_load)(void *opaque, int version_id); + + This function is called after we load the state of one device. + +- void (*pre_save)(void *opaque); + + This function is called before we save the state of one device. + +Example: You can look at hpet.c, that uses the three function to + massage the state that is transferred. + +If you use memory API functions that update memory layout outside +initialization (i.e., in response to a guest action), this is a strong +indication that you need to call these functions in a post_load callback. +Examples of such memory API functions are: + + - memory_region_add_subregion() + - memory_region_del_subregion() + - memory_region_set_readonly() + - memory_region_set_enabled() + - memory_region_set_address() + - memory_region_set_alias_offset() + +=== Subsections === + +The use of version_id allows to be able to migrate from older versions +to newer versions of a device. But not the other way around. This +makes very complicated to fix bugs in stable branches. If we need to +add anything to the state to fix a bug, we have to disable migration +to older versions that don't have that bug-fix (i.e. a new field). + +But sometimes, that bug-fix is only needed sometimes, not always. For +instance, if the device is in the middle of a DMA operation, it is +using a specific functionality, .... + +It is impossible to create a way to make migration from any version to +any other version to work. But we can do better than only allowing +migration from older versions to newer ones. For that fields that are +only needed sometimes, we add the idea of subsections. A subsection +is "like" a device vmstate, but with a particularity, it has a Boolean +function that tells if that values are needed to be sent or not. If +this functions returns false, the subsection is not sent. + +On the receiving side, if we found a subsection for a device that we +don't understand, we just fail the migration. If we understand all +the subsections, then we load the state with success. + +One important note is that the post_load() function is called "after" +loading all subsections, because a newer subsection could change same +value that it uses. + +Example: + +static bool ide_drive_pio_state_needed(void *opaque) +{ + IDEState *s = opaque; + + return ((s->status & DRQ_STAT) != 0) + || (s->bus->error_status & BM_STATUS_PIO_RETRY); +} + +const VMStateDescription vmstate_ide_drive_pio_state = { + .name = "ide_drive/pio_state", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = ide_drive_pio_pre_save, + .post_load = ide_drive_pio_post_load, + .needed = ide_drive_pio_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT32(req_nb_sectors, IDEState), + VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, + vmstate_info_uint8, uint8_t), + VMSTATE_INT32(cur_io_buffer_offset, IDEState), + VMSTATE_INT32(cur_io_buffer_len, IDEState), + VMSTATE_UINT8(end_transfer_fn_idx, IDEState), + VMSTATE_INT32(elementary_transfer_size, IDEState), + VMSTATE_INT32(packet_transfer_size, IDEState), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_ide_drive = { + .name = "ide_drive", + .version_id = 3, + .minimum_version_id = 0, + .post_load = ide_drive_post_load, + .fields = (VMStateField[]) { + .... several fields .... + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_ide_drive_pio_state, + NULL + } +}; + +Here we have a subsection for the pio state. We only need to +save/send this state when we are in the middle of a pio operation +(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is +not enabled, the values on that fields are garbage and don't need to +be sent. + +Using a condition function that checks a 'property' to determine whether +to send a subsection allows backwards migration compatibility when +new subsections are added. + +For example; + a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and + default it to true. + b) Add an entry to the HW_COMPAT_ for the previous version + that sets the property to false. + c) Add a static bool support_foo function that tests the property. + d) Add a subsection with a .needed set to the support_foo function + e) (potentially) Add a pre_load that sets up a default value for 'foo' + to be used if the subsection isn't loaded. + +Now that subsection will not be generated when using an older +machine type and the migration stream will be accepted by older +QEMU versions. pre-load functions can be used to initialise state +on the newer version so that they default to suitable values +when loading streams created by older QEMU versions that do not +generate the subsection. + +In some cases subsections are added for data that had been accidentally +omitted by earlier versions; if the missing data causes the migration +process to succeed but the guest to behave badly then it may be better +to send the subsection and cause the migration to explicitly fail +with the unknown subsection error. If the bad behaviour only happens +with certain data values, making the subsection conditional on +the data value (rather than the machine type) allows migrations to succeed +in most cases. In general the preference is to tie the subsection to +the machine type, and allow reliable migrations, unless the behaviour +from omission of the subsection is really bad. + += Not sending existing elements = + +Sometimes members of the VMState are no longer needed; + removing them will break migration compatibility + making them version dependent and bumping the version will break backwards + migration compatibility. + +The best way is to: + a) Add a new property/compatibility/function in the same way for subsections + above. + b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: + VMSTATE_UINT32(foo, barstruct) + becomes + VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz) + + Sometime in the future when we no longer care about the ancient +versions these can be killed off. + += Return path = + +In most migration scenarios there is only a single data path that runs +from the source VM to the destination, typically along a single fd (although +possibly with another fd or similar for some fast way of throwing pages across). + +However, some uses need two way communication; in particular the Postcopy +destination needs to be able to request pages on demand from the source. + +For these scenarios there is a 'return path' from the destination to the source; +qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return +path. + + Source side + Forward path - written by migration thread + Return path - opened by main thread, read by return-path thread + + Destination side + Forward path - read by main thread + Return path - opened by main thread, written by main thread AND postcopy + thread (protected by rp_mutex) + += Postcopy = +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side or the network connection causes +the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +=== Enabling postcopy === + +To enable postcopy, issue this command on the monitor prior to the +start of migration: + +migrate_set_capability postcopy-ram on + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +migrate_start_postcopy + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Note: During the postcopy phase, the bandwidth limits set using +migrate_set_speed is ignored (to avoid delaying requested pages that +the destination is waiting for). + +=== Postcopy device transfer === + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + Command: 'postcopy listen' + The device state + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 +main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) +thread | | + | (page request) + | \___ + v \ +listen thread: --- page -- page -- page -- page -- page -- + + a b c +------------------------------------------------------------------------------ + +On receipt of CMD_PACKAGED (1) + All the data associated with the package - the ( ... ) section in the +diagram - is read into memory, and the main thread recurses into +qemu_loadvm_state_main to process the contents of the package (2) +which contains commands (3,6) and devices (4...) + +On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) +a new thread (a) is started that takes over servicing the migration stream, +while the main thread carries on loading the package. It loads normal +background page data (b) but if during a device load a fault happens (5) the +returned page (c) is loaded by the listen thread allowing the main threads +device load to carry on. + +The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination +CPUs start running. +At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour +and is no longer used by migration, while the listen thread carries +on servicing page data until the end of migration. + +=== Postcopy states === + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + Advise: Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + Discard: Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + Listen: The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + Running: POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + End: The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +=== Source side page maps === + +The source side keeps two bitmaps during postcopy; 'the migration bitmap' +and 'unsent map'. The 'migration bitmap' is basically the same as in +the precopy case, and holds a bit to indicate that page is 'dirty' - +i.e. needs sending. During the precopy phase this is updated as the CPU +dirties pages, however during postcopy the CPUs are stopped and nothing +should dirty anything any more. + +The 'unsent map' is used for the transition to postcopy. It is a bitmap that +has a bit cleared whenever a page is sent to the destination, however during +the transition to postcopy mode it is combined with the migration bitmap +to form a set of pages that: + a) Have been sent but then redirtied (which must be discarded) + b) Have not yet been sent - which also must be discarded to cause any + transparent huge pages built during precopy to be broken. + +Note that the contents of the unsentmap are sacrificed during the calculation +of the discard set and thus aren't valid once in postcopy. The dirtymap +is still valid and is used to ensure that no page is sent more than once. Any +request for a page that has already been sent is ignored. Duplicate requests +such as this can happen as a page is sent at about the same time the +destination accesses it. + +=== Postcopy with hugepages === + +Postcopy now works with hugetlbfs backed memory: + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that -mem-path /dev/hugepages will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using -mem-prealloc enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. diff --git a/docs/devel/multi-thread-tcg.txt b/docs/devel/multi-thread-tcg.txt new file mode 100644 index 0000000000..a99b4564c6 --- /dev/null +++ b/docs/devel/multi-thread-tcg.txt @@ -0,0 +1,350 @@ +Copyright (c) 2015-2016 Linaro Ltd. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +Introduction +============ + +This document outlines the design for multi-threaded TCG system-mode +emulation. The current user-mode emulation mirrors the thread +structure of the translated executable. Some of the work will be +applicable to both system and linux-user emulation. + +The original system-mode TCG implementation was single threaded and +dealt with multiple CPUs with simple round-robin scheduling. This +simplified a lot of things but became increasingly limited as systems +being emulated gained additional cores and per-core performance gains +for host systems started to level off. + +vCPU Scheduling +=============== + +We introduce a new running mode where each vCPU will run on its own +user-space thread. This will be enabled by default for all FE/BE +combinations that have had the required work done to support this +safely. + +In the general case of running translated code there should be no +inter-vCPU dependencies and all vCPUs should be able to run at full +speed. Synchronisation will only be required while accessing internal +shared data structures or when the emulated architecture requires a +coherent representation of the emulated machine state. + +Shared Data Structures +====================== + +Main Run Loop +------------- + +Even when there is no code being generated there are a number of +structures associated with the hot-path through the main run-loop. +These are associated with looking up the next translation block to +execute. These include: + + tb_jmp_cache (per-vCPU, cache of recent jumps) + tb_ctx.htable (global hash table, phys address->tb lookup) + +As TB linking only occurs when blocks are in the same page this code +is critical to performance as looking up the next TB to execute is the +most common reason to exit the generated code. + +DESIGN REQUIREMENT: Make access to lookup structures safe with +multiple reader/writer threads. Minimise any lock contention to do it. + +The hot-path avoids using locks where possible. The tb_jmp_cache is +updated with atomic accesses to ensure consistent results. The fall +back QHT based hash table is also designed for lockless lookups. Locks +are only taken when code generation is required or TranslationBlocks +have their block-to-block jumps patched. + +Global TCG State +---------------- + +We need to protect the entire code generation cycle including any post +generation patching of the translated code. This also implies a shared +translation buffer which contains code running on all cores. Any +execution path that comes to the main run loop will need to hold a +mutex for code generation. This also includes times when we need flush +code or entries from any shared lookups/caches. Structures held on a +per-vCPU basis won't need locking unless other vCPUs will need to +modify them. + +DESIGN REQUIREMENT: Add locking around all code generation and TB +patching. + +(Current solution) + +Mainly as part of the linux-user work all code generation is +serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the +place of mmap_lock() in linux-user. + +Translation Blocks +------------------ + +Currently the whole system shares a single code generation buffer +which when full will force a flush of all translations and start from +scratch again. Some operations also force a full flush of translations +including: + + - debugging operations (breakpoint insertion/removal) + - some CPU helper functions + +This is done with the async_safe_run_on_cpu() mechanism to ensure all +vCPUs are quiescent when changes are being made to shared global +structures. + +More granular translation invalidation events are typically due +to a change of the state of a physical page: + + - code modification (self modify code, patching code) + - page changes (new page mapping in linux-user mode) + +While setting the invalid flag in a TranslationBlock will stop it +being used when looked up in the hot-path there are a number of other +book-keeping structures that need to be safely cleared. + +Any TranslationBlocks which have been patched to jump directly to the +now invalid blocks need the jump patches reversing so they will return +to the C code. + +There are a number of look-up caches that need to be properly updated +including the: + + - jump lookup cache + - the physical-to-tb lookup hash table + - the global page table + +The global page table (l1_map) which provides a multi-level look-up +for PageDesc structures which contain pointers to the start of a +linked list of all Translation Blocks in that page (see page_next). + +Both the jump patching and the page cache involve linked lists that +the invalidated TranslationBlock needs to be removed from. + +DESIGN REQUIREMENT: Safely handle invalidation of TBs + - safely patch/revert direct jumps + - remove central PageDesc lookup entries + - ensure lookup caches/hashes are safely updated + +(Current solution) + +The direct jump themselves are updated atomically by the TCG +tb_set_jmp_target() code. Modification to the linked lists that allow +searching for linked pages are done under the protect of the +tb_lock(). + +The global page table is protected by the tb_lock() in system-mode and +mmap_lock() in linux-user mode. + +The lookup caches are updated atomically and the lookup hash uses QHT +which is designed for concurrent safe lookup. + + +Memory maps and TLBs +-------------------- + +The memory handling code is fairly critical to the speed of memory +access in the emulated system. The SoftMMU code is designed so the +hot-path can be handled entirely within translated code. This is +handled with a per-vCPU TLB structure which once populated will allow +a series of accesses to the page to occur without exiting the +translated code. It is possible to set flags in the TLB address which +will ensure the slow-path is taken for each access. This can be done +to support: + + - Memory regions (dividing up access to PIO, MMIO and RAM) + - Dirty page tracking (for code gen, SMC detection, migration and display) + - Virtual TLB (for translating guest address->real address) + +When the TLB tables are updated by a vCPU thread other than their own +we need to ensure it is done in a safe way so no inconsistent state is +seen by the vCPU thread. + +Some operations require updating a number of vCPUs TLBs at the same +time in a synchronised manner. + +DESIGN REQUIREMENTS: + + - TLB Flush All/Page + - can be across-vCPUs + - cross vCPU TLB flush may need other vCPU brought to halt + - change may need to be visible to the calling vCPU immediately + - TLB Flag Update + - usually cross-vCPU + - want change to be visible as soon as possible + - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs) + - This is a per-vCPU table - by definition can't race + - updated by its own thread when the slow-path is forced + +(Current solution) + +We have updated cputlb.c to defer operations when a cross-vCPU +operation with async_run_on_cpu() which ensures each vCPU sees a +coherent state when it next runs its work (in a few instructions +time). + +A new set up operations (tlb_flush_*_all_cpus) take an additional flag +which when set will force synchronisation by setting the source vCPUs +work as "safe work" and exiting the cpu run loop. This ensure by the +time execution restarts all flush operations have completed. + +TLB flag updates are all done atomically and are also protected by the +tb_lock() which is used by the functions that update the TLB in bulk. + +(Known limitation) + +Not really a limitation but the wait mechanism is overly strict for +some architectures which only need flushes completed by a barrier +instruction. This could be a future optimisation. + +Emulated hardware state +----------------------- + +Currently thanks to KVM work any access to IO memory is automatically +protected by the global iothread mutex, also known as the BQL (Big +Qemu Lock). Any IO region that doesn't use global mutex is expected to +do its own locking. + +However IO memory isn't the only way emulated hardware state can be +modified. Some architectures have model specific registers that +trigger hardware emulation features. Generally any translation helper +that needs to update more than a single vCPUs of state should take the +BQL. + +As the BQL, or global iothread mutex is shared across the system we +push the use of the lock as far down into the TCG code as possible to +minimise contention. + +(Current solution) + +MMIO access automatically serialises hardware emulation by way of the +BQL. Currently ARM targets serialise all ARM_CP_IO register accesses +and also defer the reset/startup of vCPUs to the vCPU context by way +of async_run_on_cpu(). + +Updates to interrupt state are also protected by the BQL as they can +often be cross vCPU. + +Memory Consistency +================== + +Between emulated guests and host systems there are a range of memory +consistency models. Even emulating weakly ordered systems on strongly +ordered hosts needs to ensure things like store-after-load re-ordering +can be prevented when the guest wants to. + +Memory Barriers +--------------- + +Barriers (sometimes known as fences) provide a mechanism for software +to enforce a particular ordering of memory operations from the point +of view of external observers (e.g. another processor core). They can +apply to any memory operations as well as just loads or stores. + +The Linux kernel has an excellent write-up on the various forms of +memory barrier and the guarantees they can provide [1]. + +Barriers are often wrapped around synchronisation primitives to +provide explicit memory ordering semantics. However they can be used +by themselves to provide safe lockless access by ensuring for example +a change to a signal flag will only be visible once the changes to +payload are. + +DESIGN REQUIREMENT: Add a new tcg_memory_barrier op + +This would enforce a strong load/store ordering so all loads/stores +complete at the memory barrier. On single-core non-SMP strongly +ordered backends this could become a NOP. + +Aside from explicit standalone memory barrier instructions there are +also implicit memory ordering semantics which comes with each guest +memory access instruction. For example all x86 load/stores come with +fairly strong guarantees of sequential consistency where as ARM has +special variants of load/store instructions that imply acquire/release +semantics. + +In the case of a strongly ordered guest architecture being emulated on +a weakly ordered host the scope for a heavy performance impact is +quite high. + +DESIGN REQUIREMENTS: Be efficient with use of memory barriers + - host systems with stronger implied guarantees can skip some barriers + - merge consecutive barriers to the strongest one + +(Current solution) + +The system currently has a tcg_gen_mb() which will add memory barrier +operations if code generation is being done in a parallel context. The +tcg_optimize() function attempts to merge barriers up to their +strongest form before any load/store operations. The solution was +originally developed and tested for linux-user based systems. All +backends have been converted to emit fences when required. So far the +following front-ends have been updated to emit fences when required: + + - target-i386 + - target-arm + - target-aarch64 + - target-alpha + - target-mips + +Memory Control and Maintenance +------------------------------ + +This includes a class of instructions for controlling system cache +behaviour. While QEMU doesn't model cache behaviour these instructions +are often seen when code modification has taken place to ensure the +changes take effect. + +Synchronisation Primitives +-------------------------- + +There are two broad types of synchronisation primitives found in +modern ISAs: atomic instructions and exclusive regions. + +The first type offer a simple atomic instruction which will guarantee +some sort of test and conditional store will be truly atomic w.r.t. +other cores sharing access to the memory. The classic example is the +x86 cmpxchg instruction. + +The second type offer a pair of load/store instructions which offer a +guarantee that an region of memory has not been touched between the +load and store instructions. An example of this is ARM's ldrex/strex +pair where the strex instruction will return a flag indicating a +successful store only if no other CPU has accessed the memory region +since the ldrex. + +Traditionally TCG has generated a series of operations that work +because they are within the context of a single translation block so +will have completed before another CPU is scheduled. However with +the ability to have multiple threads running to emulate multiple CPUs +we will need to explicitly expose these semantics. + +DESIGN REQUIREMENTS: + - Support classic atomic instructions + - Support load/store exclusive (or load link/store conditional) pairs + - Generic enough infrastructure to support all guest architectures +CURRENT OPEN QUESTIONS: + - How problematic is the ABA problem in general? + +(Current solution) + +The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which +can be used directly or combined to emulate other instructions like +ARM's ldrex/strex instructions. While they are susceptible to the ABA +problem so far common guests have not implemented patterns where +this may be a problem - typically presenting a locking ABI which +assumes cmpxchg like semantics. + +The code also includes a fall-back for cases where multi-threaded TCG +ops can't work (e.g. guest atomic width > host atomic width). In this +case an EXCP_ATOMIC exit occurs and the instruction is emulated with +an exclusive lock which ensures all emulation is serialised. + +While the atomic helpers look good enough for now there may be a need +to look at solutions that can more closely model the guest +architectures semantics. + +========== + +[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt diff --git a/docs/devel/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt new file mode 100644 index 0000000000..e4d340bbb7 --- /dev/null +++ b/docs/devel/multiple-iothreads.txt @@ -0,0 +1,137 @@ +Copyright (c) 2014 Red Hat Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + + +This document explains the IOThread feature and how to write code that runs +outside the QEMU global mutex. + +The main loop and IOThreads +--------------------------- +QEMU is an event-driven program that can do several things at once using an +event loop. The VNC server and the QMP monitor are both processed from the +same event loop, which monitors their file descriptors until they become +readable and then invokes a callback. + +The default event loop is called the main loop (see main-loop.c). It is +possible to create additional event loop threads using -object +iothread,id=my-iothread. + +Side note: The main loop and IOThread are both event loops but their code is +not shared completely. Sometimes it is useful to remember that although they +are conceptually similar they are currently not interchangeable. + +Why IOThreads are useful +------------------------ +IOThreads allow the user to control the placement of work. The main loop is a +scalability bottleneck on hosts with many CPUs. Work can be spread across +several IOThreads instead of just one main loop. When set up correctly this +can improve I/O latency and reduce jitter seen by the guest. + +The main loop is also deeply associated with the QEMU global mutex, which is a +scalability bottleneck in itself. vCPU threads and the main loop use the QEMU +global mutex to serialize execution of QEMU code. This mutex is necessary +because a lot of QEMU's code historically was not thread-safe. + +The fact that all I/O processing is done in a single main loop and that the +QEMU global mutex is contended by all vCPU threads and the main loop explain +why it is desirable to place work into IOThreads. + +The experimental virtio-blk data-plane implementation has been benchmarked and +shows these effects: +ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf + +How to program for IOThreads +---------------------------- +The main difference between legacy code and new code that can run in an +IOThread is dealing explicitly with the event loop object, AioContext +(see include/block/aio.h). Code that only works in the main loop +implicitly uses the main loop's AioContext. Code that supports running +in IOThreads must be aware of its AioContext. + +AioContext supports the following services: + * File descriptor monitoring (read/write/error on POSIX hosts) + * Event notifiers (inter-thread signalling) + * Timers + * Bottom Halves (BH) deferred callbacks + +There are several old APIs that use the main loop AioContext: + * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor + * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier + * LEGACY timer_new_ms() - create a timer + * LEGACY qemu_bh_new() - create a BH + * LEGACY qemu_aio_wait() - run an event loop iteration + +Since they implicitly work on the main loop they cannot be used in code that +runs in an IOThread. They might cause a crash or deadlock if called from an +IOThread since the QEMU global mutex is not held. + +Instead, use the AioContext functions directly (see include/block/aio.h): + * aio_set_fd_handler() - monitor a file descriptor + * aio_set_event_notifier() - monitor an event notifier + * aio_timer_new() - create a timer + * aio_bh_new() - create a BH + * aio_poll() - run an event loop iteration + +The AioContext can be obtained from the IOThread using +iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). +Code that takes an AioContext argument works both in IOThreads or the main +loop, depending on which AioContext instance the caller passes in. + +How to synchronize with an IOThread +----------------------------------- +AioContext is not thread-safe so some rules must be followed when using file +descriptors, event notifiers, timers, or BHs across threads: + +1. AioContext functions can always be called safely. They handle their +own locking internally. + +2. Other threads wishing to access the AioContext must use +aio_context_acquire()/aio_context_release() for mutual exclusion. Once the +context is acquired no other thread can access it or run event loop iterations +in this AioContext. + +aio_context_acquire()/aio_context_release() calls may be nested. This +means you can call them if you're not sure whether #2 applies. + +There is currently no lock ordering rule if a thread needs to acquire multiple +AioContexts simultaneously. Therefore, it is only safe for code holding the +QEMU global mutex to acquire other AioContexts. + +Side note: the best way to schedule a function call across threads is to call +aio_bh_schedule_oneshot(). No acquire/release or locking is needed. + +AioContext and the block layer +------------------------------ +The AioContext originates from the QEMU block layer, even though nowadays +AioContext is a generic event loop that can be used by any QEMU subsystem. + +The block layer has support for AioContext integrated. Each BlockDriverState +is associated with an AioContext using bdrv_set_aio_context() and +bdrv_get_aio_context(). This allows block layer code to process I/O inside the +right AioContext. Other subsystems may wish to follow a similar approach. + +Block layer code must therefore expect to run in an IOThread and avoid using +old APIs that implicitly use the main loop. See the "How to program for +IOThreads" above for information on how to do that. + +If main loop code such as a QMP function wishes to access a BlockDriverState +it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure +that callbacks in the IOThread do not run in parallel. + +Code running in the monitor typically needs to ensure that past +requests from the guest are completed. When a block device is running +in an IOThread, the IOThread can also process requests from the guest +(via ioeventfd). To achieve both objects, wrap the code between +bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained +section". The functions must be called between aio_context_acquire() +and aio_context_release(). You can freely release and re-acquire the +AioContext within a drained section. + +Long-running jobs (usually in the form of coroutines) are best scheduled in +the BlockDriverState's AioContext to avoid the need to acquire/release around +each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier, +or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends, +can be used to get a notification whenever bdrv_set_aio_context() moves a +BlockDriverState to a different AioContext. diff --git a/docs/devel/qapi-code-gen.txt b/docs/devel/qapi-code-gen.txt new file mode 100644 index 0000000000..52e3874efe --- /dev/null +++ b/docs/devel/qapi-code-gen.txt @@ -0,0 +1,1310 @@ += How to use the QAPI code generator = + +Copyright IBM Corp. 2011 +Copyright (C) 2012-2016 Red Hat, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +== Introduction == + +QAPI is a native C API within QEMU which provides management-level +functionality to internal and external users. For external +users/processes, this interface is made available by a JSON-based wire +format for the QEMU Monitor Protocol (QMP) for controlling qemu, as +well as the QEMU Guest Agent (QGA) for communicating with the guest. +The remainder of this document uses "Client JSON Protocol" when +referring to the wire contents of a QMP or QGA connection. + +To map Client JSON Protocol interfaces to the native C QAPI +implementations, a JSON-based schema is used to define types and +function signatures, and a set of scripts is used to generate types, +signatures, and marshaling/dispatch code. This document will describe +how the schemas, scripts, and resulting code are used. + + +== QMP/Guest agent schema == + +A QAPI schema file is designed to be loosely based on JSON +(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style +and the use of comments; a QAPI schema file is then parsed by a python +code generation program. A valid QAPI schema consists of a series of +top-level expressions, with no commas between them. Where +dictionaries (JSON objects) are used, they are parsed as python +OrderedDicts so that ordering is preserved (for predictable layout of +generated C structs and parameter lists). Ordering doesn't matter +between top-level expressions or the keys within an expression, but +does matter within dictionary values for 'data' and 'returns' members +of a single expression. QAPI schema input is written using 'single +quotes' instead of JSON's "double quotes" (in contrast, Client JSON +Protocol uses no comments, and while input accepts 'single quotes' as +an extension, output is strict JSON using only "double quotes"). As +in JSON, trailing commas are not permitted in arrays or dictionaries. +Input must be ASCII (although QMP supports full Unicode strings, the +QAPI parser does not). At present, there is no place where a QAPI +schema requires the use of JSON numbers or null. + + +=== Comments === + +Comments are allowed; anything between an unquoted # and the following +newline is ignored. + +A multi-line comment that starts and ends with a '##' line is a +documentation comment. These are parsed by the documentation +generator, which recognizes certain markup detailed below. + + +==== Documentation markup ==== + +Comment text starting with '=' is a section title: + + # = Section title + +Double the '=' for a subsection title: + + # == Subection title + +'|' denotes examples: + + # | Text of the example, may span + # | multiple lines + +'*' starts an itemized list: + + # * First item, may span + # multiple lines + # * Second item + +You can also use '-' instead of '*'. + +A decimal number followed by '.' starts a numbered list: + + # 1. First item, may span + # multiple lines + # 2. Second item + +The actual number doesn't matter. You could even use '*' instead of +'2.' for the second item. + +Lists can't be nested. Blank lines are currently not supported within +lists. + +Additional whitespace between the initial '#' and the comment text is +permitted. + +*foo* and _foo_ are for strong and emphasis styles respectively (they +do not work over multiple lines). @foo is used to reference a name in +the schema. + +Example: + +## +# = Section +# == Subsection +# +# Some text foo with *strong* and _emphasis_ +# 1. with a list +# 2. like that +# +# And some code: +# | $ echo foo +# | -> do this +# | <- get that +# +## + + +==== Expression documentation ==== + +Each expression that isn't an include directive may be preceded by a +documentation block. Such blocks are called expression documentation +blocks. + +When documentation is required (see pragma 'doc-required'), expression +documentation blocks are mandatory. + +The documentation block consists of a first line naming the +expression, an optional overview, a description of each argument (for +commands and events) or member (for structs, unions and alternates), +and optional tagged sections. + +FIXME: the parser accepts these things in almost any order. + +Extensions added after the expression was first released carry a +'(since x.y.z)' comment. + +A tagged section starts with one of the following words: +"Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:". +The section ends with the start of a new section. + +A 'Since: x.y.z' tagged section lists the release that introduced the +expression. + +For example: + +## +# @BlockStats: +# +# Statistics of a virtual block device or a block backing device. +# +# @device: If the stats are for a virtual block device, the name +# corresponding to the virtual block device. +# +# @node-name: The node name of the device. (since 2.3) +# +# ... more members ... +# +# Since: 0.14.0 +## +{ 'struct': 'BlockStats', + 'data': {'*device': 'str', '*node-name': 'str', + ... more members ... } } + +## +# @query-blockstats: +# +# Query the @BlockStats for all virtual block devices. +# +# @query-nodes: If true, the command will query all the +# block nodes ... explain, explain ... (since 2.3) +# +# Returns: A list of @BlockStats for each virtual block devices. +# +# Since: 0.14.0 +# +# Example: +# +# -> { "execute": "query-blockstats" } +# <- { +# ... lots of output ... +# } +# +## +{ 'command': 'query-blockstats', + 'data': { '*query-nodes': 'bool' }, + 'returns': ['BlockStats'] } + +==== Free-form documentation ==== + +A documentation block that isn't an expression documentation block is +a free-form documentation block. These may be used to provide +additional text and structuring content. + + +=== Schema overview === + +The schema sets up a series of types, as well as commands and events +that will use those types. Forward references are allowed: the parser +scans in two passes, where the first pass learns all type names, and +the second validates the schema and generates the code. This allows +the definition of complex structs that can have mutually recursive +types, and allows for indefinite nesting of Client JSON Protocol that +satisfies the schema. A type name should not be defined more than +once. It is permissible for the schema to contain additional types +not used by any commands or events in the Client JSON Protocol, for +the side effect of generated C code used internally. + +There are eight top-level expressions recognized by the parser: +'include', 'pragma', 'command', 'struct', 'enum', 'union', +'alternate', and 'event'. There are several groups of types: simple +types (a number of built-in types, such as 'int' and 'str'; as well as +enumerations), complex types (structs and two flavors of unions), and +alternate types (a choice between other types). The 'command' and +'event' expressions can refer to existing types by name, or list an +anonymous type as a dictionary. Listing a type name inside an array +refers to a single-dimension array of that type; multi-dimension +arrays are not directly supported (although an array of a complex +struct that contains an array member is possible). + +All names must begin with a letter, and contain only ASCII letters, +digits, hyphen, and underscore. There are two exceptions: enum values +may start with a digit, and names that are downstream extensions (see +section Downstream extensions) start with underscore. + +Names beginning with 'q_' are reserved for the generator, which uses +them for munging QMP names that resemble C keywords or other +problematic strings. For example, a member named "default" in qapi +becomes "q_default" in the generated C code. + +Types, commands, and events share a common namespace. Therefore, +generally speaking, type definitions should always use CamelCase for +user-defined type names, while built-in types are lowercase. + +Type names ending with 'Kind' or 'List' are reserved for the +generator, which uses them for implicit union enums and array types, +respectively. + +Command names, and member names within a type, should be all lower +case with words separated by a hyphen. However, some existing older +commands and complex types use underscore; when extending such +expressions, consistency is preferred over blindly avoiding +underscore. + +Event names should be ALL_CAPS with words separated by underscore. + +Member names starting with 'has-' or 'has_' are reserved for the +generator, which uses them for tracking optional members. + +Any name (command, event, type, member, or enum value) beginning with +"x-" is marked experimental, and may be withdrawn or changed +incompatibly in a future release. + +Pragma 'name-case-whitelist' lets you violate the rules on use of +upper and lower case. Use for new code is strongly discouraged. + +In the rest of this document, usage lines are given for each +expression type, with literal strings written in lower case and +placeholders written in capitals. If a literal string includes a +prefix of '*', that key/value pair can be omitted from the expression. +For example, a usage statement that includes '*base':STRUCT-NAME +means that an expression has an optional key 'base', which if present +must have a value that forms a struct name. + + +=== Built-in Types === + +The following types are predefined, and map to C as follows: + + Schema C JSON + str char * any JSON string, UTF-8 + number double any JSON number + int int64_t a JSON number without fractional part + that fits into the C integer type + int8 int8_t likewise + int16 int16_t likewise + int32 int32_t likewise + int64 int64_t likewise + uint8 uint8_t likewise + uint16 uint16_t likewise + uint32 uint32_t likewise + uint64 uint64_t likewise + size uint64_t like uint64_t, except StringInputVisitor + accepts size suffixes + bool bool JSON true or false + any QObject * any JSON value + QType QType JSON string matching enum QType values + + +=== Include directives === + +Usage: { 'include': STRING } + +The QAPI schema definitions can be modularized using the 'include' directive: + + { 'include': 'path/to/file.json' } + +The directive is evaluated recursively, and include paths are relative to the +file using the directive. Multiple includes of the same file are +idempotent. No other keys should appear in the expression, and the include +value should be a string. + +As a matter of style, it is a good idea to have all files be +self-contained, but at the moment, nothing prevents an included file +from making a forward reference to a type that is only introduced by +an outer file. The parser may be made stricter in the future to +prevent incomplete include files. + + +=== Pragma directives === + +Usage: { 'pragma': DICT } + +The pragma directive lets you control optional generator behavior. +The dictionary's entries are pragma names and values. + +Pragma's scope is currently the complete schema. Setting the same +pragma to different values in parts of the schema doesn't work. + +Pragma 'doc-required' takes a boolean value. If true, documentation +is required. Default is false. + +Pragma 'returns-whitelist' takes a list of command names that may +violate the rules on permitted return types. Default is none. + +Pragma 'name-case-whitelist' takes a list of names that may violate +rules on use of upper- vs. lower-case letters. Default is none. + + +=== Struct types === + +Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME } + +A struct is a dictionary containing a single 'data' key whose value is +a dictionary; the dictionary may be empty. This corresponds to a +struct in C or an Object in JSON. Each value of the 'data' dictionary +must be the name of a type, or a one-element array containing a type +name. An example of a struct is: + + { 'struct': 'MyType', + 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } + +The use of '*' as a prefix to the name means the member is optional in +the corresponding JSON protocol usage. + +The default initialization value of an optional argument should not be changed +between versions of QEMU unless the new default maintains backward +compatibility to the user-visible behavior of the old default. + +With proper documentation, this policy still allows some flexibility; for +example, documenting that a default of 0 picks an optimal buffer size allows +one release to declare the optimal size at 512 while another release declares +the optimal size at 4096 - the user-visible behavior is not the bytes used by +the buffer, but the fact that the buffer was optimal size. + +On input structures (only mentioned in the 'data' side of a command), changing +from mandatory to optional is safe (older clients will supply the option, and +newer clients can benefit from the default); changing from optional to +mandatory is backwards incompatible (older clients may be omitting the option, +and must continue to work). + +On output structures (only mentioned in the 'returns' side of a command), +changing from mandatory to optional is in general unsafe (older clients may be +expecting the member, and could crash if it is missing), although it +can be done if the only way that the optional argument will be omitted +is when it is triggered by the presence of a new input flag to the +command that older clients don't know to send. Changing from optional +to mandatory is safe. + +A structure that is used in both input and output of various commands +must consider the backwards compatibility constraints of both directions +of use. + +A struct definition can specify another struct as its base. +In this case, the members of the base type are included as top-level members +of the new struct's dictionary in the Client JSON Protocol wire +format. An example definition is: + + { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } + { 'struct': 'BlockdevOptionsGenericCOWFormat', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*backing': 'str' } } + +An example BlockdevOptionsGenericCOWFormat object on the wire could use +both members like this: + + { "file": "/some/place/my-image", + "backing": "/some/place/my-backing-file" } + + +=== Enumeration types === + +Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING } + { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING } + +An enumeration type is a dictionary containing a single 'data' key +whose value is a list of strings. An example enumeration is: + + { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] } + +Nothing prevents an empty enumeration, although it is probably not +useful. The list of strings should be lower case; if an enum name +represents multiple words, use '-' between words. The string 'max' is +not allowed as an enum value, and values should not be repeated. + +The enum constants will be named by using a heuristic to turn the +type name into a set of underscore separated words. For the example +above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name +of 'MY_ENUM_VALUE1' for the first value. If the default heuristic +does not result in a desirable name, the optional 'prefix' member +can be used when defining the enum. + +The enumeration values are passed as strings over the Client JSON +Protocol, but are encoded as C enum integral values in generated code. +While the C code starts numbering at 0, it is better to use explicit +comparisons to enum values than implicit comparisons to 0; the C code +will also include a generated enum member ending in _MAX for tracking +the size of the enum, useful when using common functions for +converting between strings and enum values. Since the wire format +always passes by name, it is acceptable to reorder or add new +enumeration members in any location without breaking clients of Client +JSON Protocol; however, removing enum values would break +compatibility. For any struct that has a member that will only contain +a finite set of string values, using an enum type for that member is +better than open-coding the member to be type 'str'. + + +=== Union types === + +Usage: { 'union': STRING, 'data': DICT } +or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT, + 'discriminator': ENUM-MEMBER-OF-BASE } + +Union types are used to let the user choose between several different +variants for an object. There are two flavors: simple (no +discriminator or base), and flat (both discriminator and base). A union +type is defined using a data dictionary as explained in the following +paragraphs. The data dictionary for either type of union must not +be empty. + +A simple union type defines a mapping from automatic discriminator +values to data types like in this example: + + { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } } + { 'struct': 'BlockdevOptionsQcow2', + 'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } } + + { 'union': 'BlockdevOptionsSimple', + 'data': { 'file': 'BlockdevOptionsFile', + 'qcow2': 'BlockdevOptionsQcow2' } } + +In the Client JSON Protocol, a simple union is represented by a +dictionary that contains the 'type' member as a discriminator, and a +'data' member that is of the specified data type corresponding to the +discriminator value, as in these examples: + + { "type": "file", "data": { "filename": "/some/place/my-image" } } + { "type": "qcow2", "data": { "backing": "/some/place/my-image", + "lazy-refcounts": true } } + +The generated C code uses a struct containing a union. Additionally, +an implicit C enum 'NameKind' is created, corresponding to the union +'Name', for accessing the various branches of the union. No branch of +the union can be named 'max', as this would collide with the implicit +enum. The value for each branch can be of any type. + +A flat union definition avoids nesting on the wire, and specifies a +set of common members that occur in all variants of the union. The +'base' key must specify either a type name (the type must be a +struct, not a union), or a dictionary representing an anonymous type. +All branches of the union must be complex types, and the top-level +members of the union dictionary on the wire will be combination of +members from both the base type and the appropriate branch type (when +merging two dictionaries, there must be no keys in common). The +'discriminator' member must be the name of a non-optional enum-typed +member of the base struct. + +The following example enhances the above simple union example by +adding an optional common member 'read-only', renaming the +discriminator to something more applicable than the simple union's +default of 'type', and reducing the number of {} required on the wire: + + { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] } + { 'union': 'BlockdevOptions', + 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' }, + 'discriminator': 'driver', + 'data': { 'file': 'BlockdevOptionsFile', + 'qcow2': 'BlockdevOptionsQcow2' } } + +Resulting in these JSON objects: + + { "driver": "file", "read-only": true, + "filename": "/some/place/my-image" } + { "driver": "qcow2", "read-only": false, + "backing": "/some/place/my-image", "lazy-refcounts": true } + +Notice that in a flat union, the discriminator name is controlled by +the user, but because it must map to a base member with enum type, the +code generator can ensure that branches exist for all values of the +enum (although the order of the keys need not match the declaration of +the enum). In the resulting generated C data types, a flat union is +represented as a struct with the base members included directly, and +then a union of structures for each branch of the struct. + +A simple union can always be re-written as a flat union where the base +class has a single member named 'type', and where each branch of the +union has a struct with a single member named 'data'. That is, + + { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } } + +is identical on the wire to: + + { 'enum': 'Enum', 'data': ['one', 'two'] } + { 'struct': 'Branch1', 'data': { 'data': 'str' } } + { 'struct': 'Branch2', 'data': { 'data': 'int' } } + { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type', + 'data': { 'one': 'Branch1', 'two': 'Branch2' } } + + +=== Alternate types === + +Usage: { 'alternate': STRING, 'data': DICT } + +An alternate type is one that allows a choice between two or more JSON +data types (string, integer, number, or object, but currently not +array) on the wire. The definition is similar to a simple union type, +where each branch of the union names a QAPI type. For example: + + { 'alternate': 'BlockdevRef', + 'data': { 'definition': 'BlockdevOptions', + 'reference': 'str' } } + +Unlike a union, the discriminator string is never passed on the wire +for the Client JSON Protocol. Instead, the value's JSON type serves +as an implicit discriminator, which in turn means that an alternate +can only express a choice between types represented differently in +JSON. If a branch is typed as the 'bool' built-in, the alternate +accepts true and false; if it is typed as any of the various numeric +built-ins, it accepts a JSON number; if it is typed as a 'str' +built-in or named enum type, it accepts a JSON string; and if it is +typed as a complex type (struct or union), it accepts a JSON object. +Two different complex types, for instance, aren't permitted, because +both are represented as a JSON object. + +The example alternate declaration above allows using both of the +following example objects: + + { "file": "my_existing_block_device_id" } + { "file": { "driver": "file", + "read-only": false, + "filename": "/tmp/mydisk.qcow2" } } + + +=== Commands === + +Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*returns': TYPE-NAME, '*boxed': true, + '*gen': false, '*success-response': false } + +Commands are defined by using a dictionary containing several members, +where three members are most common. The 'command' member is a +mandatory string, and determines the "execute" value passed in a +Client JSON Protocol command exchange. + +The 'data' argument maps to the "arguments" dictionary passed in as +part of a Client JSON Protocol command. The 'data' member is optional +and defaults to {} (an empty dictionary). If present, it must be the +string name of a complex type, or a dictionary that declares an +anonymous type with the same semantics as a 'struct' expression. + +The 'returns' member describes what will appear in the "return" member +of a Client JSON Protocol reply on successful completion of a command. +The member is optional from the command declaration; if absent, the +"return" member will be an empty dictionary. If 'returns' is present, +it must be the string name of a complex or built-in type, a +one-element array containing the name of a complex or built-in type. +To return anything else, you have to list the command in pragma +'returns-whitelist'. If you do this, the command cannot be extended +to return additional information in the future. Use of +'returns-whitelist' for new commands is strongly discouraged. + +All commands in Client JSON Protocol use a dictionary to report +failure, with no way to specify that in QAPI. Where the error return +is different than the usual GenericError class in order to help the +client react differently to certain error conditions, it is worth +documenting this in the comments before the command declaration. + +Some example commands: + + { 'command': 'my-first-command', + 'data': { 'arg1': 'str', '*arg2': 'str' } } + { 'struct': 'MyType', 'data': { '*value': 'str' } } + { 'command': 'my-second-command', + 'returns': [ 'MyType' ] } + +which would validate this Client JSON Protocol transaction: + + => { "execute": "my-first-command", + "arguments": { "arg1": "hello" } } + <= { "return": { } } + => { "execute": "my-second-command" } + <= { "return": [ { "value": "one" }, { } ] } + +The generator emits a prototype for the user's function implementing +the command. Normally, 'data' is a dictionary for an anonymous type, +or names a struct type (possibly empty, but not a union), and its +members are passed as separate arguments to this function. If the +command definition includes a key 'boxed' with the boolean value true, +then 'data' is instead the name of any non-empty complex type +(struct, union, or alternate), and a pointer to that QAPI type is +passed as a single argument. + +The generator also emits a marshalling function that extracts +arguments for the user's function out of an input QDict, calls the +user's function, and if it succeeded, builds an output QObject from +its return value. + +In rare cases, QAPI cannot express a type-safe representation of a +corresponding Client JSON Protocol command. You then have to suppress +generation of a marshalling function by including a key 'gen' with +boolean value false, and instead write your own function. Please try +to avoid adding new commands that rely on this, and instead use +type-safe unions. For an example of this usage: + + { 'command': 'netdev_add', + 'data': {'type': 'str', 'id': 'str'}, + 'gen': false } + +Normally, the QAPI schema is used to describe synchronous exchanges, +where a response is expected. But in some cases, the action of a +command is expected to change state in a way that a successful +response is not possible (although the command will still return a +normal dictionary error on failure). When a successful reply is not +possible, the command expression should include the optional key +'success-response' with boolean value false. So far, only QGA makes +use of this member. + + +=== Events === + +Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*boxed': true } + +Events are defined with the keyword 'event'. It is not allowed to +name an event 'MAX', since the generator also produces a C enumeration +of all event names with a generated _MAX value at the end. When +'data' is also specified, additional info will be included in the +event, with similar semantics to a 'struct' expression. Finally there +will be C API generated in qapi-event.h; when called by QEMU code, a +message with timestamp will be emitted on the wire. + +An example event is: + +{ 'event': 'EVENT_C', + 'data': { '*a': 'int', 'b': 'str' } } + +Resulting in this JSON object: + +{ "event": "EVENT_C", + "data": { "b": "test string" }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +The generator emits a function to send the event. Normally, 'data' is +a dictionary for an anonymous type, or names a struct type (possibly +empty, but not a union), and its members are passed as separate +arguments to this function. If the event definition includes a key +'boxed' with the boolean value true, then 'data' is instead the name of +any non-empty complex type (struct, union, or alternate), and a +pointer to that QAPI type is passed as a single argument. + + +=== Downstream extensions === + +QAPI schema names that are externally visible, say in the Client JSON +Protocol, need to be managed with care. Names starting with a +downstream prefix of the form __RFQDN_ are reserved for the downstream +who controls the valid, reverse fully qualified domain name RFQDN. +RFQDN may only contain ASCII letters, digits, hyphen and period. + +Example: Red Hat, Inc. controls redhat.com, and may therefore add a +downstream command __com.redhat_drive-mirror. + + +== Client JSON Protocol introspection == + +Clients of a Client JSON Protocol commonly need to figure out what +exactly the server (QEMU) supports. + +For this purpose, QMP provides introspection via command +query-qmp-schema. QGA currently doesn't support introspection. + +While Client JSON Protocol wire compatibility should be maintained +between qemu versions, we cannot make the same guarantees for +introspection stability. For example, one version of qemu may provide +a non-variant optional member of a struct, and a later version rework +the member to instead be non-optional and associated with a variant. +Likewise, one version of qemu may list a member with open-ended type +'str', and a later version could convert it to a finite set of strings +via an enum type; or a member may be converted from a specific type to +an alternate that represents a choice between the original type and +something else. + +query-qmp-schema returns a JSON array of SchemaInfo objects. These +objects together describe the wire ABI, as defined in the QAPI schema. +There is no specified order to the SchemaInfo objects returned; a +client must search for a particular name throughout the entire array +to learn more about that name, but is at least guaranteed that there +will be no collisions between type, command, and event names. + +However, the SchemaInfo can't reflect all the rules and restrictions +that apply to QMP. It's interface introspection (figuring out what's +there), not interface specification. The specification is in the QAPI +schema. To understand how QMP is to be used, you need to study the +QAPI schema. + +Like any other command, query-qmp-schema is itself defined in the QAPI +schema, along with the SchemaInfo type. This text attempts to give an +overview how things work. For details you need to consult the QAPI +schema. + +SchemaInfo objects have common members "name" and "meta-type", and +additional variant members depending on the value of meta-type. + +Each SchemaInfo object describes a wire ABI entity of a certain +meta-type: a command, event or one of several kinds of type. + +SchemaInfo for commands and events have the same name as in the QAPI +schema. + +Command and event names are part of the wire ABI, but type names are +not. Therefore, the SchemaInfo for types have auto-generated +meaningless names. For readability, the examples in this section use +meaningful type names instead. + +To examine a type, start with a command or event using it, then follow +references by name. + +QAPI schema definitions not reachable that way are omitted. + +The SchemaInfo for a command has meta-type "command", and variant +members "arg-type" and "ret-type". On the wire, the "arguments" +member of a client's "execute" command must conform to the object type +named by "arg-type". The "return" member that the server passes in a +success response conforms to the type named by "ret-type". + +If the command takes no arguments, "arg-type" names an object type +without members. Likewise, if the command returns nothing, "ret-type" +names an object type without members. + +Example: the SchemaInfo for command query-qmp-schema + + { "name": "query-qmp-schema", "meta-type": "command", + "arg-type": "q_empty", "ret-type": "SchemaInfoList" } + + Type "q_empty" is an automatic object type without members, and type + "SchemaInfoList" is the array of SchemaInfo type. + +The SchemaInfo for an event has meta-type "event", and variant member +"arg-type". On the wire, a "data" member that the server passes in an +event conforms to the object type named by "arg-type". + +If the event carries no additional information, "arg-type" names an +object type without members. The event may not have a data member on +the wire then. + +Each command or event defined with dictionary-valued 'data' in the +QAPI schema implicitly defines an object type. + +Example: the SchemaInfo for EVENT_C from section Events + + { "name": "EVENT_C", "meta-type": "event", + "arg-type": "q_obj-EVENT_C-arg" } + + Type "q_obj-EVENT_C-arg" is an implicitly defined object type with + the two members from the event's definition. + +The SchemaInfo for struct and union types has meta-type "object". + +The SchemaInfo for a struct type has variant member "members". + +The SchemaInfo for a union type additionally has variant members "tag" +and "variants". + +"members" is a JSON array describing the object's common members, if +any. Each element is a JSON object with members "name" (the member's +name), "type" (the name of its type), and optionally "default". The +member is optional if "default" is present. Currently, "default" can +only have value null. Other values are reserved for future +extensions. The "members" array is in no particular order; clients +must search the entire object when learning whether a particular +member is supported. + +Example: the SchemaInfo for MyType from section Struct types + + { "name": "MyType", "meta-type": "object", + "members": [ + { "name": "member1", "type": "str" }, + { "name": "member2", "type": "int" }, + { "name": "member3", "type": "str", "default": null } ] } + +"tag" is the name of the common member serving as type tag. +"variants" is a JSON array describing the object's variant members. +Each element is a JSON object with members "case" (the value of type +tag this element applies to) and "type" (the name of an object type +that provides the variant members for this type tag value). The +"variants" array is in no particular order, and is not guaranteed to +list cases in the same order as the corresponding "tag" enum type. + +Example: the SchemaInfo for flat union BlockdevOptions from section +Union types + + { "name": "BlockdevOptions", "meta-type": "object", + "members": [ + { "name": "driver", "type": "BlockdevDriver" }, + { "name": "read-only", "type": "bool", "default": null } ], + "tag": "driver", + "variants": [ + { "case": "file", "type": "BlockdevOptionsFile" }, + { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] } + +Note that base types are "flattened": its members are included in the +"members" array. + +A simple union implicitly defines an enumeration type for its implicit +discriminator (called "type" on the wire, see section Union types). + +A simple union implicitly defines an object type for each of its +variants. + +Example: the SchemaInfo for simple union BlockdevOptionsSimple from section +Union types + + { "name": "BlockdevOptionsSimple", "meta-type": "object", + "members": [ + { "name": "type", "type": "BlockdevOptionsSimpleKind" } ], + "tag": "type", + "variants": [ + { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" }, + { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] } + + Enumeration type "BlockdevOptionsSimpleKind" and the object types + "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper" + are implicitly defined. + +The SchemaInfo for an alternate type has meta-type "alternate", and +variant member "members". "members" is a JSON array. Each element is +a JSON object with member "type", which names a type. Values of the +alternate type conform to exactly one of its member types. There is +no guarantee on the order in which "members" will be listed. + +Example: the SchemaInfo for BlockdevRef from section Alternate types + + { "name": "BlockdevRef", "meta-type": "alternate", + "members": [ + { "type": "BlockdevOptions" }, + { "type": "str" } ] } + +The SchemaInfo for an array type has meta-type "array", and variant +member "element-type", which names the array's element type. Array +types are implicitly defined. For convenience, the array's name may +resemble the element type; however, clients should examine member +"element-type" instead of making assumptions based on parsing member +"name". + +Example: the SchemaInfo for ['str'] + + { "name": "[str]", "meta-type": "array", + "element-type": "str" } + +The SchemaInfo for an enumeration type has meta-type "enum" and +variant member "values". The values are listed in no particular +order; clients must search the entire enum when learning whether a +particular value is supported. + +Example: the SchemaInfo for MyEnum from section Enumeration types + + { "name": "MyEnum", "meta-type": "enum", + "values": [ "value1", "value2", "value3" ] } + +The SchemaInfo for a built-in type has the same name as the type in +the QAPI schema (see section Built-in Types), with one exception +detailed below. It has variant member "json-type" that shows how +values of this type are encoded on the wire. + +Example: the SchemaInfo for str + + { "name": "str", "meta-type": "builtin", "json-type": "string" } + +The QAPI schema supports a number of integer types that only differ in +how they map to C. They are identical as far as SchemaInfo is +concerned. Therefore, they get all mapped to a single type "int" in +SchemaInfo. + +As explained above, type names are not part of the wire ABI. Not even +the names of built-in types. Clients should examine member +"json-type" instead of hard-coding names of built-in types. + + +== Code generation == + +Schemas are fed into five scripts to generate all the code/files that, +paired with the core QAPI libraries, comprise everything required to +take JSON commands read in by a Client JSON Protocol server, unmarshal +the arguments into the underlying C types, call into the corresponding +C function, map the response back to a Client JSON Protocol response +to be returned to the user, and introspect the commands. + +As an example, we'll use the following schema, which describes a +single complex user-defined type, along with command which takes a +list of that type as a parameter, and returns a single element of that +type. The user is responsible for writing the implementation of +qmp_my_command(); everything else is produced by the generator. + + $ cat example-schema.json + { 'struct': 'UserDefOne', + 'data': { 'integer': 'int', '*string': 'str' } } + + { 'command': 'my-command', + 'data': { 'arg1': ['UserDefOne'] }, + 'returns': 'UserDefOne' } + + { 'event': 'MY_EVENT' } + +For a more thorough look at generated code, the testsuite includes +tests/qapi-schema/qapi-schema-tests.json that covers more examples of +what the generator will accept, and compiles the resulting C code as +part of 'make check-unit'. + +=== scripts/qapi-types.py === + +Used to generate the C types defined by a schema, along with +supporting code. The following files are created: + +$(prefix)qapi-types.h - C types corresponding to types defined in + the schema you pass in +$(prefix)qapi-types.c - Cleanup functions for the above C types + +The $(prefix) is an optional parameter used as a namespace to keep the +generated code from one schema/code-generation separated from others so code +can be generated/used from multiple schemas without clobbering previously +created code. + +Example: + + $ python scripts/qapi-types.py --output-dir="qapi-generated" \ + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-types.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_TYPES_H + #define EXAMPLE_QAPI_TYPES_H + +[Built-in types omitted...] + + typedef struct UserDefOne UserDefOne; + + typedef struct UserDefOneList UserDefOneList; + + struct UserDefOne { + int64_t integer; + bool has_string; + char *string; + }; + + void qapi_free_UserDefOne(UserDefOne *obj); + + struct UserDefOneList { + UserDefOneList *next; + UserDefOne *value; + }; + + void qapi_free_UserDefOneList(UserDefOneList *obj); + + #endif + $ cat qapi-generated/example-qapi-types.c +[Uninteresting stuff omitted...] + + void qapi_free_UserDefOne(UserDefOne *obj) + { + Visitor *v; + + if (!obj) { + return; + } + + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOne(v, NULL, &obj, NULL); + visit_free(v); + } + + void qapi_free_UserDefOneList(UserDefOneList *obj) + { + Visitor *v; + + if (!obj) { + return; + } + + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOneList(v, NULL, &obj, NULL); + visit_free(v); + } + +=== scripts/qapi-visit.py === + +Used to generate the visitor functions used to walk through and +convert between a native QAPI C data structure and some other format +(such as QObject); the generated functions are named visit_type_FOO() +and visit_type_FOO_members(). + +The following files are generated: + +$(prefix)qapi-visit.c: visitor function for a particular C type, used + to automagically convert QObjects into the + corresponding C type and vice-versa, as well + as for deallocating memory for an existing C + type + +$(prefix)qapi-visit.h: declarations for previously mentioned visitor + functions + +Example: + + $ python scripts/qapi-visit.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-visit.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_VISIT_H + #define EXAMPLE_QAPI_VISIT_H + +[Visitors for built-in types omitted...] + + void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp); + void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp); + void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp); + + #endif + $ cat qapi-generated/example-qapi-visit.c +[Uninteresting stuff omitted...] + + void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp) + { + Error *err = NULL; + + visit_type_int(v, "integer", &obj->integer, &err); + if (err) { + goto out; + } + if (visit_optional(v, "string", &obj->has_string)) { + visit_type_str(v, "string", &obj->string, &err); + if (err) { + goto out; + } + } + + out: + error_propagate(errp, err); + } + + void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp) + { + Error *err = NULL; + + visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err); + if (err) { + goto out; + } + if (!*obj) { + goto out_obj; + } + visit_type_UserDefOne_members(v, *obj, &err); + if (err) { + goto out_obj; + } + visit_check_struct(v, &err); + out_obj: + visit_end_struct(v, (void **)obj); + if (err && visit_is_input(v)) { + qapi_free_UserDefOne(*obj); + *obj = NULL; + } + out: + error_propagate(errp, err); + } + + void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp) + { + Error *err = NULL; + UserDefOneList *tail; + size_t size = sizeof(**obj); + + visit_start_list(v, name, (GenericList **)obj, size, &err); + if (err) { + goto out; + } + + for (tail = *obj; tail; + tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) { + visit_type_UserDefOne(v, NULL, &tail->value, &err); + if (err) { + break; + } + } + + visit_end_list(v, (void **)obj); + if (err && visit_is_input(v)) { + qapi_free_UserDefOneList(*obj); + *obj = NULL; + } + out: + error_propagate(errp, err); + } + +=== scripts/qapi-commands.py === + +Used to generate the marshaling/dispatch functions for the commands +defined in the schema. The generated code implements +qmp_marshal_COMMAND() (registered automatically), and declares +qmp_COMMAND() that the user must implement. The following files are +generated: + +$(prefix)qmp-marshal.c: command marshal/dispatch functions for each + QMP command defined in the schema. Functions + generated by qapi-visit.py are used to + convert QObjects received from the wire into + function parameters, and uses the same + visitor functions to convert native C return + values to QObjects from transmission back + over the wire. + +$(prefix)qmp-commands.h: Function prototypes for the QMP commands + specified in the schema. + +Example: + + $ python scripts/qapi-commands.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-commands.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_COMMANDS_H + #define EXAMPLE_QMP_COMMANDS_H + + #include "example-qapi-types.h" + #include "qapi/qmp/qdict.h" + #include "qapi/error.h" + + UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp); + + #endif + $ cat qapi-generated/example-qmp-marshal.c +[Uninteresting stuff omitted...] + + static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp) + { + Error *err = NULL; + Visitor *v; + + v = qobject_output_visitor_new(ret_out); + visit_type_UserDefOne(v, "unused", &ret_in, &err); + if (!err) { + visit_complete(v, ret_out); + } + error_propagate(errp, err); + visit_free(v); + v = qapi_dealloc_visitor_new(); + visit_type_UserDefOne(v, "unused", &ret_in, NULL); + visit_free(v); + } + + static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp) + { + Error *err = NULL; + UserDefOne *retval; + Visitor *v; + UserDefOneList *arg1 = NULL; + + v = qobject_input_visitor_new(QOBJECT(args)); + visit_start_struct(v, NULL, NULL, 0, &err); + if (err) { + goto out; + } + visit_type_UserDefOneList(v, "arg1", &arg1, &err); + if (!err) { + visit_check_struct(v, &err); + } + visit_end_struct(v, NULL); + if (err) { + goto out; + } + + retval = qmp_my_command(arg1, &err); + if (err) { + goto out; + } + + qmp_marshal_output_UserDefOne(retval, ret, &err); + + out: + error_propagate(errp, err); + visit_free(v); + v = qapi_dealloc_visitor_new(); + visit_start_struct(v, NULL, NULL, 0, NULL); + visit_type_UserDefOneList(v, "arg1", &arg1, NULL); + visit_end_struct(v, NULL); + visit_free(v); + } + + static void qmp_init_marshal(void) + { + qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS); + } + + qapi_init(qmp_init_marshal); + +=== scripts/qapi-event.py === + +Used to generate the event-related C code defined by a schema, with +implementations for qapi_event_send_FOO(). The following files are +created: + +$(prefix)qapi-event.h - Function prototypes for each event type, plus an + enumeration of all event names +$(prefix)qapi-event.c - Implementation of functions to send an event + +Example: + + $ python scripts/qapi-event.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-event.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_EVENT_H + #define EXAMPLE_QAPI_EVENT_H + + #include "qapi/error.h" + #include "qapi/qmp/qdict.h" + #include "example-qapi-types.h" + + + void qapi_event_send_my_event(Error **errp); + + typedef enum example_QAPIEvent { + EXAMPLE_QAPI_EVENT_MY_EVENT = 0, + EXAMPLE_QAPI_EVENT__MAX = 1, + } example_QAPIEvent; + + extern const char *const example_QAPIEvent_lookup[]; + + #endif + $ cat qapi-generated/example-qapi-event.c +[Uninteresting stuff omitted...] + + void qapi_event_send_my_event(Error **errp) + { + QDict *qmp; + Error *err = NULL; + QMPEventFuncEmit emit; + emit = qmp_event_get_func_emit(); + if (!emit) { + return; + } + + qmp = qmp_event_build_dict("MY_EVENT"); + + emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err); + + error_propagate(errp, err); + QDECREF(qmp); + } + + const char *const example_QAPIEvent_lookup[] = { + [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT", + [EXAMPLE_QAPI_EVENT__MAX] = NULL, + }; + +=== scripts/qapi-introspect.py === + +Used to generate the introspection C code for a schema. The following +files are created: + +$(prefix)qmp-introspect.c - Defines a string holding a JSON + description of the schema. +$(prefix)qmp-introspect.h - Declares the above string. + +Example: + + $ python scripts/qapi-introspect.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-introspect.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_INTROSPECT_H + #define EXAMPLE_QMP_INTROSPECT_H + + extern const char example_qmp_schema_json[]; + + #endif + $ cat qapi-generated/example-qmp-introspect.c +[Uninteresting stuff omitted...] + + const char example_qmp_schema_json[] = "[" + "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, " + "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, " + "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, " + "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, " + "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, " + "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, " + "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, " + "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]"; diff --git a/docs/devel/rcu.txt b/docs/devel/rcu.txt new file mode 100644 index 0000000000..c84e7f42b2 --- /dev/null +++ b/docs/devel/rcu.txt @@ -0,0 +1,390 @@ +Using RCU (Read-Copy-Update) for synchronization +================================================ + +Read-copy update (RCU) is a synchronization mechanism that is used to +protect read-mostly data structures. RCU is very efficient and scalable +on the read side (it is wait-free), and thus can make the read paths +extremely fast. + +RCU supports concurrency between a single writer and multiple readers, +thus it is not used alone. Typically, the write-side will use a lock to +serialize multiple updates, but other approaches are possible (e.g., +restricting updates to a single task). In QEMU, when a lock is used, +this will often be the "iothread mutex", also known as the "big QEMU +lock" (BQL). Also, restricting updates to a single task is done in +QEMU using the "bottom half" API. + +RCU is fundamentally a "wait-to-finish" mechanism. The read side marks +sections of code with "critical sections", and the update side will wait +for the execution of all *currently running* critical sections before +proceeding, or before asynchronously executing a callback. + +The key point here is that only the currently running critical sections +are waited for; critical sections that are started _after_ the beginning +of the wait do not extend the wait, despite running concurrently with +the updater. This is the reason why RCU is more scalable than, +for example, reader-writer locks. It is so much more scalable that +the system will have a single instance of the RCU mechanism; a single +mechanism can be used for an arbitrary number of "things", without +having to worry about things such as contention or deadlocks. + +How is this possible? The basic idea is to split updates in two phases, +"removal" and "reclamation". During removal, we ensure that subsequent +readers will not be able to get a reference to the old data. After +removal has completed, a critical section will not be able to access +the old data. Therefore, critical sections that begin after removal +do not matter; as soon as all previous critical sections have finished, +there cannot be any readers who hold references to the data structure, +and these can now be safely reclaimed (e.g., freed or unref'ed). + +Here is a picture: + + thread 1 thread 2 thread 3 + ------------------- ------------------------ ------------------- + enter RCU crit.sec. + | finish removal phase + | begin wait + | | enter RCU crit.sec. + exit RCU crit.sec | | + complete wait | + begin reclamation phase | + exit RCU crit.sec. + + +Note how thread 3 is still executing its critical section when thread 2 +starts reclaiming data. This is possible, because the old version of the +data structure was not accessible at the time thread 3 began executing +that critical section. + + +RCU API +======= + +The core RCU API is small: + + void rcu_read_lock(void); + + Used by a reader to inform the reclaimer that the reader is + entering an RCU read-side critical section. + + void rcu_read_unlock(void); + + Used by a reader to inform the reclaimer that the reader is + exiting an RCU read-side critical section. Note that RCU + read-side critical sections may be nested and/or overlapping. + + void synchronize_rcu(void); + + Blocks until all pre-existing RCU read-side critical sections + on all threads have completed. This marks the end of the removal + phase and the beginning of reclamation phase. + + Note that it would be valid for another update to come while + synchronize_rcu is running. Because of this, it is better that + the updater releases any locks it may hold before calling + synchronize_rcu. If this is not possible (for example, because + the updater is protected by the BQL), you can use call_rcu. + + void call_rcu1(struct rcu_head * head, + void (*func)(struct rcu_head *head)); + + This function invokes func(head) after all pre-existing RCU + read-side critical sections on all threads have completed. This + marks the end of the removal phase, with func taking care + asynchronously of the reclamation phase. + + The foo struct needs to have an rcu_head structure added, + perhaps as follows: + + struct foo { + struct rcu_head rcu; + int a; + char b; + long c; + }; + + so that the reclaimer function can fetch the struct foo address + and free it: + + call_rcu1(&foo.rcu, foo_reclaim); + + void foo_reclaim(struct rcu_head *rp) + { + struct foo *fp = container_of(rp, struct foo, rcu); + g_free(fp); + } + + For the common case where the rcu_head member is the first of the + struct, you can use the following macro. + + void call_rcu(T *p, + void (*func)(T *p), + field-name); + void g_free_rcu(T *p, + field-name); + + call_rcu1 is typically used through these macro, in the common case + where the "struct rcu_head" is the first field in the struct. If + the callback function is g_free, in particular, g_free_rcu can be + used. In the above case, one could have written simply: + + g_free_rcu(&foo, rcu); + + typeof(*p) atomic_rcu_read(p); + + atomic_rcu_read() is similar to atomic_mb_read(), but it makes + some assumptions on the code that calls it. This allows a more + optimized implementation. + + atomic_rcu_read assumes that whenever a single RCU critical + section reads multiple shared data, these reads are either + data-dependent or need no ordering. This is almost always the + case when using RCU, because read-side critical sections typically + navigate one or more pointers (the pointers that are changed on + every update) until reaching a data structure of interest, + and then read from there. + + RCU read-side critical sections must use atomic_rcu_read() to + read data, unless concurrent writes are prevented by another + synchronization mechanism. + + Furthermore, RCU read-side critical sections should traverse the + data structure in a single direction, opposite to the direction + in which the updater initializes it. + + void atomic_rcu_set(p, typeof(*p) v); + + atomic_rcu_set() is also similar to atomic_mb_set(), and it also + makes assumptions on the code that calls it in order to allow a more + optimized implementation. + + In particular, atomic_rcu_set() suffices for synchronization + with readers, if the updater never mutates a field within a + data item that is already accessible to readers. This is the + case when initializing a new copy of the RCU-protected data + structure; just ensure that initialization of *p is carried out + before atomic_rcu_set() makes the data item visible to readers. + If this rule is observed, writes will happen in the opposite + order as reads in the RCU read-side critical sections (or if + there is just one update), and there will be no need for other + synchronization mechanism to coordinate the accesses. + +The following APIs must be used before RCU is used in a thread: + + void rcu_register_thread(void); + + Mark a thread as taking part in the RCU mechanism. Such a thread + will have to report quiescent points regularly, either manually + or through the QemuCond/QemuSemaphore/QemuEvent APIs. + + void rcu_unregister_thread(void); + + Mark a thread as not taking part anymore in the RCU mechanism. + It is not a problem if such a thread reports quiescent points, + either manually or by using the QemuCond/QemuSemaphore/QemuEvent + APIs. + +Note that these APIs are relatively heavyweight, and should _not_ be +nested. + + +DIFFERENCES WITH LINUX +====================== + +- Waiting on a mutex is possible, though discouraged, within an RCU critical + section. This is because spinlocks are rarely (if ever) used in userspace + programming; not allowing this would prevent upgrading an RCU read-side + critical section to become an updater. + +- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and + rcu_assign_pointer. They take a _pointer_ to the variable being accessed. + +- call_rcu is a macro that has an extra argument (the name of the first + field in the struct, which must be a struct rcu_head), and expects the + type of the callback's argument to be the type of the first argument. + call_rcu1 is the same as Linux's call_rcu. + + +RCU PATTERNS +============ + +Many patterns using read-writer locks translate directly to RCU, with +the advantages of higher scalability and deadlock immunity. + +In general, RCU can be used whenever it is possible to create a new +"version" of a data structure every time the updater runs. This may +sound like a very strict restriction, however: + +- the updater does not mean "everything that writes to a data structure", + but rather "everything that involves a reclamation step". See the + array example below + +- in some cases, creating a new version of a data structure may actually + be very cheap. For example, modifying the "next" pointer of a singly + linked list is effectively creating a new version of the list. + +Here are some frequently-used RCU idioms that are worth noting. + + +RCU list processing +------------------- + +TBD (not yet used in QEMU) + + +RCU reference counting +---------------------- + +Because grace periods are not allowed to complete while there is an RCU +read-side critical section in progress, the RCU read-side primitives +may be used as a restricted reference-counting mechanism. For example, +consider the following code fragment: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + /* do something with p. */ + rcu_read_unlock(); + +The RCU read-side critical section ensures that the value of "p" remains +valid until after the rcu_read_unlock(). In some sense, it is acquiring +a reference to p that is later released when the critical section ends. +The write side looks simply like this (with appropriate locking): + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + free(old); + +If the processing cannot be done purely within the critical section, it +is possible to combine this idiom with a "real" reference count: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + foo_ref(p); + rcu_read_unlock(); + /* do something with p. */ + foo_unref(p); + +The write side can be like this: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + foo_unref(old); + +or with call_rcu: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + call_rcu(foo_unref, old, rcu); + +In both cases, the write side only performs removal. Reclamation +happens when the last reference to a "foo" object is dropped. +Using synchronize_rcu() is undesirably expensive, because the +last reference may be dropped on the read side. Hence you can +use call_rcu() instead: + + foo_unref(struct foo *p) { + if (atomic_fetch_dec(&p->refcount) == 1) { + call_rcu(foo_destroy, p, rcu); + } + } + + +Note that the same idioms would be possible with reader/writer +locks: + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; p = foo; + /* do something with p. */ foo = new; + read_unlock(&foo_rwlock); free(p); + write_mutex_unlock(&foo_rwlock); + free(p); + + ------------------------------------------------------------------ + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; old = foo; + foo_ref(p); foo = new; + read_unlock(&foo_rwlock); foo_unref(old); + /* do something with p. */ write_mutex_unlock(&foo_rwlock); + read_lock(&foo_rwlock); + foo_unref(p); + read_unlock(&foo_rwlock); + +foo_unref could use a mechanism such as bottom halves to move deallocation +out of the write-side critical section. + + +RCU resizable arrays +-------------------- + +Resizable arrays can be used with RCU. The expensive RCU synchronization +(or call_rcu) only needs to take place when the array is resized. +The two items to take care of are: + +- ensuring that the old version of the array is available between removal + and reclamation; + +- avoiding mismatches in the read side between the array data and the + array size. + +The first problem is avoided simply by not using realloc. Instead, +each resize will allocate a new array and copy the old data into it. +The second problem would arise if the size and the data pointers were +two members of a larger struct: + + struct mystuff { + ... + int data_size; + int data_alloc; + T *data; + ... + }; + +Instead, we store the size of the array with the array itself: + + struct arr { + int size; + int alloc; + T data[]; + }; + struct arr *global_array; + + read side: + rcu_read_lock(); + struct arr *array = atomic_rcu_read(&global_array); + x = i < array->size ? array->data[i] : -1; + rcu_read_unlock(); + return x; + + write side (running under a lock): + if (global_array->size == global_array->alloc) { + /* Creating a new version. */ + new_array = g_malloc(sizeof(struct arr) + + global_array->alloc * 2 * sizeof(T)); + new_array->size = global_array->size; + new_array->alloc = global_array->alloc * 2; + memcpy(new_array->data, global_array->data, + global_array->alloc * sizeof(T)); + + /* Removal phase. */ + old_array = global_array; + atomic_rcu_set(&new_array->data, new_array); + synchronize_rcu(); + + /* Reclamation phase. */ + free(old_array); + } + + +SOURCES +======= + +* Documentation/RCU/ from the Linux kernel diff --git a/docs/devel/tracing.txt b/docs/devel/tracing.txt new file mode 100644 index 0000000000..8c0029beca --- /dev/null +++ b/docs/devel/tracing.txt @@ -0,0 +1,442 @@ += Tracing = + +== Introduction == + +This document describes the tracing infrastructure in QEMU and how to use it +for debugging, profiling, and observing execution. + +== Quickstart == + +1. Build with the 'simple' trace backend: + + ./configure --enable-trace-backends=simple + make + +2. Create a file with the events you want to trace: + + echo bdrv_aio_readv > /tmp/events + echo bdrv_aio_writev >> /tmp/events + +3. Run the virtual machine to produce a trace file: + + qemu -trace events=/tmp/events ... # your normal QEMU invocation + +4. Pretty-print the binary trace file: + + ./scripts/simpletrace.py trace-events-all trace-* # Override * with QEMU <pid> + +== Trace events == + +=== Sub-directory setup === + +Each directory in the source tree can declare a set of static trace events +in a local "trace-events" file. All directories which contain "trace-events" +files must be listed in the "trace-events-subdirs" make variable in the top +level Makefile.objs. During build, the "trace-events" file in each listed +subdirectory will be processed by the "tracetool" script to generate code for +the trace events. + +The individual "trace-events" files are merged into a "trace-events-all" file, +which is also installed into "/usr/share/qemu" with the name "trace-events". +This merged file is to be used by the "simpletrace.py" script to later analyse +traces in the simpletrace data format. + +In the sub-directory the following files will be automatically generated + + - trace.c - the trace event state declarations + - trace.h - the trace event enums and probe functions + - trace-dtrace.h - DTrace event probe specification + - trace-dtrace.dtrace - DTrace event probe helper declaration + - trace-dtrace.o - binary DTrace provider (generated by dtrace) + - trace-ust.h - UST event probe helper declarations + +Source files in the sub-directory should #include the local 'trace.h' file, +without any sub-directory path prefix. eg io/channel-buffer.c would do + + #include "trace.h" + +To access the 'io/trace.h' file. While it is possible to include a trace.h +file from outside a source files' own sub-directory, this is discouraged in +general. It is strongly preferred that all events be declared directly in +the sub-directory that uses them. The only exception is where there are some +shared trace events defined in the top level directory trace-events file. +The top level directory generates trace files with a filename prefix of +"trace-root" instead of just "trace". This is to avoid ambiguity between +a trace.h in the current directory, vs the top level directory. + +=== Using trace events === + +Trace events are invoked directly from source code like this: + + #include "trace.h" /* needed for trace event prototype */ + + void *qemu_vmalloc(size_t size) + { + void *ptr; + size_t align = QEMU_VMALLOC_ALIGN; + + if (size < align) { + align = getpagesize(); + } + ptr = qemu_memalign(align, size); + trace_qemu_vmalloc(size, ptr); + return ptr; + } + +=== Declaring trace events === + +The "tracetool" script produces the trace.h header file which is included by +every source file that uses trace events. Since many source files include +trace.h, it uses a minimum of types and other header files included to keep the +namespace clean and compile times and dependencies down. + +Trace events should use types as follows: + + * Use stdint.h types for fixed-size types. Most offsets and guest memory + addresses are best represented with uint32_t or uint64_t. Use fixed-size + types over primitive types whose size may change depending on the host + (32-bit versus 64-bit) so trace events don't truncate values or break + the build. + + * Use void * for pointers to structs or for arrays. The trace.h header + cannot include all user-defined struct declarations and it is therefore + necessary to use void * for pointers to structs. + + * For everything else, use primitive scalar types (char, int, long) with the + appropriate signedness. + +Format strings should reflect the types defined in the trace event. Take +special care to use PRId64 and PRIu64 for int64_t and uint64_t types, +respectively. This ensures portability between 32- and 64-bit platforms. + +Each event declaration will start with the event name, then its arguments, +finally a format string for pretty-printing. For example: + + qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p" + qemu_vfree(void *ptr) "ptr %p" + + +=== Hints for adding new trace events === + +1. Trace state changes in the code. Interesting points in the code usually + involve a state change like starting, stopping, allocating, freeing. State + changes are good trace events because they can be used to understand the + execution of the system. + +2. Trace guest operations. Guest I/O accesses like reading device registers + are good trace events because they can be used to understand guest + interactions. + +3. Use correlator fields so the context of an individual line of trace output + can be understood. For example, trace the pointer returned by malloc and + used as an argument to free. This way mallocs and frees can be matched up. + Trace events with no context are not very useful. + +4. Name trace events after their function. If there are multiple trace events + in one function, append a unique distinguisher at the end of the name. + +== Generic interface and monitor commands == + +You can programmatically query and control the state of trace events through a +backend-agnostic interface provided by the header "trace/control.h". + +Note that some of the backends do not provide an implementation for some parts +of this interface, in which case QEMU will just print a warning (please refer to +header "trace/control.h" to see which routines are backend-dependent). + +The state of events can also be queried and modified through monitor commands: + +* info trace-events + View available trace events and their state. State 1 means enabled, state 0 + means disabled. + +* trace-event NAME on|off + Enable/disable a given trace event or a group of events (using wildcards). + +The "-trace events=<file>" command line argument can be used to enable the +events listed in <file> from the very beginning of the program. This file must +contain one event name per line. + +If a line in the "-trace events=<file>" file begins with a '-', the trace event +will be disabled instead of enabled. This is useful when a wildcard was used +to enable an entire family of events but one noisy event needs to be disabled. + +Wildcard matching is supported in both the monitor command "trace-event" and the +events list file. That means you can enable/disable the events having a common +prefix in a batch. For example, virtio-blk trace events could be enabled using +the following monitor command: + + trace-event virtio_blk_* on + +== Trace backends == + +The "tracetool" script automates tedious trace event code generation and also +keeps the trace event declarations independent of the trace backend. The trace +events are not tightly coupled to a specific trace backend, such as LTTng or +SystemTap. Support for trace backends can be added by extending the "tracetool" +script. + +The trace backends are chosen at configure time: + + ./configure --enable-trace-backends=simple + +For a list of supported trace backends, try ./configure --help or see below. +If multiple backends are enabled, the trace is sent to them all. + +If no backends are explicitly selected, configure will default to the +"log" backend. + +The following subsections describe the supported trace backends. + +=== Nop === + +The "nop" backend generates empty trace event functions so that the compiler +can optimize out trace events completely. This imposes no performance +penalty. + +Note that regardless of the selected trace backend, events with the "disable" +property will be generated with the "nop" backend. + +=== Log === + +The "log" backend sends trace events directly to standard error. This +effectively turns trace events into debug printfs. + +This is the simplest backend and can be used together with existing code that +uses DPRINTF(). + +=== Simpletrace === + +The "simple" backend supports common use cases and comes as part of the QEMU +source tree. It may not be as powerful as platform-specific or third-party +trace backends but it is portable. This is the recommended trace backend +unless you have specific needs for more advanced backends. + +=== Ftrace === + +The "ftrace" backend writes trace data to ftrace marker. This effectively +sends trace events to ftrace ring buffer, and you can compare qemu trace +data and kernel(especially kvm.ko when using KVM) trace data. + +if you use KVM, enable kvm events in ftrace: + + # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable + +After running qemu by root user, you can get the trace: + + # cat /sys/kernel/debug/tracing/trace + +Restriction: "ftrace" backend is restricted to Linux only. + +=== Syslog === + +The "syslog" backend sends trace events using the POSIX syslog API. The log +is opened specifying the LOG_DAEMON facility and LOG_PID option (so events +are tagged with the pid of the particular QEMU process that generated +them). All events are logged at LOG_INFO level. + +NOTE: syslog may squash duplicate consecutive trace events and apply rate + limiting. + +Restriction: "syslog" backend is restricted to POSIX compliant OS. + +==== Monitor commands ==== + +* trace-file on|off|flush|set <path> + Enable/disable/flush the trace file or set the trace file name. + +==== Analyzing trace files ==== + +The "simple" backend produces binary trace files that can be formatted with the +simpletrace.py script. The script takes the "trace-events-all" file and the +binary trace: + + ./scripts/simpletrace.py trace-events-all trace-12345 + +You must ensure that the same "trace-events-all" file was used to build QEMU, +otherwise trace event declarations may have changed and output will not be +consistent. + +=== LTTng Userspace Tracer === + +The "ust" backend uses the LTTng Userspace Tracer library. There are no +monitor commands built into QEMU, instead UST utilities should be used to list, +enable/disable, and dump traces. + +Package lttng-tools is required for userspace tracing. You must ensure that the +current user belongs to the "tracing" group, or manually launch the +lttng-sessiond daemon for the current user prior to running any instance of +QEMU. + +While running an instrumented QEMU, LTTng should be able to list all available +events: + + lttng list -u + +Create tracing session: + + lttng create mysession + +Enable events: + + lttng enable-event qemu:g_malloc -u + +Where the events can either be a comma-separated list of events, or "-a" to +enable all tracepoint events. Start and stop tracing as needed: + + lttng start + lttng stop + +View the trace: + + lttng view + +Destroy tracing session: + + lttng destroy + +Babeltrace can be used at any later time to view the trace: + + babeltrace $HOME/lttng-traces/mysession-<date>-<time> + +=== SystemTap === + +The "dtrace" backend uses DTrace sdt probes but has only been tested with +SystemTap. When SystemTap support is detected a .stp file with wrapper probes +is generated to make use in scripts more convenient. This step can also be +performed manually after a build in order to change the binary name in the .stp +probes: + + scripts/tracetool.py --backends=dtrace --format=stap \ + --binary path/to/qemu-binary \ + --target-type system \ + --target-name x86_64 \ + <trace-events-all >qemu.stp + +== Trace event properties == + +Each event in the "trace-events-all" file can be prefixed with a space-separated +list of zero or more of the following event properties. + +=== "disable" === + +If a specific trace event is going to be invoked a huge number of times, this +might have a noticeable performance impact even when the event is +programmatically disabled. + +In this case you should declare such event with the "disable" property. This +will effectively disable the event at compile time (by using the "nop" backend), +thus having no performance impact at all on regular builds (i.e., unless you +edit the "trace-events-all" file). + +In addition, there might be cases where relatively complex computations must be +performed to generate values that are only used as arguments for a trace +function. In these cases you can use the macro 'TRACE_${EVENT_NAME}_ENABLED' to +guard such computations and avoid its compilation when the event is disabled: + + #include "trace.h" /* needed for trace event prototype */ + + void *qemu_vmalloc(size_t size) + { + void *ptr; + size_t align = QEMU_VMALLOC_ALIGN; + + if (size < align) { + align = getpagesize(); + } + ptr = qemu_memalign(align, size); + if (TRACE_QEMU_VMALLOC_ENABLED) { /* preprocessor macro */ + void *complex; + /* some complex computations to produce the 'complex' value */ + trace_qemu_vmalloc(size, ptr, complex); + } + return ptr; + } + +You can check both if the event has been disabled and is dynamically enabled at +the same time using the 'trace_event_get_state' routine (see header +"trace/control.h" for more information). + +=== "tcg" === + +Guest code generated by TCG can be traced by defining an event with the "tcg" +event property. Internally, this property generates two events: +"<eventname>_trans" to trace the event at translation time, and +"<eventname>_exec" to trace the event at execution time. + +Instead of using these two events, you should instead use the function +"trace_<eventname>_tcg" during translation (TCG code generation). This function +will automatically call "trace_<eventname>_trans", and will generate the +necessary TCG code to call "trace_<eventname>_exec" during guest code execution. + +Events with the "tcg" property can be declared in the "trace-events" file with a +mix of native and TCG types, and "trace_<eventname>_tcg" will gracefully forward +them to the "<eventname>_trans" and "<eventname>_exec" events. Since TCG values +are not known at translation time, these are ignored by the "<eventname>_trans" +event. Because of this, the entry in the "trace-events" file needs two printing +formats (separated by a comma): + + tcg foo(uint8_t a1, TCGv_i32 a2) "a1=%d", "a1=%d a2=%d" + +For example: + + #include "trace-tcg.h" + + void some_disassembly_func (...) + { + uint8_t a1 = ...; + TCGv_i32 a2 = ...; + trace_foo_tcg(a1, a2); + } + +This will immediately call: + + void trace_foo_trans(uint8_t a1); + +and will generate the TCG code to call: + + void trace_foo(uint8_t a1, uint32_t a2); + +=== "vcpu" === + +Identifies events that trace vCPU-specific information. It implicitly adds a +"CPUState*" argument, and extends the tracing print format to show the vCPU +information. If used together with the "tcg" property, it adds a second +"TCGv_env" argument that must point to the per-target global TCG register that +points to the vCPU when guest code is executed (usually the "cpu_env" variable). + +The "tcg" and "vcpu" properties are currently only honored in the root +./trace-events file. + +The following example events: + + foo(uint32_t a) "a=%x" + vcpu bar(uint32_t a) "a=%x" + tcg vcpu baz(uint32_t a) "a=%x", "a=%x" + +Can be used as: + + #include "trace-tcg.h" + + CPUArchState *env; + TCGv_ptr cpu_env; + + void some_disassembly_func(...) + { + /* trace emitted at this point */ + trace_foo(0xd1); + /* trace emitted at this point */ + trace_bar(ENV_GET_CPU(env), 0xd2); + /* trace emitted at this point (env) and when guest code is executed (cpu_env) */ + trace_baz_tcg(ENV_GET_CPU(env), cpu_env, 0xd3); + } + +If the translating vCPU has address 0xc1 and code is later executed by vCPU +0xc2, this would be an example output: + + // at guest code translation + foo a=0xd1 + bar cpu=0xc1 a=0xd2 + baz_trans cpu=0xc1 a=0xd3 + // at guest code execution + baz_exec cpu=0xc2 a=0xd3 diff --git a/docs/devel/virtio-migration.txt b/docs/devel/virtio-migration.txt new file mode 100644 index 0000000000..98a6b0ffb5 --- /dev/null +++ b/docs/devel/virtio-migration.txt @@ -0,0 +1,108 @@ +Virtio devices and migration +============================ + +Copyright 2015 IBM Corp. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + +Saving and restoring the state of virtio devices is a bit of a twisty maze, +for several reasons: +- state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and + transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their + state (mac address, request queue, ...) +- most fields are saved via the stream interface; subsequently, subsections + have been added to make cross-version migration possible + +This file attempts to document the current procedure and point out some +caveats. + + +Save state procedure +==================== + +virtio core virtio transport virtio device +----------- ---------------- ------------- + + save() function registered + via VMState wrapper on + device class +virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields +- save common device + fields +- save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields +- save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set + + +Load state procedure +==================== + +virtio core virtio transport virtio device +----------- ---------------- ------------- + + load() function registered + via VMState wrapper on + device class +virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields +- load common device + fields +- load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields +- notify guest + ------> load_device() + - load device-specific + fields +- load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields +- sanitize endianness +- sanitize features +- virtqueue index sanity + check + - feature-dependent setup + + +Implications of this setup +========================== + +Devices need to be careful in their state processing during load: The +load_device() procedure is invoked by the core before subsections have +been loaded. Any code that depends on information transmitted in subsections +therefore has to be invoked in the device's load() function _after_ +virtio_load() returned (like e.g. code depending on features). + +Any extension of the state being migrated should be done in subsections +added to the core for compatibility reasons. If transport or device specific +state is added, core needs to invoke a callback from the new subsection. diff --git a/docs/devel/writing-qmp-commands.txt b/docs/devel/writing-qmp-commands.txt new file mode 100644 index 0000000000..1e6375495b --- /dev/null +++ b/docs/devel/writing-qmp-commands.txt @@ -0,0 +1,607 @@ += How to write QMP commands using the QAPI framework = + +This document is a step-by-step guide on how to write new QMP commands using +the QAPI framework. It also shows how to implement new style HMP commands. + +This document doesn't discuss QMP protocol level details, nor does it dive +into the QAPI framework implementation. + +For an in-depth introduction to the QAPI framework, please refer to +docs/qapi-code-gen.txt. For documentation about the QMP protocol, +start with docs/qmp-intro.txt. + +== Overview == + +Generally speaking, the following steps should be taken in order to write a +new QMP command. + +1. Write the command's and type(s) specification in the QAPI schema file + (qapi-schema.json in the root source directory) + +2. Write the QMP command itself, which is a regular C function. Preferably, + the command should be exported by some QEMU subsystem. But it can also be + added to the qmp.c file + +3. At this point the command can be tested under the QMP protocol + +4. Write the HMP command equivalent. This is not required and should only be + done if it does make sense to have the functionality in HMP. The HMP command + is implemented in terms of the QMP command + +The following sections will demonstrate each of the steps above. We will start +very simple and get more complex as we progress. + +=== Testing === + +For all the examples in the next sections, the test setup is the same and is +shown here. + +First, QEMU should be started as: + +# /path/to/your/source/qemu [...] \ + -chardev socket,id=qmp,port=4444,host=localhost,server \ + -mon chardev=qmp,mode=control,pretty=on + +Then, in a different terminal: + +$ telnet localhost 4444 +Trying 127.0.0.1... +Connected to localhost. +Escape character is '^]'. +{ + "QMP": { + "version": { + "qemu": { + "micro": 50, + "minor": 15, + "major": 0 + }, + "package": "" + }, + "capabilities": [ + ] + } +} + +The above output is the QMP server saying you're connected. The server is +actually in capabilities negotiation mode. To enter in command mode type: + +{ "execute": "qmp_capabilities" } + +Then the server should respond: + +{ + "return": { + } +} + +Which is QMP's way of saying "the latest command executed OK and didn't return +any data". Now you're ready to enter the QMP example commands as explained in +the following sections. + +== Writing a command that doesn't return data == + +That's the most simple QMP command that can be written. Usually, this kind of +command carries some meaningful action in QEMU but here it will just print +"Hello, world" to the standard output. + +Our command will be called "hello-world". It takes no arguments, nor does it +return any data. + +The first step is to add the following line to the bottom of the +qapi-schema.json file: + +{ 'command': 'hello-world' } + +The "command" keyword defines a new QMP command. It's an JSON object. All +schema entries are JSON objects. The line above will instruct the QAPI to +generate any prototypes and the necessary code to marshal and unmarshal +protocol data. + +The next step is to write the "hello-world" implementation. As explained +earlier, it's preferable for commands to live in QEMU subsystems. But +"hello-world" doesn't pertain to any, so we put its implementation in qmp.c: + +void qmp_hello_world(Error **errp) +{ + printf("Hello, world!\n"); +} + +There are a few things to be noticed: + +1. QMP command implementation functions must be prefixed with "qmp_" +2. qmp_hello_world() returns void, this is in accordance with the fact that the + command doesn't return any data +3. It takes an "Error **" argument. This is required. Later we will see how to + return errors and take additional arguments. The Error argument should not + be touched if the command doesn't return errors +4. We won't add the function's prototype. That's automatically done by the QAPI +5. Printing to the terminal is discouraged for QMP commands, we do it here + because it's the easiest way to demonstrate a QMP command + +You're done. Now build qemu, run it as suggested in the "Testing" section, +and then type the following QMP command: + +{ "execute": "hello-world" } + +Then check the terminal running qemu and look for the "Hello, world" string. If +you don't see it then something went wrong. + +=== Arguments === + +Let's add an argument called "message" to our "hello-world" command. The new +argument will contain the string to be printed to stdout. It's an optional +argument, if it's not present we print our default "Hello, World" string. + +The first change we have to do is to modify the command specification in the +schema file to the following: + +{ 'command': 'hello-world', 'data': { '*message': 'str' } } + +Notice the new 'data' member in the schema. It's an JSON object whose each +element is an argument to the command in question. Also notice the asterisk, +it's used to mark the argument optional (that means that you shouldn't use it +for mandatory arguments). Finally, 'str' is the argument's type, which +stands for "string". The QAPI also supports integers, booleans, enumerations +and user defined types. + +Now, let's update our C implementation in qmp.c: + +void qmp_hello_world(bool has_message, const char *message, Error **errp) +{ + if (has_message) { + printf("%s\n", message); + } else { + printf("Hello, world\n"); + } +} + +There are two important details to be noticed: + +1. All optional arguments are accompanied by a 'has_' boolean, which is set + if the optional argument is present or false otherwise +2. The C implementation signature must follow the schema's argument ordering, + which is defined by the "data" member + +Time to test our new version of the "hello-world" command. Build qemu, run it as +described in the "Testing" section and then send two commands: + +{ "execute": "hello-world" } +{ + "return": { + } +} + +{ "execute": "hello-world", "arguments": { "message": "We love qemu" } } +{ + "return": { + } +} + +You should see "Hello, world" and "we love qemu" in the terminal running qemu, +if you don't see these strings, then something went wrong. + +=== Errors === + +QMP commands should use the error interface exported by the error.h header +file. Basically, most errors are set by calling the error_setg() function. + +Let's say we don't accept the string "message" to contain the word "love". If +it does contain it, we want the "hello-world" command to return an error: + +void qmp_hello_world(bool has_message, const char *message, Error **errp) +{ + if (has_message) { + if (strstr(message, "love")) { + error_setg(errp, "the word 'love' is not allowed"); + return; + } + printf("%s\n", message); + } else { + printf("Hello, world\n"); + } +} + +The first argument to the error_setg() function is the Error pointer +to pointer, which is passed to all QMP functions. The next argument is a human +description of the error, this is a free-form printf-like string. + +Let's test the example above. Build qemu, run it as defined in the "Testing" +section, and then issue the following command: + +{ "execute": "hello-world", "arguments": { "message": "all you need is love" } } + +The QMP server's response should be: + +{ + "error": { + "class": "GenericError", + "desc": "the word 'love' is not allowed" + } +} + +As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR +(done by default when using error_setg()). There are two exceptions to +this rule: + + 1. A non-generic ErrorClass value exists* for the failure you want to report + (eg. DeviceNotFound) + + 2. Management applications have to take special action on the failure you + want to report, hence you have to add a new ErrorClass value so that they + can check for it + +If the failure you want to report falls into one of the two cases above, +use error_set() with a second argument of an ErrorClass value. + + * All existing ErrorClass values are defined in the qapi-schema.json file + +=== Command Documentation === + +There's only one step missing to make "hello-world"'s implementation complete, +and that's its documentation in the schema file. + +This is very important. No QMP command will be accepted in QEMU without proper +documentation. + +There are many examples of such documentation in the schema file already, but +here goes "hello-world"'s new entry for the qapi-schema.json file: + +## +# @hello-world +# +# Print a client provided string to the standard output stream. +# +# @message: string to be printed +# +# Returns: Nothing on success. +# +# Notes: if @message is not provided, the "Hello, world" string will +# be printed instead +# +# Since: <next qemu stable release, eg. 1.0> +## +{ 'command': 'hello-world', 'data': { '*message': 'str' } } + +Please, note that the "Returns" clause is optional if a command doesn't return +any data nor any errors. + +=== Implementing the HMP command === + +Now that the QMP command is in place, we can also make it available in the human +monitor (HMP). + +With the introduction of the QAPI, HMP commands make QMP calls. Most of the +time HMP commands are simple wrappers. All HMP commands implementation exist in +the hmp.c file. + +Here's the implementation of the "hello-world" HMP command: + +void hmp_hello_world(Monitor *mon, const QDict *qdict) +{ + const char *message = qdict_get_try_str(qdict, "message"); + Error *err = NULL; + + qmp_hello_world(!!message, message, &err); + if (err) { + monitor_printf(mon, "%s\n", error_get_pretty(err)); + error_free(err); + return; + } +} + +Also, you have to add the function's prototype to the hmp.h file. + +There are three important points to be noticed: + +1. The "mon" and "qdict" arguments are mandatory for all HMP functions. The + former is the monitor object. The latter is how the monitor passes + arguments entered by the user to the command implementation +2. hmp_hello_world() performs error checking. In this example we just print + the error description to the user, but we could do more, like taking + different actions depending on the error qmp_hello_world() returns +3. The "err" variable must be initialized to NULL before performing the + QMP call + +There's one last step to actually make the command available to monitor users, +we should add it to the hmp-commands.hx file: + + { + .name = "hello-world", + .args_type = "message:s?", + .params = "hello-world [message]", + .help = "Print message to the standard output", + .cmd = hmp_hello_world, + }, + +STEXI +@item hello_world @var{message} +@findex hello_world +Print message to the standard output +ETEXI + +To test this you have to open a user monitor and issue the "hello-world" +command. It might be instructive to check the command's documentation with +HMP's "help" command. + +Please, check the "-monitor" command-line option to know how to open a user +monitor. + +== Writing a command that returns data == + +A QMP command is capable of returning any data the QAPI supports like integers, +strings, booleans, enumerations and user defined types. + +In this section we will focus on user defined types. Please, check the QAPI +documentation for information about the other types. + +=== User Defined Types === + +FIXME This example needs to be redone after commit 6d32717 + +For this example we will write the query-alarm-clock command, which returns +information about QEMU's timer alarm. For more information about it, please +check the "-clock" command-line option. + +We want to return two pieces of information. The first one is the alarm clock's +name. The second one is when the next alarm will fire. The former information is +returned as a string, the latter is an integer in nanoseconds (which is not +very useful in practice, as the timer has probably already fired when the +information reaches the client). + +The best way to return that data is to create a new QAPI type, as shown below: + +## +# @QemuAlarmClock +# +# QEMU alarm clock information. +# +# @clock-name: The alarm clock method's name. +# +# @next-deadline: The time (in nanoseconds) the next alarm will fire. +# +# Since: 1.0 +## +{ 'type': 'QemuAlarmClock', + 'data': { 'clock-name': 'str', '*next-deadline': 'int' } } + +The "type" keyword defines a new QAPI type. Its "data" member contains the +type's members. In this example our members are the "clock-name" and the +"next-deadline" one, which is optional. + +Now let's define the query-alarm-clock command: + +## +# @query-alarm-clock +# +# Return information about QEMU's alarm clock. +# +# Returns a @QemuAlarmClock instance describing the alarm clock method +# being currently used by QEMU (this is usually set by the '-clock' +# command-line option). +# +# Since: 1.0 +## +{ 'command': 'query-alarm-clock', 'returns': 'QemuAlarmClock' } + +Notice the "returns" keyword. As its name suggests, it's used to define the +data returned by a command. + +It's time to implement the qmp_query_alarm_clock() function, you can put it +in the qemu-timer.c file: + +QemuAlarmClock *qmp_query_alarm_clock(Error **errp) +{ + QemuAlarmClock *clock; + int64_t deadline; + + clock = g_malloc0(sizeof(*clock)); + + deadline = qemu_next_alarm_deadline(); + if (deadline > 0) { + clock->has_next_deadline = true; + clock->next_deadline = deadline; + } + clock->clock_name = g_strdup(alarm_timer->name); + + return clock; +} + +There are a number of things to be noticed: + +1. The QemuAlarmClock type is automatically generated by the QAPI framework, + its members correspond to the type's specification in the schema file +2. As specified in the schema file, the function returns a QemuAlarmClock + instance and takes no arguments (besides the "errp" one, which is mandatory + for all QMP functions) +3. The "clock" variable (which will point to our QAPI type instance) is + allocated by the regular g_malloc0() function. Note that we chose to + initialize the memory to zero. This is recommended for all QAPI types, as + it helps avoiding bad surprises (specially with booleans) +4. Remember that "next_deadline" is optional? All optional members have a + 'has_TYPE_NAME' member that should be properly set by the implementation, + as shown above +5. Even static strings, such as "alarm_timer->name", should be dynamically + allocated by the implementation. This is so because the QAPI also generates + a function to free its types and it cannot distinguish between dynamically + or statically allocated strings +6. You have to include the "qmp-commands.h" header file in qemu-timer.c, + otherwise qemu won't build + +Time to test the new command. Build qemu, run it as described in the "Testing" +section and try this: + +{ "execute": "query-alarm-clock" } +{ + "return": { + "next-deadline": 2368219, + "clock-name": "dynticks" + } +} + +==== The HMP command ==== + +Here's the HMP counterpart of the query-alarm-clock command: + +void hmp_info_alarm_clock(Monitor *mon) +{ + QemuAlarmClock *clock; + Error *err = NULL; + + clock = qmp_query_alarm_clock(&err); + if (err) { + monitor_printf(mon, "Could not query alarm clock information\n"); + error_free(err); + return; + } + + monitor_printf(mon, "Alarm clock method in use: '%s'\n", clock->clock_name); + if (clock->has_next_deadline) { + monitor_printf(mon, "Next alarm will fire in %" PRId64 " nanoseconds\n", + clock->next_deadline); + } + + qapi_free_QemuAlarmClock(clock); +} + +It's important to notice that hmp_info_alarm_clock() calls +qapi_free_QemuAlarmClock() to free the data returned by qmp_query_alarm_clock(). +For user defined types, the QAPI will generate a qapi_free_QAPI_TYPE_NAME() +function and that's what you have to use to free the types you define and +qapi_free_QAPI_TYPE_NAMEList() for list types (explained in the next section). +If the QMP call returns a string, then you should g_free() to free it. + +Also note that hmp_info_alarm_clock() performs error handling. That's not +strictly required if you're sure the QMP function doesn't return errors, but +it's good practice to always check for errors. + +Another important detail is that HMP's "info" commands don't go into the +hmp-commands.hx. Instead, they go into the info_cmds[] table, which is defined +in the monitor.c file. The entry for the "info alarmclock" follows: + + { + .name = "alarmclock", + .args_type = "", + .params = "", + .help = "show information about the alarm clock", + .cmd = hmp_info_alarm_clock, + }, + +To test this, run qemu and type "info alarmclock" in the user monitor. + +=== Returning Lists === + +For this example, we're going to return all available methods for the timer +alarm, which is pretty much what the command-line option "-clock ?" does, +except that we're also going to inform which method is in use. + +This first step is to define a new type: + +## +# @TimerAlarmMethod +# +# Timer alarm method information. +# +# @method-name: The method's name. +# +# @current: true if this alarm method is currently in use, false otherwise +# +# Since: 1.0 +## +{ 'type': 'TimerAlarmMethod', + 'data': { 'method-name': 'str', 'current': 'bool' } } + +The command will be called "query-alarm-methods", here is its schema +specification: + +## +# @query-alarm-methods +# +# Returns information about available alarm methods. +# +# Returns: a list of @TimerAlarmMethod for each method +# +# Since: 1.0 +## +{ 'command': 'query-alarm-methods', 'returns': ['TimerAlarmMethod'] } + +Notice the syntax for returning lists "'returns': ['TimerAlarmMethod']", this +should be read as "returns a list of TimerAlarmMethod instances". + +The C implementation follows: + +TimerAlarmMethodList *qmp_query_alarm_methods(Error **errp) +{ + TimerAlarmMethodList *method_list = NULL; + const struct qemu_alarm_timer *p; + bool current = true; + + for (p = alarm_timers; p->name; p++) { + TimerAlarmMethodList *info = g_malloc0(sizeof(*info)); + info->value = g_malloc0(sizeof(*info->value)); + info->value->method_name = g_strdup(p->name); + info->value->current = current; + + current = false; + + info->next = method_list; + method_list = info; + } + + return method_list; +} + +The most important difference from the previous examples is the +TimerAlarmMethodList type, which is automatically generated by the QAPI from +the TimerAlarmMethod type. + +Each list node is represented by a TimerAlarmMethodList instance. We have to +allocate it, and that's done inside the for loop: the "info" pointer points to +an allocated node. We also have to allocate the node's contents, which is +stored in its "value" member. In our example, the "value" member is a pointer +to an TimerAlarmMethod instance. + +Notice that the "current" variable is used as "true" only in the first +iteration of the loop. That's because the alarm timer method in use is the +first element of the alarm_timers array. Also notice that QAPI lists are handled +by hand and we return the head of the list. + +Now Build qemu, run it as explained in the "Testing" section and try our new +command: + +{ "execute": "query-alarm-methods" } +{ + "return": [ + { + "current": false, + "method-name": "unix" + }, + { + "current": true, + "method-name": "dynticks" + } + ] +} + +The HMP counterpart is a bit more complex than previous examples because it +has to traverse the list, it's shown below for reference: + +void hmp_info_alarm_methods(Monitor *mon) +{ + TimerAlarmMethodList *method_list, *method; + Error *err = NULL; + + method_list = qmp_query_alarm_methods(&err); + if (err) { + monitor_printf(mon, "Could not query alarm methods\n"); + error_free(err); + return; + } + + for (method = method_list; method; method = method->next) { + monitor_printf(mon, "%c %s\n", method->value->current ? '*' : ' ', + method->value->method_name); + } + + qapi_free_TimerAlarmMethodList(method_list); +} diff --git a/docs/ich9-ehci-uhci.cfg b/docs/ich9-ehci-uhci.cfg deleted file mode 100644 index a0e9b96f4d..0000000000 --- a/docs/ich9-ehci-uhci.cfg +++ /dev/null @@ -1,37 +0,0 @@ -########################################################################### -# -# You can pass this file directly to qemu using the -readconfig -# command line switch. -# -# This config file creates a EHCI adapter with companion UHCI -# controllers as multifunction device in PCI slot "1d". -# -# Specify "bus=ehci.0" when creating usb devices to hook them up -# there. -# - -[device "ehci"] - driver = "ich9-usb-ehci1" - addr = "1d.7" - multifunction = "on" - -[device "uhci-1"] - driver = "ich9-usb-uhci1" - addr = "1d.0" - multifunction = "on" - masterbus = "ehci.0" - firstport = "0" - -[device "uhci-2"] - driver = "ich9-usb-uhci2" - addr = "1d.1" - multifunction = "on" - masterbus = "ehci.0" - firstport = "2" - -[device "uhci-3"] - driver = "ich9-usb-uhci3" - addr = "1d.2" - multifunction = "on" - masterbus = "ehci.0" - firstport = "4" diff --git a/docs/lockcnt.txt b/docs/lockcnt.txt deleted file mode 100644 index 2a79b3205b..0000000000 --- a/docs/lockcnt.txt +++ /dev/null @@ -1,277 +0,0 @@ -DOCUMENTATION FOR LOCKED COUNTERS (aka QemuLockCnt) -=================================================== - -QEMU often uses reference counts to track data structures that are being -accessed and should not be freed. For example, a loop that invoke -callbacks like this is not safe: - - QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { - if (ioh->revents & G_IO_OUT) { - ioh->fd_write(ioh->opaque); - } - } - -QLIST_FOREACH_SAFE protects against deletion of the current node (ioh) -by stashing away its "next" pointer. However, ioh->fd_write could -actually delete the next node from the list. The simplest way to -avoid this is to mark the node as deleted, and remove it from the -list in the above loop: - - QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { - if (ioh->deleted) { - QLIST_REMOVE(ioh, next); - g_free(ioh); - } else { - if (ioh->revents & G_IO_OUT) { - ioh->fd_write(ioh->opaque); - } - } - } - -If however this loop must also be reentrant, i.e. it is possible that -ioh->fd_write invokes the loop again, some kind of counting is needed: - - walking_handlers++; - QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { - if (ioh->deleted) { - if (walking_handlers == 1) { - QLIST_REMOVE(ioh, next); - g_free(ioh); - } - } else { - if (ioh->revents & G_IO_OUT) { - ioh->fd_write(ioh->opaque); - } - } - } - walking_handlers--; - -One may think of using the RCU primitives, rcu_read_lock() and -rcu_read_unlock(); effectively, the RCU nesting count would take -the place of the walking_handlers global variable. Indeed, -reference counting and RCU have similar purposes, but their usage in -general is complementary: - -- reference counting is fine-grained and limited to a single data - structure; RCU delays reclamation of *all* RCU-protected data - structures; - -- reference counting works even in the presence of code that keeps - a reference for a long time; RCU critical sections in principle - should be kept short; - -- reference counting is often applied to code that is not thread-safe - but is reentrant; in fact, usage of reference counting in QEMU predates - the introduction of threads by many years. RCU is generally used to - protect readers from other threads freeing memory after concurrent - modifications to a data structure. - -- reclaiming data can be done by a separate thread in the case of RCU; - this can improve performance, but also delay reclamation undesirably. - With reference counting, reclamation is deterministic. - -This file documents QemuLockCnt, an abstraction for using reference -counting in code that has to be both thread-safe and reentrant. - - -QemuLockCnt concepts --------------------- - -A QemuLockCnt comprises both a counter and a mutex; it has primitives -to increment and decrement the counter, and to take and release the -mutex. The counter notes how many visits to the data structures are -taking place (the visits could be from different threads, or there could -be multiple reentrant visits from the same thread). The basic rules -governing the counter/mutex pair then are the following: - -- Data protected by the QemuLockCnt must not be freed unless the - counter is zero and the mutex is taken. - -- A new visit cannot be started while the counter is zero and the - mutex is taken. - -Most of the time, the mutex protects all writes to the data structure, -not just frees, though there could be cases where this is not necessary. - -Reads, instead, can be done without taking the mutex, as long as the -readers and writers use the same macros that are used for RCU, for -example atomic_rcu_read, atomic_rcu_set, QLIST_FOREACH_RCU, etc. This is -because the reads are done outside a lock and a set or QLIST_INSERT_HEAD -can happen concurrently with the read. The RCU API ensures that the -processor and the compiler see all required memory barriers. - -This could be implemented simply by protecting the counter with the -mutex, for example: - - // (1) - qemu_mutex_lock(&walking_handlers_mutex); - walking_handlers++; - qemu_mutex_unlock(&walking_handlers_mutex); - - ... - - // (2) - qemu_mutex_lock(&walking_handlers_mutex); - if (--walking_handlers == 0) { - QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { - if (ioh->deleted) { - QLIST_REMOVE(ioh, next); - g_free(ioh); - } - } - } - qemu_mutex_unlock(&walking_handlers_mutex); - -Here, no frees can happen in the code represented by the ellipsis. -If another thread is executing critical section (2), that part of -the code cannot be entered, because the thread will not be able -to increment the walking_handlers variable. And of course -during the visit any other thread will see a nonzero value for -walking_handlers, as in the single-threaded code. - -Note that it is possible for multiple concurrent accesses to delay -the cleanup arbitrarily; in other words, for the walking_handlers -counter to never become zero. For this reason, this technique is -more easily applicable if concurrent access to the structure is rare. - -However, critical sections are easy to forget since you have to do -them for each modification of the counter. QemuLockCnt ensures that -all modifications of the counter take the lock appropriately, and it -can also be more efficient in two ways: - -- it avoids taking the lock for many operations (for example - incrementing the counter while it is non-zero); - -- on some platforms, one can implement QemuLockCnt to hold the lock - and the mutex in a single word, making the fast path no more expensive - than simply managing a counter using atomic operations (see - docs/atomics.txt). This can be very helpful if concurrent access to - the data structure is expected to be rare. - - -Using the same mutex for frees and writes can still incur some small -inefficiencies; for example, a visit can never start if the counter is -zero and the mutex is taken---even if the mutex is taken by a write, -which in principle need not block a visit of the data structure. -However, these are usually not a problem if any of the following -assumptions are valid: - -- concurrent access is possible but rare - -- writes are rare - -- writes are frequent, but this kind of write (e.g. appending to a - list) has a very small critical section. - -For example, QEMU uses QemuLockCnt to manage an AioContext's list of -bottom halves and file descriptor handlers. Modifications to the list -of file descriptor handlers are rare. Creation of a new bottom half is -frequent and can happen on a fast path; however: 1) it is almost never -concurrent with a visit to the list of bottom halves; 2) it only has -three instructions in the critical path, two assignments and a smp_wmb(). - - -QemuLockCnt API ---------------- - -The QemuLockCnt API is described in include/qemu/thread.h. - - -QemuLockCnt usage ------------------ - -This section explains the typical usage patterns for QemuLockCnt functions. - -Setting a variable to a non-NULL value can be done between -qemu_lockcnt_lock and qemu_lockcnt_unlock: - - qemu_lockcnt_lock(&xyz_lockcnt); - if (!xyz) { - new_xyz = g_new(XYZ, 1); - ... - atomic_rcu_set(&xyz, new_xyz); - } - qemu_lockcnt_unlock(&xyz_lockcnt); - -Accessing the value can be done between qemu_lockcnt_inc and -qemu_lockcnt_dec: - - qemu_lockcnt_inc(&xyz_lockcnt); - if (xyz) { - XYZ *p = atomic_rcu_read(&xyz); - ... - /* Accesses can now be done through "p". */ - } - qemu_lockcnt_dec(&xyz_lockcnt); - -Freeing the object can similarly use qemu_lockcnt_lock and -qemu_lockcnt_unlock, but you also need to ensure that the count -is zero (i.e. there is no concurrent visit). Because qemu_lockcnt_inc -takes the QemuLockCnt's lock, the count cannot become non-zero while -the object is being freed. Freeing an object looks like this: - - qemu_lockcnt_lock(&xyz_lockcnt); - if (!qemu_lockcnt_count(&xyz_lockcnt)) { - g_free(xyz); - xyz = NULL; - } - qemu_lockcnt_unlock(&xyz_lockcnt); - -If an object has to be freed right after a visit, you can combine -the decrement, the locking and the check on count as follows: - - qemu_lockcnt_inc(&xyz_lockcnt); - if (xyz) { - XYZ *p = atomic_rcu_read(&xyz); - ... - /* Accesses can now be done through "p". */ - } - if (qemu_lockcnt_dec_and_lock(&xyz_lockcnt)) { - g_free(xyz); - xyz = NULL; - qemu_lockcnt_unlock(&xyz_lockcnt); - } - -QemuLockCnt can also be used to access a list as follows: - - qemu_lockcnt_inc(&io_handlers_lockcnt); - QLIST_FOREACH_RCU(ioh, &io_handlers, pioh) { - if (ioh->revents & G_IO_OUT) { - ioh->fd_write(ioh->opaque); - } - } - - if (qemu_lockcnt_dec_and_lock(&io_handlers_lockcnt)) { - QLIST_FOREACH_SAFE(ioh, &io_handlers, next, pioh) { - if (ioh->deleted) { - QLIST_REMOVE(ioh, next); - g_free(ioh); - } - } - qemu_lockcnt_unlock(&io_handlers_lockcnt); - } - -Again, the RCU primitives are used because new items can be added to the -list during the walk. QLIST_FOREACH_RCU ensures that the processor and -the compiler see the appropriate memory barriers. - -An alternative pattern uses qemu_lockcnt_dec_if_lock: - - qemu_lockcnt_inc(&io_handlers_lockcnt); - QLIST_FOREACH_SAFE_RCU(ioh, &io_handlers, next, pioh) { - if (ioh->deleted) { - if (qemu_lockcnt_dec_if_lock(&io_handlers_lockcnt)) { - QLIST_REMOVE(ioh, next); - g_free(ioh); - qemu_lockcnt_inc_and_unlock(&io_handlers_lockcnt); - } - } else { - if (ioh->revents & G_IO_OUT) { - ioh->fd_write(ioh->opaque); - } - } - } - qemu_lockcnt_dec(&io_handlers_lockcnt); - -Here you can use qemu_lockcnt_dec instead of qemu_lockcnt_dec_and_lock, -because there is no special task to do if the count goes from 1 to 0. diff --git a/docs/mach-virt-graphical.cfg b/docs/mach-virt-graphical.cfg deleted file mode 100644 index 0fdf6846dd..0000000000 --- a/docs/mach-virt-graphical.cfg +++ /dev/null @@ -1,281 +0,0 @@ -# mach-virt - VirtIO guest (graphical console) -# ========================================================= -# -# Usage: -# -# $ qemu-system-aarch64 \ -# -nodefaults \ -# -readconfig mach-virt-graphical.cfg \ -# -cpu host -# -# You will probably need to tweak the lines marked as -# CHANGE ME before being able to use this configuration! -# -# The guest will have a selection of VirtIO devices -# tailored towards optimal performance with modern guests, -# and will be accessed through a graphical console. -# -# --------------------------------------------------------- -# -# Using -nodefaults is required to have full control over -# the virtual hardware: when it's specified, QEMU will -# populate the board with only the builtin peripherals, -# such as the PL011 UART, plus a PCI Express Root Bus; the -# user will then have to explicitly add further devices. -# -# The PCI Express Root Bus shows up in the guest as: -# -# 00:00.0 Host bridge -# -# This configuration file adds a number of other useful -# devices, more specifically: -# -# 00:01.0 Display controller -# 00.1c.* PCI bridge (PCI Express Root Ports) -# 01:00.0 SCSI storage controller -# 02:00.0 Ethernet controller -# 03:00.0 USB controller -# -# More information about these devices is available below. - - -# Machine options -# ========================================================= -# -# We use the virt machine type and enable KVM acceleration -# for better performance. -# -# Using less than 1 GiB of memory is probably not going to -# yield good performance in the guest, and might even lead -# to obscure boot issues in some cases. -# -# Unfortunately, there is no way to configure the CPU model -# in this file, so it will have to be provided on the -# command line, but we can configure the guest to use the -# same GIC version as the host. - -[machine] - type = "virt" - accel = "kvm" - gic-version = "host" - -[memory] - size = "1024" - - -# Firmware configuration -# ========================================================= -# -# There are two parts to the firmware: a read-only image -# containing the executable code, which is shared between -# guests, and a read/write variable store that is owned -# by one specific guest, exclusively, and is used to -# record information such as the UEFI boot order. -# -# For any new guest, its permanent, private variable store -# should initially be copied from the template file -# provided along with the firmware binary. -# -# Depending on the OS distribution you're using on the -# host, the name of the package containing the firmware -# binary and variable store template, as well as the paths -# to the files themselves, will be different. For example: -# -# Fedora -# edk2-aarch64 (pkg) -# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) -# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) -# -# RHEL -# AAVMF (pkg) -# /usr/share/AAVMF/AAVMF_CODE.fd (bin) -# /usr/share/AAVMF/AAVMF_VARS.fd (var) -# -# Debian/Ubuntu -# qemu-efi (pkg) -# /usr/share/AAVMF/AAVMF_CODE.fd (bin) -# /usr/share/AAVMF/AAVMF_VARS.fd (var) - -[drive "uefi-binary"] - file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME - format = "raw" - if = "pflash" - unit = "0" - readonly = "on" - -[drive "uefi-varstore"] - file = "guest_VARS.fd" # CHANGE ME - format = "raw" - if = "pflash" - unit = "1" - - -# PCI bridge (PCI Express Root Ports) -# ========================================================= -# -# We create eight PCI Express Root Ports, and we plug them -# all into separate functions of the same slot. Some of -# them will be used by devices, the rest will remain -# available for hotplug. - -[device "pcie.1"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - multifunction = "on" - -[device "pcie.2"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "pcie.3"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "pcie.4"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - -[device "pcie.5"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.4" - port = "5" - chassis = "5" - -[device "pcie.6"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.5" - port = "6" - chassis = "6" - -[device "pcie.7"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.6" - port = "7" - chassis = "7" - -[device "pcie.8"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.7" - port = "8" - chassis = "8" - - -# SCSI storage controller (and storage) -# ========================================================= -# -# We use virtio-scsi here so that we can (hot)plug a large -# number of disks without running into issues; a SCSI disk, -# backed by a qcow2 disk image on the host's filesystem, is -# attached to it. -# -# We also create an optical disk, mostly for installation -# purposes: once the guest OS has been succesfully -# installed, the guest will no longer boot from optical -# media. If you don't want, or no longer want, to have an -# optical disk in the guest you can safely comment out -# all relevant sections below. - -[device "scsi"] - driver = "virtio-scsi-pci" - bus = "pcie.1" - addr = "00.0" - -[device "scsi-disk"] - driver = "scsi-hd" - bus = "scsi.0" - drive = "disk" - bootindex = "1" - -[drive "disk"] - file = "guest.qcow2" # CHANGE ME - format = "qcow2" - if = "none" - -[device "scsi-optical-disk"] - driver = "scsi-cd" - bus = "scsi.0" - drive = "optical-disk" - bootindex = "2" - -[drive "optical-disk"] - file = "install.iso" # CHANGE ME - format = "raw" - if = "none" - - -# Ethernet controller -# ========================================================= -# -# We use virtio-net for improved performance over emulated -# hardware; on the host side, we take advantage of user -# networking so that the QEMU process doesn't require any -# additional privileges. - -[netdev "hostnet"] - type = "user" - -[device "net"] - driver = "virtio-net-pci" - netdev = "hostnet" - bus = "pcie.2" - addr = "00.0" - - -# USB controller (and input devices) -# ========================================================= -# -# We add a virtualization-friendly USB 3.0 controller and -# a USB keyboard / USB tablet combo so that graphical -# guests can be controlled appropriately. - -[device "usb"] - driver = "nec-usb-xhci" - bus = "pcie.3" - addr = "00.0" - -[device "keyboard"] - driver = "usb-kbd" - bus = "usb.0" - -[device "tablet"] - driver = "usb-tablet" - bus = "usb.0" - - -# Display controller -# ========================================================= -# -# We use virtio-gpu because the legacy VGA framebuffer is -# very troublesome on aarch64, and virtio-gpu is the only -# video device that doesn't implement it. -# -# If you're running the guest on a remote, potentially -# headless host, you will probably want to append something -# like -# -# -display vnc=127.0.0.1:0 -# -# to the command line in order to prevent QEMU from -# creating a graphical display window on the host and -# enable remote access instead. - -[device "video"] - driver = "virtio-gpu" - bus = "pcie.0" - addr = "01.0" diff --git a/docs/mach-virt-serial.cfg b/docs/mach-virt-serial.cfg deleted file mode 100644 index aee9f1c5a1..0000000000 --- a/docs/mach-virt-serial.cfg +++ /dev/null @@ -1,243 +0,0 @@ -# mach-virt - VirtIO guest (serial console) -# ========================================================= -# -# Usage: -# -# $ qemu-system-aarch64 \ -# -nodefaults \ -# -readconfig mach-virt-serial.cfg \ -# -display none -serial mon:stdio \ -# -cpu host -# -# You will probably need to tweak the lines marked as -# CHANGE ME before being able to use this configuration! -# -# The guest will have a selection of VirtIO devices -# tailored towards optimal performance with modern guests, -# and will be accessed through the serial console. -# -# --------------------------------------------------------- -# -# Using -nodefaults is required to have full control over -# the virtual hardware: when it's specified, QEMU will -# populate the board with only the builtin peripherals, -# such as the PL011 UART, plus a PCI Express Root Bus; the -# user will then have to explicitly add further devices. -# -# The PCI Express Root Bus shows up in the guest as: -# -# 00:00.0 Host bridge -# -# This configuration file adds a number of other useful -# devices, more specifically: -# -# 00.1c.* PCI bridge (PCI Express Root Ports) -# 01:00.0 SCSI storage controller -# 02:00.0 Ethernet controller -# -# More information about these devices is available below. -# -# We use '-display none' to prevent QEMU from creating a -# graphical display window, which would serve no use in -# this specific configuration, and '-serial mon:stdio' to -# multiplex the guest's serial console and the QEMU monitor -# to the host's stdio; use 'Ctrl+A h' to learn how to -# switch between the two and more. - - -# Machine options -# ========================================================= -# -# We use the virt machine type and enable KVM acceleration -# for better performance. -# -# Using less than 1 GiB of memory is probably not going to -# yield good performance in the guest, and might even lead -# to obscure boot issues in some cases. -# -# Unfortunately, there is no way to configure the CPU model -# in this file, so it will have to be provided on the -# command line, but we can configure the guest to use the -# same GIC version as the host. - -[machine] - type = "virt" - accel = "kvm" - gic-version = "host" - -[memory] - size = "1024" - - -# Firmware configuration -# ========================================================= -# -# There are two parts to the firmware: a read-only image -# containing the executable code, which is shared between -# guests, and a read/write variable store that is owned -# by one specific guest, exclusively, and is used to -# record information such as the UEFI boot order. -# -# For any new guest, its permanent, private variable store -# should initially be copied from the template file -# provided along with the firmware binary. -# -# Depending on the OS distribution you're using on the -# host, the name of the package containing the firmware -# binary and variable store template, as well as the paths -# to the files themselves, will be different. For example: -# -# Fedora -# edk2-aarch64 (pkg) -# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) -# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) -# -# RHEL -# AAVMF (pkg) -# /usr/share/AAVMF/AAVMF_CODE.fd (bin) -# /usr/share/AAVMF/AAVMF_VARS.fd (var) -# -# Debian/Ubuntu -# qemu-efi (pkg) -# /usr/share/AAVMF/AAVMF_CODE.fd (bin) -# /usr/share/AAVMF/AAVMF_VARS.fd (var) - -[drive "uefi-binary"] - file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME - format = "raw" - if = "pflash" - unit = "0" - readonly = "on" - -[drive "uefi-varstore"] - file = "guest_VARS.fd" # CHANGE ME - format = "raw" - if = "pflash" - unit = "1" - - -# PCI bridge (PCI Express Root Ports) -# ========================================================= -# -# We create eight PCI Express Root Ports, and we plug them -# all into separate functions of the same slot. Some of -# them will be used by devices, the rest will remain -# available for hotplug. - -[device "pcie.1"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - multifunction = "on" - -[device "pcie.2"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "pcie.3"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "pcie.4"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - -[device "pcie.5"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.4" - port = "5" - chassis = "5" - -[device "pcie.6"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.5" - port = "6" - chassis = "6" - -[device "pcie.7"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.6" - port = "7" - chassis = "7" - -[device "pcie.8"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.7" - port = "8" - chassis = "8" - - -# SCSI storage controller (and storage) -# ========================================================= -# -# We use virtio-scsi here so that we can (hot)plug a large -# number of disks without running into issues; a SCSI disk, -# backed by a qcow2 disk image on the host's filesystem, is -# attached to it. -# -# We also create an optical disk, mostly for installation -# purposes: once the guest OS has been succesfully -# installed, the guest will no longer boot from optical -# media. If you don't want, or no longer want, to have an -# optical disk in the guest you can safely comment out -# all relevant sections below. - -[device "scsi"] - driver = "virtio-scsi-pci" - bus = "pcie.1" - addr = "00.0" - -[device "scsi-disk"] - driver = "scsi-hd" - bus = "scsi.0" - drive = "disk" - bootindex = "1" - -[drive "disk"] - file = "guest.qcow2" # CHANGE ME - format = "qcow2" - if = "none" - -[device "scsi-optical-disk"] - driver = "scsi-cd" - bus = "scsi.0" - drive = "optical-disk" - bootindex = "2" - -[drive "optical-disk"] - file = "install.iso" # CHANGE ME - format = "raw" - if = "none" - - -# Ethernet controller -# ========================================================= -# -# We use virtio-net for improved performance over emulated -# hardware; on the host side, we take advantage of user -# networking so that the QEMU process doesn't require any -# additional privileges. - -[netdev "hostnet"] - type = "user" - -[device "net"] - driver = "virtio-net-pci" - netdev = "hostnet" - bus = "pcie.2" - addr = "00.0" diff --git a/docs/memory.txt b/docs/memory.txt deleted file mode 100644 index 811b1bd3c5..0000000000 --- a/docs/memory.txt +++ /dev/null @@ -1,316 +0,0 @@ -The memory API -============== - -The memory API models the memory and I/O buses and controllers of a QEMU -machine. It attempts to allow modelling of: - - - ordinary RAM - - memory-mapped I/O (MMIO) - - memory controllers that can dynamically reroute physical memory regions - to different destinations - -The memory model provides support for - - - tracking RAM changes by the guest - - setting up coalesced memory for kvm - - setting up ioeventfd regions for kvm - -Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks -(leaves) are RAM and MMIO regions, while other nodes represent -buses, memory controllers, and memory regions that have been rerouted. - -In addition to MemoryRegion objects, the memory API provides AddressSpace -objects for every root and possibly for intermediate MemoryRegions too. -These represent memory as seen from the CPU or a device's viewpoint. - -Types of regions ----------------- - -There are multiple types of memory regions (all represented by a single C type -MemoryRegion): - -- RAM: a RAM region is simply a range of host memory that can be made available - to the guest. - You typically initialize these with memory_region_init_ram(). Some special - purposes require the variants memory_region_init_resizeable_ram(), - memory_region_init_ram_from_file(), or memory_region_init_ram_ptr(). - -- MMIO: a range of guest memory that is implemented by host callbacks; - each read or write causes a callback to be called on the host. - You initialize these with memory_region_init_io(), passing it a - MemoryRegionOps structure describing the callbacks. - -- ROM: a ROM memory region works like RAM for reads (directly accessing - a region of host memory), and forbids writes. You initialize these with - memory_region_init_rom(). - -- ROM device: a ROM device memory region works like RAM for reads - (directly accessing a region of host memory), but like MMIO for - writes (invoking a callback). You initialize these with - memory_region_init_rom_device(). - -- IOMMU region: an IOMMU region translates addresses of accesses made to it - and forwards them to some other target memory region. As the name suggests, - these are only needed for modelling an IOMMU, not for simple devices. - You initialize these with memory_region_init_iommu(). - -- container: a container simply includes other memory regions, each at - a different offset. Containers are useful for grouping several regions - into one unit. For example, a PCI BAR may be composed of a RAM region - and an MMIO region. - - A container's subregions are usually non-overlapping. In some cases it is - useful to have overlapping regions; for example a memory controller that - can overlay a subregion of RAM with MMIO or ROM, or a PCI controller - that does not prevent card from claiming overlapping BARs. - - You initialize a pure container with memory_region_init(). - -- alias: a subsection of another region. Aliases allow a region to be - split apart into discontiguous regions. Examples of uses are memory banks - used when the guest address space is smaller than the amount of RAM - addressed, or a memory controller that splits main memory to expose a "PCI - hole". Aliases may point to any type of region, including other aliases, - but an alias may not point back to itself, directly or indirectly. - You initialize these with memory_region_init_alias(). - -- reservation region: a reservation region is primarily for debugging. - It claims I/O space that is not supposed to be handled by QEMU itself. - The typical use is to track parts of the address space which will be - handled by the host kernel when KVM is enabled. - You initialize these with memory_region_init_reservation(), or by - passing a NULL callback parameter to memory_region_init_io(). - -It is valid to add subregions to a region which is not a pure container -(that is, to an MMIO, RAM or ROM region). This means that the region -will act like a container, except that any addresses within the container's -region which are not claimed by any subregion are handled by the -container itself (ie by its MMIO callbacks or RAM backing). However -it is generally possible to achieve the same effect with a pure container -one of whose subregions is a low priority "background" region covering -the whole address range; this is often clearer and is preferred. -Subregions cannot be added to an alias region. - -Region names ------------- - -Regions are assigned names by the constructor. For most regions these are -only used for debugging purposes, but RAM regions also use the name to identify -live migration sections. This means that RAM region names need to have ABI -stability. - -Region lifecycle ----------------- - -A region is created by one of the memory_region_init*() functions and -attached to an object, which acts as its owner or parent. QEMU ensures -that the owner object remains alive as long as the region is visible to -the guest, or as long as the region is in use by a virtual CPU or another -device. For example, the owner object will not die between an -address_space_map operation and the corresponding address_space_unmap. - -After creation, a region can be added to an address space or a -container with memory_region_add_subregion(), and removed using -memory_region_del_subregion(). - -Various region attributes (read-only, dirty logging, coalesced mmio, -ioeventfd) can be changed during the region lifecycle. They take effect -as soon as the region is made visible. This can be immediately, later, -or never. - -Destruction of a memory region happens automatically when the owner -object dies. - -If however the memory region is part of a dynamically allocated data -structure, you should call object_unparent() to destroy the memory region -before the data structure is freed. For an example see VFIOMSIXInfo -and VFIOQuirk in hw/vfio/pci.c. - -You must not destroy a memory region as long as it may be in use by a -device or CPU. In order to do this, as a general rule do not create or -destroy memory regions dynamically during a device's lifetime, and only -call object_unparent() in the memory region owner's instance_finalize -callback. The dynamically allocated data structure that contains the -memory region then should obviously be freed in the instance_finalize -callback as well. - -If you break this rule, the following situation can happen: - -- the memory region's owner had a reference taken via memory_region_ref - (for example by address_space_map) - -- the region is unparented, and has no owner anymore - -- when address_space_unmap is called, the reference to the memory region's - owner is leaked. - - -There is an exception to the above rule: it is okay to call -object_unparent at any time for an alias or a container region. It is -therefore also okay to create or destroy alias and container regions -dynamically during a device's lifetime. - -This exceptional usage is valid because aliases and containers only help -QEMU building the guest's memory map; they are never accessed directly. -memory_region_ref and memory_region_unref are never called on aliases -or containers, and the above situation then cannot happen. Exploiting -this exception is rarely necessary, and therefore it is discouraged, -but nevertheless it is used in a few places. - -For regions that "have no owner" (NULL is passed at creation time), the -machine object is actually used as the owner. Since instance_finalize is -never called for the machine object, you must never call object_unparent -on regions that have no owner, unless they are aliases or containers. - - -Overlapping regions and priority --------------------------------- -Usually, regions may not overlap each other; a memory address decodes into -exactly one target. In some cases it is useful to allow regions to overlap, -and sometimes to control which of an overlapping regions is visible to the -guest. This is done with memory_region_add_subregion_overlap(), which -allows the region to overlap any other region in the same container, and -specifies a priority that allows the core to decide which of two regions at -the same address are visible (highest wins). -Priority values are signed, and the default value is zero. This means that -you can use memory_region_add_subregion_overlap() both to specify a region -that must sit 'above' any others (with a positive priority) and also a -background region that sits 'below' others (with a negative priority). - -If the higher priority region in an overlap is a container or alias, then -the lower priority region will appear in any "holes" that the higher priority -region has left by not mapping subregions to that area of its address range. -(This applies recursively -- if the subregions are themselves containers or -aliases that leave holes then the lower priority region will appear in these -holes too.) - -For example, suppose we have a container A of size 0x8000 with two subregions -B and C. B is a container mapped at 0x2000, size 0x4000, priority 2; C is -an MMIO region mapped at 0x0, size 0x6000, priority 1. B currently has two -of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at -offset 0x2000. As a diagram: - - 0 1000 2000 3000 4000 5000 6000 7000 8000 - |------|------|------|------|------|------|------|------| - A: [ ] - C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] - B: [ ] - D: [DDDDD] - E: [EEEEE] - -The regions that will be seen within this address range then are: - [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] - -Since B has higher priority than C, its subregions appear in the flat map -even where they overlap with C. In ranges where B has not mapped anything -C's region appears. - -If B had provided its own MMIO operations (ie it was not a pure container) -then these would be used for any addresses in its range not handled by -D or E, and the result would be: - [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB] - -Priority values are local to a container, because the priorities of two -regions are only compared when they are both children of the same container. -This means that the device in charge of the container (typically modelling -a bus or a memory controller) can use them to manage the interaction of -its child regions without any side effects on other parts of the system. -In the example above, the priorities of D and E are unimportant because -they do not overlap each other. It is the relative priority of B and C -that causes D and E to appear on top of C: D and E's priorities are never -compared against the priority of C. - -Visibility ----------- -The memory core uses the following rules to select a memory region when the -guest accesses an address: - -- all direct subregions of the root region are matched against the address, in - descending priority order - - if the address lies outside the region offset/size, the subregion is - discarded - - if the subregion is a leaf (RAM or MMIO), the search terminates, returning - this leaf region - - if the subregion is a container, the same algorithm is used within the - subregion (after the address is adjusted by the subregion offset) - - if the subregion is an alias, the search is continued at the alias target - (after the address is adjusted by the subregion offset and alias offset) - - if a recursive search within a container or alias subregion does not - find a match (because of a "hole" in the container's coverage of its - address range), then if this is a container with its own MMIO or RAM - backing the search terminates, returning the container itself. Otherwise - we continue with the next subregion in priority order -- if none of the subregions match the address then the search terminates - with no match found - -Example memory map ------------------- - -system_memory: container@0-2^48-1 - | - +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff) - | - +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff) - | - +---- vga-window: alias@0xa0000-0xbffff ---> #pci (0xa0000-0xbffff) - | (prio 1) - | - +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff) - -pci (0-2^32-1) - | - +--- vga-area: container@0xa0000-0xbffff - | | - | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff) - | | - | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff) - | - +---- vram: ram@0xe1000000-0xe1ffffff - | - +---- vga-mmio: mmio@0xe2000000-0xe200ffff - -ram: ram@0x00000000-0xffffffff - -This is a (simplified) PC memory map. The 4GB RAM block is mapped into the -system address space via two aliases: "lomem" is a 1:1 mapping of the first -3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the -so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with -4GB of memory. - -The memory controller diverts addresses in the range 640K-768K to the PCI -address space. This is modelled using the "vga-window" alias, mapped at a -higher priority so it obscures the RAM at the same addresses. The vga window -can be removed by programming the memory controller; this is modelled by -removing the alias and exposing the RAM underneath. - -The pci address space is not a direct child of the system address space, since -we only want parts of it to be visible (we accomplish this using aliases). -It has two subregions: vga-area models the legacy vga window and is occupied -by two 32K memory banks pointing at two sections of the framebuffer. -In addition the vram is mapped as a BAR at address e1000000, and an additional -BAR containing MMIO registers is mapped after it. - -Note that if the guest maps a BAR outside the PCI hole, it would not be -visible as the pci-hole alias clips it to a 0.5GB range. - -MMIO Operations ---------------- - -MMIO regions are provided with ->read() and ->write() callbacks; in addition -various constraints can be supplied to control how these callbacks are called: - - - .valid.min_access_size, .valid.max_access_size define the access sizes - (in bytes) which the device accepts; accesses outside this range will - have device and bus specific behaviour (ignored, or machine check) - - .valid.unaligned specifies that the *device being modelled* supports - unaligned accesses; if false, unaligned accesses will invoke the - appropriate bus or CPU specific behaviour. - - .impl.min_access_size, .impl.max_access_size define the access sizes - (in bytes) supported by the *implementation*; other access sizes will be - emulated using the ones available. For example a 4-byte write will be - emulated using four 1-byte writes, if .impl.max_access_size = 1. - - .impl.unaligned specifies that the *implementation* supports unaligned - accesses; if false, unaligned accesses will be emulated by two aligned - accesses. - - .old_mmio eases the porting of code that was formerly using - cpu_register_io_memory(). It should not be used in new code. diff --git a/docs/migration.txt b/docs/migration.txt deleted file mode 100644 index 1b940a829b..0000000000 --- a/docs/migration.txt +++ /dev/null @@ -1,555 +0,0 @@ -= Migration = - -QEMU has code to load/save the state of the guest that it is running. -These are two complementary operations. Saving the state just does -that, saves the state for each device that the guest is running. -Restoring a guest is just the opposite operation: we need to load the -state of each device. - -For this to work, QEMU has to be launched with the same arguments the -two times. I.e. it can only restore the state in one guest that has -the same devices that the one it was saved (this last requirement can -be relaxed a bit, but for now we can consider that configuration has -to be exactly the same). - -Once that we are able to save/restore a guest, a new functionality is -requested: migration. This means that QEMU is able to start in one -machine and being "migrated" to another machine. I.e. being moved to -another machine. - -Next was the "live migration" functionality. This is important -because some guests run with a lot of state (specially RAM), and it -can take a while to move all state from one machine to another. Live -migration allows the guest to continue running while the state is -transferred. Only while the last part of the state is transferred has -the guest to be stopped. Typically the time that the guest is -unresponsive during live migration is the low hundred of milliseconds -(notice that this depends on a lot of things). - -=== Types of migration === - -Now that we have talked about live migration, there are several ways -to do migration: - -- tcp migration: do the migration using tcp sockets -- unix migration: do the migration using unix sockets -- exec migration: do the migration using the stdin/stdout through a process. -- fd migration: do the migration using an file descriptor that is - passed to QEMU. QEMU doesn't care how this file descriptor is opened. - -All these four migration protocols use the same infrastructure to -save/restore state devices. This infrastructure is shared with the -savevm/loadvm functionality. - -=== State Live Migration === - -This is used for RAM and block devices. It is not yet ported to vmstate. -<Fill more information here> - -=== What is the common infrastructure === - -QEMU uses a QEMUFile abstraction to be able to do migration. Any type -of migration that wants to use QEMU infrastructure has to create a -QEMUFile with: - -QEMUFile *qemu_fopen_ops(void *opaque, - QEMUFilePutBufferFunc *put_buffer, - QEMUFileGetBufferFunc *get_buffer, - QEMUFileCloseFunc *close); - -The functions have the following functionality: - -This function writes a chunk of data to a file at the given position. -The pos argument can be ignored if the file is only used for -streaming. The handler should try to write all of the data it can. - -typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, - int64_t pos, int size); - -Read a chunk of data from a file at the given position. The pos argument -can be ignored if the file is only be used for streaming. The number of -bytes actually read should be returned. - -typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, - int64_t pos, int size); - -Close a file and return an error code. - -typedef int (QEMUFileCloseFunc)(void *opaque); - -You can use any internal state that you need using the opaque void * -pointer that is passed to all functions. - -The important functions for us are put_buffer()/get_buffer() that -allow to write/read a buffer into the QEMUFile. - -=== How to save the state of one device === - -The state of a device is saved using intermediate buffers. There are -some helper functions to assist this saving. - -There is a new concept that we have to explain here: device state -version. When we migrate a device, we save/load the state as a series -of fields. Some times, due to bugs or new functionality, we need to -change the state to store more/different information. We use the -version to identify each time that we do a change. Each version is -associated with a series of fields saved. The save_state always saves -the state as the newer version. But load_state sometimes is able to -load state from an older version. - -=== Legacy way === - -This way is going to disappear as soon as all current users are ported to VMSTATE. - -Each device has to register two functions, one to save the state and -another to load the state back. - -int register_savevm(DeviceState *dev, - const char *idstr, - int instance_id, - int version_id, - SaveStateHandler *save_state, - LoadStateHandler *load_state, - void *opaque); - -typedef void SaveStateHandler(QEMUFile *f, void *opaque); -typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); - -The important functions for the device state format are the save_state -and load_state. Notice that load_state receives a version_id -parameter to know what state format is receiving. save_state doesn't -have a version_id parameter because it always uses the latest version. - -=== VMState === - -The legacy way of saving/loading state of the device had the problem -that we have to maintain two functions in sync. If we did one change -in one of them and not in the other, we would get a failed migration. - -VMState changed the way that state is saved/loaded. Instead of using -a function to save the state and another to load it, it was changed to -a declarative way of what the state consisted of. Now VMState is able -to interpret that definition to be able to load/save the state. As -the state is declared only once, it can't go out of sync in the -save/load functions. - -An example (from hw/input/pckbd.c) - -static const VMStateDescription vmstate_kbd = { - .name = "pckbd", - .version_id = 3, - .minimum_version_id = 3, - .fields = (VMStateField[]) { - VMSTATE_UINT8(write_cmd, KBDState), - VMSTATE_UINT8(status, KBDState), - VMSTATE_UINT8(mode, KBDState), - VMSTATE_UINT8(pending, KBDState), - VMSTATE_END_OF_LIST() - } -}; - -We are declaring the state with name "pckbd". -The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. -We registered this with: - - vmstate_register(NULL, 0, &vmstate_kbd, s); - -Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. - -You can search for VMSTATE_* macros for lots of types used in QEMU in -include/hw/hw.h. - -=== More about versions === - -Version numbers are intended for major incompatible changes to the -migration of a device, and using them breaks backwards-migration -compatibility; in general most changes can be made by adding Subsections -(see below) or _TEST macros (see below) which won't break compatibility. - -You can see that there are several version fields: - -- version_id: the maximum version_id supported by VMState for that device. -- minimum_version_id: the minimum version_id that VMState is able to understand - for that device. -- minimum_version_id_old: For devices that were not able to port to vmstate, we can - assign a function that knows how to read this old state. This field is - ignored if there is no load_state_old handler. - -So, VMState is able to read versions from minimum_version_id to -version_id. And the function load_state_old() (if present) is able to -load state from minimum_version_id_old to minimum_version_id. This -function is deprecated and will be removed when no more users are left. - -Saving state will always create a section with the 'version_id' value -and thus can't be loaded by any older QEMU. - -=== Massaging functions === - -Sometimes, it is not enough to be able to save the state directly -from one structure, we need to fill the correct values there. One -example is when we are using kvm. Before saving the cpu state, we -need to ask kvm to copy to QEMU the state that it is using. And the -opposite when we are loading the state, we need a way to tell kvm to -load the state for the cpu that we have just loaded from the QEMUFile. - -The functions to do that are inside a vmstate definition, and are called: - -- int (*pre_load)(void *opaque); - - This function is called before we load the state of one device. - -- int (*post_load)(void *opaque, int version_id); - - This function is called after we load the state of one device. - -- void (*pre_save)(void *opaque); - - This function is called before we save the state of one device. - -Example: You can look at hpet.c, that uses the three function to - massage the state that is transferred. - -If you use memory API functions that update memory layout outside -initialization (i.e., in response to a guest action), this is a strong -indication that you need to call these functions in a post_load callback. -Examples of such memory API functions are: - - - memory_region_add_subregion() - - memory_region_del_subregion() - - memory_region_set_readonly() - - memory_region_set_enabled() - - memory_region_set_address() - - memory_region_set_alias_offset() - -=== Subsections === - -The use of version_id allows to be able to migrate from older versions -to newer versions of a device. But not the other way around. This -makes very complicated to fix bugs in stable branches. If we need to -add anything to the state to fix a bug, we have to disable migration -to older versions that don't have that bug-fix (i.e. a new field). - -But sometimes, that bug-fix is only needed sometimes, not always. For -instance, if the device is in the middle of a DMA operation, it is -using a specific functionality, .... - -It is impossible to create a way to make migration from any version to -any other version to work. But we can do better than only allowing -migration from older versions to newer ones. For that fields that are -only needed sometimes, we add the idea of subsections. A subsection -is "like" a device vmstate, but with a particularity, it has a Boolean -function that tells if that values are needed to be sent or not. If -this functions returns false, the subsection is not sent. - -On the receiving side, if we found a subsection for a device that we -don't understand, we just fail the migration. If we understand all -the subsections, then we load the state with success. - -One important note is that the post_load() function is called "after" -loading all subsections, because a newer subsection could change same -value that it uses. - -Example: - -static bool ide_drive_pio_state_needed(void *opaque) -{ - IDEState *s = opaque; - - return ((s->status & DRQ_STAT) != 0) - || (s->bus->error_status & BM_STATUS_PIO_RETRY); -} - -const VMStateDescription vmstate_ide_drive_pio_state = { - .name = "ide_drive/pio_state", - .version_id = 1, - .minimum_version_id = 1, - .pre_save = ide_drive_pio_pre_save, - .post_load = ide_drive_pio_post_load, - .needed = ide_drive_pio_state_needed, - .fields = (VMStateField[]) { - VMSTATE_INT32(req_nb_sectors, IDEState), - VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, - vmstate_info_uint8, uint8_t), - VMSTATE_INT32(cur_io_buffer_offset, IDEState), - VMSTATE_INT32(cur_io_buffer_len, IDEState), - VMSTATE_UINT8(end_transfer_fn_idx, IDEState), - VMSTATE_INT32(elementary_transfer_size, IDEState), - VMSTATE_INT32(packet_transfer_size, IDEState), - VMSTATE_END_OF_LIST() - } -}; - -const VMStateDescription vmstate_ide_drive = { - .name = "ide_drive", - .version_id = 3, - .minimum_version_id = 0, - .post_load = ide_drive_post_load, - .fields = (VMStateField[]) { - .... several fields .... - VMSTATE_END_OF_LIST() - }, - .subsections = (const VMStateDescription*[]) { - &vmstate_ide_drive_pio_state, - NULL - } -}; - -Here we have a subsection for the pio state. We only need to -save/send this state when we are in the middle of a pio operation -(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is -not enabled, the values on that fields are garbage and don't need to -be sent. - -Using a condition function that checks a 'property' to determine whether -to send a subsection allows backwards migration compatibility when -new subsections are added. - -For example; - a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and - default it to true. - b) Add an entry to the HW_COMPAT_ for the previous version - that sets the property to false. - c) Add a static bool support_foo function that tests the property. - d) Add a subsection with a .needed set to the support_foo function - e) (potentially) Add a pre_load that sets up a default value for 'foo' - to be used if the subsection isn't loaded. - -Now that subsection will not be generated when using an older -machine type and the migration stream will be accepted by older -QEMU versions. pre-load functions can be used to initialise state -on the newer version so that they default to suitable values -when loading streams created by older QEMU versions that do not -generate the subsection. - -In some cases subsections are added for data that had been accidentally -omitted by earlier versions; if the missing data causes the migration -process to succeed but the guest to behave badly then it may be better -to send the subsection and cause the migration to explicitly fail -with the unknown subsection error. If the bad behaviour only happens -with certain data values, making the subsection conditional on -the data value (rather than the machine type) allows migrations to succeed -in most cases. In general the preference is to tie the subsection to -the machine type, and allow reliable migrations, unless the behaviour -from omission of the subsection is really bad. - -= Not sending existing elements = - -Sometimes members of the VMState are no longer needed; - removing them will break migration compatibility - making them version dependent and bumping the version will break backwards - migration compatibility. - -The best way is to: - a) Add a new property/compatibility/function in the same way for subsections - above. - b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: - VMSTATE_UINT32(foo, barstruct) - becomes - VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz) - - Sometime in the future when we no longer care about the ancient -versions these can be killed off. - -= Return path = - -In most migration scenarios there is only a single data path that runs -from the source VM to the destination, typically along a single fd (although -possibly with another fd or similar for some fast way of throwing pages across). - -However, some uses need two way communication; in particular the Postcopy -destination needs to be able to request pages on demand from the source. - -For these scenarios there is a 'return path' from the destination to the source; -qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return -path. - - Source side - Forward path - written by migration thread - Return path - opened by main thread, read by return-path thread - - Destination side - Forward path - read by main thread - Return path - opened by main thread, written by main thread AND postcopy - thread (protected by rp_mutex) - -= Postcopy = -'Postcopy' migration is a way to deal with migrations that refuse to converge -(or take too long to converge) its plus side is that there is an upper bound on -the amount of migration traffic and time it takes, the down side is that during -the postcopy phase, a failure of *either* side or the network connection causes -the guest to be lost. - -In postcopy the destination CPUs are started before all the memory has been -transferred, and accesses to pages that are yet to be transferred cause -a fault that's translated by QEMU into a request to the source QEMU. - -Postcopy can be combined with precopy (i.e. normal migration) so that if precopy -doesn't finish in a given time the switch is made to postcopy. - -=== Enabling postcopy === - -To enable postcopy, issue this command on the monitor prior to the -start of migration: - -migrate_set_capability postcopy-ram on - -The normal commands are then used to start a migration, which is still -started in precopy mode. Issuing: - -migrate_start_postcopy - -will now cause the transition from precopy to postcopy. -It can be issued immediately after migration is started or any -time later on. Issuing it after the end of a migration is harmless. - -Note: During the postcopy phase, the bandwidth limits set using -migrate_set_speed is ignored (to avoid delaying requested pages that -the destination is waiting for). - -=== Postcopy device transfer === - -Loading of device data may cause the device emulation to access guest RAM -that may trigger faults that have to be resolved by the source, as such -the migration stream has to be able to respond with page data *during* the -device load, and hence the device data has to be read from the stream completely -before the device load begins to free the stream up. This is achieved by -'packaging' the device data into a blob that's read in one go. - -Source behaviour - -Until postcopy is entered the migration stream is identical to normal -precopy, except for the addition of a 'postcopy advise' command at -the beginning, to tell the destination that postcopy might happen. -When postcopy starts the source sends the page discard data and then -forms the 'package' containing: - - Command: 'postcopy listen' - The device state - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - Command: 'postcopy run' - -The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the -contents are formatted in the same way as the main migration stream. - -During postcopy the source scans the list of dirty pages and sends them -to the destination without being requested (in much the same way as precopy), -however when a page request is received from the destination, the dirty page -scanning restarts from the requested location. This causes requested pages -to be sent quickly, and also causes pages directly after the requested page -to be sent quickly in the hope that those pages are likely to be used -by the destination soon. - -Destination behaviour - -Initially the destination looks the same as precopy, with a single thread -reading the migration stream; the 'postcopy advise' and 'discard' commands -are processed to change the way RAM is managed, but don't affect the stream -processing. - ------------------------------------------------------------------------------- - 1 2 3 4 5 6 7 -main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) -thread | | - | (page request) - | \___ - v \ -listen thread: --- page -- page -- page -- page -- page -- - - a b c ------------------------------------------------------------------------------- - -On receipt of CMD_PACKAGED (1) - All the data associated with the package - the ( ... ) section in the -diagram - is read into memory, and the main thread recurses into -qemu_loadvm_state_main to process the contents of the package (2) -which contains commands (3,6) and devices (4...) - -On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) -a new thread (a) is started that takes over servicing the migration stream, -while the main thread carries on loading the package. It loads normal -background page data (b) but if during a device load a fault happens (5) the -returned page (c) is loaded by the listen thread allowing the main threads -device load to carry on. - -The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination -CPUs start running. -At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour -and is no longer used by migration, while the listen thread carries -on servicing page data until the end of migration. - -=== Postcopy states === - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - Advise: Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - Discard: Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - Listen: The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - Running: POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - End: The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -=== Source side page maps === - -The source side keeps two bitmaps during postcopy; 'the migration bitmap' -and 'unsent map'. The 'migration bitmap' is basically the same as in -the precopy case, and holds a bit to indicate that page is 'dirty' - -i.e. needs sending. During the precopy phase this is updated as the CPU -dirties pages, however during postcopy the CPUs are stopped and nothing -should dirty anything any more. - -The 'unsent map' is used for the transition to postcopy. It is a bitmap that -has a bit cleared whenever a page is sent to the destination, however during -the transition to postcopy mode it is combined with the migration bitmap -to form a set of pages that: - a) Have been sent but then redirtied (which must be discarded) - b) Have not yet been sent - which also must be discarded to cause any - transparent huge pages built during precopy to be broken. - -Note that the contents of the unsentmap are sacrificed during the calculation -of the discard set and thus aren't valid once in postcopy. The dirtymap -is still valid and is used to ensure that no page is sent more than once. Any -request for a page that has already been sent is ignored. Duplicate requests -such as this can happen as a page is sent at about the same time the -destination accesses it. - -=== Postcopy with hugepages === - -Postcopy now works with hugetlbfs backed memory: - a) The linux kernel on the destination must support userfault on hugepages. - b) The huge-page configuration on the source and destination VMs must be - identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that -mem-path /dev/hugepages will fall back to allocating normal - RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using -mem-prealloc enforces the allocation using hugepages. - d) Care should be taken with the size of hugepage used; postcopy with 2MB - hugepages works well, however 1GB hugepages are likely to be problematic - since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, - and until the full page is transferred the destination thread is blocked. diff --git a/docs/multi-thread-tcg.txt b/docs/multi-thread-tcg.txt deleted file mode 100644 index a99b4564c6..0000000000 --- a/docs/multi-thread-tcg.txt +++ /dev/null @@ -1,350 +0,0 @@ -Copyright (c) 2015-2016 Linaro Ltd. - -This work is licensed under the terms of the GNU GPL, version 2 or -later. See the COPYING file in the top-level directory. - -Introduction -============ - -This document outlines the design for multi-threaded TCG system-mode -emulation. The current user-mode emulation mirrors the thread -structure of the translated executable. Some of the work will be -applicable to both system and linux-user emulation. - -The original system-mode TCG implementation was single threaded and -dealt with multiple CPUs with simple round-robin scheduling. This -simplified a lot of things but became increasingly limited as systems -being emulated gained additional cores and per-core performance gains -for host systems started to level off. - -vCPU Scheduling -=============== - -We introduce a new running mode where each vCPU will run on its own -user-space thread. This will be enabled by default for all FE/BE -combinations that have had the required work done to support this -safely. - -In the general case of running translated code there should be no -inter-vCPU dependencies and all vCPUs should be able to run at full -speed. Synchronisation will only be required while accessing internal -shared data structures or when the emulated architecture requires a -coherent representation of the emulated machine state. - -Shared Data Structures -====================== - -Main Run Loop -------------- - -Even when there is no code being generated there are a number of -structures associated with the hot-path through the main run-loop. -These are associated with looking up the next translation block to -execute. These include: - - tb_jmp_cache (per-vCPU, cache of recent jumps) - tb_ctx.htable (global hash table, phys address->tb lookup) - -As TB linking only occurs when blocks are in the same page this code -is critical to performance as looking up the next TB to execute is the -most common reason to exit the generated code. - -DESIGN REQUIREMENT: Make access to lookup structures safe with -multiple reader/writer threads. Minimise any lock contention to do it. - -The hot-path avoids using locks where possible. The tb_jmp_cache is -updated with atomic accesses to ensure consistent results. The fall -back QHT based hash table is also designed for lockless lookups. Locks -are only taken when code generation is required or TranslationBlocks -have their block-to-block jumps patched. - -Global TCG State ----------------- - -We need to protect the entire code generation cycle including any post -generation patching of the translated code. This also implies a shared -translation buffer which contains code running on all cores. Any -execution path that comes to the main run loop will need to hold a -mutex for code generation. This also includes times when we need flush -code or entries from any shared lookups/caches. Structures held on a -per-vCPU basis won't need locking unless other vCPUs will need to -modify them. - -DESIGN REQUIREMENT: Add locking around all code generation and TB -patching. - -(Current solution) - -Mainly as part of the linux-user work all code generation is -serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the -place of mmap_lock() in linux-user. - -Translation Blocks ------------------- - -Currently the whole system shares a single code generation buffer -which when full will force a flush of all translations and start from -scratch again. Some operations also force a full flush of translations -including: - - - debugging operations (breakpoint insertion/removal) - - some CPU helper functions - -This is done with the async_safe_run_on_cpu() mechanism to ensure all -vCPUs are quiescent when changes are being made to shared global -structures. - -More granular translation invalidation events are typically due -to a change of the state of a physical page: - - - code modification (self modify code, patching code) - - page changes (new page mapping in linux-user mode) - -While setting the invalid flag in a TranslationBlock will stop it -being used when looked up in the hot-path there are a number of other -book-keeping structures that need to be safely cleared. - -Any TranslationBlocks which have been patched to jump directly to the -now invalid blocks need the jump patches reversing so they will return -to the C code. - -There are a number of look-up caches that need to be properly updated -including the: - - - jump lookup cache - - the physical-to-tb lookup hash table - - the global page table - -The global page table (l1_map) which provides a multi-level look-up -for PageDesc structures which contain pointers to the start of a -linked list of all Translation Blocks in that page (see page_next). - -Both the jump patching and the page cache involve linked lists that -the invalidated TranslationBlock needs to be removed from. - -DESIGN REQUIREMENT: Safely handle invalidation of TBs - - safely patch/revert direct jumps - - remove central PageDesc lookup entries - - ensure lookup caches/hashes are safely updated - -(Current solution) - -The direct jump themselves are updated atomically by the TCG -tb_set_jmp_target() code. Modification to the linked lists that allow -searching for linked pages are done under the protect of the -tb_lock(). - -The global page table is protected by the tb_lock() in system-mode and -mmap_lock() in linux-user mode. - -The lookup caches are updated atomically and the lookup hash uses QHT -which is designed for concurrent safe lookup. - - -Memory maps and TLBs --------------------- - -The memory handling code is fairly critical to the speed of memory -access in the emulated system. The SoftMMU code is designed so the -hot-path can be handled entirely within translated code. This is -handled with a per-vCPU TLB structure which once populated will allow -a series of accesses to the page to occur without exiting the -translated code. It is possible to set flags in the TLB address which -will ensure the slow-path is taken for each access. This can be done -to support: - - - Memory regions (dividing up access to PIO, MMIO and RAM) - - Dirty page tracking (for code gen, SMC detection, migration and display) - - Virtual TLB (for translating guest address->real address) - -When the TLB tables are updated by a vCPU thread other than their own -we need to ensure it is done in a safe way so no inconsistent state is -seen by the vCPU thread. - -Some operations require updating a number of vCPUs TLBs at the same -time in a synchronised manner. - -DESIGN REQUIREMENTS: - - - TLB Flush All/Page - - can be across-vCPUs - - cross vCPU TLB flush may need other vCPU brought to halt - - change may need to be visible to the calling vCPU immediately - - TLB Flag Update - - usually cross-vCPU - - want change to be visible as soon as possible - - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs) - - This is a per-vCPU table - by definition can't race - - updated by its own thread when the slow-path is forced - -(Current solution) - -We have updated cputlb.c to defer operations when a cross-vCPU -operation with async_run_on_cpu() which ensures each vCPU sees a -coherent state when it next runs its work (in a few instructions -time). - -A new set up operations (tlb_flush_*_all_cpus) take an additional flag -which when set will force synchronisation by setting the source vCPUs -work as "safe work" and exiting the cpu run loop. This ensure by the -time execution restarts all flush operations have completed. - -TLB flag updates are all done atomically and are also protected by the -tb_lock() which is used by the functions that update the TLB in bulk. - -(Known limitation) - -Not really a limitation but the wait mechanism is overly strict for -some architectures which only need flushes completed by a barrier -instruction. This could be a future optimisation. - -Emulated hardware state ------------------------ - -Currently thanks to KVM work any access to IO memory is automatically -protected by the global iothread mutex, also known as the BQL (Big -Qemu Lock). Any IO region that doesn't use global mutex is expected to -do its own locking. - -However IO memory isn't the only way emulated hardware state can be -modified. Some architectures have model specific registers that -trigger hardware emulation features. Generally any translation helper -that needs to update more than a single vCPUs of state should take the -BQL. - -As the BQL, or global iothread mutex is shared across the system we -push the use of the lock as far down into the TCG code as possible to -minimise contention. - -(Current solution) - -MMIO access automatically serialises hardware emulation by way of the -BQL. Currently ARM targets serialise all ARM_CP_IO register accesses -and also defer the reset/startup of vCPUs to the vCPU context by way -of async_run_on_cpu(). - -Updates to interrupt state are also protected by the BQL as they can -often be cross vCPU. - -Memory Consistency -================== - -Between emulated guests and host systems there are a range of memory -consistency models. Even emulating weakly ordered systems on strongly -ordered hosts needs to ensure things like store-after-load re-ordering -can be prevented when the guest wants to. - -Memory Barriers ---------------- - -Barriers (sometimes known as fences) provide a mechanism for software -to enforce a particular ordering of memory operations from the point -of view of external observers (e.g. another processor core). They can -apply to any memory operations as well as just loads or stores. - -The Linux kernel has an excellent write-up on the various forms of -memory barrier and the guarantees they can provide [1]. - -Barriers are often wrapped around synchronisation primitives to -provide explicit memory ordering semantics. However they can be used -by themselves to provide safe lockless access by ensuring for example -a change to a signal flag will only be visible once the changes to -payload are. - -DESIGN REQUIREMENT: Add a new tcg_memory_barrier op - -This would enforce a strong load/store ordering so all loads/stores -complete at the memory barrier. On single-core non-SMP strongly -ordered backends this could become a NOP. - -Aside from explicit standalone memory barrier instructions there are -also implicit memory ordering semantics which comes with each guest -memory access instruction. For example all x86 load/stores come with -fairly strong guarantees of sequential consistency where as ARM has -special variants of load/store instructions that imply acquire/release -semantics. - -In the case of a strongly ordered guest architecture being emulated on -a weakly ordered host the scope for a heavy performance impact is -quite high. - -DESIGN REQUIREMENTS: Be efficient with use of memory barriers - - host systems with stronger implied guarantees can skip some barriers - - merge consecutive barriers to the strongest one - -(Current solution) - -The system currently has a tcg_gen_mb() which will add memory barrier -operations if code generation is being done in a parallel context. The -tcg_optimize() function attempts to merge barriers up to their -strongest form before any load/store operations. The solution was -originally developed and tested for linux-user based systems. All -backends have been converted to emit fences when required. So far the -following front-ends have been updated to emit fences when required: - - - target-i386 - - target-arm - - target-aarch64 - - target-alpha - - target-mips - -Memory Control and Maintenance ------------------------------- - -This includes a class of instructions for controlling system cache -behaviour. While QEMU doesn't model cache behaviour these instructions -are often seen when code modification has taken place to ensure the -changes take effect. - -Synchronisation Primitives --------------------------- - -There are two broad types of synchronisation primitives found in -modern ISAs: atomic instructions and exclusive regions. - -The first type offer a simple atomic instruction which will guarantee -some sort of test and conditional store will be truly atomic w.r.t. -other cores sharing access to the memory. The classic example is the -x86 cmpxchg instruction. - -The second type offer a pair of load/store instructions which offer a -guarantee that an region of memory has not been touched between the -load and store instructions. An example of this is ARM's ldrex/strex -pair where the strex instruction will return a flag indicating a -successful store only if no other CPU has accessed the memory region -since the ldrex. - -Traditionally TCG has generated a series of operations that work -because they are within the context of a single translation block so -will have completed before another CPU is scheduled. However with -the ability to have multiple threads running to emulate multiple CPUs -we will need to explicitly expose these semantics. - -DESIGN REQUIREMENTS: - - Support classic atomic instructions - - Support load/store exclusive (or load link/store conditional) pairs - - Generic enough infrastructure to support all guest architectures -CURRENT OPEN QUESTIONS: - - How problematic is the ABA problem in general? - -(Current solution) - -The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which -can be used directly or combined to emulate other instructions like -ARM's ldrex/strex instructions. While they are susceptible to the ABA -problem so far common guests have not implemented patterns where -this may be a problem - typically presenting a locking ABI which -assumes cmpxchg like semantics. - -The code also includes a fall-back for cases where multi-threaded TCG -ops can't work (e.g. guest atomic width > host atomic width). In this -case an EXCP_ATOMIC exit occurs and the instruction is emulated with -an exclusive lock which ensures all emulation is serialised. - -While the atomic helpers look good enough for now there may be a need -to look at solutions that can more closely model the guest -architectures semantics. - -========== - -[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt deleted file mode 100644 index e4d340bbb7..0000000000 --- a/docs/multiple-iothreads.txt +++ /dev/null @@ -1,137 +0,0 @@ -Copyright (c) 2014 Red Hat Inc. - -This work is licensed under the terms of the GNU GPL, version 2 or later. See -the COPYING file in the top-level directory. - - -This document explains the IOThread feature and how to write code that runs -outside the QEMU global mutex. - -The main loop and IOThreads ---------------------------- -QEMU is an event-driven program that can do several things at once using an -event loop. The VNC server and the QMP monitor are both processed from the -same event loop, which monitors their file descriptors until they become -readable and then invokes a callback. - -The default event loop is called the main loop (see main-loop.c). It is -possible to create additional event loop threads using -object -iothread,id=my-iothread. - -Side note: The main loop and IOThread are both event loops but their code is -not shared completely. Sometimes it is useful to remember that although they -are conceptually similar they are currently not interchangeable. - -Why IOThreads are useful ------------------------- -IOThreads allow the user to control the placement of work. The main loop is a -scalability bottleneck on hosts with many CPUs. Work can be spread across -several IOThreads instead of just one main loop. When set up correctly this -can improve I/O latency and reduce jitter seen by the guest. - -The main loop is also deeply associated with the QEMU global mutex, which is a -scalability bottleneck in itself. vCPU threads and the main loop use the QEMU -global mutex to serialize execution of QEMU code. This mutex is necessary -because a lot of QEMU's code historically was not thread-safe. - -The fact that all I/O processing is done in a single main loop and that the -QEMU global mutex is contended by all vCPU threads and the main loop explain -why it is desirable to place work into IOThreads. - -The experimental virtio-blk data-plane implementation has been benchmarked and -shows these effects: -ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf - -How to program for IOThreads ----------------------------- -The main difference between legacy code and new code that can run in an -IOThread is dealing explicitly with the event loop object, AioContext -(see include/block/aio.h). Code that only works in the main loop -implicitly uses the main loop's AioContext. Code that supports running -in IOThreads must be aware of its AioContext. - -AioContext supports the following services: - * File descriptor monitoring (read/write/error on POSIX hosts) - * Event notifiers (inter-thread signalling) - * Timers - * Bottom Halves (BH) deferred callbacks - -There are several old APIs that use the main loop AioContext: - * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor - * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier - * LEGACY timer_new_ms() - create a timer - * LEGACY qemu_bh_new() - create a BH - * LEGACY qemu_aio_wait() - run an event loop iteration - -Since they implicitly work on the main loop they cannot be used in code that -runs in an IOThread. They might cause a crash or deadlock if called from an -IOThread since the QEMU global mutex is not held. - -Instead, use the AioContext functions directly (see include/block/aio.h): - * aio_set_fd_handler() - monitor a file descriptor - * aio_set_event_notifier() - monitor an event notifier - * aio_timer_new() - create a timer - * aio_bh_new() - create a BH - * aio_poll() - run an event loop iteration - -The AioContext can be obtained from the IOThread using -iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). -Code that takes an AioContext argument works both in IOThreads or the main -loop, depending on which AioContext instance the caller passes in. - -How to synchronize with an IOThread ------------------------------------ -AioContext is not thread-safe so some rules must be followed when using file -descriptors, event notifiers, timers, or BHs across threads: - -1. AioContext functions can always be called safely. They handle their -own locking internally. - -2. Other threads wishing to access the AioContext must use -aio_context_acquire()/aio_context_release() for mutual exclusion. Once the -context is acquired no other thread can access it or run event loop iterations -in this AioContext. - -aio_context_acquire()/aio_context_release() calls may be nested. This -means you can call them if you're not sure whether #2 applies. - -There is currently no lock ordering rule if a thread needs to acquire multiple -AioContexts simultaneously. Therefore, it is only safe for code holding the -QEMU global mutex to acquire other AioContexts. - -Side note: the best way to schedule a function call across threads is to call -aio_bh_schedule_oneshot(). No acquire/release or locking is needed. - -AioContext and the block layer ------------------------------- -The AioContext originates from the QEMU block layer, even though nowadays -AioContext is a generic event loop that can be used by any QEMU subsystem. - -The block layer has support for AioContext integrated. Each BlockDriverState -is associated with an AioContext using bdrv_set_aio_context() and -bdrv_get_aio_context(). This allows block layer code to process I/O inside the -right AioContext. Other subsystems may wish to follow a similar approach. - -Block layer code must therefore expect to run in an IOThread and avoid using -old APIs that implicitly use the main loop. See the "How to program for -IOThreads" above for information on how to do that. - -If main loop code such as a QMP function wishes to access a BlockDriverState -it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure -that callbacks in the IOThread do not run in parallel. - -Code running in the monitor typically needs to ensure that past -requests from the guest are completed. When a block device is running -in an IOThread, the IOThread can also process requests from the guest -(via ioeventfd). To achieve both objects, wrap the code between -bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained -section". The functions must be called between aio_context_acquire() -and aio_context_release(). You can freely release and re-acquire the -AioContext within a drained section. - -Long-running jobs (usually in the form of coroutines) are best scheduled in -the BlockDriverState's AioContext to avoid the need to acquire/release around -each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier, -or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends, -can be used to get a notification whenever bdrv_set_aio_context() moves a -BlockDriverState to a different AioContext. diff --git a/docs/q35-emulated.cfg b/docs/q35-emulated.cfg deleted file mode 100644 index c6416d6545..0000000000 --- a/docs/q35-emulated.cfg +++ /dev/null @@ -1,288 +0,0 @@ -# q35 - Emulated guest (graphical console) -# ========================================================= -# -# Usage: -# -# $ qemu-system-x86_64 \ -# -nodefaults \ -# -readconfig q35-emulated.cfg -# -# You will probably need to tweak the lines marked as -# CHANGE ME before being able to use this configuration! -# -# The guest will have a selection of emulated devices that -# closely resembles that of a physical machine, and will be -# accessed through a graphical console. -# -# --------------------------------------------------------- -# -# Using -nodefaults is required to have full control over -# the virtual hardware: when it's specified, QEMU will -# populate the board with only the builtin peripherals -# plus a small selection of core PCI devices and -# controllers; the user will then have to explicitly add -# further devices. -# -# The core PCI devices show up in the guest as: -# -# 00:00.0 Host bridge -# 00:1f.0 ISA bridge / LPC -# 00:1f.2 SATA (AHCI) controller -# 00:1f.3 SMBus controller -# -# This configuration file adds a number of devices that -# are pretty much guaranteed to be present in every single -# physical machine based on q35, more specifically: -# -# 00:01.0 VGA compatible controller -# 00:19.0 Ethernet controller -# 00:1a.* USB controller (#2) -# 00:1b.0 Audio device -# 00:1c.* PCI bridge (PCI Express Root Ports) -# 00:1d.* USB Controller (#1) -# 00:1e.0 PCI bridge (legacy PCI bridge) -# -# More information about these devices is available below. - - -# Machine options -# ========================================================= -# -# We use the q35 machine type and enable KVM acceleration -# for better performance. -# -# Using less than 1 GiB of memory is probably not going to -# yield good performance in the guest, and might even lead -# to obscure boot issues in some cases. -# -# Unfortunately, there is no way to configure the CPU model -# in this file, so it will have to be provided on the -# command line. - -[machine] - type = "q35" - accel = "kvm" - -[memory] - size = "1024" - - -# PCI bridge (PCI Express Root Ports) -# ========================================================= -# -# We add four PCI Express Root Ports, all sharing the same -# slot on the PCI Express Root Bus. These ports support -# hotplug. - -[device "ich9-pcie-port-1"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - -[device "ich9-pcie-port-2"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "ich9-pcie-port-3"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "ich9-pcie-port-4"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - - -# PCI bridge (legacy PCI bridge) -# ========================================================= -# -# This bridge can be used to build an independent topology -# for legacy PCI devices. PCI Express devices should be -# plugged into PCI Express slots instead, so ideally there -# will be no devices connected to this bridge. - -[device "ich9-pci-bridge"] - driver = "i82801b11-bridge" - bus = "pcie.0" - addr = "1e.0" - - -# SATA storage -# ========================================================= -# -# An implicit SATA controller is created automatically for -# every single q35 guest; here we create a disk, backed by -# a qcow2 disk image on the host's filesystem, and attach -# it to that controller so that the guest can use it. -# -# We also create an optical disk, mostly for installation -# purposes: once the guest OS has been succesfully -# installed, the guest will no longer boot from optical -# media. If you don't want, or no longer want, to have an -# optical disk in the guest you can safely comment out -# all relevant sections below. - -[device "sata-disk"] - driver = "ide-hd" - bus = "ide.0" - drive = "disk" - bootindex = "1" - -[drive "disk"] - file = "guest.qcow2" # CHANGE ME - format = "qcow2" - if = "none" - -[device "sata-optical-disk"] - driver = "ide-cd" - bus = "ide.1" - drive = "optical-disk" - bootindex = "2" - -[drive "optical-disk"] - file = "install.iso" # CHANGE ME - format = "raw" - if = "none" - - -# USB controller (#1) -# ========================================================= -# -# EHCI controller + UHCI companion controllers. - -[device "ich9-ehci-1"] - driver = "ich9-usb-ehci1" - multifunction = "on" - bus = "pcie.0" - addr = "1d.7" - -[device "ich9-uhci-1"] - driver = "ich9-usb-uhci1" - multifunction = "on" - bus = "pcie.0" - addr = "1d.0" - masterbus = "ich9-ehci-1.0" - firstport = "0" - -[device "ich9-uhci-2"] - driver = "ich9-usb-uhci2" - multifunction = "on" - bus = "pcie.0" - addr = "1d.1" - masterbus = "ich9-ehci-1.0" - firstport = "2" - -[device "ich9-uhci-3"] - driver = "ich9-usb-uhci3" - multifunction = "on" - bus = "pcie.0" - addr = "1d.2" - masterbus = "ich9-ehci-1.0" - firstport = "4" - - -# USB controller (#2) -# ========================================================= -# -# EHCI controller + UHCI companion controllers. - -[device "ich9-ehci-2"] - driver = "ich9-usb-ehci2" - multifunction = "on" - bus = "pcie.0" - addr = "1a.7" - -[device "ich9-uhci-4"] - driver = "ich9-usb-uhci4" - multifunction = "on" - bus = "pcie.0" - addr = "1a.0" - masterbus = "ich9-ehci-2.0" - firstport = "0" - -[device "ich9-uhci-5"] - driver = "ich9-usb-uhci5" - multifunction = "on" - bus = "pcie.0" - addr = "1a.1" - masterbus = "ich9-ehci-2.0" - firstport = "2" - -[device "ich9-uhci-6"] - driver = "ich9-usb-uhci6" - multifunction = "on" - bus = "pcie.0" - addr = "1a.2" - masterbus = "ich9-ehci-2.0" - firstport = "4" - - -# Ethernet controller -# ========================================================= -# -# We add a Gigabit Ethernet interface to the guest; on the -# host side, we take advantage of user networking so that -# the QEMU process doesn't require any additional -# privileges. - -[netdev "hostnet"] - type = "user" - -[device "net"] - driver = "e1000" - netdev = "hostnet" - bus = "pcie.0" - addr = "19.0" - - -# VGA compatible controller -# ========================================================= -# -# We use stdvga instead of Cirrus as it supports more video -# modes and is closer to what actual hardware looks like. -# -# If you're running the guest on a remote, potentially -# headless host, you will probably want to append something -# like -# -# -display vnc=127.0.0.1:0 -# -# to the command line in order to prevent QEMU from -# creating a graphical display window on the host and -# enable remote access instead. - -[device "video"] - driver = "VGA" - bus = "pcie.0" - addr = "01.0" - - -# Audio device -# ========================================================= -# -# The sound card is a legacy PCI device that is plugged -# directly into the PCI Express Root Bus. - -[device "ich9-hda-audio"] - driver = "ich9-intel-hda" - bus = "pcie.0" - addr = "1b.0" - -[device "ich9-hda-duplex"] - driver = "hda-duplex" - bus = "ich9-hda-audio.0" - cad = "0" diff --git a/docs/q35-virtio-graphical.cfg b/docs/q35-virtio-graphical.cfg deleted file mode 100644 index 28bde2fc57..0000000000 --- a/docs/q35-virtio-graphical.cfg +++ /dev/null @@ -1,248 +0,0 @@ -# q35 - VirtIO guest (graphical console) -# ========================================================= -# -# Usage: -# -# $ qemu-system-x86_64 \ -# -nodefaults \ -# -readconfig q35-virtio-graphical.cfg -# -# You will probably need to tweak the lines marked as -# CHANGE ME before being able to use this configuration! -# -# The guest will have a selection of VirtIO devices -# tailored towards optimal performance with modern guests, -# and will be accessed through a graphical console. -# -# --------------------------------------------------------- -# -# Using -nodefaults is required to have full control over -# the virtual hardware: when it's specified, QEMU will -# populate the board with only the builtin peripherals -# plus a small selection of core PCI devices and -# controllers; the user will then have to explicitly add -# further devices. -# -# The core PCI devices show up in the guest as: -# -# 00:00.0 Host bridge -# 00:1f.0 ISA bridge / LPC -# 00:1f.2 SATA (AHCI) controller -# 00:1f.3 SMBus controller -# -# This configuration file adds a number of other useful -# devices, more specifically: -# -# 00:01.0 VGA compatible controller -# 00:1b.0 Audio device -# 00.1c.* PCI bridge (PCI Express Root Ports) -# 01:00.0 SCSI storage controller -# 02:00.0 Ethernet controller -# 03:00.0 USB controller -# -# More information about these devices is available below. - - -# Machine options -# ========================================================= -# -# We use the q35 machine type and enable KVM acceleration -# for better performance. -# -# Using less than 1 GiB of memory is probably not going to -# yield good performance in the guest, and might even lead -# to obscure boot issues in some cases. - -[machine] - type = "q35" - accel = "kvm" - -[memory] - size = "1024" - - -# PCI bridge (PCI Express Root Ports) -# ========================================================= -# -# We create eight PCI Express Root Ports, and we plug them -# all into separate functions of the same slot. Some of -# them will be used by devices, the rest will remain -# available for hotplug. - -[device "pcie.1"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - multifunction = "on" - -[device "pcie.2"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "pcie.3"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "pcie.4"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - -[device "pcie.5"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.4" - port = "5" - chassis = "5" - -[device "pcie.6"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.5" - port = "6" - chassis = "6" - -[device "pcie.7"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.6" - port = "7" - chassis = "7" - -[device "pcie.8"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.7" - port = "8" - chassis = "8" - - -# SCSI storage controller (and storage) -# ========================================================= -# -# We use virtio-scsi here so that we can (hot)plug a large -# number of disks without running into issues; a SCSI disk, -# backed by a qcow2 disk image on the host's filesystem, is -# attached to it. -# -# We also create an optical disk, mostly for installation -# purposes: once the guest OS has been succesfully -# installed, the guest will no longer boot from optical -# media. If you don't want, or no longer want, to have an -# optical disk in the guest you can safely comment out -# all relevant sections below. - -[device "scsi"] - driver = "virtio-scsi-pci" - bus = "pcie.1" - addr = "00.0" - -[device "scsi-disk"] - driver = "scsi-hd" - bus = "scsi.0" - drive = "disk" - bootindex = "1" - -[drive "disk"] - file = "guest.qcow2" # CHANGE ME - format = "qcow2" - if = "none" - -[device "scsi-optical-disk"] - driver = "scsi-cd" - bus = "scsi.0" - drive = "optical-disk" - bootindex = "2" - -[drive "optical-disk"] - file = "install.iso" # CHANGE ME - format = "raw" - if = "none" - - -# Ethernet controller -# ========================================================= -# -# We use virtio-net for improved performance over emulated -# hardware; on the host side, we take advantage of user -# networking so that the QEMU process doesn't require any -# additional privileges. - -[netdev "hostnet"] - type = "user" - -[device "net"] - driver = "virtio-net-pci" - netdev = "hostnet" - bus = "pcie.2" - addr = "00.0" - - -# USB controller (and input devices) -# ========================================================= -# -# We add a virtualization-friendly USB 3.0 controller and -# a USB tablet so that graphical guests can be controlled -# appropriately. A USB keyboard is not needed, as q35 -# guests get a PS/2 one added automatically. - -[device "usb"] - driver = "nec-usb-xhci" - bus = "pcie.3" - addr = "00.0" - -[device "tablet"] - driver = "usb-tablet" - bus = "usb.0" - - -# VGA compatible controller -# ========================================================= -# -# We plug the QXL video card directly into the PCI Express -# Root Bus as it is a legacy PCI device; this way, we can -# reduce the number of PCI Express controllers in the -# guest. -# -# If you're running the guest on a remote, potentially -# headless host, you will probably want to append something -# like -# -# -display vnc=127.0.0.1:0 -# -# to the command line in order to prevent QEMU from -# creating a graphical display window on the host and -# enable remote access instead. - -[device "video"] - driver = "qxl-vga" - bus = "pcie.0" - addr = "01.0" - - -# Audio device -# ========================================================= -# -# Like the video card, the sound card is a legacy PCI -# device and as such can be plugged directly into the PCI -# Express Root Bus. - -[device "sound"] - driver = "ich9-intel-hda" - bus = "pcie.0" - addr = "1b.0" - -[device "duplex"] - driver = "hda-duplex" - bus = "sound.0" - cad = "0" diff --git a/docs/q35-virtio-serial.cfg b/docs/q35-virtio-serial.cfg deleted file mode 100644 index c33c9cc07a..0000000000 --- a/docs/q35-virtio-serial.cfg +++ /dev/null @@ -1,193 +0,0 @@ -# q35 - VirtIO guest (serial console) -# ========================================================= -# -# Usage: -# -# $ qemu-system-x86_64 \ -# -nodefaults \ -# -readconfig q35-virtio-serial.cfg \ -# -display none -serial mon:stdio -# -# You will probably need to tweak the lines marked as -# CHANGE ME before being able to use this configuration! -# -# The guest will have a selection of VirtIO devices -# tailored towards optimal performance with modern guests, -# and will be accessed through the serial console. -# -# --------------------------------------------------------- -# -# Using -nodefaults is required to have full control over -# the virtual hardware: when it's specified, QEMU will -# populate the board with only the builtin peripherals -# plus a small selection of core PCI devices and -# controllers; the user will then have to explicitly add -# further devices. -# -# The core PCI devices show up in the guest as: -# -# 00:00.0 Host bridge -# 00:1f.0 ISA bridge / LPC -# 00:1f.2 SATA (AHCI) controller -# 00:1f.3 SMBus controller -# -# This configuration file adds a number of other useful -# devices, more specifically: -# -# 00.1c.* PCI bridge (PCI Express Root Ports) -# 01:00.0 SCSI storage controller -# 02:00.0 Ethernet controller -# -# More information about these devices is available below. -# -# We use '-display none' to prevent QEMU from creating a -# graphical display window, which would serve no use in -# this specific configuration, and '-serial mon:stdio' to -# multiplex the guest's serial console and the QEMU monitor -# to the host's stdio; use 'Ctrl+A h' to learn how to -# switch between the two and more. - - -# Machine options -# ========================================================= -# -# We use the q35 machine type and enable KVM acceleration -# for better performance. -# -# Using less than 1 GiB of memory is probably not going to -# yield good performance in the guest, and might even lead -# to obscure boot issues in some cases. - -[machine] - type = "q35" - accel = "kvm" - -[memory] - size = "1024" - - -# PCI bridge (PCI Express Root Ports) -# ========================================================= -# -# We create eight PCI Express Root Ports, and we plug them -# all into separate functions of the same slot. Some of -# them will be used by devices, the rest will remain -# available for hotplug. - -[device "pcie.1"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - multifunction = "on" - -[device "pcie.2"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "pcie.3"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "pcie.4"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - -[device "pcie.5"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.4" - port = "5" - chassis = "5" - -[device "pcie.6"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.5" - port = "6" - chassis = "6" - -[device "pcie.7"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.6" - port = "7" - chassis = "7" - -[device "pcie.8"] - driver = "pcie-root-port" - bus = "pcie.0" - addr = "1c.7" - port = "8" - chassis = "8" - - -# SCSI storage controller (and storage) -# ========================================================= -# -# We use virtio-scsi here so that we can (hot)plug a large -# number of disks without running into issues; a SCSI disk, -# backed by a qcow2 disk image on the host's filesystem, is -# attached to it. -# -# We also create an optical disk, mostly for installation -# purposes: once the guest OS has been succesfully -# installed, the guest will no longer boot from optical -# media. If you don't want, or no longer want, to have an -# optical disk in the guest you can safely comment out -# all relevant sections below. - -[device "scsi"] - driver = "virtio-scsi-pci" - bus = "pcie.1" - addr = "00.0" - -[device "scsi-disk"] - driver = "scsi-hd" - bus = "scsi.0" - drive = "disk" - bootindex = "1" - -[drive "disk"] - file = "guest.qcow2" # CHANGE ME - format = "qcow2" - if = "none" - -[device "scsi-optical-disk"] - driver = "scsi-cd" - bus = "scsi.0" - drive = "optical-disk" - bootindex = "2" - -[drive "optical-disk"] - file = "install.iso" # CHANGE ME - format = "raw" - if = "none" - - -# Ethernet controller -# ========================================================= -# -# We use virtio-net for improved performance over emulated -# hardware; on the host side, we take advantage of user -# networking so that the QEMU process doesn't require any -# additional privileges. - -[netdev "hostnet"] - type = "user" - -[device "net"] - driver = "virtio-net-pci" - netdev = "hostnet" - bus = "pcie.2" - addr = "00.0" diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt deleted file mode 100644 index 52e3874efe..0000000000 --- a/docs/qapi-code-gen.txt +++ /dev/null @@ -1,1310 +0,0 @@ -= How to use the QAPI code generator = - -Copyright IBM Corp. 2011 -Copyright (C) 2012-2016 Red Hat, Inc. - -This work is licensed under the terms of the GNU GPL, version 2 or -later. See the COPYING file in the top-level directory. - -== Introduction == - -QAPI is a native C API within QEMU which provides management-level -functionality to internal and external users. For external -users/processes, this interface is made available by a JSON-based wire -format for the QEMU Monitor Protocol (QMP) for controlling qemu, as -well as the QEMU Guest Agent (QGA) for communicating with the guest. -The remainder of this document uses "Client JSON Protocol" when -referring to the wire contents of a QMP or QGA connection. - -To map Client JSON Protocol interfaces to the native C QAPI -implementations, a JSON-based schema is used to define types and -function signatures, and a set of scripts is used to generate types, -signatures, and marshaling/dispatch code. This document will describe -how the schemas, scripts, and resulting code are used. - - -== QMP/Guest agent schema == - -A QAPI schema file is designed to be loosely based on JSON -(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style -and the use of comments; a QAPI schema file is then parsed by a python -code generation program. A valid QAPI schema consists of a series of -top-level expressions, with no commas between them. Where -dictionaries (JSON objects) are used, they are parsed as python -OrderedDicts so that ordering is preserved (for predictable layout of -generated C structs and parameter lists). Ordering doesn't matter -between top-level expressions or the keys within an expression, but -does matter within dictionary values for 'data' and 'returns' members -of a single expression. QAPI schema input is written using 'single -quotes' instead of JSON's "double quotes" (in contrast, Client JSON -Protocol uses no comments, and while input accepts 'single quotes' as -an extension, output is strict JSON using only "double quotes"). As -in JSON, trailing commas are not permitted in arrays or dictionaries. -Input must be ASCII (although QMP supports full Unicode strings, the -QAPI parser does not). At present, there is no place where a QAPI -schema requires the use of JSON numbers or null. - - -=== Comments === - -Comments are allowed; anything between an unquoted # and the following -newline is ignored. - -A multi-line comment that starts and ends with a '##' line is a -documentation comment. These are parsed by the documentation -generator, which recognizes certain markup detailed below. - - -==== Documentation markup ==== - -Comment text starting with '=' is a section title: - - # = Section title - -Double the '=' for a subsection title: - - # == Subection title - -'|' denotes examples: - - # | Text of the example, may span - # | multiple lines - -'*' starts an itemized list: - - # * First item, may span - # multiple lines - # * Second item - -You can also use '-' instead of '*'. - -A decimal number followed by '.' starts a numbered list: - - # 1. First item, may span - # multiple lines - # 2. Second item - -The actual number doesn't matter. You could even use '*' instead of -'2.' for the second item. - -Lists can't be nested. Blank lines are currently not supported within -lists. - -Additional whitespace between the initial '#' and the comment text is -permitted. - -*foo* and _foo_ are for strong and emphasis styles respectively (they -do not work over multiple lines). @foo is used to reference a name in -the schema. - -Example: - -## -# = Section -# == Subsection -# -# Some text foo with *strong* and _emphasis_ -# 1. with a list -# 2. like that -# -# And some code: -# | $ echo foo -# | -> do this -# | <- get that -# -## - - -==== Expression documentation ==== - -Each expression that isn't an include directive may be preceded by a -documentation block. Such blocks are called expression documentation -blocks. - -When documentation is required (see pragma 'doc-required'), expression -documentation blocks are mandatory. - -The documentation block consists of a first line naming the -expression, an optional overview, a description of each argument (for -commands and events) or member (for structs, unions and alternates), -and optional tagged sections. - -FIXME: the parser accepts these things in almost any order. - -Extensions added after the expression was first released carry a -'(since x.y.z)' comment. - -A tagged section starts with one of the following words: -"Note:"/"Notes:", "Since:", "Example"/"Examples", "Returns:", "TODO:". -The section ends with the start of a new section. - -A 'Since: x.y.z' tagged section lists the release that introduced the -expression. - -For example: - -## -# @BlockStats: -# -# Statistics of a virtual block device or a block backing device. -# -# @device: If the stats are for a virtual block device, the name -# corresponding to the virtual block device. -# -# @node-name: The node name of the device. (since 2.3) -# -# ... more members ... -# -# Since: 0.14.0 -## -{ 'struct': 'BlockStats', - 'data': {'*device': 'str', '*node-name': 'str', - ... more members ... } } - -## -# @query-blockstats: -# -# Query the @BlockStats for all virtual block devices. -# -# @query-nodes: If true, the command will query all the -# block nodes ... explain, explain ... (since 2.3) -# -# Returns: A list of @BlockStats for each virtual block devices. -# -# Since: 0.14.0 -# -# Example: -# -# -> { "execute": "query-blockstats" } -# <- { -# ... lots of output ... -# } -# -## -{ 'command': 'query-blockstats', - 'data': { '*query-nodes': 'bool' }, - 'returns': ['BlockStats'] } - -==== Free-form documentation ==== - -A documentation block that isn't an expression documentation block is -a free-form documentation block. These may be used to provide -additional text and structuring content. - - -=== Schema overview === - -The schema sets up a series of types, as well as commands and events -that will use those types. Forward references are allowed: the parser -scans in two passes, where the first pass learns all type names, and -the second validates the schema and generates the code. This allows -the definition of complex structs that can have mutually recursive -types, and allows for indefinite nesting of Client JSON Protocol that -satisfies the schema. A type name should not be defined more than -once. It is permissible for the schema to contain additional types -not used by any commands or events in the Client JSON Protocol, for -the side effect of generated C code used internally. - -There are eight top-level expressions recognized by the parser: -'include', 'pragma', 'command', 'struct', 'enum', 'union', -'alternate', and 'event'. There are several groups of types: simple -types (a number of built-in types, such as 'int' and 'str'; as well as -enumerations), complex types (structs and two flavors of unions), and -alternate types (a choice between other types). The 'command' and -'event' expressions can refer to existing types by name, or list an -anonymous type as a dictionary. Listing a type name inside an array -refers to a single-dimension array of that type; multi-dimension -arrays are not directly supported (although an array of a complex -struct that contains an array member is possible). - -All names must begin with a letter, and contain only ASCII letters, -digits, hyphen, and underscore. There are two exceptions: enum values -may start with a digit, and names that are downstream extensions (see -section Downstream extensions) start with underscore. - -Names beginning with 'q_' are reserved for the generator, which uses -them for munging QMP names that resemble C keywords or other -problematic strings. For example, a member named "default" in qapi -becomes "q_default" in the generated C code. - -Types, commands, and events share a common namespace. Therefore, -generally speaking, type definitions should always use CamelCase for -user-defined type names, while built-in types are lowercase. - -Type names ending with 'Kind' or 'List' are reserved for the -generator, which uses them for implicit union enums and array types, -respectively. - -Command names, and member names within a type, should be all lower -case with words separated by a hyphen. However, some existing older -commands and complex types use underscore; when extending such -expressions, consistency is preferred over blindly avoiding -underscore. - -Event names should be ALL_CAPS with words separated by underscore. - -Member names starting with 'has-' or 'has_' are reserved for the -generator, which uses them for tracking optional members. - -Any name (command, event, type, member, or enum value) beginning with -"x-" is marked experimental, and may be withdrawn or changed -incompatibly in a future release. - -Pragma 'name-case-whitelist' lets you violate the rules on use of -upper and lower case. Use for new code is strongly discouraged. - -In the rest of this document, usage lines are given for each -expression type, with literal strings written in lower case and -placeholders written in capitals. If a literal string includes a -prefix of '*', that key/value pair can be omitted from the expression. -For example, a usage statement that includes '*base':STRUCT-NAME -means that an expression has an optional key 'base', which if present -must have a value that forms a struct name. - - -=== Built-in Types === - -The following types are predefined, and map to C as follows: - - Schema C JSON - str char * any JSON string, UTF-8 - number double any JSON number - int int64_t a JSON number without fractional part - that fits into the C integer type - int8 int8_t likewise - int16 int16_t likewise - int32 int32_t likewise - int64 int64_t likewise - uint8 uint8_t likewise - uint16 uint16_t likewise - uint32 uint32_t likewise - uint64 uint64_t likewise - size uint64_t like uint64_t, except StringInputVisitor - accepts size suffixes - bool bool JSON true or false - any QObject * any JSON value - QType QType JSON string matching enum QType values - - -=== Include directives === - -Usage: { 'include': STRING } - -The QAPI schema definitions can be modularized using the 'include' directive: - - { 'include': 'path/to/file.json' } - -The directive is evaluated recursively, and include paths are relative to the -file using the directive. Multiple includes of the same file are -idempotent. No other keys should appear in the expression, and the include -value should be a string. - -As a matter of style, it is a good idea to have all files be -self-contained, but at the moment, nothing prevents an included file -from making a forward reference to a type that is only introduced by -an outer file. The parser may be made stricter in the future to -prevent incomplete include files. - - -=== Pragma directives === - -Usage: { 'pragma': DICT } - -The pragma directive lets you control optional generator behavior. -The dictionary's entries are pragma names and values. - -Pragma's scope is currently the complete schema. Setting the same -pragma to different values in parts of the schema doesn't work. - -Pragma 'doc-required' takes a boolean value. If true, documentation -is required. Default is false. - -Pragma 'returns-whitelist' takes a list of command names that may -violate the rules on permitted return types. Default is none. - -Pragma 'name-case-whitelist' takes a list of names that may violate -rules on use of upper- vs. lower-case letters. Default is none. - - -=== Struct types === - -Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME } - -A struct is a dictionary containing a single 'data' key whose value is -a dictionary; the dictionary may be empty. This corresponds to a -struct in C or an Object in JSON. Each value of the 'data' dictionary -must be the name of a type, or a one-element array containing a type -name. An example of a struct is: - - { 'struct': 'MyType', - 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } - -The use of '*' as a prefix to the name means the member is optional in -the corresponding JSON protocol usage. - -The default initialization value of an optional argument should not be changed -between versions of QEMU unless the new default maintains backward -compatibility to the user-visible behavior of the old default. - -With proper documentation, this policy still allows some flexibility; for -example, documenting that a default of 0 picks an optimal buffer size allows -one release to declare the optimal size at 512 while another release declares -the optimal size at 4096 - the user-visible behavior is not the bytes used by -the buffer, but the fact that the buffer was optimal size. - -On input structures (only mentioned in the 'data' side of a command), changing -from mandatory to optional is safe (older clients will supply the option, and -newer clients can benefit from the default); changing from optional to -mandatory is backwards incompatible (older clients may be omitting the option, -and must continue to work). - -On output structures (only mentioned in the 'returns' side of a command), -changing from mandatory to optional is in general unsafe (older clients may be -expecting the member, and could crash if it is missing), although it -can be done if the only way that the optional argument will be omitted -is when it is triggered by the presence of a new input flag to the -command that older clients don't know to send. Changing from optional -to mandatory is safe. - -A structure that is used in both input and output of various commands -must consider the backwards compatibility constraints of both directions -of use. - -A struct definition can specify another struct as its base. -In this case, the members of the base type are included as top-level members -of the new struct's dictionary in the Client JSON Protocol wire -format. An example definition is: - - { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } - { 'struct': 'BlockdevOptionsGenericCOWFormat', - 'base': 'BlockdevOptionsGenericFormat', - 'data': { '*backing': 'str' } } - -An example BlockdevOptionsGenericCOWFormat object on the wire could use -both members like this: - - { "file": "/some/place/my-image", - "backing": "/some/place/my-backing-file" } - - -=== Enumeration types === - -Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING } - { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING } - -An enumeration type is a dictionary containing a single 'data' key -whose value is a list of strings. An example enumeration is: - - { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] } - -Nothing prevents an empty enumeration, although it is probably not -useful. The list of strings should be lower case; if an enum name -represents multiple words, use '-' between words. The string 'max' is -not allowed as an enum value, and values should not be repeated. - -The enum constants will be named by using a heuristic to turn the -type name into a set of underscore separated words. For the example -above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name -of 'MY_ENUM_VALUE1' for the first value. If the default heuristic -does not result in a desirable name, the optional 'prefix' member -can be used when defining the enum. - -The enumeration values are passed as strings over the Client JSON -Protocol, but are encoded as C enum integral values in generated code. -While the C code starts numbering at 0, it is better to use explicit -comparisons to enum values than implicit comparisons to 0; the C code -will also include a generated enum member ending in _MAX for tracking -the size of the enum, useful when using common functions for -converting between strings and enum values. Since the wire format -always passes by name, it is acceptable to reorder or add new -enumeration members in any location without breaking clients of Client -JSON Protocol; however, removing enum values would break -compatibility. For any struct that has a member that will only contain -a finite set of string values, using an enum type for that member is -better than open-coding the member to be type 'str'. - - -=== Union types === - -Usage: { 'union': STRING, 'data': DICT } -or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME-OR-DICT, - 'discriminator': ENUM-MEMBER-OF-BASE } - -Union types are used to let the user choose between several different -variants for an object. There are two flavors: simple (no -discriminator or base), and flat (both discriminator and base). A union -type is defined using a data dictionary as explained in the following -paragraphs. The data dictionary for either type of union must not -be empty. - -A simple union type defines a mapping from automatic discriminator -values to data types like in this example: - - { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str' } } - { 'struct': 'BlockdevOptionsQcow2', - 'data': { 'backing': 'str', '*lazy-refcounts': 'bool' } } - - { 'union': 'BlockdevOptionsSimple', - 'data': { 'file': 'BlockdevOptionsFile', - 'qcow2': 'BlockdevOptionsQcow2' } } - -In the Client JSON Protocol, a simple union is represented by a -dictionary that contains the 'type' member as a discriminator, and a -'data' member that is of the specified data type corresponding to the -discriminator value, as in these examples: - - { "type": "file", "data": { "filename": "/some/place/my-image" } } - { "type": "qcow2", "data": { "backing": "/some/place/my-image", - "lazy-refcounts": true } } - -The generated C code uses a struct containing a union. Additionally, -an implicit C enum 'NameKind' is created, corresponding to the union -'Name', for accessing the various branches of the union. No branch of -the union can be named 'max', as this would collide with the implicit -enum. The value for each branch can be of any type. - -A flat union definition avoids nesting on the wire, and specifies a -set of common members that occur in all variants of the union. The -'base' key must specify either a type name (the type must be a -struct, not a union), or a dictionary representing an anonymous type. -All branches of the union must be complex types, and the top-level -members of the union dictionary on the wire will be combination of -members from both the base type and the appropriate branch type (when -merging two dictionaries, there must be no keys in common). The -'discriminator' member must be the name of a non-optional enum-typed -member of the base struct. - -The following example enhances the above simple union example by -adding an optional common member 'read-only', renaming the -discriminator to something more applicable than the simple union's -default of 'type', and reducing the number of {} required on the wire: - - { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] } - { 'union': 'BlockdevOptions', - 'base': { 'driver': 'BlockdevDriver', '*read-only': 'bool' }, - 'discriminator': 'driver', - 'data': { 'file': 'BlockdevOptionsFile', - 'qcow2': 'BlockdevOptionsQcow2' } } - -Resulting in these JSON objects: - - { "driver": "file", "read-only": true, - "filename": "/some/place/my-image" } - { "driver": "qcow2", "read-only": false, - "backing": "/some/place/my-image", "lazy-refcounts": true } - -Notice that in a flat union, the discriminator name is controlled by -the user, but because it must map to a base member with enum type, the -code generator can ensure that branches exist for all values of the -enum (although the order of the keys need not match the declaration of -the enum). In the resulting generated C data types, a flat union is -represented as a struct with the base members included directly, and -then a union of structures for each branch of the struct. - -A simple union can always be re-written as a flat union where the base -class has a single member named 'type', and where each branch of the -union has a struct with a single member named 'data'. That is, - - { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } } - -is identical on the wire to: - - { 'enum': 'Enum', 'data': ['one', 'two'] } - { 'struct': 'Branch1', 'data': { 'data': 'str' } } - { 'struct': 'Branch2', 'data': { 'data': 'int' } } - { 'union': 'Flat': 'base': { 'type': 'Enum' }, 'discriminator': 'type', - 'data': { 'one': 'Branch1', 'two': 'Branch2' } } - - -=== Alternate types === - -Usage: { 'alternate': STRING, 'data': DICT } - -An alternate type is one that allows a choice between two or more JSON -data types (string, integer, number, or object, but currently not -array) on the wire. The definition is similar to a simple union type, -where each branch of the union names a QAPI type. For example: - - { 'alternate': 'BlockdevRef', - 'data': { 'definition': 'BlockdevOptions', - 'reference': 'str' } } - -Unlike a union, the discriminator string is never passed on the wire -for the Client JSON Protocol. Instead, the value's JSON type serves -as an implicit discriminator, which in turn means that an alternate -can only express a choice between types represented differently in -JSON. If a branch is typed as the 'bool' built-in, the alternate -accepts true and false; if it is typed as any of the various numeric -built-ins, it accepts a JSON number; if it is typed as a 'str' -built-in or named enum type, it accepts a JSON string; and if it is -typed as a complex type (struct or union), it accepts a JSON object. -Two different complex types, for instance, aren't permitted, because -both are represented as a JSON object. - -The example alternate declaration above allows using both of the -following example objects: - - { "file": "my_existing_block_device_id" } - { "file": { "driver": "file", - "read-only": false, - "filename": "/tmp/mydisk.qcow2" } } - - -=== Commands === - -Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, - '*returns': TYPE-NAME, '*boxed': true, - '*gen': false, '*success-response': false } - -Commands are defined by using a dictionary containing several members, -where three members are most common. The 'command' member is a -mandatory string, and determines the "execute" value passed in a -Client JSON Protocol command exchange. - -The 'data' argument maps to the "arguments" dictionary passed in as -part of a Client JSON Protocol command. The 'data' member is optional -and defaults to {} (an empty dictionary). If present, it must be the -string name of a complex type, or a dictionary that declares an -anonymous type with the same semantics as a 'struct' expression. - -The 'returns' member describes what will appear in the "return" member -of a Client JSON Protocol reply on successful completion of a command. -The member is optional from the command declaration; if absent, the -"return" member will be an empty dictionary. If 'returns' is present, -it must be the string name of a complex or built-in type, a -one-element array containing the name of a complex or built-in type. -To return anything else, you have to list the command in pragma -'returns-whitelist'. If you do this, the command cannot be extended -to return additional information in the future. Use of -'returns-whitelist' for new commands is strongly discouraged. - -All commands in Client JSON Protocol use a dictionary to report -failure, with no way to specify that in QAPI. Where the error return -is different than the usual GenericError class in order to help the -client react differently to certain error conditions, it is worth -documenting this in the comments before the command declaration. - -Some example commands: - - { 'command': 'my-first-command', - 'data': { 'arg1': 'str', '*arg2': 'str' } } - { 'struct': 'MyType', 'data': { '*value': 'str' } } - { 'command': 'my-second-command', - 'returns': [ 'MyType' ] } - -which would validate this Client JSON Protocol transaction: - - => { "execute": "my-first-command", - "arguments": { "arg1": "hello" } } - <= { "return": { } } - => { "execute": "my-second-command" } - <= { "return": [ { "value": "one" }, { } ] } - -The generator emits a prototype for the user's function implementing -the command. Normally, 'data' is a dictionary for an anonymous type, -or names a struct type (possibly empty, but not a union), and its -members are passed as separate arguments to this function. If the -command definition includes a key 'boxed' with the boolean value true, -then 'data' is instead the name of any non-empty complex type -(struct, union, or alternate), and a pointer to that QAPI type is -passed as a single argument. - -The generator also emits a marshalling function that extracts -arguments for the user's function out of an input QDict, calls the -user's function, and if it succeeded, builds an output QObject from -its return value. - -In rare cases, QAPI cannot express a type-safe representation of a -corresponding Client JSON Protocol command. You then have to suppress -generation of a marshalling function by including a key 'gen' with -boolean value false, and instead write your own function. Please try -to avoid adding new commands that rely on this, and instead use -type-safe unions. For an example of this usage: - - { 'command': 'netdev_add', - 'data': {'type': 'str', 'id': 'str'}, - 'gen': false } - -Normally, the QAPI schema is used to describe synchronous exchanges, -where a response is expected. But in some cases, the action of a -command is expected to change state in a way that a successful -response is not possible (although the command will still return a -normal dictionary error on failure). When a successful reply is not -possible, the command expression should include the optional key -'success-response' with boolean value false. So far, only QGA makes -use of this member. - - -=== Events === - -Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, - '*boxed': true } - -Events are defined with the keyword 'event'. It is not allowed to -name an event 'MAX', since the generator also produces a C enumeration -of all event names with a generated _MAX value at the end. When -'data' is also specified, additional info will be included in the -event, with similar semantics to a 'struct' expression. Finally there -will be C API generated in qapi-event.h; when called by QEMU code, a -message with timestamp will be emitted on the wire. - -An example event is: - -{ 'event': 'EVENT_C', - 'data': { '*a': 'int', 'b': 'str' } } - -Resulting in this JSON object: - -{ "event": "EVENT_C", - "data": { "b": "test string" }, - "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } - -The generator emits a function to send the event. Normally, 'data' is -a dictionary for an anonymous type, or names a struct type (possibly -empty, but not a union), and its members are passed as separate -arguments to this function. If the event definition includes a key -'boxed' with the boolean value true, then 'data' is instead the name of -any non-empty complex type (struct, union, or alternate), and a -pointer to that QAPI type is passed as a single argument. - - -=== Downstream extensions === - -QAPI schema names that are externally visible, say in the Client JSON -Protocol, need to be managed with care. Names starting with a -downstream prefix of the form __RFQDN_ are reserved for the downstream -who controls the valid, reverse fully qualified domain name RFQDN. -RFQDN may only contain ASCII letters, digits, hyphen and period. - -Example: Red Hat, Inc. controls redhat.com, and may therefore add a -downstream command __com.redhat_drive-mirror. - - -== Client JSON Protocol introspection == - -Clients of a Client JSON Protocol commonly need to figure out what -exactly the server (QEMU) supports. - -For this purpose, QMP provides introspection via command -query-qmp-schema. QGA currently doesn't support introspection. - -While Client JSON Protocol wire compatibility should be maintained -between qemu versions, we cannot make the same guarantees for -introspection stability. For example, one version of qemu may provide -a non-variant optional member of a struct, and a later version rework -the member to instead be non-optional and associated with a variant. -Likewise, one version of qemu may list a member with open-ended type -'str', and a later version could convert it to a finite set of strings -via an enum type; or a member may be converted from a specific type to -an alternate that represents a choice between the original type and -something else. - -query-qmp-schema returns a JSON array of SchemaInfo objects. These -objects together describe the wire ABI, as defined in the QAPI schema. -There is no specified order to the SchemaInfo objects returned; a -client must search for a particular name throughout the entire array -to learn more about that name, but is at least guaranteed that there -will be no collisions between type, command, and event names. - -However, the SchemaInfo can't reflect all the rules and restrictions -that apply to QMP. It's interface introspection (figuring out what's -there), not interface specification. The specification is in the QAPI -schema. To understand how QMP is to be used, you need to study the -QAPI schema. - -Like any other command, query-qmp-schema is itself defined in the QAPI -schema, along with the SchemaInfo type. This text attempts to give an -overview how things work. For details you need to consult the QAPI -schema. - -SchemaInfo objects have common members "name" and "meta-type", and -additional variant members depending on the value of meta-type. - -Each SchemaInfo object describes a wire ABI entity of a certain -meta-type: a command, event or one of several kinds of type. - -SchemaInfo for commands and events have the same name as in the QAPI -schema. - -Command and event names are part of the wire ABI, but type names are -not. Therefore, the SchemaInfo for types have auto-generated -meaningless names. For readability, the examples in this section use -meaningful type names instead. - -To examine a type, start with a command or event using it, then follow -references by name. - -QAPI schema definitions not reachable that way are omitted. - -The SchemaInfo for a command has meta-type "command", and variant -members "arg-type" and "ret-type". On the wire, the "arguments" -member of a client's "execute" command must conform to the object type -named by "arg-type". The "return" member that the server passes in a -success response conforms to the type named by "ret-type". - -If the command takes no arguments, "arg-type" names an object type -without members. Likewise, if the command returns nothing, "ret-type" -names an object type without members. - -Example: the SchemaInfo for command query-qmp-schema - - { "name": "query-qmp-schema", "meta-type": "command", - "arg-type": "q_empty", "ret-type": "SchemaInfoList" } - - Type "q_empty" is an automatic object type without members, and type - "SchemaInfoList" is the array of SchemaInfo type. - -The SchemaInfo for an event has meta-type "event", and variant member -"arg-type". On the wire, a "data" member that the server passes in an -event conforms to the object type named by "arg-type". - -If the event carries no additional information, "arg-type" names an -object type without members. The event may not have a data member on -the wire then. - -Each command or event defined with dictionary-valued 'data' in the -QAPI schema implicitly defines an object type. - -Example: the SchemaInfo for EVENT_C from section Events - - { "name": "EVENT_C", "meta-type": "event", - "arg-type": "q_obj-EVENT_C-arg" } - - Type "q_obj-EVENT_C-arg" is an implicitly defined object type with - the two members from the event's definition. - -The SchemaInfo for struct and union types has meta-type "object". - -The SchemaInfo for a struct type has variant member "members". - -The SchemaInfo for a union type additionally has variant members "tag" -and "variants". - -"members" is a JSON array describing the object's common members, if -any. Each element is a JSON object with members "name" (the member's -name), "type" (the name of its type), and optionally "default". The -member is optional if "default" is present. Currently, "default" can -only have value null. Other values are reserved for future -extensions. The "members" array is in no particular order; clients -must search the entire object when learning whether a particular -member is supported. - -Example: the SchemaInfo for MyType from section Struct types - - { "name": "MyType", "meta-type": "object", - "members": [ - { "name": "member1", "type": "str" }, - { "name": "member2", "type": "int" }, - { "name": "member3", "type": "str", "default": null } ] } - -"tag" is the name of the common member serving as type tag. -"variants" is a JSON array describing the object's variant members. -Each element is a JSON object with members "case" (the value of type -tag this element applies to) and "type" (the name of an object type -that provides the variant members for this type tag value). The -"variants" array is in no particular order, and is not guaranteed to -list cases in the same order as the corresponding "tag" enum type. - -Example: the SchemaInfo for flat union BlockdevOptions from section -Union types - - { "name": "BlockdevOptions", "meta-type": "object", - "members": [ - { "name": "driver", "type": "BlockdevDriver" }, - { "name": "read-only", "type": "bool", "default": null } ], - "tag": "driver", - "variants": [ - { "case": "file", "type": "BlockdevOptionsFile" }, - { "case": "qcow2", "type": "BlockdevOptionsQcow2" } ] } - -Note that base types are "flattened": its members are included in the -"members" array. - -A simple union implicitly defines an enumeration type for its implicit -discriminator (called "type" on the wire, see section Union types). - -A simple union implicitly defines an object type for each of its -variants. - -Example: the SchemaInfo for simple union BlockdevOptionsSimple from section -Union types - - { "name": "BlockdevOptionsSimple", "meta-type": "object", - "members": [ - { "name": "type", "type": "BlockdevOptionsSimpleKind" } ], - "tag": "type", - "variants": [ - { "case": "file", "type": "q_obj-BlockdevOptionsFile-wrapper" }, - { "case": "qcow2", "type": "q_obj-BlockdevOptionsQcow2-wrapper" } ] } - - Enumeration type "BlockdevOptionsSimpleKind" and the object types - "q_obj-BlockdevOptionsFile-wrapper", "q_obj-BlockdevOptionsQcow2-wrapper" - are implicitly defined. - -The SchemaInfo for an alternate type has meta-type "alternate", and -variant member "members". "members" is a JSON array. Each element is -a JSON object with member "type", which names a type. Values of the -alternate type conform to exactly one of its member types. There is -no guarantee on the order in which "members" will be listed. - -Example: the SchemaInfo for BlockdevRef from section Alternate types - - { "name": "BlockdevRef", "meta-type": "alternate", - "members": [ - { "type": "BlockdevOptions" }, - { "type": "str" } ] } - -The SchemaInfo for an array type has meta-type "array", and variant -member "element-type", which names the array's element type. Array -types are implicitly defined. For convenience, the array's name may -resemble the element type; however, clients should examine member -"element-type" instead of making assumptions based on parsing member -"name". - -Example: the SchemaInfo for ['str'] - - { "name": "[str]", "meta-type": "array", - "element-type": "str" } - -The SchemaInfo for an enumeration type has meta-type "enum" and -variant member "values". The values are listed in no particular -order; clients must search the entire enum when learning whether a -particular value is supported. - -Example: the SchemaInfo for MyEnum from section Enumeration types - - { "name": "MyEnum", "meta-type": "enum", - "values": [ "value1", "value2", "value3" ] } - -The SchemaInfo for a built-in type has the same name as the type in -the QAPI schema (see section Built-in Types), with one exception -detailed below. It has variant member "json-type" that shows how -values of this type are encoded on the wire. - -Example: the SchemaInfo for str - - { "name": "str", "meta-type": "builtin", "json-type": "string" } - -The QAPI schema supports a number of integer types that only differ in -how they map to C. They are identical as far as SchemaInfo is -concerned. Therefore, they get all mapped to a single type "int" in -SchemaInfo. - -As explained above, type names are not part of the wire ABI. Not even -the names of built-in types. Clients should examine member -"json-type" instead of hard-coding names of built-in types. - - -== Code generation == - -Schemas are fed into five scripts to generate all the code/files that, -paired with the core QAPI libraries, comprise everything required to -take JSON commands read in by a Client JSON Protocol server, unmarshal -the arguments into the underlying C types, call into the corresponding -C function, map the response back to a Client JSON Protocol response -to be returned to the user, and introspect the commands. - -As an example, we'll use the following schema, which describes a -single complex user-defined type, along with command which takes a -list of that type as a parameter, and returns a single element of that -type. The user is responsible for writing the implementation of -qmp_my_command(); everything else is produced by the generator. - - $ cat example-schema.json - { 'struct': 'UserDefOne', - 'data': { 'integer': 'int', '*string': 'str' } } - - { 'command': 'my-command', - 'data': { 'arg1': ['UserDefOne'] }, - 'returns': 'UserDefOne' } - - { 'event': 'MY_EVENT' } - -For a more thorough look at generated code, the testsuite includes -tests/qapi-schema/qapi-schema-tests.json that covers more examples of -what the generator will accept, and compiles the resulting C code as -part of 'make check-unit'. - -=== scripts/qapi-types.py === - -Used to generate the C types defined by a schema, along with -supporting code. The following files are created: - -$(prefix)qapi-types.h - C types corresponding to types defined in - the schema you pass in -$(prefix)qapi-types.c - Cleanup functions for the above C types - -The $(prefix) is an optional parameter used as a namespace to keep the -generated code from one schema/code-generation separated from others so code -can be generated/used from multiple schemas without clobbering previously -created code. - -Example: - - $ python scripts/qapi-types.py --output-dir="qapi-generated" \ - --prefix="example-" example-schema.json - $ cat qapi-generated/example-qapi-types.h -[Uninteresting stuff omitted...] - - #ifndef EXAMPLE_QAPI_TYPES_H - #define EXAMPLE_QAPI_TYPES_H - -[Built-in types omitted...] - - typedef struct UserDefOne UserDefOne; - - typedef struct UserDefOneList UserDefOneList; - - struct UserDefOne { - int64_t integer; - bool has_string; - char *string; - }; - - void qapi_free_UserDefOne(UserDefOne *obj); - - struct UserDefOneList { - UserDefOneList *next; - UserDefOne *value; - }; - - void qapi_free_UserDefOneList(UserDefOneList *obj); - - #endif - $ cat qapi-generated/example-qapi-types.c -[Uninteresting stuff omitted...] - - void qapi_free_UserDefOne(UserDefOne *obj) - { - Visitor *v; - - if (!obj) { - return; - } - - v = qapi_dealloc_visitor_new(); - visit_type_UserDefOne(v, NULL, &obj, NULL); - visit_free(v); - } - - void qapi_free_UserDefOneList(UserDefOneList *obj) - { - Visitor *v; - - if (!obj) { - return; - } - - v = qapi_dealloc_visitor_new(); - visit_type_UserDefOneList(v, NULL, &obj, NULL); - visit_free(v); - } - -=== scripts/qapi-visit.py === - -Used to generate the visitor functions used to walk through and -convert between a native QAPI C data structure and some other format -(such as QObject); the generated functions are named visit_type_FOO() -and visit_type_FOO_members(). - -The following files are generated: - -$(prefix)qapi-visit.c: visitor function for a particular C type, used - to automagically convert QObjects into the - corresponding C type and vice-versa, as well - as for deallocating memory for an existing C - type - -$(prefix)qapi-visit.h: declarations for previously mentioned visitor - functions - -Example: - - $ python scripts/qapi-visit.py --output-dir="qapi-generated" - --prefix="example-" example-schema.json - $ cat qapi-generated/example-qapi-visit.h -[Uninteresting stuff omitted...] - - #ifndef EXAMPLE_QAPI_VISIT_H - #define EXAMPLE_QAPI_VISIT_H - -[Visitors for built-in types omitted...] - - void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp); - void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp); - void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp); - - #endif - $ cat qapi-generated/example-qapi-visit.c -[Uninteresting stuff omitted...] - - void visit_type_UserDefOne_members(Visitor *v, UserDefOne *obj, Error **errp) - { - Error *err = NULL; - - visit_type_int(v, "integer", &obj->integer, &err); - if (err) { - goto out; - } - if (visit_optional(v, "string", &obj->has_string)) { - visit_type_str(v, "string", &obj->string, &err); - if (err) { - goto out; - } - } - - out: - error_propagate(errp, err); - } - - void visit_type_UserDefOne(Visitor *v, const char *name, UserDefOne **obj, Error **errp) - { - Error *err = NULL; - - visit_start_struct(v, name, (void **)obj, sizeof(UserDefOne), &err); - if (err) { - goto out; - } - if (!*obj) { - goto out_obj; - } - visit_type_UserDefOne_members(v, *obj, &err); - if (err) { - goto out_obj; - } - visit_check_struct(v, &err); - out_obj: - visit_end_struct(v, (void **)obj); - if (err && visit_is_input(v)) { - qapi_free_UserDefOne(*obj); - *obj = NULL; - } - out: - error_propagate(errp, err); - } - - void visit_type_UserDefOneList(Visitor *v, const char *name, UserDefOneList **obj, Error **errp) - { - Error *err = NULL; - UserDefOneList *tail; - size_t size = sizeof(**obj); - - visit_start_list(v, name, (GenericList **)obj, size, &err); - if (err) { - goto out; - } - - for (tail = *obj; tail; - tail = (UserDefOneList *)visit_next_list(v, (GenericList *)tail, size)) { - visit_type_UserDefOne(v, NULL, &tail->value, &err); - if (err) { - break; - } - } - - visit_end_list(v, (void **)obj); - if (err && visit_is_input(v)) { - qapi_free_UserDefOneList(*obj); - *obj = NULL; - } - out: - error_propagate(errp, err); - } - -=== scripts/qapi-commands.py === - -Used to generate the marshaling/dispatch functions for the commands -defined in the schema. The generated code implements -qmp_marshal_COMMAND() (registered automatically), and declares -qmp_COMMAND() that the user must implement. The following files are -generated: - -$(prefix)qmp-marshal.c: command marshal/dispatch functions for each - QMP command defined in the schema. Functions - generated by qapi-visit.py are used to - convert QObjects received from the wire into - function parameters, and uses the same - visitor functions to convert native C return - values to QObjects from transmission back - over the wire. - -$(prefix)qmp-commands.h: Function prototypes for the QMP commands - specified in the schema. - -Example: - - $ python scripts/qapi-commands.py --output-dir="qapi-generated" - --prefix="example-" example-schema.json - $ cat qapi-generated/example-qmp-commands.h -[Uninteresting stuff omitted...] - - #ifndef EXAMPLE_QMP_COMMANDS_H - #define EXAMPLE_QMP_COMMANDS_H - - #include "example-qapi-types.h" - #include "qapi/qmp/qdict.h" - #include "qapi/error.h" - - UserDefOne *qmp_my_command(UserDefOneList *arg1, Error **errp); - - #endif - $ cat qapi-generated/example-qmp-marshal.c -[Uninteresting stuff omitted...] - - static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp) - { - Error *err = NULL; - Visitor *v; - - v = qobject_output_visitor_new(ret_out); - visit_type_UserDefOne(v, "unused", &ret_in, &err); - if (!err) { - visit_complete(v, ret_out); - } - error_propagate(errp, err); - visit_free(v); - v = qapi_dealloc_visitor_new(); - visit_type_UserDefOne(v, "unused", &ret_in, NULL); - visit_free(v); - } - - static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp) - { - Error *err = NULL; - UserDefOne *retval; - Visitor *v; - UserDefOneList *arg1 = NULL; - - v = qobject_input_visitor_new(QOBJECT(args)); - visit_start_struct(v, NULL, NULL, 0, &err); - if (err) { - goto out; - } - visit_type_UserDefOneList(v, "arg1", &arg1, &err); - if (!err) { - visit_check_struct(v, &err); - } - visit_end_struct(v, NULL); - if (err) { - goto out; - } - - retval = qmp_my_command(arg1, &err); - if (err) { - goto out; - } - - qmp_marshal_output_UserDefOne(retval, ret, &err); - - out: - error_propagate(errp, err); - visit_free(v); - v = qapi_dealloc_visitor_new(); - visit_start_struct(v, NULL, NULL, 0, NULL); - visit_type_UserDefOneList(v, "arg1", &arg1, NULL); - visit_end_struct(v, NULL); - visit_free(v); - } - - static void qmp_init_marshal(void) - { - qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS); - } - - qapi_init(qmp_init_marshal); - -=== scripts/qapi-event.py === - -Used to generate the event-related C code defined by a schema, with -implementations for qapi_event_send_FOO(). The following files are -created: - -$(prefix)qapi-event.h - Function prototypes for each event type, plus an - enumeration of all event names -$(prefix)qapi-event.c - Implementation of functions to send an event - -Example: - - $ python scripts/qapi-event.py --output-dir="qapi-generated" - --prefix="example-" example-schema.json - $ cat qapi-generated/example-qapi-event.h -[Uninteresting stuff omitted...] - - #ifndef EXAMPLE_QAPI_EVENT_H - #define EXAMPLE_QAPI_EVENT_H - - #include "qapi/error.h" - #include "qapi/qmp/qdict.h" - #include "example-qapi-types.h" - - - void qapi_event_send_my_event(Error **errp); - - typedef enum example_QAPIEvent { - EXAMPLE_QAPI_EVENT_MY_EVENT = 0, - EXAMPLE_QAPI_EVENT__MAX = 1, - } example_QAPIEvent; - - extern const char *const example_QAPIEvent_lookup[]; - - #endif - $ cat qapi-generated/example-qapi-event.c -[Uninteresting stuff omitted...] - - void qapi_event_send_my_event(Error **errp) - { - QDict *qmp; - Error *err = NULL; - QMPEventFuncEmit emit; - emit = qmp_event_get_func_emit(); - if (!emit) { - return; - } - - qmp = qmp_event_build_dict("MY_EVENT"); - - emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err); - - error_propagate(errp, err); - QDECREF(qmp); - } - - const char *const example_QAPIEvent_lookup[] = { - [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT", - [EXAMPLE_QAPI_EVENT__MAX] = NULL, - }; - -=== scripts/qapi-introspect.py === - -Used to generate the introspection C code for a schema. The following -files are created: - -$(prefix)qmp-introspect.c - Defines a string holding a JSON - description of the schema. -$(prefix)qmp-introspect.h - Declares the above string. - -Example: - - $ python scripts/qapi-introspect.py --output-dir="qapi-generated" - --prefix="example-" example-schema.json - $ cat qapi-generated/example-qmp-introspect.h -[Uninteresting stuff omitted...] - - #ifndef EXAMPLE_QMP_INTROSPECT_H - #define EXAMPLE_QMP_INTROSPECT_H - - extern const char example_qmp_schema_json[]; - - #endif - $ cat qapi-generated/example-qmp-introspect.c -[Uninteresting stuff omitted...] - - const char example_qmp_schema_json[] = "[" - "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, " - "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, " - "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, " - "{\"members\": [{\"name\": \"arg1\", \"type\": \"[2]\"}], \"meta-type\": \"object\", \"name\": \"1\"}, " - "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"default\": null, \"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, " - "{\"element-type\": \"2\", \"meta-type\": \"array\", \"name\": \"[2]\"}, " - "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, " - "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]"; diff --git a/docs/rcu.txt b/docs/rcu.txt deleted file mode 100644 index c84e7f42b2..0000000000 --- a/docs/rcu.txt +++ /dev/null @@ -1,390 +0,0 @@ -Using RCU (Read-Copy-Update) for synchronization -================================================ - -Read-copy update (RCU) is a synchronization mechanism that is used to -protect read-mostly data structures. RCU is very efficient and scalable -on the read side (it is wait-free), and thus can make the read paths -extremely fast. - -RCU supports concurrency between a single writer and multiple readers, -thus it is not used alone. Typically, the write-side will use a lock to -serialize multiple updates, but other approaches are possible (e.g., -restricting updates to a single task). In QEMU, when a lock is used, -this will often be the "iothread mutex", also known as the "big QEMU -lock" (BQL). Also, restricting updates to a single task is done in -QEMU using the "bottom half" API. - -RCU is fundamentally a "wait-to-finish" mechanism. The read side marks -sections of code with "critical sections", and the update side will wait -for the execution of all *currently running* critical sections before -proceeding, or before asynchronously executing a callback. - -The key point here is that only the currently running critical sections -are waited for; critical sections that are started _after_ the beginning -of the wait do not extend the wait, despite running concurrently with -the updater. This is the reason why RCU is more scalable than, -for example, reader-writer locks. It is so much more scalable that -the system will have a single instance of the RCU mechanism; a single -mechanism can be used for an arbitrary number of "things", without -having to worry about things such as contention or deadlocks. - -How is this possible? The basic idea is to split updates in two phases, -"removal" and "reclamation". During removal, we ensure that subsequent -readers will not be able to get a reference to the old data. After -removal has completed, a critical section will not be able to access -the old data. Therefore, critical sections that begin after removal -do not matter; as soon as all previous critical sections have finished, -there cannot be any readers who hold references to the data structure, -and these can now be safely reclaimed (e.g., freed or unref'ed). - -Here is a picture: - - thread 1 thread 2 thread 3 - ------------------- ------------------------ ------------------- - enter RCU crit.sec. - | finish removal phase - | begin wait - | | enter RCU crit.sec. - exit RCU crit.sec | | - complete wait | - begin reclamation phase | - exit RCU crit.sec. - - -Note how thread 3 is still executing its critical section when thread 2 -starts reclaiming data. This is possible, because the old version of the -data structure was not accessible at the time thread 3 began executing -that critical section. - - -RCU API -======= - -The core RCU API is small: - - void rcu_read_lock(void); - - Used by a reader to inform the reclaimer that the reader is - entering an RCU read-side critical section. - - void rcu_read_unlock(void); - - Used by a reader to inform the reclaimer that the reader is - exiting an RCU read-side critical section. Note that RCU - read-side critical sections may be nested and/or overlapping. - - void synchronize_rcu(void); - - Blocks until all pre-existing RCU read-side critical sections - on all threads have completed. This marks the end of the removal - phase and the beginning of reclamation phase. - - Note that it would be valid for another update to come while - synchronize_rcu is running. Because of this, it is better that - the updater releases any locks it may hold before calling - synchronize_rcu. If this is not possible (for example, because - the updater is protected by the BQL), you can use call_rcu. - - void call_rcu1(struct rcu_head * head, - void (*func)(struct rcu_head *head)); - - This function invokes func(head) after all pre-existing RCU - read-side critical sections on all threads have completed. This - marks the end of the removal phase, with func taking care - asynchronously of the reclamation phase. - - The foo struct needs to have an rcu_head structure added, - perhaps as follows: - - struct foo { - struct rcu_head rcu; - int a; - char b; - long c; - }; - - so that the reclaimer function can fetch the struct foo address - and free it: - - call_rcu1(&foo.rcu, foo_reclaim); - - void foo_reclaim(struct rcu_head *rp) - { - struct foo *fp = container_of(rp, struct foo, rcu); - g_free(fp); - } - - For the common case where the rcu_head member is the first of the - struct, you can use the following macro. - - void call_rcu(T *p, - void (*func)(T *p), - field-name); - void g_free_rcu(T *p, - field-name); - - call_rcu1 is typically used through these macro, in the common case - where the "struct rcu_head" is the first field in the struct. If - the callback function is g_free, in particular, g_free_rcu can be - used. In the above case, one could have written simply: - - g_free_rcu(&foo, rcu); - - typeof(*p) atomic_rcu_read(p); - - atomic_rcu_read() is similar to atomic_mb_read(), but it makes - some assumptions on the code that calls it. This allows a more - optimized implementation. - - atomic_rcu_read assumes that whenever a single RCU critical - section reads multiple shared data, these reads are either - data-dependent or need no ordering. This is almost always the - case when using RCU, because read-side critical sections typically - navigate one or more pointers (the pointers that are changed on - every update) until reaching a data structure of interest, - and then read from there. - - RCU read-side critical sections must use atomic_rcu_read() to - read data, unless concurrent writes are prevented by another - synchronization mechanism. - - Furthermore, RCU read-side critical sections should traverse the - data structure in a single direction, opposite to the direction - in which the updater initializes it. - - void atomic_rcu_set(p, typeof(*p) v); - - atomic_rcu_set() is also similar to atomic_mb_set(), and it also - makes assumptions on the code that calls it in order to allow a more - optimized implementation. - - In particular, atomic_rcu_set() suffices for synchronization - with readers, if the updater never mutates a field within a - data item that is already accessible to readers. This is the - case when initializing a new copy of the RCU-protected data - structure; just ensure that initialization of *p is carried out - before atomic_rcu_set() makes the data item visible to readers. - If this rule is observed, writes will happen in the opposite - order as reads in the RCU read-side critical sections (or if - there is just one update), and there will be no need for other - synchronization mechanism to coordinate the accesses. - -The following APIs must be used before RCU is used in a thread: - - void rcu_register_thread(void); - - Mark a thread as taking part in the RCU mechanism. Such a thread - will have to report quiescent points regularly, either manually - or through the QemuCond/QemuSemaphore/QemuEvent APIs. - - void rcu_unregister_thread(void); - - Mark a thread as not taking part anymore in the RCU mechanism. - It is not a problem if such a thread reports quiescent points, - either manually or by using the QemuCond/QemuSemaphore/QemuEvent - APIs. - -Note that these APIs are relatively heavyweight, and should _not_ be -nested. - - -DIFFERENCES WITH LINUX -====================== - -- Waiting on a mutex is possible, though discouraged, within an RCU critical - section. This is because spinlocks are rarely (if ever) used in userspace - programming; not allowing this would prevent upgrading an RCU read-side - critical section to become an updater. - -- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and - rcu_assign_pointer. They take a _pointer_ to the variable being accessed. - -- call_rcu is a macro that has an extra argument (the name of the first - field in the struct, which must be a struct rcu_head), and expects the - type of the callback's argument to be the type of the first argument. - call_rcu1 is the same as Linux's call_rcu. - - -RCU PATTERNS -============ - -Many patterns using read-writer locks translate directly to RCU, with -the advantages of higher scalability and deadlock immunity. - -In general, RCU can be used whenever it is possible to create a new -"version" of a data structure every time the updater runs. This may -sound like a very strict restriction, however: - -- the updater does not mean "everything that writes to a data structure", - but rather "everything that involves a reclamation step". See the - array example below - -- in some cases, creating a new version of a data structure may actually - be very cheap. For example, modifying the "next" pointer of a singly - linked list is effectively creating a new version of the list. - -Here are some frequently-used RCU idioms that are worth noting. - - -RCU list processing -------------------- - -TBD (not yet used in QEMU) - - -RCU reference counting ----------------------- - -Because grace periods are not allowed to complete while there is an RCU -read-side critical section in progress, the RCU read-side primitives -may be used as a restricted reference-counting mechanism. For example, -consider the following code fragment: - - rcu_read_lock(); - p = atomic_rcu_read(&foo); - /* do something with p. */ - rcu_read_unlock(); - -The RCU read-side critical section ensures that the value of "p" remains -valid until after the rcu_read_unlock(). In some sense, it is acquiring -a reference to p that is later released when the critical section ends. -The write side looks simply like this (with appropriate locking): - - qemu_mutex_lock(&foo_mutex); - old = foo; - atomic_rcu_set(&foo, new); - qemu_mutex_unlock(&foo_mutex); - synchronize_rcu(); - free(old); - -If the processing cannot be done purely within the critical section, it -is possible to combine this idiom with a "real" reference count: - - rcu_read_lock(); - p = atomic_rcu_read(&foo); - foo_ref(p); - rcu_read_unlock(); - /* do something with p. */ - foo_unref(p); - -The write side can be like this: - - qemu_mutex_lock(&foo_mutex); - old = foo; - atomic_rcu_set(&foo, new); - qemu_mutex_unlock(&foo_mutex); - synchronize_rcu(); - foo_unref(old); - -or with call_rcu: - - qemu_mutex_lock(&foo_mutex); - old = foo; - atomic_rcu_set(&foo, new); - qemu_mutex_unlock(&foo_mutex); - call_rcu(foo_unref, old, rcu); - -In both cases, the write side only performs removal. Reclamation -happens when the last reference to a "foo" object is dropped. -Using synchronize_rcu() is undesirably expensive, because the -last reference may be dropped on the read side. Hence you can -use call_rcu() instead: - - foo_unref(struct foo *p) { - if (atomic_fetch_dec(&p->refcount) == 1) { - call_rcu(foo_destroy, p, rcu); - } - } - - -Note that the same idioms would be possible with reader/writer -locks: - - read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); - p = foo; p = foo; - /* do something with p. */ foo = new; - read_unlock(&foo_rwlock); free(p); - write_mutex_unlock(&foo_rwlock); - free(p); - - ------------------------------------------------------------------ - - read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); - p = foo; old = foo; - foo_ref(p); foo = new; - read_unlock(&foo_rwlock); foo_unref(old); - /* do something with p. */ write_mutex_unlock(&foo_rwlock); - read_lock(&foo_rwlock); - foo_unref(p); - read_unlock(&foo_rwlock); - -foo_unref could use a mechanism such as bottom halves to move deallocation -out of the write-side critical section. - - -RCU resizable arrays --------------------- - -Resizable arrays can be used with RCU. The expensive RCU synchronization -(or call_rcu) only needs to take place when the array is resized. -The two items to take care of are: - -- ensuring that the old version of the array is available between removal - and reclamation; - -- avoiding mismatches in the read side between the array data and the - array size. - -The first problem is avoided simply by not using realloc. Instead, -each resize will allocate a new array and copy the old data into it. -The second problem would arise if the size and the data pointers were -two members of a larger struct: - - struct mystuff { - ... - int data_size; - int data_alloc; - T *data; - ... - }; - -Instead, we store the size of the array with the array itself: - - struct arr { - int size; - int alloc; - T data[]; - }; - struct arr *global_array; - - read side: - rcu_read_lock(); - struct arr *array = atomic_rcu_read(&global_array); - x = i < array->size ? array->data[i] : -1; - rcu_read_unlock(); - return x; - - write side (running under a lock): - if (global_array->size == global_array->alloc) { - /* Creating a new version. */ - new_array = g_malloc(sizeof(struct arr) + - global_array->alloc * 2 * sizeof(T)); - new_array->size = global_array->size; - new_array->alloc = global_array->alloc * 2; - memcpy(new_array->data, global_array->data, - global_array->alloc * sizeof(T)); - - /* Removal phase. */ - old_array = global_array; - atomic_rcu_set(&new_array->data, new_array); - synchronize_rcu(); - - /* Reclamation phase. */ - free(old_array); - } - - -SOURCES -======= - -* Documentation/RCU/ from the Linux kernel diff --git a/docs/spin/aio_notify.promela b/docs/spin/aio_notify.promela new file mode 100644 index 0000000000..fccc7ee1c3 --- /dev/null +++ b/docs/spin/aio_notify.promela @@ -0,0 +1,93 @@ +/* + * This model describes the interaction between ctx->notify_me + * and aio_notify(). + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To simulate it: + * spin -p docs/aio_notify.promela + * + * To verify it: + * spin -a docs/aio_notify.promela + * gcc -O2 pan.c + * ./a.out -a + * + * To verify it (with a bug planted in the model): + * spin -a -DBUG docs/aio_notify.promela + * gcc -O2 pan.c + * ./a.out -a + */ + +#define MAX 4 +#define LAST (1 << (MAX - 1)) +#define FINAL ((LAST << 1) - 1) + +bool notify_me; +bool event; + +int req; +int done; + +active proctype waiter() +{ + int fetch; + + do + :: true -> { + notify_me++; + + if +#ifndef BUG + :: (req > 0) -> skip; +#endif + :: else -> + // Wait for a nudge from the other side + do + :: event == 1 -> { event = 0; break; } + od; + fi; + + notify_me--; + + atomic { fetch = req; req = 0; } + done = done | fetch; + } + od +} + +active proctype notifier() +{ + int next = 1; + + do + :: next <= LAST -> { + // generate a request + req = req | next; + next = next << 1; + + // aio_notify + if + :: notify_me == 1 -> event = 1; + :: else -> printf("Skipped event_notifier_set\n"); skip; + fi; + + // Test both synchronous and asynchronous delivery + if + :: 1 -> do + :: req == 0 -> break; + od; + :: 1 -> skip; + fi; + } + od; +} + +never { /* [] done < FINAL */ +accept_init: + do + :: done < FINAL -> skip; + od; +} diff --git a/docs/spin/aio_notify_accept.promela b/docs/spin/aio_notify_accept.promela new file mode 100644 index 0000000000..9cef2c955d --- /dev/null +++ b/docs/spin/aio_notify_accept.promela @@ -0,0 +1,152 @@ +/* + * This model describes the interaction between ctx->notified + * and ctx->notifier. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG1 docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * (or -DBUG2) + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool notified; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +#ifdef BUG +#error Please define BUG1 or BUG2 instead. +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> +#if defined BUG1 + /* CHECK_REQ does not detect this bug! */ + notified = 1; + event = 1; +#elif defined BUG2 + if + :: !notified -> event = 1; + :: else -> skip; + fi; + notified = 1; +#else + event = 1; + notified = 1; +#endif + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + atomic { old = notified; notified = 0; } \ + if \ + :: old -> event = 0; \ + :: else -> skip; \ + fi; \ + \ + req = 0; + +active proctype waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/docs/spin/aio_notify_bug.promela b/docs/spin/aio_notify_bug.promela new file mode 100644 index 0000000000..b3bfca1ca4 --- /dev/null +++ b/docs/spin/aio_notify_bug.promela @@ -0,0 +1,140 @@ +/* + * This model describes a bug in aio_notify. If ctx->notifier is + * cleared too late, a wakeup could be lost. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> event = 1; + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#ifdef BUG +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + req = 0; \ + event = 0; +#else +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + event = 0; \ + req = 0; +#endif + +active proctype waiter() +{ + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/docs/spin/tcg-exclusive.promela b/docs/spin/tcg-exclusive.promela new file mode 100644 index 0000000000..c91cfca9f7 --- /dev/null +++ b/docs/spin/tcg-exclusive.promela @@ -0,0 +1,225 @@ +/* + * This model describes the implementation of exclusive sections in + * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start, + * cpu_exec_end). + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify it: + * spin -a docs/tcg-exclusive.promela + * gcc pan.c -O2 + * ./a.out -a + * + * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX, + * TEST_EXPENSIVE. + */ + +// Define the missing parameters for the model +#ifndef N_CPUS +#define N_CPUS 2 +#warning defaulting to 2 CPU processes +#endif + +// the expensive test is not so expensive for <= 2 CPUs +// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs +// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM +#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX) +#define TEST_EXPENSIVE +#endif + +#ifndef N_EXCLUSIVE +# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE +# define N_EXCLUSIVE 2 +# warning defaulting to 2 concurrent exclusive sections +# else +# define N_EXCLUSIVE 1 +# warning defaulting to 1 concurrent exclusive sections +# endif +#endif +#ifndef N_CYCLES +# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE +# define N_CYCLES 2 +# warning defaulting to 2 CPU cycles +# else +# define N_CYCLES 1 +# warning defaulting to 1 CPU cycles +# endif +#endif + + +// synchronization primitives. condition variables require a +// process-local "cond_t saved;" variable. + +#define mutex_t byte +#define MUTEX_LOCK(m) atomic { m == 0 -> m = 1 } +#define MUTEX_UNLOCK(m) m = 0 + +#define cond_t int +#define COND_WAIT(c, m) { \ + saved = c; \ + MUTEX_UNLOCK(m); \ + c != saved -> MUTEX_LOCK(m); \ + } +#define COND_BROADCAST(c) c++ + +// this is the logic from cpus-common.c + +mutex_t mutex; +cond_t exclusive_cond; +cond_t exclusive_resume; +byte pending_cpus; + +byte running[N_CPUS]; +byte has_waiter[N_CPUS]; + +#define exclusive_idle() \ + do \ + :: pending_cpus -> COND_WAIT(exclusive_resume, mutex); \ + :: else -> break; \ + od + +#define start_exclusive() \ + MUTEX_LOCK(mutex); \ + exclusive_idle(); \ + pending_cpus = 1; \ + \ + i = 0; \ + do \ + :: i < N_CPUS -> { \ + if \ + :: running[i] -> has_waiter[i] = 1; pending_cpus++; \ + :: else -> skip; \ + fi; \ + i++; \ + } \ + :: else -> break; \ + od; \ + \ + do \ + :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex); \ + :: else -> break; \ + od; \ + MUTEX_UNLOCK(mutex); + +#define end_exclusive() \ + MUTEX_LOCK(mutex); \ + pending_cpus = 0; \ + COND_BROADCAST(exclusive_resume); \ + MUTEX_UNLOCK(mutex); + +#ifdef USE_MUTEX +// Simple version using mutexes +#define cpu_exec_start(id) \ + MUTEX_LOCK(mutex); \ + exclusive_idle(); \ + running[id] = 1; \ + MUTEX_UNLOCK(mutex); + +#define cpu_exec_end(id) \ + MUTEX_LOCK(mutex); \ + running[id] = 0; \ + if \ + :: pending_cpus -> { \ + pending_cpus--; \ + if \ + :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ + :: else -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); +#else +// Wait-free fast path, only needs mutex when concurrent with +// an exclusive section +#define cpu_exec_start(id) \ + running[id] = 1; \ + if \ + :: pending_cpus -> { \ + MUTEX_LOCK(mutex); \ + if \ + :: !has_waiter[id] -> { \ + running[id] = 0; \ + exclusive_idle(); \ + running[id] = 1; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); \ + } \ + :: else -> skip; \ + fi; + +#define cpu_exec_end(id) \ + running[id] = 0; \ + if \ + :: pending_cpus -> { \ + MUTEX_LOCK(mutex); \ + if \ + :: has_waiter[id] -> { \ + has_waiter[id] = 0; \ + pending_cpus--; \ + if \ + :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ + :: else -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + MUTEX_UNLOCK(mutex); \ + } \ + :: else -> skip; \ + fi +#endif + +// Promela processes + +byte done_cpu; +byte in_cpu; +active[N_CPUS] proctype cpu() +{ + byte id = _pid % N_CPUS; + byte cycles = 0; + cond_t saved; + + do + :: cycles == N_CYCLES -> break; + :: else -> { + cycles++; + cpu_exec_start(id) + in_cpu++; + done_cpu++; + in_cpu--; + cpu_exec_end(id) + } + od; +} + +byte done_exclusive; +byte in_exclusive; +active[N_EXCLUSIVE] proctype exclusive() +{ + cond_t saved; + byte i; + + start_exclusive(); + in_exclusive = 1; + done_exclusive++; + in_exclusive = 0; + end_exclusive(); +} + +#define LIVENESS (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE) +#define SAFETY !(in_exclusive && in_cpu) + +never { /* ! ([] SAFETY && <> [] LIVENESS) */ + do + // once the liveness property is satisfied, this is not executable + // and the never clause is not accepted + :: ! LIVENESS -> accept_liveness: skip + :: 1 -> assert(SAFETY) + od; +} diff --git a/docs/spin/win32-qemu-event.promela b/docs/spin/win32-qemu-event.promela new file mode 100644 index 0000000000..c446a71555 --- /dev/null +++ b/docs/spin/win32-qemu-event.promela @@ -0,0 +1,98 @@ +/* + * This model describes the implementation of QemuEvent in + * util/qemu-thread-win32.c. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify it: + * spin -a docs/event.promela + * gcc -O2 pan.c -DSAFETY + * ./a.out + */ + +bool event; +int value; + +/* Primitives for a Win32 event */ +#define RAW_RESET event = false +#define RAW_SET event = true +#define RAW_WAIT do :: event -> break; od + +#if 0 +/* Basic sanity checking: test the Win32 event primitives */ +#define RESET RAW_RESET +#define SET RAW_SET +#define WAIT RAW_WAIT +#else +/* Full model: layer a userspace-only fast path on top of the RAW_* + * primitives. SET/RESET/WAIT have exactly the same semantics as + * RAW_SET/RAW_RESET/RAW_WAIT, but try to avoid invoking them. + */ +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + +int state = EV_FREE; + +int xchg_result; +#define SET if :: state != EV_SET -> \ + atomic { /* xchg_result=xchg(state, EV_SET) */ \ + xchg_result = state; \ + state = EV_SET; \ + } \ + if :: xchg_result == EV_BUSY -> RAW_SET; \ + :: else -> skip; \ + fi; \ + :: else -> skip; \ + fi + +#define RESET if :: state == EV_SET -> atomic { state = state | EV_FREE; } \ + :: else -> skip; \ + fi + +int tmp1, tmp2; +#define WAIT tmp1 = state; \ + if :: tmp1 != EV_SET -> \ + if :: tmp1 == EV_FREE -> \ + RAW_RESET; \ + atomic { /* tmp2=cas(state, EV_FREE, EV_BUSY) */ \ + tmp2 = state; \ + if :: tmp2 == EV_FREE -> state = EV_BUSY; \ + :: else -> skip; \ + fi; \ + } \ + if :: tmp2 == EV_SET -> tmp1 = EV_SET; \ + :: else -> tmp1 = EV_BUSY; \ + fi; \ + :: else -> skip; \ + fi; \ + assert(tmp1 != EV_FREE); \ + if :: tmp1 == EV_BUSY -> RAW_WAIT; \ + :: else -> skip; \ + fi; \ + :: else -> skip; \ + fi +#endif + +active proctype waiter() +{ + if + :: !value -> + RESET; + if + :: !value -> WAIT; + :: else -> skip; + fi; + :: else -> skip; + fi; + assert(value); +} + +active proctype notifier() +{ + value = true; + SET; +} diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela deleted file mode 100644 index c91cfca9f7..0000000000 --- a/docs/tcg-exclusive.promela +++ /dev/null @@ -1,225 +0,0 @@ -/* - * This model describes the implementation of exclusive sections in - * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start, - * cpu_exec_end). - * - * Author: Paolo Bonzini <pbonzini@redhat.com> - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify it: - * spin -a docs/tcg-exclusive.promela - * gcc pan.c -O2 - * ./a.out -a - * - * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX, - * TEST_EXPENSIVE. - */ - -// Define the missing parameters for the model -#ifndef N_CPUS -#define N_CPUS 2 -#warning defaulting to 2 CPU processes -#endif - -// the expensive test is not so expensive for <= 2 CPUs -// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs -// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM -#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX) -#define TEST_EXPENSIVE -#endif - -#ifndef N_EXCLUSIVE -# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE -# define N_EXCLUSIVE 2 -# warning defaulting to 2 concurrent exclusive sections -# else -# define N_EXCLUSIVE 1 -# warning defaulting to 1 concurrent exclusive sections -# endif -#endif -#ifndef N_CYCLES -# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE -# define N_CYCLES 2 -# warning defaulting to 2 CPU cycles -# else -# define N_CYCLES 1 -# warning defaulting to 1 CPU cycles -# endif -#endif - - -// synchronization primitives. condition variables require a -// process-local "cond_t saved;" variable. - -#define mutex_t byte -#define MUTEX_LOCK(m) atomic { m == 0 -> m = 1 } -#define MUTEX_UNLOCK(m) m = 0 - -#define cond_t int -#define COND_WAIT(c, m) { \ - saved = c; \ - MUTEX_UNLOCK(m); \ - c != saved -> MUTEX_LOCK(m); \ - } -#define COND_BROADCAST(c) c++ - -// this is the logic from cpus-common.c - -mutex_t mutex; -cond_t exclusive_cond; -cond_t exclusive_resume; -byte pending_cpus; - -byte running[N_CPUS]; -byte has_waiter[N_CPUS]; - -#define exclusive_idle() \ - do \ - :: pending_cpus -> COND_WAIT(exclusive_resume, mutex); \ - :: else -> break; \ - od - -#define start_exclusive() \ - MUTEX_LOCK(mutex); \ - exclusive_idle(); \ - pending_cpus = 1; \ - \ - i = 0; \ - do \ - :: i < N_CPUS -> { \ - if \ - :: running[i] -> has_waiter[i] = 1; pending_cpus++; \ - :: else -> skip; \ - fi; \ - i++; \ - } \ - :: else -> break; \ - od; \ - \ - do \ - :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex); \ - :: else -> break; \ - od; \ - MUTEX_UNLOCK(mutex); - -#define end_exclusive() \ - MUTEX_LOCK(mutex); \ - pending_cpus = 0; \ - COND_BROADCAST(exclusive_resume); \ - MUTEX_UNLOCK(mutex); - -#ifdef USE_MUTEX -// Simple version using mutexes -#define cpu_exec_start(id) \ - MUTEX_LOCK(mutex); \ - exclusive_idle(); \ - running[id] = 1; \ - MUTEX_UNLOCK(mutex); - -#define cpu_exec_end(id) \ - MUTEX_LOCK(mutex); \ - running[id] = 0; \ - if \ - :: pending_cpus -> { \ - pending_cpus--; \ - if \ - :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ - :: else -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - MUTEX_UNLOCK(mutex); -#else -// Wait-free fast path, only needs mutex when concurrent with -// an exclusive section -#define cpu_exec_start(id) \ - running[id] = 1; \ - if \ - :: pending_cpus -> { \ - MUTEX_LOCK(mutex); \ - if \ - :: !has_waiter[id] -> { \ - running[id] = 0; \ - exclusive_idle(); \ - running[id] = 1; \ - } \ - :: else -> skip; \ - fi; \ - MUTEX_UNLOCK(mutex); \ - } \ - :: else -> skip; \ - fi; - -#define cpu_exec_end(id) \ - running[id] = 0; \ - if \ - :: pending_cpus -> { \ - MUTEX_LOCK(mutex); \ - if \ - :: has_waiter[id] -> { \ - has_waiter[id] = 0; \ - pending_cpus--; \ - if \ - :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \ - :: else -> skip; \ - fi; \ - } \ - :: else -> skip; \ - fi; \ - MUTEX_UNLOCK(mutex); \ - } \ - :: else -> skip; \ - fi -#endif - -// Promela processes - -byte done_cpu; -byte in_cpu; -active[N_CPUS] proctype cpu() -{ - byte id = _pid % N_CPUS; - byte cycles = 0; - cond_t saved; - - do - :: cycles == N_CYCLES -> break; - :: else -> { - cycles++; - cpu_exec_start(id) - in_cpu++; - done_cpu++; - in_cpu--; - cpu_exec_end(id) - } - od; -} - -byte done_exclusive; -byte in_exclusive; -active[N_EXCLUSIVE] proctype exclusive() -{ - cond_t saved; - byte i; - - start_exclusive(); - in_exclusive = 1; - done_exclusive++; - in_exclusive = 0; - end_exclusive(); -} - -#define LIVENESS (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE) -#define SAFETY !(in_exclusive && in_cpu) - -never { /* ! ([] SAFETY && <> [] LIVENESS) */ - do - // once the liveness property is satisfied, this is not executable - // and the never clause is not accepted - :: ! LIVENESS -> accept_liveness: skip - :: 1 -> assert(SAFETY) - od; -} diff --git a/docs/tracing.txt b/docs/tracing.txt deleted file mode 100644 index 8c0029beca..0000000000 --- a/docs/tracing.txt +++ /dev/null @@ -1,442 +0,0 @@ -= Tracing = - -== Introduction == - -This document describes the tracing infrastructure in QEMU and how to use it -for debugging, profiling, and observing execution. - -== Quickstart == - -1. Build with the 'simple' trace backend: - - ./configure --enable-trace-backends=simple - make - -2. Create a file with the events you want to trace: - - echo bdrv_aio_readv > /tmp/events - echo bdrv_aio_writev >> /tmp/events - -3. Run the virtual machine to produce a trace file: - - qemu -trace events=/tmp/events ... # your normal QEMU invocation - -4. Pretty-print the binary trace file: - - ./scripts/simpletrace.py trace-events-all trace-* # Override * with QEMU <pid> - -== Trace events == - -=== Sub-directory setup === - -Each directory in the source tree can declare a set of static trace events -in a local "trace-events" file. All directories which contain "trace-events" -files must be listed in the "trace-events-subdirs" make variable in the top -level Makefile.objs. During build, the "trace-events" file in each listed -subdirectory will be processed by the "tracetool" script to generate code for -the trace events. - -The individual "trace-events" files are merged into a "trace-events-all" file, -which is also installed into "/usr/share/qemu" with the name "trace-events". -This merged file is to be used by the "simpletrace.py" script to later analyse -traces in the simpletrace data format. - -In the sub-directory the following files will be automatically generated - - - trace.c - the trace event state declarations - - trace.h - the trace event enums and probe functions - - trace-dtrace.h - DTrace event probe specification - - trace-dtrace.dtrace - DTrace event probe helper declaration - - trace-dtrace.o - binary DTrace provider (generated by dtrace) - - trace-ust.h - UST event probe helper declarations - -Source files in the sub-directory should #include the local 'trace.h' file, -without any sub-directory path prefix. eg io/channel-buffer.c would do - - #include "trace.h" - -To access the 'io/trace.h' file. While it is possible to include a trace.h -file from outside a source files' own sub-directory, this is discouraged in -general. It is strongly preferred that all events be declared directly in -the sub-directory that uses them. The only exception is where there are some -shared trace events defined in the top level directory trace-events file. -The top level directory generates trace files with a filename prefix of -"trace-root" instead of just "trace". This is to avoid ambiguity between -a trace.h in the current directory, vs the top level directory. - -=== Using trace events === - -Trace events are invoked directly from source code like this: - - #include "trace.h" /* needed for trace event prototype */ - - void *qemu_vmalloc(size_t size) - { - void *ptr; - size_t align = QEMU_VMALLOC_ALIGN; - - if (size < align) { - align = getpagesize(); - } - ptr = qemu_memalign(align, size); - trace_qemu_vmalloc(size, ptr); - return ptr; - } - -=== Declaring trace events === - -The "tracetool" script produces the trace.h header file which is included by -every source file that uses trace events. Since many source files include -trace.h, it uses a minimum of types and other header files included to keep the -namespace clean and compile times and dependencies down. - -Trace events should use types as follows: - - * Use stdint.h types for fixed-size types. Most offsets and guest memory - addresses are best represented with uint32_t or uint64_t. Use fixed-size - types over primitive types whose size may change depending on the host - (32-bit versus 64-bit) so trace events don't truncate values or break - the build. - - * Use void * for pointers to structs or for arrays. The trace.h header - cannot include all user-defined struct declarations and it is therefore - necessary to use void * for pointers to structs. - - * For everything else, use primitive scalar types (char, int, long) with the - appropriate signedness. - -Format strings should reflect the types defined in the trace event. Take -special care to use PRId64 and PRIu64 for int64_t and uint64_t types, -respectively. This ensures portability between 32- and 64-bit platforms. - -Each event declaration will start with the event name, then its arguments, -finally a format string for pretty-printing. For example: - - qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p" - qemu_vfree(void *ptr) "ptr %p" - - -=== Hints for adding new trace events === - -1. Trace state changes in the code. Interesting points in the code usually - involve a state change like starting, stopping, allocating, freeing. State - changes are good trace events because they can be used to understand the - execution of the system. - -2. Trace guest operations. Guest I/O accesses like reading device registers - are good trace events because they can be used to understand guest - interactions. - -3. Use correlator fields so the context of an individual line of trace output - can be understood. For example, trace the pointer returned by malloc and - used as an argument to free. This way mallocs and frees can be matched up. - Trace events with no context are not very useful. - -4. Name trace events after their function. If there are multiple trace events - in one function, append a unique distinguisher at the end of the name. - -== Generic interface and monitor commands == - -You can programmatically query and control the state of trace events through a -backend-agnostic interface provided by the header "trace/control.h". - -Note that some of the backends do not provide an implementation for some parts -of this interface, in which case QEMU will just print a warning (please refer to -header "trace/control.h" to see which routines are backend-dependent). - -The state of events can also be queried and modified through monitor commands: - -* info trace-events - View available trace events and their state. State 1 means enabled, state 0 - means disabled. - -* trace-event NAME on|off - Enable/disable a given trace event or a group of events (using wildcards). - -The "-trace events=<file>" command line argument can be used to enable the -events listed in <file> from the very beginning of the program. This file must -contain one event name per line. - -If a line in the "-trace events=<file>" file begins with a '-', the trace event -will be disabled instead of enabled. This is useful when a wildcard was used -to enable an entire family of events but one noisy event needs to be disabled. - -Wildcard matching is supported in both the monitor command "trace-event" and the -events list file. That means you can enable/disable the events having a common -prefix in a batch. For example, virtio-blk trace events could be enabled using -the following monitor command: - - trace-event virtio_blk_* on - -== Trace backends == - -The "tracetool" script automates tedious trace event code generation and also -keeps the trace event declarations independent of the trace backend. The trace -events are not tightly coupled to a specific trace backend, such as LTTng or -SystemTap. Support for trace backends can be added by extending the "tracetool" -script. - -The trace backends are chosen at configure time: - - ./configure --enable-trace-backends=simple - -For a list of supported trace backends, try ./configure --help or see below. -If multiple backends are enabled, the trace is sent to them all. - -If no backends are explicitly selected, configure will default to the -"log" backend. - -The following subsections describe the supported trace backends. - -=== Nop === - -The "nop" backend generates empty trace event functions so that the compiler -can optimize out trace events completely. This imposes no performance -penalty. - -Note that regardless of the selected trace backend, events with the "disable" -property will be generated with the "nop" backend. - -=== Log === - -The "log" backend sends trace events directly to standard error. This -effectively turns trace events into debug printfs. - -This is the simplest backend and can be used together with existing code that -uses DPRINTF(). - -=== Simpletrace === - -The "simple" backend supports common use cases and comes as part of the QEMU -source tree. It may not be as powerful as platform-specific or third-party -trace backends but it is portable. This is the recommended trace backend -unless you have specific needs for more advanced backends. - -=== Ftrace === - -The "ftrace" backend writes trace data to ftrace marker. This effectively -sends trace events to ftrace ring buffer, and you can compare qemu trace -data and kernel(especially kvm.ko when using KVM) trace data. - -if you use KVM, enable kvm events in ftrace: - - # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable - -After running qemu by root user, you can get the trace: - - # cat /sys/kernel/debug/tracing/trace - -Restriction: "ftrace" backend is restricted to Linux only. - -=== Syslog === - -The "syslog" backend sends trace events using the POSIX syslog API. The log -is opened specifying the LOG_DAEMON facility and LOG_PID option (so events -are tagged with the pid of the particular QEMU process that generated -them). All events are logged at LOG_INFO level. - -NOTE: syslog may squash duplicate consecutive trace events and apply rate - limiting. - -Restriction: "syslog" backend is restricted to POSIX compliant OS. - -==== Monitor commands ==== - -* trace-file on|off|flush|set <path> - Enable/disable/flush the trace file or set the trace file name. - -==== Analyzing trace files ==== - -The "simple" backend produces binary trace files that can be formatted with the -simpletrace.py script. The script takes the "trace-events-all" file and the -binary trace: - - ./scripts/simpletrace.py trace-events-all trace-12345 - -You must ensure that the same "trace-events-all" file was used to build QEMU, -otherwise trace event declarations may have changed and output will not be -consistent. - -=== LTTng Userspace Tracer === - -The "ust" backend uses the LTTng Userspace Tracer library. There are no -monitor commands built into QEMU, instead UST utilities should be used to list, -enable/disable, and dump traces. - -Package lttng-tools is required for userspace tracing. You must ensure that the -current user belongs to the "tracing" group, or manually launch the -lttng-sessiond daemon for the current user prior to running any instance of -QEMU. - -While running an instrumented QEMU, LTTng should be able to list all available -events: - - lttng list -u - -Create tracing session: - - lttng create mysession - -Enable events: - - lttng enable-event qemu:g_malloc -u - -Where the events can either be a comma-separated list of events, or "-a" to -enable all tracepoint events. Start and stop tracing as needed: - - lttng start - lttng stop - -View the trace: - - lttng view - -Destroy tracing session: - - lttng destroy - -Babeltrace can be used at any later time to view the trace: - - babeltrace $HOME/lttng-traces/mysession-<date>-<time> - -=== SystemTap === - -The "dtrace" backend uses DTrace sdt probes but has only been tested with -SystemTap. When SystemTap support is detected a .stp file with wrapper probes -is generated to make use in scripts more convenient. This step can also be -performed manually after a build in order to change the binary name in the .stp -probes: - - scripts/tracetool.py --backends=dtrace --format=stap \ - --binary path/to/qemu-binary \ - --target-type system \ - --target-name x86_64 \ - <trace-events-all >qemu.stp - -== Trace event properties == - -Each event in the "trace-events-all" file can be prefixed with a space-separated -list of zero or more of the following event properties. - -=== "disable" === - -If a specific trace event is going to be invoked a huge number of times, this -might have a noticeable performance impact even when the event is -programmatically disabled. - -In this case you should declare such event with the "disable" property. This -will effectively disable the event at compile time (by using the "nop" backend), -thus having no performance impact at all on regular builds (i.e., unless you -edit the "trace-events-all" file). - -In addition, there might be cases where relatively complex computations must be -performed to generate values that are only used as arguments for a trace -function. In these cases you can use the macro 'TRACE_${EVENT_NAME}_ENABLED' to -guard such computations and avoid its compilation when the event is disabled: - - #include "trace.h" /* needed for trace event prototype */ - - void *qemu_vmalloc(size_t size) - { - void *ptr; - size_t align = QEMU_VMALLOC_ALIGN; - - if (size < align) { - align = getpagesize(); - } - ptr = qemu_memalign(align, size); - if (TRACE_QEMU_VMALLOC_ENABLED) { /* preprocessor macro */ - void *complex; - /* some complex computations to produce the 'complex' value */ - trace_qemu_vmalloc(size, ptr, complex); - } - return ptr; - } - -You can check both if the event has been disabled and is dynamically enabled at -the same time using the 'trace_event_get_state' routine (see header -"trace/control.h" for more information). - -=== "tcg" === - -Guest code generated by TCG can be traced by defining an event with the "tcg" -event property. Internally, this property generates two events: -"<eventname>_trans" to trace the event at translation time, and -"<eventname>_exec" to trace the event at execution time. - -Instead of using these two events, you should instead use the function -"trace_<eventname>_tcg" during translation (TCG code generation). This function -will automatically call "trace_<eventname>_trans", and will generate the -necessary TCG code to call "trace_<eventname>_exec" during guest code execution. - -Events with the "tcg" property can be declared in the "trace-events" file with a -mix of native and TCG types, and "trace_<eventname>_tcg" will gracefully forward -them to the "<eventname>_trans" and "<eventname>_exec" events. Since TCG values -are not known at translation time, these are ignored by the "<eventname>_trans" -event. Because of this, the entry in the "trace-events" file needs two printing -formats (separated by a comma): - - tcg foo(uint8_t a1, TCGv_i32 a2) "a1=%d", "a1=%d a2=%d" - -For example: - - #include "trace-tcg.h" - - void some_disassembly_func (...) - { - uint8_t a1 = ...; - TCGv_i32 a2 = ...; - trace_foo_tcg(a1, a2); - } - -This will immediately call: - - void trace_foo_trans(uint8_t a1); - -and will generate the TCG code to call: - - void trace_foo(uint8_t a1, uint32_t a2); - -=== "vcpu" === - -Identifies events that trace vCPU-specific information. It implicitly adds a -"CPUState*" argument, and extends the tracing print format to show the vCPU -information. If used together with the "tcg" property, it adds a second -"TCGv_env" argument that must point to the per-target global TCG register that -points to the vCPU when guest code is executed (usually the "cpu_env" variable). - -The "tcg" and "vcpu" properties are currently only honored in the root -./trace-events file. - -The following example events: - - foo(uint32_t a) "a=%x" - vcpu bar(uint32_t a) "a=%x" - tcg vcpu baz(uint32_t a) "a=%x", "a=%x" - -Can be used as: - - #include "trace-tcg.h" - - CPUArchState *env; - TCGv_ptr cpu_env; - - void some_disassembly_func(...) - { - /* trace emitted at this point */ - trace_foo(0xd1); - /* trace emitted at this point */ - trace_bar(ENV_GET_CPU(env), 0xd2); - /* trace emitted at this point (env) and when guest code is executed (cpu_env) */ - trace_baz_tcg(ENV_GET_CPU(env), cpu_env, 0xd3); - } - -If the translating vCPU has address 0xc1 and code is later executed by vCPU -0xc2, this would be an example output: - - // at guest code translation - foo a=0xd1 - bar cpu=0xc1 a=0xd2 - baz_trans cpu=0xc1 a=0xd3 - // at guest code execution - baz_exec cpu=0xc2 a=0xd3 diff --git a/docs/virtio-migration.txt b/docs/virtio-migration.txt deleted file mode 100644 index 98a6b0ffb5..0000000000 --- a/docs/virtio-migration.txt +++ /dev/null @@ -1,108 +0,0 @@ -Virtio devices and migration -============================ - -Copyright 2015 IBM Corp. - -This work is licensed under the terms of the GNU GPL, version 2 or later. See -the COPYING file in the top-level directory. - -Saving and restoring the state of virtio devices is a bit of a twisty maze, -for several reasons: -- state is distributed between several parts: - - virtio core, for common fields like features, number of queues, ... - - virtio transport (pci, ccw, ...), for the different proxy devices and - transport specific state (msix vectors, indicators, ...) - - virtio device (net, blk, ...), for the different device types and their - state (mac address, request queue, ...) -- most fields are saved via the stream interface; subsequently, subsections - have been added to make cross-version migration possible - -This file attempts to document the current procedure and point out some -caveats. - - -Save state procedure -==================== - -virtio core virtio transport virtio device ------------ ---------------- ------------- - - save() function registered - via VMState wrapper on - device class -virtio_save() <---------- - ------> save_config() - - save proxy device - - save transport-specific - device fields -- save common device - fields -- save common virtqueue - fields - ------> save_queue() - - save transport-specific - virtqueue fields - ------> save_device() - - save device-specific - fields -- save subsections - - device endianness, - if changed from - default endianness - - 64 bit features, if - any high feature bit - is set - - virtio-1 virtqueue - fields, if VERSION_1 - is set - - -Load state procedure -==================== - -virtio core virtio transport virtio device ------------ ---------------- ------------- - - load() function registered - via VMState wrapper on - device class -virtio_load() <---------- - ------> load_config() - - load proxy device - - load transport-specific - device fields -- load common device - fields -- load common virtqueue - fields - ------> load_queue() - - load transport-specific - virtqueue fields -- notify guest - ------> load_device() - - load device-specific - fields -- load subsections - - device endianness - - 64 bit features - - virtio-1 virtqueue - fields -- sanitize endianness -- sanitize features -- virtqueue index sanity - check - - feature-dependent setup - - -Implications of this setup -========================== - -Devices need to be careful in their state processing during load: The -load_device() procedure is invoked by the core before subsections have -been loaded. Any code that depends on information transmitted in subsections -therefore has to be invoked in the device's load() function _after_ -virtio_load() returned (like e.g. code depending on features). - -Any extension of the state being migrated should be done in subsections -added to the core for compatibility reasons. If transport or device specific -state is added, core needs to invoke a callback from the new subsection. diff --git a/docs/win32-qemu-event.promela b/docs/win32-qemu-event.promela deleted file mode 100644 index c446a71555..0000000000 --- a/docs/win32-qemu-event.promela +++ /dev/null @@ -1,98 +0,0 @@ -/* - * This model describes the implementation of QemuEvent in - * util/qemu-thread-win32.c. - * - * Author: Paolo Bonzini <pbonzini@redhat.com> - * - * This file is in the public domain. If you really want a license, - * the WTFPL will do. - * - * To verify it: - * spin -a docs/event.promela - * gcc -O2 pan.c -DSAFETY - * ./a.out - */ - -bool event; -int value; - -/* Primitives for a Win32 event */ -#define RAW_RESET event = false -#define RAW_SET event = true -#define RAW_WAIT do :: event -> break; od - -#if 0 -/* Basic sanity checking: test the Win32 event primitives */ -#define RESET RAW_RESET -#define SET RAW_SET -#define WAIT RAW_WAIT -#else -/* Full model: layer a userspace-only fast path on top of the RAW_* - * primitives. SET/RESET/WAIT have exactly the same semantics as - * RAW_SET/RAW_RESET/RAW_WAIT, but try to avoid invoking them. - */ -#define EV_SET 0 -#define EV_FREE 1 -#define EV_BUSY -1 - -int state = EV_FREE; - -int xchg_result; -#define SET if :: state != EV_SET -> \ - atomic { /* xchg_result=xchg(state, EV_SET) */ \ - xchg_result = state; \ - state = EV_SET; \ - } \ - if :: xchg_result == EV_BUSY -> RAW_SET; \ - :: else -> skip; \ - fi; \ - :: else -> skip; \ - fi - -#define RESET if :: state == EV_SET -> atomic { state = state | EV_FREE; } \ - :: else -> skip; \ - fi - -int tmp1, tmp2; -#define WAIT tmp1 = state; \ - if :: tmp1 != EV_SET -> \ - if :: tmp1 == EV_FREE -> \ - RAW_RESET; \ - atomic { /* tmp2=cas(state, EV_FREE, EV_BUSY) */ \ - tmp2 = state; \ - if :: tmp2 == EV_FREE -> state = EV_BUSY; \ - :: else -> skip; \ - fi; \ - } \ - if :: tmp2 == EV_SET -> tmp1 = EV_SET; \ - :: else -> tmp1 = EV_BUSY; \ - fi; \ - :: else -> skip; \ - fi; \ - assert(tmp1 != EV_FREE); \ - if :: tmp1 == EV_BUSY -> RAW_WAIT; \ - :: else -> skip; \ - fi; \ - :: else -> skip; \ - fi -#endif - -active proctype waiter() -{ - if - :: !value -> - RESET; - if - :: !value -> WAIT; - :: else -> skip; - fi; - :: else -> skip; - fi; - assert(value); -} - -active proctype notifier() -{ - value = true; - SET; -} diff --git a/docs/writing-qmp-commands.txt b/docs/writing-qmp-commands.txt deleted file mode 100644 index 1e6375495b..0000000000 --- a/docs/writing-qmp-commands.txt +++ /dev/null @@ -1,607 +0,0 @@ -= How to write QMP commands using the QAPI framework = - -This document is a step-by-step guide on how to write new QMP commands using -the QAPI framework. It also shows how to implement new style HMP commands. - -This document doesn't discuss QMP protocol level details, nor does it dive -into the QAPI framework implementation. - -For an in-depth introduction to the QAPI framework, please refer to -docs/qapi-code-gen.txt. For documentation about the QMP protocol, -start with docs/qmp-intro.txt. - -== Overview == - -Generally speaking, the following steps should be taken in order to write a -new QMP command. - -1. Write the command's and type(s) specification in the QAPI schema file - (qapi-schema.json in the root source directory) - -2. Write the QMP command itself, which is a regular C function. Preferably, - the command should be exported by some QEMU subsystem. But it can also be - added to the qmp.c file - -3. At this point the command can be tested under the QMP protocol - -4. Write the HMP command equivalent. This is not required and should only be - done if it does make sense to have the functionality in HMP. The HMP command - is implemented in terms of the QMP command - -The following sections will demonstrate each of the steps above. We will start -very simple and get more complex as we progress. - -=== Testing === - -For all the examples in the next sections, the test setup is the same and is -shown here. - -First, QEMU should be started as: - -# /path/to/your/source/qemu [...] \ - -chardev socket,id=qmp,port=4444,host=localhost,server \ - -mon chardev=qmp,mode=control,pretty=on - -Then, in a different terminal: - -$ telnet localhost 4444 -Trying 127.0.0.1... -Connected to localhost. -Escape character is '^]'. -{ - "QMP": { - "version": { - "qemu": { - "micro": 50, - "minor": 15, - "major": 0 - }, - "package": "" - }, - "capabilities": [ - ] - } -} - -The above output is the QMP server saying you're connected. The server is -actually in capabilities negotiation mode. To enter in command mode type: - -{ "execute": "qmp_capabilities" } - -Then the server should respond: - -{ - "return": { - } -} - -Which is QMP's way of saying "the latest command executed OK and didn't return -any data". Now you're ready to enter the QMP example commands as explained in -the following sections. - -== Writing a command that doesn't return data == - -That's the most simple QMP command that can be written. Usually, this kind of -command carries some meaningful action in QEMU but here it will just print -"Hello, world" to the standard output. - -Our command will be called "hello-world". It takes no arguments, nor does it -return any data. - -The first step is to add the following line to the bottom of the -qapi-schema.json file: - -{ 'command': 'hello-world' } - -The "command" keyword defines a new QMP command. It's an JSON object. All -schema entries are JSON objects. The line above will instruct the QAPI to -generate any prototypes and the necessary code to marshal and unmarshal -protocol data. - -The next step is to write the "hello-world" implementation. As explained -earlier, it's preferable for commands to live in QEMU subsystems. But -"hello-world" doesn't pertain to any, so we put its implementation in qmp.c: - -void qmp_hello_world(Error **errp) -{ - printf("Hello, world!\n"); -} - -There are a few things to be noticed: - -1. QMP command implementation functions must be prefixed with "qmp_" -2. qmp_hello_world() returns void, this is in accordance with the fact that the - command doesn't return any data -3. It takes an "Error **" argument. This is required. Later we will see how to - return errors and take additional arguments. The Error argument should not - be touched if the command doesn't return errors -4. We won't add the function's prototype. That's automatically done by the QAPI -5. Printing to the terminal is discouraged for QMP commands, we do it here - because it's the easiest way to demonstrate a QMP command - -You're done. Now build qemu, run it as suggested in the "Testing" section, -and then type the following QMP command: - -{ "execute": "hello-world" } - -Then check the terminal running qemu and look for the "Hello, world" string. If -you don't see it then something went wrong. - -=== Arguments === - -Let's add an argument called "message" to our "hello-world" command. The new -argument will contain the string to be printed to stdout. It's an optional -argument, if it's not present we print our default "Hello, World" string. - -The first change we have to do is to modify the command specification in the -schema file to the following: - -{ 'command': 'hello-world', 'data': { '*message': 'str' } } - -Notice the new 'data' member in the schema. It's an JSON object whose each -element is an argument to the command in question. Also notice the asterisk, -it's used to mark the argument optional (that means that you shouldn't use it -for mandatory arguments). Finally, 'str' is the argument's type, which -stands for "string". The QAPI also supports integers, booleans, enumerations -and user defined types. - -Now, let's update our C implementation in qmp.c: - -void qmp_hello_world(bool has_message, const char *message, Error **errp) -{ - if (has_message) { - printf("%s\n", message); - } else { - printf("Hello, world\n"); - } -} - -There are two important details to be noticed: - -1. All optional arguments are accompanied by a 'has_' boolean, which is set - if the optional argument is present or false otherwise -2. The C implementation signature must follow the schema's argument ordering, - which is defined by the "data" member - -Time to test our new version of the "hello-world" command. Build qemu, run it as -described in the "Testing" section and then send two commands: - -{ "execute": "hello-world" } -{ - "return": { - } -} - -{ "execute": "hello-world", "arguments": { "message": "We love qemu" } } -{ - "return": { - } -} - -You should see "Hello, world" and "we love qemu" in the terminal running qemu, -if you don't see these strings, then something went wrong. - -=== Errors === - -QMP commands should use the error interface exported by the error.h header -file. Basically, most errors are set by calling the error_setg() function. - -Let's say we don't accept the string "message" to contain the word "love". If -it does contain it, we want the "hello-world" command to return an error: - -void qmp_hello_world(bool has_message, const char *message, Error **errp) -{ - if (has_message) { - if (strstr(message, "love")) { - error_setg(errp, "the word 'love' is not allowed"); - return; - } - printf("%s\n", message); - } else { - printf("Hello, world\n"); - } -} - -The first argument to the error_setg() function is the Error pointer -to pointer, which is passed to all QMP functions. The next argument is a human -description of the error, this is a free-form printf-like string. - -Let's test the example above. Build qemu, run it as defined in the "Testing" -section, and then issue the following command: - -{ "execute": "hello-world", "arguments": { "message": "all you need is love" } } - -The QMP server's response should be: - -{ - "error": { - "class": "GenericError", - "desc": "the word 'love' is not allowed" - } -} - -As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR -(done by default when using error_setg()). There are two exceptions to -this rule: - - 1. A non-generic ErrorClass value exists* for the failure you want to report - (eg. DeviceNotFound) - - 2. Management applications have to take special action on the failure you - want to report, hence you have to add a new ErrorClass value so that they - can check for it - -If the failure you want to report falls into one of the two cases above, -use error_set() with a second argument of an ErrorClass value. - - * All existing ErrorClass values are defined in the qapi-schema.json file - -=== Command Documentation === - -There's only one step missing to make "hello-world"'s implementation complete, -and that's its documentation in the schema file. - -This is very important. No QMP command will be accepted in QEMU without proper -documentation. - -There are many examples of such documentation in the schema file already, but -here goes "hello-world"'s new entry for the qapi-schema.json file: - -## -# @hello-world -# -# Print a client provided string to the standard output stream. -# -# @message: string to be printed -# -# Returns: Nothing on success. -# -# Notes: if @message is not provided, the "Hello, world" string will -# be printed instead -# -# Since: <next qemu stable release, eg. 1.0> -## -{ 'command': 'hello-world', 'data': { '*message': 'str' } } - -Please, note that the "Returns" clause is optional if a command doesn't return -any data nor any errors. - -=== Implementing the HMP command === - -Now that the QMP command is in place, we can also make it available in the human -monitor (HMP). - -With the introduction of the QAPI, HMP commands make QMP calls. Most of the -time HMP commands are simple wrappers. All HMP commands implementation exist in -the hmp.c file. - -Here's the implementation of the "hello-world" HMP command: - -void hmp_hello_world(Monitor *mon, const QDict *qdict) -{ - const char *message = qdict_get_try_str(qdict, "message"); - Error *err = NULL; - - qmp_hello_world(!!message, message, &err); - if (err) { - monitor_printf(mon, "%s\n", error_get_pretty(err)); - error_free(err); - return; - } -} - -Also, you have to add the function's prototype to the hmp.h file. - -There are three important points to be noticed: - -1. The "mon" and "qdict" arguments are mandatory for all HMP functions. The - former is the monitor object. The latter is how the monitor passes - arguments entered by the user to the command implementation -2. hmp_hello_world() performs error checking. In this example we just print - the error description to the user, but we could do more, like taking - different actions depending on the error qmp_hello_world() returns -3. The "err" variable must be initialized to NULL before performing the - QMP call - -There's one last step to actually make the command available to monitor users, -we should add it to the hmp-commands.hx file: - - { - .name = "hello-world", - .args_type = "message:s?", - .params = "hello-world [message]", - .help = "Print message to the standard output", - .cmd = hmp_hello_world, - }, - -STEXI -@item hello_world @var{message} -@findex hello_world -Print message to the standard output -ETEXI - -To test this you have to open a user monitor and issue the "hello-world" -command. It might be instructive to check the command's documentation with -HMP's "help" command. - -Please, check the "-monitor" command-line option to know how to open a user -monitor. - -== Writing a command that returns data == - -A QMP command is capable of returning any data the QAPI supports like integers, -strings, booleans, enumerations and user defined types. - -In this section we will focus on user defined types. Please, check the QAPI -documentation for information about the other types. - -=== User Defined Types === - -FIXME This example needs to be redone after commit 6d32717 - -For this example we will write the query-alarm-clock command, which returns -information about QEMU's timer alarm. For more information about it, please -check the "-clock" command-line option. - -We want to return two pieces of information. The first one is the alarm clock's -name. The second one is when the next alarm will fire. The former information is -returned as a string, the latter is an integer in nanoseconds (which is not -very useful in practice, as the timer has probably already fired when the -information reaches the client). - -The best way to return that data is to create a new QAPI type, as shown below: - -## -# @QemuAlarmClock -# -# QEMU alarm clock information. -# -# @clock-name: The alarm clock method's name. -# -# @next-deadline: The time (in nanoseconds) the next alarm will fire. -# -# Since: 1.0 -## -{ 'type': 'QemuAlarmClock', - 'data': { 'clock-name': 'str', '*next-deadline': 'int' } } - -The "type" keyword defines a new QAPI type. Its "data" member contains the -type's members. In this example our members are the "clock-name" and the -"next-deadline" one, which is optional. - -Now let's define the query-alarm-clock command: - -## -# @query-alarm-clock -# -# Return information about QEMU's alarm clock. -# -# Returns a @QemuAlarmClock instance describing the alarm clock method -# being currently used by QEMU (this is usually set by the '-clock' -# command-line option). -# -# Since: 1.0 -## -{ 'command': 'query-alarm-clock', 'returns': 'QemuAlarmClock' } - -Notice the "returns" keyword. As its name suggests, it's used to define the -data returned by a command. - -It's time to implement the qmp_query_alarm_clock() function, you can put it -in the qemu-timer.c file: - -QemuAlarmClock *qmp_query_alarm_clock(Error **errp) -{ - QemuAlarmClock *clock; - int64_t deadline; - - clock = g_malloc0(sizeof(*clock)); - - deadline = qemu_next_alarm_deadline(); - if (deadline > 0) { - clock->has_next_deadline = true; - clock->next_deadline = deadline; - } - clock->clock_name = g_strdup(alarm_timer->name); - - return clock; -} - -There are a number of things to be noticed: - -1. The QemuAlarmClock type is automatically generated by the QAPI framework, - its members correspond to the type's specification in the schema file -2. As specified in the schema file, the function returns a QemuAlarmClock - instance and takes no arguments (besides the "errp" one, which is mandatory - for all QMP functions) -3. The "clock" variable (which will point to our QAPI type instance) is - allocated by the regular g_malloc0() function. Note that we chose to - initialize the memory to zero. This is recommended for all QAPI types, as - it helps avoiding bad surprises (specially with booleans) -4. Remember that "next_deadline" is optional? All optional members have a - 'has_TYPE_NAME' member that should be properly set by the implementation, - as shown above -5. Even static strings, such as "alarm_timer->name", should be dynamically - allocated by the implementation. This is so because the QAPI also generates - a function to free its types and it cannot distinguish between dynamically - or statically allocated strings -6. You have to include the "qmp-commands.h" header file in qemu-timer.c, - otherwise qemu won't build - -Time to test the new command. Build qemu, run it as described in the "Testing" -section and try this: - -{ "execute": "query-alarm-clock" } -{ - "return": { - "next-deadline": 2368219, - "clock-name": "dynticks" - } -} - -==== The HMP command ==== - -Here's the HMP counterpart of the query-alarm-clock command: - -void hmp_info_alarm_clock(Monitor *mon) -{ - QemuAlarmClock *clock; - Error *err = NULL; - - clock = qmp_query_alarm_clock(&err); - if (err) { - monitor_printf(mon, "Could not query alarm clock information\n"); - error_free(err); - return; - } - - monitor_printf(mon, "Alarm clock method in use: '%s'\n", clock->clock_name); - if (clock->has_next_deadline) { - monitor_printf(mon, "Next alarm will fire in %" PRId64 " nanoseconds\n", - clock->next_deadline); - } - - qapi_free_QemuAlarmClock(clock); -} - -It's important to notice that hmp_info_alarm_clock() calls -qapi_free_QemuAlarmClock() to free the data returned by qmp_query_alarm_clock(). -For user defined types, the QAPI will generate a qapi_free_QAPI_TYPE_NAME() -function and that's what you have to use to free the types you define and -qapi_free_QAPI_TYPE_NAMEList() for list types (explained in the next section). -If the QMP call returns a string, then you should g_free() to free it. - -Also note that hmp_info_alarm_clock() performs error handling. That's not -strictly required if you're sure the QMP function doesn't return errors, but -it's good practice to always check for errors. - -Another important detail is that HMP's "info" commands don't go into the -hmp-commands.hx. Instead, they go into the info_cmds[] table, which is defined -in the monitor.c file. The entry for the "info alarmclock" follows: - - { - .name = "alarmclock", - .args_type = "", - .params = "", - .help = "show information about the alarm clock", - .cmd = hmp_info_alarm_clock, - }, - -To test this, run qemu and type "info alarmclock" in the user monitor. - -=== Returning Lists === - -For this example, we're going to return all available methods for the timer -alarm, which is pretty much what the command-line option "-clock ?" does, -except that we're also going to inform which method is in use. - -This first step is to define a new type: - -## -# @TimerAlarmMethod -# -# Timer alarm method information. -# -# @method-name: The method's name. -# -# @current: true if this alarm method is currently in use, false otherwise -# -# Since: 1.0 -## -{ 'type': 'TimerAlarmMethod', - 'data': { 'method-name': 'str', 'current': 'bool' } } - -The command will be called "query-alarm-methods", here is its schema -specification: - -## -# @query-alarm-methods -# -# Returns information about available alarm methods. -# -# Returns: a list of @TimerAlarmMethod for each method -# -# Since: 1.0 -## -{ 'command': 'query-alarm-methods', 'returns': ['TimerAlarmMethod'] } - -Notice the syntax for returning lists "'returns': ['TimerAlarmMethod']", this -should be read as "returns a list of TimerAlarmMethod instances". - -The C implementation follows: - -TimerAlarmMethodList *qmp_query_alarm_methods(Error **errp) -{ - TimerAlarmMethodList *method_list = NULL; - const struct qemu_alarm_timer *p; - bool current = true; - - for (p = alarm_timers; p->name; p++) { - TimerAlarmMethodList *info = g_malloc0(sizeof(*info)); - info->value = g_malloc0(sizeof(*info->value)); - info->value->method_name = g_strdup(p->name); - info->value->current = current; - - current = false; - - info->next = method_list; - method_list = info; - } - - return method_list; -} - -The most important difference from the previous examples is the -TimerAlarmMethodList type, which is automatically generated by the QAPI from -the TimerAlarmMethod type. - -Each list node is represented by a TimerAlarmMethodList instance. We have to -allocate it, and that's done inside the for loop: the "info" pointer points to -an allocated node. We also have to allocate the node's contents, which is -stored in its "value" member. In our example, the "value" member is a pointer -to an TimerAlarmMethod instance. - -Notice that the "current" variable is used as "true" only in the first -iteration of the loop. That's because the alarm timer method in use is the -first element of the alarm_timers array. Also notice that QAPI lists are handled -by hand and we return the head of the list. - -Now Build qemu, run it as explained in the "Testing" section and try our new -command: - -{ "execute": "query-alarm-methods" } -{ - "return": [ - { - "current": false, - "method-name": "unix" - }, - { - "current": true, - "method-name": "dynticks" - } - ] -} - -The HMP counterpart is a bit more complex than previous examples because it -has to traverse the list, it's shown below for reference: - -void hmp_info_alarm_methods(Monitor *mon) -{ - TimerAlarmMethodList *method_list, *method; - Error *err = NULL; - - method_list = qmp_query_alarm_methods(&err); - if (err) { - monitor_printf(mon, "Could not query alarm methods\n"); - error_free(err); - return; - } - - for (method = method_list; method; method = method->next) { - monitor_printf(mon, "%c %s\n", method->value->current ? '*' : ' ', - method->value->method_name); - } - - qapi_free_TimerAlarmMethodList(method_list); -}