migration: implementation of background snapshot thread

author Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>

Fri, 29 Jan 2021 10:14:06 +0000 (13:14 +0300)

committer Dr. David Alan Gilbert <dgilbert@redhat.com>

Mon, 8 Feb 2021 11:19:51 +0000 (11:19 +0000)
author Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
Fri, 29 Jan 2021 10:14:06 +0000 (13:14 +0300)
committer Dr. David Alan Gilbert <dgilbert@redhat.com>
Mon, 8 Feb 2021 11:19:51 +0000 (11:19 +0000)
diff --git a/migration/migration.c b/migration/migration.c

index 2262f348af4e5fd9d6b0f1c2733e84c73373ac3f..ecb4115d68db2cee98b1410dfcabab8d8b92ec01 100644 (file)
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2000,6 +2000,7 @@ void migrate_init(MigrationState *s)
       * locks.
       */
      s->cleanup_bh = 0;
+    s->vm_start_bh = 0;
      s->to_dst_file = NULL;
      s->state = MIGRATION_STATUS_NONE;
      s->rp_state.from_dst_file = NULL;
@@ -3217,6 +3218,50 @@ fail:
                        MIGRATION_STATUS_FAILED);
  }
  
+/**
+ * bg_migration_completion: Used by bg_migration_thread when after all the
+ *   RAM has been saved. The caller 'breaks' the loop when this returns.
+ *
+ * @s: Current migration state
+ */
+static void bg_migration_completion(MigrationState *s)
+{
+    int current_active_state = s->state;
+
+    /*
+     * Stop tracking RAM writes - un-protect memory, un-register UFFD
+     * memory ranges, flush kernel wait queues and wake up threads
+     * waiting for write fault to be resolved.
+     */
+    ram_write_tracking_stop();
+
+    if (s->state == MIGRATION_STATUS_ACTIVE) {
+        /*
+         * By this moment we have RAM content saved into the migration stream.
+         * The next step is to flush the non-RAM content (device state)
+         * right after the ram content. The device state has been stored into
+         * the temporary buffer before RAM saving started.
+         */
+        qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
+        qemu_fflush(s->to_dst_file);
+    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
+        goto fail;
+    }
+
+    if (qemu_file_get_error(s->to_dst_file)) {
+        trace_migration_completion_file_err();
+        goto fail;
+    }
+
+    migrate_set_state(&s->state, current_active_state,
+                      MIGRATION_STATUS_COMPLETED);
+    return;
+
+fail:
+    migrate_set_state(&s->state, current_active_state,
+                      MIGRATION_STATUS_FAILED);
+}
+
  bool migrate_colo_enabled(void)
  {
      MigrationState *s = migrate_get_current();
@@ -3557,6 +3602,47 @@ static void migration_iteration_finish(MigrationState *s)
      qemu_mutex_unlock_iothread();
  }
  
+static void bg_migration_iteration_finish(MigrationState *s)
+{
+    qemu_mutex_lock_iothread();
+    switch (s->state) {
+    case MIGRATION_STATUS_COMPLETED:
+        migration_calculate_complete(s);
+        break;
+
+    case MIGRATION_STATUS_ACTIVE:
+    case MIGRATION_STATUS_FAILED:
+    case MIGRATION_STATUS_CANCELLED:
+    case MIGRATION_STATUS_CANCELLING:
+        break;
+
+    default:
+        /* Should not reach here, but if so, forgive the VM. */
+        error_report("%s: Unknown ending state %d", __func__, s->state);
+        break;
+    }
+
+    migrate_fd_cleanup_schedule(s);
+    qemu_mutex_unlock_iothread();
+}
+
+/*
+ * Return true if continue to the next iteration directly, false
+ * otherwise.
+ */
+static MigIterateState bg_migration_iteration_run(MigrationState *s)
+{
+    int res;
+
+    res = qemu_savevm_state_iterate(s->to_dst_file, false);
+    if (res > 0) {
+        bg_migration_completion(s);
+        return MIG_ITERATE_BREAK;
+    }
+
+    return MIG_ITERATE_RESUME;
+}
+
  void migration_make_urgent_request(void)
  {
      qemu_sem_post(&migrate_get_current()->rate_limit_sem);
@@ -3704,6 +3790,165 @@ static void *migration_thread(void *opaque)
      return NULL;
  }
  
+static void bg_migration_vm_start_bh(void *opaque)
+{
+    MigrationState *s = opaque;
+
+    qemu_bh_delete(s->vm_start_bh);
+    s->vm_start_bh = NULL;
+
+    vm_start();
+    s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
+}
+
+/**
+ * Background snapshot thread, based on live migration code.
+ * This is an alternative implementation of live migration mechanism
+ * introduced specifically to support background snapshots.
+ *
+ * It takes advantage of userfault_fd write protection mechanism introduced
+ * in v5.7 kernel. Compared to existing dirty page logging migration much
+ * lesser stream traffic is produced resulting in smaller snapshot images,
+ * simply cause of no page duplicates can get into the stream.
+ *
+ * Another key point is that generated vmstate stream reflects machine state
+ * 'frozen' at the beginning of snapshot creation compared to dirty page logging
+ * mechanism, which effectively results in that saved snapshot is the state of VM
+ * at the end of the process.
+ */
+static void *bg_migration_thread(void *opaque)
+{
+    MigrationState *s = opaque;
+    int64_t setup_start;
+    MigThrError thr_error;
+    QEMUFile *fb;
+    bool early_fail = true;
+
+    rcu_register_thread();
+    object_ref(OBJECT(s));
+
+    qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+
+    setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    /*
+     * We want to save vmstate for the moment when migration has been
+     * initiated but also we want to save RAM content while VM is running.
+     * The RAM content should appear first in the vmstate. So, we first
+     * stash the non-RAM part of the vmstate to the temporary buffer,
+     * then write RAM part of the vmstate to the migration stream
+     * with vCPUs running and, finally, write stashed non-RAM part of
+     * the vmstate from the buffer to the migration stream.
+     */
+    s->bioc = qio_channel_buffer_new(128 * 1024);
+    qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
+    fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
+    object_unref(OBJECT(s->bioc));
+
+    update_iteration_initial_status(s);
+
+    qemu_savevm_state_header(s->to_dst_file);
+    qemu_savevm_state_setup(s->to_dst_file);
+
+    if (qemu_savevm_state_guest_unplug_pending()) {
+        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+                          MIGRATION_STATUS_WAIT_UNPLUG);
+
+        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
+               qemu_savevm_state_guest_unplug_pending()) {
+            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+        }
+
+        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
+                          MIGRATION_STATUS_ACTIVE);
+    } else {
+        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+                MIGRATION_STATUS_ACTIVE);
+    }
+    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+
+    trace_migration_thread_setup_complete();
+    s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+    qemu_mutex_lock_iothread();
+
+    /*
+     * If VM is currently in suspended state, then, to make a valid runstate
+     * transition in vm_stop_force_state() we need to wakeup it up.
+     */
+    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
+    s->vm_was_running = runstate_is_running();
+
+    if (global_state_store()) {
+        goto fail;
+    }
+    /* Forcibly stop VM before saving state of vCPUs and devices */
+    if (vm_stop_force_state(RUN_STATE_PAUSED)) {
+        goto fail;
+    }
+    /*
+     * Put vCPUs in sync with shadow context structures, then
+     * save their state to channel-buffer along with devices.
+     */
+    cpu_synchronize_all_states();
+    if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
+        goto fail;
+    }
+    /* Now initialize UFFD context and start tracking RAM writes */
+    if (ram_write_tracking_start()) {
+        goto fail;
+    }
+    early_fail = false;
+
+    /*
+     * Start VM from BH handler to avoid write-fault lock here.
+     * UFFD-WP protection for the whole RAM is already enabled so
+     * calling VM state change notifiers from vm_start() would initiate
+     * writes to virtio VQs memory which is in write-protected region.
+     */
+    s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
+    qemu_bh_schedule(s->vm_start_bh);
+
+    qemu_mutex_unlock_iothread();
+
+    while (migration_is_active(s)) {
+        MigIterateState iter_state = bg_migration_iteration_run(s);
+        if (iter_state == MIG_ITERATE_SKIP) {
+            continue;
+        } else if (iter_state == MIG_ITERATE_BREAK) {
+            break;
+        }
+
+        /*
+         * Try to detect any kind of failures, and see whether we
+         * should stop the migration now.
+         */
+        thr_error = migration_detect_error(s);
+        if (thr_error == MIG_THR_ERR_FATAL) {
+            /* Stop migration */
+            break;
+        }
+
+        migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+    }
+
+    trace_migration_thread_after_loop();
+
+fail:
+    if (early_fail) {
+        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+                MIGRATION_STATUS_FAILED);
+        qemu_mutex_unlock_iothread();
+    }
+
+    bg_migration_iteration_finish(s);
+
+    qemu_fclose(fb);
+    object_unref(OBJECT(s));
+    rcu_unregister_thread();
+
+    return NULL;
+}
+
  void migrate_fd_connect(MigrationState *s, Error *error_in)
  {
      Error *local_err = NULL;
@@ -3767,8 +4012,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
          migrate_fd_cleanup(s);
          return;
      }
-    qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
-                       QEMU_THREAD_JOINABLE);
+
+    if (migrate_background_snapshot()) {
+        qemu_thread_create(&s->thread, "bg_snapshot",
+                bg_migration_thread, s, QEMU_THREAD_JOINABLE);
+    } else {
+        qemu_thread_create(&s->thread, "live_migration",
+                migration_thread, s, QEMU_THREAD_JOINABLE);
+    }
      s->migration_thread_running = true;
  }
  
diff --git a/migration/migration.h b/migration/migration.h

index f40338cfbf21c18b234edc3b6d3000d4b2c5cf5d..0723955cd787b345e4a8dc0b2f3fca67c6e429c6 100644 (file)
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -20,6 +20,7 @@
  #include "qemu/thread.h"
  #include "qemu/coroutine_int.h"
  #include "io/channel.h"
+#include "io/channel-buffer.h"
  #include "net/announce.h"
  #include "qom/object.h"
  
@@ -147,8 +148,10 @@ struct MigrationState {
  
      /*< public >*/
      QemuThread thread;
+    QEMUBH *vm_start_bh;
      QEMUBH *cleanup_bh;
      QEMUFile *to_dst_file;
+    QIOChannelBuffer *bioc;
      /*
       * Protects to_dst_file pointer.  We need to make sure we won't
       * yield or hang during the critical section, since this lock will
diff --git a/migration/savevm.c b/migration/savevm.c

index d1e6aaed6054968b48fc7c0f3a6843a43e69feb8..d5bf53388fe3a736b0b98664f1d95ccb85f536ce 100644 (file)
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1378,7 +1378,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
      return 0;
  }
  
-static
  int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
                                                      bool in_postcopy,
                                                      bool inactivate_disks)
diff --git a/migration/savevm.h b/migration/savevm.h

index ba64a7e27115907092d60ca871f464c9ce8591d9..aaee2528ed0101abe5e9917be92b161bbe204a51 100644 (file)
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -64,5 +64,7 @@ int qemu_loadvm_state(QEMUFile *f);
  void qemu_loadvm_state_cleanup(void);
  int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
  int qemu_load_device_state(QEMUFile *f);
+int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
+        bool in_postcopy, bool inactivate_disks);
  
  #endif
author	Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
	Fri, 29 Jan 2021 10:14:06 +0000 (13:14 +0300)
committer	Dr. David Alan Gilbert <dgilbert@redhat.com>
	Mon, 8 Feb 2021 11:19:51 +0000 (11:19 +0000)
migration/migration.c		patch \| blob \| history
migration/migration.h		patch \| blob \| history
migration/savevm.c		patch \| blob \| history
migration/savevm.h		patch \| blob \| history