ulong scratch1;
        u8 in_guest;
        u8 restore_hid5;
+       u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        struct kvm_vcpu *kvm_vcpu;
 
  */
 struct kvmppc_vcore {
        int n_runnable;
-       int n_blocked;
+       int n_busy;
        int num_threads;
        int entry_exit_count;
        int n_woken;
        int nap_count;
+       int napping_threads;
        u16 pcpu;
-       u8 vcore_running;
+       u8 vcore_state;
        u8 in_guest;
        struct list_head runnable_threads;
        spinlock_t lock;
+       wait_queue_head_t wq;
 };
 
 #define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
 #define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
 
+/* Values for vcore_state */
+#define VCORE_INACTIVE 0
+#define VCORE_RUNNING  1
+#define VCORE_EXITING  2
+#define VCORE_SLEEPING 3
+
 struct kvmppc_pte {
        ulong eaddr;
        u64 vpage;
        struct dtl *dtl;
        struct dtl *dtl_end;
 
+       wait_queue_head_t *wqp;
        struct kvmppc_vcore *vcore;
        int ret;
        int trap;
        int state;
        int ptid;
+       bool timer_running;
        wait_queue_head_t cpu_run;
 
        struct kvm_vcpu_arch_shared *shared;
 #endif
 };
 
-#define KVMPPC_VCPU_BUSY_IN_HOST       0
-#define KVMPPC_VCPU_BLOCKED            1
+/* Values for vcpu->arch.state */
+#define KVMPPC_VCPU_STOPPED            0
+#define KVMPPC_VCPU_BUSY_IN_HOST       1
 #define KVMPPC_VCPU_RUNNABLE           2
 
 #endif /* __POWERPC_KVM_HOST_H__ */
 
 #include <asm/compat.h>
 #include <asm/mmu.h>
 #include <asm/hvcall.h>
+#include <asm/xics.h>
 #endif
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
        DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
        DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
        DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+       DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
+       DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
        DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
        DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
        DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
        DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
        DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
+       DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
        DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
                           offsetof(struct kvmppc_vcpu_book3s, vcpu));
        DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
        HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
        HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
        HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
+       HSTATE_FIELD(HSTATE_NAPPING, napping);
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
        HSTATE_FIELD(HSTATE_DSCR, host_dscr);
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+       DEFINE(IPI_PRIORITY, IPI_PRIORITY);
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
 #else /* CONFIG_PPC_BOOK3S */
 
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        local_paca->kvm_hstate.kvm_vcpu = vcpu;
 {
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
-
-void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
-{
-       u64 now;
-       unsigned long dec_nsec;
-
-       now = get_tb();
-       if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
-               kvmppc_core_queue_dec(vcpu);
-       if (vcpu->arch.pending_exceptions)
-               return;
-       if (vcpu->arch.dec_expires != ~(u64)0) {
-               dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
-                       tb_ticks_per_sec;
-               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-                             HRTIMER_MODE_REL);
-       }
-
-       kvmppc_vcpu_blocked(vcpu);
-
-       kvm_vcpu_block(vcpu);
-       vcpu->stat.halt_wakeup++;
-
-       if (vcpu->arch.dec_expires != ~(u64)0)
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-
-       kvmppc_vcpu_unblocked(vcpu);
-}
-
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
        vcpu->arch.shregs.msr = msr;
+       kvmppc_end_cede(vcpu);
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
 
        switch (req) {
        case H_CEDE:
-               vcpu->arch.shregs.msr |= MSR_EE;
-               vcpu->arch.ceded = 1;
-               smp_mb();
-               if (!vcpu->arch.prodded)
-                       kvmppc_vcpu_block(vcpu);
-               else
-                       vcpu->arch.prodded = 0;
-               smp_mb();
-               vcpu->arch.ceded = 0;
                break;
        case H_PROD:
                target = kvmppc_get_gpr(vcpu, 4);
                break;
        }
 
-
-       if (!(r & RESUME_HOST)) {
-               /* To avoid clobbering exit_reason, only check for signals if
-                * we aren't already exiting to userspace for some other
-                * reason. */
-               if (signal_pending(tsk)) {
-                       vcpu->stat.signal_exits++;
-                       run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-               } else {
-                       kvmppc_core_deliver_interrupts(vcpu);
-               }
-       }
-
        return r;
 }
 
        kvmppc_mmu_book3s_hv_init(vcpu);
 
        /*
-        * Some vcpus may start out in stopped state.  If we initialize
-        * them to busy-in-host state they will stop other vcpus in the
-        * vcore from running.  Instead we initialize them to blocked
-        * state, effectively considering them to be stopped until we
-        * see the first run ioctl for them.
+        * We consider the vcpu stopped until we see the first run ioctl for it.
         */
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+       vcpu->arch.state = KVMPPC_VCPU_STOPPED;
 
        init_waitqueue_head(&vcpu->arch.cpu_run);
 
                if (vcore) {
                        INIT_LIST_HEAD(&vcore->runnable_threads);
                        spin_lock_init(&vcore->lock);
+                       init_waitqueue_head(&vcore->wq);
                }
                kvm->arch.vcores[core] = vcore;
        }
 
        spin_lock(&vcore->lock);
        ++vcore->num_threads;
-       ++vcore->n_blocked;
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
 
        kfree(vcpu);
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long dec_nsec, now;
 
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
-       ++vc->n_blocked;
-       if (vc->n_runnable > 0 &&
-           vc->n_runnable + vc->n_blocked == vc->num_threads) {
-               vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                                       arch.run_list);
-               wake_up(&vcpu->arch.cpu_run);
+       now = get_tb();
+       if (now > vcpu->arch.dec_expires) {
+               /* decrementer has already gone negative */
+               kvmppc_core_queue_dec(vcpu);
+               kvmppc_core_deliver_interrupts(vcpu);
+               return;
        }
-       spin_unlock(&vc->lock);
+       dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
+                  / tb_ticks_per_sec;
+       hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                     HRTIMER_MODE_REL);
+       vcpu->arch.timer_running = 1;
 }
 
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
-
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
-       --vc->n_blocked;
-       spin_unlock(&vc->lock);
+       vcpu->arch.ceded = 0;
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
                return;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
        --vc->n_runnable;
+       ++vc->n_busy;
        /* decrement the physical thread id of each following vcpu */
        v = vcpu;
        list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
        struct paca_struct *tpaca;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
        cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
        tpaca->kvm_hstate.kvm_vcore = vc;
+       tpaca->kvm_hstate.napping = 0;
+       vcpu->cpu = vc->pcpu;
        smp_wmb();
 #ifdef CONFIG_PPC_ICP_NATIVE
        if (vcpu->arch.ptid) {
                tpaca->cpu_start = 0x80;
-               tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
                wmb();
                xics_wake_cpu(cpu);
                ++vc->n_woken;
  */
 static int kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu, *vcpu0, *vnext;
        long ret;
        u64 now;
+       int ptid;
 
        /* don't start if any threads have a signal pending */
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
                goto out;
        }
 
+       /*
+        * Assign physical thread IDs, first to non-ceded vcpus
+        * and then to ceded ones.
+        */
+       ptid = 0;
+       vcpu0 = NULL;
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               if (!vcpu->arch.ceded) {
+                       if (!ptid)
+                               vcpu0 = vcpu;
+                       vcpu->arch.ptid = ptid++;
+               }
+       }
+       if (!vcpu0)
+               return 0;               /* nothing to run */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               if (vcpu->arch.ceded)
+                       vcpu->arch.ptid = ptid++;
+
        vc->n_woken = 0;
        vc->nap_count = 0;
        vc->entry_exit_count = 0;
-       vc->vcore_running = 1;
+       vc->vcore_state = VCORE_RUNNING;
        vc->in_guest = 0;
        vc->pcpu = smp_processor_id();
+       vc->napping_threads = 0;
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
                kvmppc_start_thread(vcpu);
-       vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                               arch.run_list);
 
+       preempt_disable();
        spin_unlock(&vc->lock);
 
-       preempt_disable();
        kvm_guest_enter();
-       __kvmppc_vcore_entry(NULL, vcpu);
+       __kvmppc_vcore_entry(NULL, vcpu0);
 
-       /* wait for secondary threads to finish writing their state to memory */
        spin_lock(&vc->lock);
+       /* disable sending of IPIs on virtual external irqs */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               vcpu->cpu = -1;
+       /* wait for secondary threads to finish writing their state to memory */
        if (vc->nap_count < vc->n_woken)
                kvmppc_wait_for_nap(vc);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
-       vc->vcore_running = 2;
+       vc->vcore_state = VCORE_EXITING;
        spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
                        kvmppc_core_dequeue_dec(vcpu);
-               if (!vcpu->arch.trap) {
-                       if (signal_pending(vcpu->arch.run_task)) {
-                               vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                       }
-                       continue;               /* didn't get to run */
-               }
-               ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
-                                        vcpu->arch.run_task);
+
+               ret = RESUME_GUEST;
+               if (vcpu->arch.trap)
+                       ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+                                                vcpu->arch.run_task);
+
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
+
+               if (vcpu->arch.ceded) {
+                       if (ret != RESUME_GUEST)
+                               kvmppc_end_cede(vcpu);
+                       else
+                               kvmppc_set_timer(vcpu);
+               }
        }
 
        spin_lock(&vc->lock);
  out:
-       vc->vcore_running = 0;
+       vc->vcore_state = VCORE_INACTIVE;
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
                if (vcpu->arch.ret != RESUME_GUEST) {
        return 1;
 }
 
-static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 {
-       int ptid;
-       int wait_state;
-       struct kvmppc_vcore *vc;
        DEFINE_WAIT(wait);
 
-       /* No need to go into the guest when all we do is going out */
-       if (signal_pending(current)) {
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
+       prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+               schedule();
+       finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+       DEFINE_WAIT(wait);
+       struct kvm_vcpu *v;
+       int all_idle = 1;
+
+       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       vc->vcore_state = VCORE_SLEEPING;
+       spin_unlock(&vc->lock);
+       list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+               if (!v->arch.ceded || v->arch.pending_exceptions) {
+                       all_idle = 0;
+                       break;
+               }
        }
+       if (all_idle)
+               schedule();
+       finish_wait(&vc->wq, &wait);
+       spin_lock(&vc->lock);
+       vc->vcore_state = VCORE_INACTIVE;
+}
 
-       /* On PPC970, check that we have an RMA region */
-       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
-               return -EPERM;
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int n_ceded;
+       int prev_state;
+       struct kvmppc_vcore *vc;
+       struct kvm_vcpu *v, *vn;
 
        kvm_run->exit_reason = 0;
        vcpu->arch.ret = RESUME_GUEST;
        vcpu->arch.trap = 0;
 
-       flush_fp_to_thread(current);
-       flush_altivec_to_thread(current);
-       flush_vsx_to_thread(current);
-
        /*
         * Synchronize with other threads in this virtual core
         */
        vc = vcpu->arch.vcore;
        spin_lock(&vc->lock);
-       /* This happens the first time this is called for a vcpu */
-       if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
-               --vc->n_blocked;
-       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
-       ptid = vc->n_runnable;
+       vcpu->arch.ceded = 0;
        vcpu->arch.run_task = current;
        vcpu->arch.kvm_run = kvm_run;
-       vcpu->arch.ptid = ptid;
+       prev_state = vcpu->arch.state;
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
        list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
        ++vc->n_runnable;
 
-       wait_state = TASK_INTERRUPTIBLE;
-       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-               if (signal_pending(current)) {
-                       if (!vc->vcore_running) {
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                               break;
-                       }
-                       /* have to wait for vcore to stop executing guest */
-                       wait_state = TASK_UNINTERRUPTIBLE;
-                       smp_send_reschedule(vc->pcpu);
+       /*
+        * This happens the first time this is called for a vcpu.
+        * If the vcore is already running, we may be able to start
+        * this thread straight away and have it join in.
+        */
+       if (prev_state == KVMPPC_VCPU_STOPPED) {
+               if (vc->vcore_state == VCORE_RUNNING &&
+                   VCORE_EXIT_COUNT(vc) == 0) {
+                       vcpu->arch.ptid = vc->n_runnable - 1;
+                       kvmppc_start_thread(vcpu);
                }
 
-               if (!vc->vcore_running &&
-                   vc->n_runnable + vc->n_blocked == vc->num_threads) {
-                       /* we can run now */
-                       if (kvmppc_run_core(vc))
-                               continue;
-               }
+       } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
+               --vc->n_busy;
 
-               if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
-                       kvmppc_start_thread(vcpu);
+       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+              !signal_pending(current)) {
+               if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+                       continue;
+               }
+               n_ceded = 0;
+               list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+                       n_ceded += v->arch.ceded;
+               if (n_ceded == vc->n_runnable)
+                       kvmppc_vcore_blocked(vc);
+               else
+                       kvmppc_run_core(vc);
+
+               list_for_each_entry_safe(v, vn, &vc->runnable_threads,
+                                        arch.run_list) {
+                       kvmppc_core_deliver_interrupts(v);
+                       if (signal_pending(v->arch.run_task)) {
+                               kvmppc_remove_runnable(vc, v);
+                               v->stat.signal_exits++;
+                               v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+                               v->arch.ret = -EINTR;
+                               wake_up(&v->arch.cpu_run);
+                       }
+               }
+       }
 
-               /* wait for other threads to come in, or wait for vcore */
-               prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-               spin_unlock(&vc->lock);
-               schedule();
-               finish_wait(&vcpu->arch.cpu_run, &wait);
-               spin_lock(&vc->lock);
+       if (signal_pending(current)) {
+               if (vc->vcore_state == VCORE_RUNNING ||
+                   vc->vcore_state == VCORE_EXITING) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+               }
+               if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       vcpu->stat.signal_exits++;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       vcpu->arch.ret = -EINTR;
+               }
        }
 
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
-               kvmppc_remove_runnable(vc, vcpu);
        spin_unlock(&vc->lock);
-
        return vcpu->arch.ret;
 }
 
                return -EINVAL;
        }
 
+       /* No need to go into the guest when all we'll do is come back out */
+       if (signal_pending(current)) {
+               run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* On PPC970, check that we have an RMA region */
+       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+               return -EPERM;
+
+       flush_fp_to_thread(current);
+       flush_altivec_to_thread(current);
+       flush_vsx_to_thread(current);
+       vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+
        do {
                r = kvmppc_run_vcpu(run, vcpu);
 
 
        b       .
 
 /*
- * Call kvmppc_handler_trampoline_enter in real mode.
+ * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
  *
  * Input Registers:
 kvm_start_guest:
        ld      r1,PACAEMERGSP(r13)
        subi    r1,r1,STACK_FRAME_OVERHEAD
+       ld      r2,PACATOC(r13)
+
+       /* were we napping due to cede? */
+       lbz     r0,HSTATE_NAPPING(r13)
+       cmpwi   r0,0
+       bne     kvm_end_cede
 
        /* get vcpu pointer */
        ld      r4, HSTATE_KVM_VCPU(r13)
        cmpwi   r0,0
        beq     20b
 
-       /* Set LPCR.  Set the MER bit if there is a pending external irq. */
+       /* Set LPCR and RMOR. */
 10:    ld      r8,KVM_LPCR(r9)
-       ld      r0,VCPU_PENDING_EXC(r4)
-       li      r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
-       oris    r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
-       and.    r0,r0,r7
-       beq     11f
-       ori     r8,r8,LPCR_MER
-11:    mtspr   SPRN_LPCR,r8
+       mtspr   SPRN_LPCR,r8
        ld      r8,KVM_RMOR(r9)
        mtspr   SPRN_RMOR,r8
        isync
        mtctr   r6
        mtxer   r7
 
-       /* Move SRR0 and SRR1 into the respective regs */
+kvmppc_cede_reentry:           /* r4 = vcpu, r13 = paca */
        ld      r6, VCPU_SRR0(r4)
        ld      r7, VCPU_SRR1(r4)
-       mtspr   SPRN_SRR0, r6
-       mtspr   SPRN_SRR1, r7
-
        ld      r10, VCPU_PC(r4)
+       ld      r11, VCPU_MSR(r4)       /* r11 = vcpu->arch.msr & ~MSR_HV */
 
-       ld      r11, VCPU_MSR(r4)       /* r10 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
        ori     r11, r11, MSR_ME
 
+       /* Check if we can deliver an external or decrementer interrupt now */
+       ld      r0,VCPU_PENDING_EXC(r4)
+       li      r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and     r0,r0,r8
+       cmpdi   cr1,r0,0
+       andi.   r0,r11,MSR_EE
+       beq     cr1,11f
+BEGIN_FTR_SECTION
+       mfspr   r8,SPRN_LPCR
+       ori     r8,r8,LPCR_MER
+       mtspr   SPRN_LPCR,r8
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+       beq     5f
+       li      r0,BOOK3S_INTERRUPT_EXTERNAL
+12:    mr      r6,r10
+       mr      r10,r0
+       mr      r7,r11
+       li      r11,(MSR_ME << 1) | 1   /* synthesize MSR_SF | MSR_ME */
+       rotldi  r11,r11,63
+       b       5f
+11:    beq     5f
+       mfspr   r0,SPRN_DEC
+       cmpwi   r0,0
+       li      r0,BOOK3S_INTERRUPT_DECREMENTER
+       blt     12b
+
+       /* Move SRR0 and SRR1 into the respective regs */
+5:     mtspr   SPRN_SRR0, r6
+       mtspr   SPRN_SRR1, r7
+       li      r0,0
+       stb     r0,VCPU_CEDED(r4)       /* cancel cede */
+
 fast_guest_return:
        mtspr   SPRN_HSRR0,r10
        mtspr   SPRN_HSRR1,r11
        /* See if this is something we can handle in real mode */
        cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
        beq     hcall_try_real_mode
-hcall_real_cont:
 
        /* Check for mediated interrupts (could be done earlier really ...) */
 BEGIN_FTR_SECTION
        cmpwi   r12,BOOK3S_INTERRUPT_EXTERNAL
        bne+    1f
-       ld      r5,VCPU_KVM(r9)
-       ld      r5,KVM_LPCR(r5)
        andi.   r0,r11,MSR_EE
        beq     1f
+       mfspr   r5,SPRN_LPCR
        andi.   r0,r5,LPCR_MER
        bne     bounce_ext_interrupt
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+hcall_real_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
        /* Save DEC */
        mfspr   r5,SPRN_DEC
        mftb    r6
        slbia
        ptesync
 
-hdec_soon:
+hdec_soon:                     /* r9 = vcpu, r12 = trap, r13 = paca */
 BEGIN_FTR_SECTION
        b       32f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        addi    r0,r3,0x100
        stwcx.  r0,0,r6
        bne     41b
+       lwsync
 
        /*
         * At this point we have an interrupt that we have to pass
         * interrupt, since the other threads will already be on their
         * way here in that case.
         */
+       cmpwi   r3,0x100        /* Are we the first here? */
+       bge     43f
+       cmpwi   r3,1            /* Are any other threads in the guest? */
+       ble     43f
        cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
        beq     40f
-       cmpwi   r3,0x100        /* Are we the first here? */
-       bge     40f
-       cmpwi   r3,1
-       ble     40f
        li      r0,0
        mtspr   SPRN_HDEC,r0
 40:
+       /*
+        * Send an IPI to any napping threads, since an HDEC interrupt
+        * doesn't wake CPUs up from nap.
+        */
+       lwz     r3,VCORE_NAPPING_THREADS(r5)
+       lwz     r4,VCPU_PTID(r9)
+       li      r0,1
+       sldi    r0,r0,r4
+       andc.   r3,r3,r0                /* no sense IPI'ing ourselves */
+       beq     43f
+       mulli   r4,r4,PACA_SIZE         /* get paca for thread 0 */
+       subf    r6,r4,r13
+42:    andi.   r0,r3,1
+       beq     44f
+       ld      r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
+       li      r0,IPI_PRIORITY
+       li      r7,XICS_QIRR
+       stbcix  r0,r7,r8                /* trigger the IPI */
+44:    srdi.   r3,r3,1
+       addi    r6,r6,PACA_SIZE
+       bne     42b
 
        /* Secondary threads wait for primary to do partition switch */
-       ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+43:    ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
        ld      r5,HSTATE_KVM_VCORE(r13)
        lwz     r3,VCPU_PTID(r9)
        cmpwi   r3,0
 hcall_real_fallback:
        li      r12,BOOK3S_INTERRUPT_SYSCALL
        ld      r9, HSTATE_KVM_VCPU(r13)
-       ld      r11, VCPU_MSR(r9)
 
        b       hcall_real_cont
 
        .long   0               /* 0xd4 */
        .long   0               /* 0xd8 */
        .long   0               /* 0xdc */
-       .long   0               /* 0xe0 */
+       .long   .kvmppc_h_cede - hcall_real_table
        .long   0               /* 0xe4 */
        .long   0               /* 0xe8 */
        .long   0               /* 0xec */
        mtspr   SPRN_SRR0,r10
        mtspr   SPRN_SRR1,r11
        li      r10,BOOK3S_INTERRUPT_EXTERNAL
-       LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+       li      r11,(MSR_ME << 1) | 1   /* synthesize MSR_SF | MSR_ME */
+       rotldi  r11,r11,63
        b       fast_guest_return
 
 _GLOBAL(kvmppc_h_set_dabr)
        li      r3,0
        blr
 
+_GLOBAL(kvmppc_h_cede)
+       ori     r11,r11,MSR_EE
+       std     r11,VCPU_MSR(r3)
+       li      r0,1
+       stb     r0,VCPU_CEDED(r3)
+       sync                    /* order setting ceded vs. testing prodded */
+       lbz     r5,VCPU_PRODDED(r3)
+       cmpwi   r5,0
+       bne     1f
+       li      r0,0            /* set trap to 0 to say hcall is handled */
+       stw     r0,VCPU_TRAP(r3)
+       li      r0,H_SUCCESS
+       std     r0,VCPU_GPR(r3)(r3)
+BEGIN_FTR_SECTION
+       b       2f              /* just send it up to host on 970 */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+       /*
+        * Set our bit in the bitmask of napping threads unless all the
+        * other threads are already napping, in which case we send this
+        * up to the host.
+        */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r6,VCPU_PTID(r3)
+       lwz     r8,VCORE_ENTRY_EXIT(r5)
+       clrldi  r8,r8,56
+       li      r0,1
+       sld     r0,r0,r6
+       addi    r6,r5,VCORE_NAPPING_THREADS
+31:    lwarx   r4,0,r6
+       or      r4,r4,r0
+       popcntw r7,r4
+       cmpw    r7,r8
+       bge     2f
+       stwcx.  r4,0,r6
+       bne     31b
+       li      r0,1
+       stb     r0,HSTATE_NAPPING(r13)
+       /* order napping_threads update vs testing entry_exit_count */
+       lwsync
+       mr      r4,r3
+       lwz     r7,VCORE_ENTRY_EXIT(r5)
+       cmpwi   r7,0x100
+       bge     33f             /* another thread already exiting */
+
+/*
+ * Although not specifically required by the architecture, POWER7
+ * preserves the following registers in nap mode, even if an SMT mode
+ * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
+ * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
+ */
+       /* Save non-volatile GPRs */
+       std     r14, VCPU_GPR(r14)(r3)
+       std     r15, VCPU_GPR(r15)(r3)
+       std     r16, VCPU_GPR(r16)(r3)
+       std     r17, VCPU_GPR(r17)(r3)
+       std     r18, VCPU_GPR(r18)(r3)
+       std     r19, VCPU_GPR(r19)(r3)
+       std     r20, VCPU_GPR(r20)(r3)
+       std     r21, VCPU_GPR(r21)(r3)
+       std     r22, VCPU_GPR(r22)(r3)
+       std     r23, VCPU_GPR(r23)(r3)
+       std     r24, VCPU_GPR(r24)(r3)
+       std     r25, VCPU_GPR(r25)(r3)
+       std     r26, VCPU_GPR(r26)(r3)
+       std     r27, VCPU_GPR(r27)(r3)
+       std     r28, VCPU_GPR(r28)(r3)
+       std     r29, VCPU_GPR(r29)(r3)
+       std     r30, VCPU_GPR(r30)(r3)
+       std     r31, VCPU_GPR(r31)(r3)
+
+       /* save FP state */
+       bl      .kvmppc_save_fp
+
+       /*
+        * Take a nap until a decrementer or external interrupt occurs,
+        * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
+        */
+       li      r0,0x80
+       stb     r0,PACAPROCSTART(r13)
+       mfspr   r5,SPRN_LPCR
+       ori     r5,r5,LPCR_PECE0 | LPCR_PECE1
+       mtspr   SPRN_LPCR,r5
+       isync
+       li      r0, 0
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+kvm_end_cede:
+       /* Woken by external or decrementer interrupt */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       /* If we're a secondary thread and we got here by an IPI, ack it */
+       ld      r4,HSTATE_KVM_VCPU(r13)
+       lwz     r3,VCPU_PTID(r4)
+       cmpwi   r3,0
+       beq     27f
+       mfspr   r3,SPRN_SRR1
+       rlwinm  r3,r3,44-31,0x7         /* extract wake reason field */
+       cmpwi   r3,4                    /* was it an external interrupt? */
+       bne     27f
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r0,0xff
+       li      r6,XICS_QIRR
+       li      r7,XICS_XIRR
+       lwzcix  r8,r5,r7                /* ack the interrupt */
+       sync
+       stbcix  r0,r5,r6                /* clear it */
+       stwcix  r8,r5,r7                /* EOI it */
+27:
+       /* load up FP state */
+       bl      kvmppc_load_fp
+
+       /* Load NV GPRS */
+       ld      r14, VCPU_GPR(r14)(r4)
+       ld      r15, VCPU_GPR(r15)(r4)
+       ld      r16, VCPU_GPR(r16)(r4)
+       ld      r17, VCPU_GPR(r17)(r4)
+       ld      r18, VCPU_GPR(r18)(r4)
+       ld      r19, VCPU_GPR(r19)(r4)
+       ld      r20, VCPU_GPR(r20)(r4)
+       ld      r21, VCPU_GPR(r21)(r4)
+       ld      r22, VCPU_GPR(r22)(r4)
+       ld      r23, VCPU_GPR(r23)(r4)
+       ld      r24, VCPU_GPR(r24)(r4)
+       ld      r25, VCPU_GPR(r25)(r4)
+       ld      r26, VCPU_GPR(r26)(r4)
+       ld      r27, VCPU_GPR(r27)(r4)
+       ld      r28, VCPU_GPR(r28)(r4)
+       ld      r29, VCPU_GPR(r29)(r4)
+       ld      r30, VCPU_GPR(r30)(r4)
+       ld      r31, VCPU_GPR(r31)(r4)
+
+       /* clear our bit in vcore->napping_threads */
+33:    ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r3,VCPU_PTID(r4)
+       li      r0,1
+       sld     r0,r0,r3
+       addi    r6,r5,VCORE_NAPPING_THREADS
+32:    lwarx   r7,0,r6
+       andc    r7,r7,r0
+       stwcx.  r7,0,r6
+       bne     32b
+       li      r0,0
+       stb     r0,HSTATE_NAPPING(r13)
+
+       /* see if any other thread is already exiting */
+       lwz     r0,VCORE_ENTRY_EXIT(r5)
+       cmpwi   r0,0x100
+       blt     kvmppc_cede_reentry     /* if not go back to guest */
+
+       /* some threads are exiting, so go to the guest exit path */
+       b       hcall_real_fallback
+
+       /* cede when already previously prodded case */
+1:     li      r0,0
+       stb     r0,VCPU_PRODDED(r3)
+       sync                    /* order testing prodded vs. clearing ceded */
+       stb     r0,VCPU_CEDED(r3)
+       li      r3,H_SUCCESS
+       blr
+
+       /* we've ceded but we want to give control to the host */
+2:     li      r3,H_TOO_HARD
+       blr
+
 secondary_too_late:
        ld      r5,HSTATE_KVM_VCORE(r13)
        HMT_LOW
        slbmte  r6,r5
 1:     addi    r11,r11,16
        .endr
-       b       50f
 
 secondary_nap:
-       /* Clear any pending IPI */
-50:    ld      r5, HSTATE_XICS_PHYS(r13)
+       /* Clear any pending IPI - assume we're a secondary thread */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r7, XICS_XIRR
+       lwzcix  r3, r5, r7              /* ack any pending interrupt */
+       rlwinm. r0, r3, 0, 0xffffff     /* any pending? */
+       beq     37f
+       sync
        li      r0, 0xff
        li      r6, XICS_QIRR
-       stbcix  r0, r5, r6
+       stbcix  r0, r5, r6              /* clear the IPI */
+       stwcix  r3, r5, r7              /* EOI it */
+37:    sync
 
        /* increment the nap count and then go to nap mode */
        ld      r4, HSTATE_KVM_VCORE(r13)
        addi    r3, r3, 1
        stwcx.  r3, 0, r4
        bne     51b
-       isync
 
+       li      r3, LPCR_PECE0
        mfspr   r4, SPRN_LPCR
-       li      r0, LPCR_PECE
-       andc    r4, r4, r0
-       ori     r4, r4, LPCR_PECE0      /* exit nap on interrupt */
+       rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
        mtspr   SPRN_LPCR, r4
+       isync
        li      r0, 0
        std     r0, HSTATE_SCRATCH0(r13)
        ptesync
 
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-#ifndef CONFIG_KVM_BOOK3S_64_HV
        return !(v->arch.shared->msr & MSR_WE) ||
               !!(v->arch.pending_exceptions);
-#else
-       return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
-#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 {
        struct kvm_vcpu *vcpu;
        vcpu = kvmppc_core_vcpu_create(kvm, id);
+       vcpu->arch.wqp = &vcpu->wq;
        if (!IS_ERR(vcpu))
                kvmppc_create_vcpu_debugfs(vcpu, id);
        return vcpu;
 
        kvmppc_core_queue_dec(vcpu);
 
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
+       if (waitqueue_active(vcpu->arch.wqp)) {
+               wake_up_interruptible(vcpu->arch.wqp);
                vcpu->stat.halt_wakeup++;
        }
 }
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-       if (irq->irq == KVM_INTERRUPT_UNSET)
+       if (irq->irq == KVM_INTERRUPT_UNSET) {
                kvmppc_core_dequeue_external(vcpu, irq);
-       else
-               kvmppc_core_queue_external(vcpu, irq);
+               return 0;
+       }
+
+       kvmppc_core_queue_external(vcpu, irq);
 
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
+       if (waitqueue_active(vcpu->arch.wqp)) {
+               wake_up_interruptible(vcpu->arch.wqp);
                vcpu->stat.halt_wakeup++;
        } else if (vcpu->cpu != -1) {
                smp_send_reschedule(vcpu->cpu);