#include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/bvec.h>
 #include <linux/net.h>
                unsigned                cached_sq_head;
                unsigned                sq_entries;
                unsigned                sq_mask;
+               unsigned                sq_thread_idle;
                struct io_uring_sqe     *sq_sqes;
        } ____cacheline_aligned_in_smp;
 
        /* IO offload */
        struct workqueue_struct *sqo_wq;
+       struct task_struct      *sqo_thread;    /* if using sq thread polling */
        struct mm_struct        *sqo_mm;
+       wait_queue_head_t       sqo_wait;
+       unsigned                sqo_stop;
 
        struct {
                /* CQ ring */
        unsigned short                  index;
        bool                            has_user;
        bool                            needs_lock;
+       bool                            needs_fixed_file;
 };
 
 struct io_kiocb {
 
        if (waitqueue_active(&ctx->wait))
                wake_up(&ctx->wait);
+       if (waitqueue_active(&ctx->sqo_wait))
+               wake_up(&ctx->sqo_wait);
 }
 
 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
        return false;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
                      bool force_nonblock, struct io_submit_state *state)
 {
+       const struct io_uring_sqe *sqe = s->sqe;
        struct io_ring_ctx *ctx = req->ctx;
        struct kiocb *kiocb = &req->rw;
        unsigned ioprio, flags;
                kiocb->ki_filp = ctx->user_files[fd];
                req->flags |= REQ_F_FIXED_FILE;
        } else {
+               if (s->needs_fixed_file)
+                       return -EBADF;
                kiocb->ki_filp = io_file_get(state, fd);
                if (unlikely(!kiocb->ki_filp))
                        return -EBADF;
        struct file *file;
        ssize_t ret;
 
-       ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+       ret = io_prep_rw(req, s, force_nonblock, state);
        if (ret)
                return ret;
        file = kiocb->ki_filp;
        struct file *file;
        ssize_t ret;
 
-       ret = io_prep_rw(req, s->sqe, force_nonblock, state);
+       ret = io_prep_rw(req, s, force_nonblock, state);
        if (ret)
                return ret;
        /* Hold on to the file for -EAGAIN */
        return false;
 }
 
+static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
+                         unsigned int nr, bool has_user, bool mm_fault)
+{
+       struct io_submit_state state, *statep = NULL;
+       int ret, i, submitted = 0;
+
+       if (nr > IO_PLUG_THRESHOLD) {
+               io_submit_state_start(&state, ctx, nr);
+               statep = &state;
+       }
+
+       for (i = 0; i < nr; i++) {
+               if (unlikely(mm_fault)) {
+                       ret = -EFAULT;
+               } else {
+                       sqes[i].has_user = has_user;
+                       sqes[i].needs_lock = true;
+                       sqes[i].needs_fixed_file = true;
+                       ret = io_submit_sqe(ctx, &sqes[i], statep);
+               }
+               if (!ret) {
+                       submitted++;
+                       continue;
+               }
+
+               io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
+       }
+
+       if (statep)
+               io_submit_state_end(&state);
+
+       return submitted;
+}
+
+static int io_sq_thread(void *data)
+{
+       struct sqe_submit sqes[IO_IOPOLL_BATCH];
+       struct io_ring_ctx *ctx = data;
+       struct mm_struct *cur_mm = NULL;
+       mm_segment_t old_fs;
+       DEFINE_WAIT(wait);
+       unsigned inflight;
+       unsigned long timeout;
+
+       old_fs = get_fs();
+       set_fs(USER_DS);
+
+       timeout = inflight = 0;
+       while (!kthread_should_stop() && !ctx->sqo_stop) {
+               bool all_fixed, mm_fault = false;
+               int i;
+
+               if (inflight) {
+                       unsigned nr_events = 0;
+
+                       if (ctx->flags & IORING_SETUP_IOPOLL) {
+                               /*
+                                * We disallow the app entering submit/complete
+                                * with polling, but we still need to lock the
+                                * ring to prevent racing with polled issue
+                                * that got punted to a workqueue.
+                                */
+                               mutex_lock(&ctx->uring_lock);
+                               io_iopoll_check(ctx, &nr_events, 0);
+                               mutex_unlock(&ctx->uring_lock);
+                       } else {
+                               /*
+                                * Normal IO, just pretend everything completed.
+                                * We don't have to poll completions for that.
+                                */
+                               nr_events = inflight;
+                       }
+
+                       inflight -= nr_events;
+                       if (!inflight)
+                               timeout = jiffies + ctx->sq_thread_idle;
+               }
+
+               if (!io_get_sqring(ctx, &sqes[0])) {
+                       /*
+                        * We're polling. If we're within the defined idle
+                        * period, then let us spin without work before going
+                        * to sleep.
+                        */
+                       if (inflight || !time_after(jiffies, timeout)) {
+                               cpu_relax();
+                               continue;
+                       }
+
+                       /*
+                        * Drop cur_mm before scheduling, we can't hold it for
+                        * long periods (or over schedule()). Do this before
+                        * adding ourselves to the waitqueue, as the unuse/drop
+                        * may sleep.
+                        */
+                       if (cur_mm) {
+                               unuse_mm(cur_mm);
+                               mmput(cur_mm);
+                               cur_mm = NULL;
+                       }
+
+                       prepare_to_wait(&ctx->sqo_wait, &wait,
+                                               TASK_INTERRUPTIBLE);
+
+                       /* Tell userspace we may need a wakeup call */
+                       ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+                       smp_wmb();
+
+                       if (!io_get_sqring(ctx, &sqes[0])) {
+                               if (kthread_should_stop()) {
+                                       finish_wait(&ctx->sqo_wait, &wait);
+                                       break;
+                               }
+                               if (signal_pending(current))
+                                       flush_signals(current);
+                               schedule();
+                               finish_wait(&ctx->sqo_wait, &wait);
+
+                               ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+                               smp_wmb();
+                               continue;
+                       }
+                       finish_wait(&ctx->sqo_wait, &wait);
+
+                       ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+                       smp_wmb();
+               }
+
+               i = 0;
+               all_fixed = true;
+               do {
+                       if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
+                               all_fixed = false;
+
+                       i++;
+                       if (i == ARRAY_SIZE(sqes))
+                               break;
+               } while (io_get_sqring(ctx, &sqes[i]));
+
+               /* Unless all new commands are FIXED regions, grab mm */
+               if (!all_fixed && !cur_mm) {
+                       mm_fault = !mmget_not_zero(ctx->sqo_mm);
+                       if (!mm_fault) {
+                               use_mm(ctx->sqo_mm);
+                               cur_mm = ctx->sqo_mm;
+                       }
+               }
+
+               inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
+                                               mm_fault);
+
+               /* Commit SQ ring head once we've consumed all SQEs */
+               io_commit_sqring(ctx);
+       }
+
+       set_fs(old_fs);
+       if (cur_mm) {
+               unuse_mm(cur_mm);
+               mmput(cur_mm);
+       }
+       return 0;
+}
+
 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 {
        struct io_submit_state state, *statep = NULL;
 
                s.has_user = true;
                s.needs_lock = false;
+               s.needs_fixed_file = false;
 
                ret = io_submit_sqe(ctx, &s, statep);
                if (ret) {
        return 0;
 }
 
+static void io_sq_thread_stop(struct io_ring_ctx *ctx)
+{
+       if (ctx->sqo_thread) {
+               ctx->sqo_stop = 1;
+               mb();
+               kthread_stop(ctx->sqo_thread);
+               ctx->sqo_thread = NULL;
+       }
+}
+
 static void io_finish_async(struct io_ring_ctx *ctx)
 {
+       io_sq_thread_stop(ctx);
+
        if (ctx->sqo_wq) {
                destroy_workqueue(ctx->sqo_wq);
                ctx->sqo_wq = NULL;
        return ret;
 }
 
-static int io_sq_offload_start(struct io_ring_ctx *ctx)
+static int io_sq_offload_start(struct io_ring_ctx *ctx,
+                              struct io_uring_params *p)
 {
        int ret;
 
+       init_waitqueue_head(&ctx->sqo_wait);
        mmgrab(current->mm);
        ctx->sqo_mm = current->mm;
 
+       ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
+       if (!ctx->sq_thread_idle)
+               ctx->sq_thread_idle = HZ;
+
+       ret = -EINVAL;
+       if (!cpu_possible(p->sq_thread_cpu))
+               goto err;
+
+       if (ctx->flags & IORING_SETUP_SQPOLL) {
+               if (p->flags & IORING_SETUP_SQ_AFF) {
+                       int cpu;
+
+                       cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
+                       ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
+                                                       ctx, cpu,
+                                                       "io_uring-sq");
+               } else {
+                       ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
+                                                       "io_uring-sq");
+               }
+               if (IS_ERR(ctx->sqo_thread)) {
+                       ret = PTR_ERR(ctx->sqo_thread);
+                       ctx->sqo_thread = NULL;
+                       goto err;
+               }
+               wake_up_process(ctx->sqo_thread);
+       } else if (p->flags & IORING_SETUP_SQ_AFF) {
+               /* Can't have SQ_AFF without SQPOLL */
+               ret = -EINVAL;
+               goto err;
+       }
+
        /* Do QD, or 2 * CPUS, whatever is smallest */
        ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
                        min(ctx->sq_entries - 1, 2 * num_online_cpus()));
 
        return 0;
 err:
+       io_sq_thread_stop(ctx);
        mmdrop(ctx->sqo_mm);
        ctx->sqo_mm = NULL;
        return ret;
        int submitted = 0;
        struct fd f;
 
-       if (flags & ~IORING_ENTER_GETEVENTS)
+       if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
                return -EINVAL;
 
        f = fdget(fd);
        if (!percpu_ref_tryget(&ctx->refs))
                goto out_fput;
 
+       /*
+        * For SQ polling, the thread will do all submissions and completions.
+        * Just return the requested submit count, and wake the thread if
+        * we were asked to.
+        */
+       if (ctx->flags & IORING_SETUP_SQPOLL) {
+               if (flags & IORING_ENTER_SQ_WAKEUP)
+                       wake_up(&ctx->sqo_wait);
+               submitted = to_submit;
+               goto out_ctx;
+       }
+
        ret = 0;
        if (to_submit) {
                to_submit = min(to_submit, ctx->sq_entries);
        if (ret)
                goto err;
 
-       ret = io_sq_offload_start(ctx);
+       ret = io_sq_offload_start(ctx, p);
        if (ret)
                goto err;
 
                        return -EINVAL;
        }
 
-       if (p.flags & ~IORING_SETUP_IOPOLL)
+       if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
+                       IORING_SETUP_SQ_AFF))
                return -EINVAL;
 
        ret = io_uring_create(entries, &p);