#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
 #define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
 #define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
+#define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
+                                IORING_REGISTER_LAST + IORING_OP_LAST)
 
 struct io_uring {
        u32 head ____cacheline_aligned_in_smp;
        __u16 bid;
 };
 
+struct io_restriction {
+       DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+       DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+       u8 sqe_flags_allowed;
+       u8 sqe_flags_required;
+};
+
 struct io_ring_ctx {
        struct {
                struct percpu_ref       refs;
                unsigned int            cq_overflow_flushed: 1;
                unsigned int            drain_next: 1;
                unsigned int            eventfd_async: 1;
+               unsigned int            restricted: 1;
 
                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
        struct llist_head               file_put_llist;
 
        struct work_struct              exit_work;
+       struct io_restriction           restrictions;
 };
 
 /*
        ctx->cached_sq_head++;
 }
 
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+                                       struct io_kiocb *req,
+                                       unsigned int sqe_flags)
+{
+       if (!ctx->restricted)
+               return true;
+
+       if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+               return false;
+
+       if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+           ctx->restrictions.sqe_flags_required)
+               return false;
+
+       if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+                         ctx->restrictions.sqe_flags_required))
+               return false;
+
+       return true;
+}
+
 #define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
                                IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
                                IOSQE_BUFFER_SELECT)
        if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
                return -EINVAL;
 
+       if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+               return -EACCES;
+
        if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
            !io_op_defs[req->opcode].buffer_select)
                return -EOPNOTSUPP;
        return -EINVAL;
 }
 
+static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
+                                   unsigned int nr_args)
+{
+       struct io_uring_restriction *res;
+       size_t size;
+       int i, ret;
+
+       /* We allow only a single restrictions registration */
+       if (ctx->restricted)
+               return -EBUSY;
+
+       if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
+               return -EINVAL;
+
+       size = array_size(nr_args, sizeof(*res));
+       if (size == SIZE_MAX)
+               return -EOVERFLOW;
+
+       res = memdup_user(arg, size);
+       if (IS_ERR(res))
+               return PTR_ERR(res);
+
+       ret = 0;
+
+       for (i = 0; i < nr_args; i++) {
+               switch (res[i].opcode) {
+               case IORING_RESTRICTION_REGISTER_OP:
+                       if (res[i].register_op >= IORING_REGISTER_LAST) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       __set_bit(res[i].register_op,
+                                 ctx->restrictions.register_op);
+                       break;
+               case IORING_RESTRICTION_SQE_OP:
+                       if (res[i].sqe_op >= IORING_OP_LAST) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+                       break;
+               case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
+                       ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+                       break;
+               case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
+                       ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+                       break;
+               default:
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+out:
+       /* Reset all restrictions if an error happened */
+       if (ret != 0)
+               memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
+       else
+               ctx->restricted = 1;
+
+       kfree(res);
+       return ret;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
        switch (op) {
                if (ret) {
                        percpu_ref_resurrect(&ctx->refs);
                        ret = -EINTR;
+                       goto out_quiesce;
+               }
+       }
+
+       if (ctx->restricted) {
+               if (opcode >= IORING_REGISTER_LAST) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               if (!test_bit(opcode, ctx->restrictions.register_op)) {
+                       ret = -EACCES;
                        goto out;
                }
        }
                        break;
                ret = io_unregister_personality(ctx, nr_args);
                break;
+       case IORING_REGISTER_RESTRICTIONS:
+               ret = io_register_restrictions(ctx, arg, nr_args);
+               break;
        default:
                ret = -EINVAL;
                break;
        }
 
+out:
        if (io_register_op_must_quiesce(opcode)) {
                /* bring the ctx back to life */
                percpu_ref_reinit(&ctx->refs);
-out:
+out_quiesce:
                reinit_completion(&ctx->ref_comp);
        }
        return ret;
 
        IORING_REGISTER_PROBE                   = 8,
        IORING_REGISTER_PERSONALITY             = 9,
        IORING_UNREGISTER_PERSONALITY           = 10,
+       IORING_REGISTER_RESTRICTIONS            = 11,
 
        /* this goes last */
        IORING_REGISTER_LAST
        struct io_uring_probe_op ops[0];
 };
 
+struct io_uring_restriction {
+       __u16 opcode;
+       union {
+               __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
+               __u8 sqe_op;      /* IORING_RESTRICTION_SQE_OP */
+               __u8 sqe_flags;   /* IORING_RESTRICTION_SQE_FLAGS_* */
+       };
+       __u8 resv;
+       __u32 resv2[3];
+};
+
+/*
+ * io_uring_restriction->opcode values
+ */
+enum {
+       /* Allow an io_uring_register(2) opcode */
+       IORING_RESTRICTION_REGISTER_OP          = 0,
+
+       /* Allow an sqe opcode */
+       IORING_RESTRICTION_SQE_OP               = 1,
+
+       /* Allow sqe flags */
+       IORING_RESTRICTION_SQE_FLAGS_ALLOWED    = 2,
+
+       /* Require sqe flags (these flags must be set on each submission) */
+       IORING_RESTRICTION_SQE_FLAGS_REQUIRED   = 3,
+
+       IORING_RESTRICTION_LAST
+};
+
 #endif