bpf: add support for open-coded iterator loops

author Andrii Nakryiko <andrii@kernel.org>

Wed, 8 Mar 2023 18:41:16 +0000 (10:41 -0800)

committer Alexei Starovoitov <ast@kernel.org>

Thu, 9 Mar 2023 00:19:50 +0000 (16:19 -0800)
author Andrii Nakryiko <andrii@kernel.org>
Wed, 8 Mar 2023 18:41:16 +0000 (10:41 -0800)
committer Alexei Starovoitov <ast@kernel.org>
Thu, 9 Mar 2023 00:19:50 +0000 (16:19 -0800)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h

index e2dc7f0644495e4a905e37f64c91e99a4fa276d9..0c052bc799401205f7c18b5890c7949f8b540322 100644 (file)
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
  
  #define ITER_PREFIX "bpf_iter_"
  
+enum bpf_iter_state {
+       BPF_ITER_STATE_INVALID, /* for non-first slot */
+       BPF_ITER_STATE_ACTIVE,
+       BPF_ITER_STATE_DRAINED,
+};
+
  struct bpf_reg_state {
         /* Ordering of fields matters.  See states_equal() */
         enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
                         bool first_slot;
                 } dynptr;
  
+               /* For bpf_iter stack slots */
+               struct {
+                       /* BTF container and BTF type ID describing
+                        * struct bpf_iter_<type> of an iterator state
+                        */
+                       struct btf *btf;
+                       u32 btf_id;
+                       /* packing following two fields to fit iter state into 16 bytes */
+                       enum bpf_iter_state state:2;
+                       int depth:30;
+               } iter;
+
                 /* Max size from any of the above. */
                 struct {
                         unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
          * same reference to the socket, to determine proper reference freeing.
          * For stack slots that are dynptrs, this is used to track references to
          * the dynptr to determine proper reference freeing.
+        * Similarly to dynptrs, we use ID to track "belonging" of a reference
+        * to a specific instance of bpf_iter.
          */
         u32 id;
         /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
          * is stored in bpf_stack_state->spilled_ptr.dynptr.type
          */
         STACK_DYNPTR,
+       STACK_ITER,
  };
  
  #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+
  #define BPF_DYNPTR_SIZE                sizeof(struct bpf_dynptr_kern)
  #define BPF_DYNPTR_NR_SLOTS            (BPF_DYNPTR_SIZE / BPF_REG_SIZE)
  
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
         bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
         bool zext_dst; /* this insn zero extends dst reg */
         bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+       bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
         u8 alu_state; /* used in combination with alu_limit */
  
         /* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index 8d40fba6a1c02f980ff661cb6ddeaa63f2f73de8..45a0822844648973dc822fdee71ab854addf13f9 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta {
                 enum bpf_dynptr_type type;
                 u32 id;
         } initialized_dynptr;
+       struct {
+               u8 spi;
+               u8 frameno;
+       } iter;
         u64 mem_size;
  };
  
@@ -668,6 +672,7 @@ static char slot_type_char[] = {
         [STACK_MISC]    = 'm',
         [STACK_ZERO]    = '0',
         [STACK_DYNPTR]  = 'd',
+       [STACK_ITER]    = 'i',
  };
  
  static void print_liveness(struct bpf_verifier_env *env,
@@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
         return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
  }
  
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+       return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
  static const char *kernel_type_name(const struct btf* btf, u32 id)
  {
         return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
         }
  }
  
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+       if (!btf || btf_id == 0)
+               return "<invalid>";
+
+       /* we already validated that type is valid and has conforming name */
+       return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+       switch (state) {
+       case BPF_ITER_STATE_ACTIVE:
+               return "active";
+       case BPF_ITER_STATE_DRAINED:
+               return "drained";
+       case BPF_ITER_STATE_INVALID:
+               return "<invalid>";
+       default:
+               WARN_ONCE(1, "unknown iter state %d\n", state);
+               return "<unknown>";
+       }
+}
+
  static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
  {
         env->scratched_regs |= 1U << regno;
@@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
         }
  }
  
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+                                struct bpf_reg_state *reg, int insn_idx,
+                                struct btf *btf, u32 btf_id, int nr_slots)
+{
+       struct bpf_func_state *state = func(env, reg);
+       int spi, i, j, id;
+
+       spi = iter_get_spi(env, reg, nr_slots);
+       if (spi < 0)
+               return spi;
+
+       id = acquire_reference_state(env, insn_idx);
+       if (id < 0)
+               return id;
+
+       for (i = 0; i < nr_slots; i++) {
+               struct bpf_stack_state *slot = &state->stack[spi - i];
+               struct bpf_reg_state *st = &slot->spilled_ptr;
+
+               __mark_reg_known_zero(st);
+               st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+               st->live |= REG_LIVE_WRITTEN;
+               st->ref_obj_id = i == 0 ? id : 0;
+               st->iter.btf = btf;
+               st->iter.btf_id = btf_id;
+               st->iter.state = BPF_ITER_STATE_ACTIVE;
+               st->iter.depth = 0;
+
+               for (j = 0; j < BPF_REG_SIZE; j++)
+                       slot->slot_type[j] = STACK_ITER;
+
+               mark_stack_slot_scratched(env, spi - i);
+       }
+
+       return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+                                  struct bpf_reg_state *reg, int nr_slots)
+{
+       struct bpf_func_state *state = func(env, reg);
+       int spi, i, j;
+
+       spi = iter_get_spi(env, reg, nr_slots);
+       if (spi < 0)
+               return spi;
+
+       for (i = 0; i < nr_slots; i++) {
+               struct bpf_stack_state *slot = &state->stack[spi - i];
+               struct bpf_reg_state *st = &slot->spilled_ptr;
+
+               if (i == 0)
+                       WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+               __mark_reg_not_init(env, st);
+
+               /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+               st->live |= REG_LIVE_WRITTEN;
+
+               for (j = 0; j < BPF_REG_SIZE; j++)
+                       slot->slot_type[j] = STACK_INVALID;
+
+               mark_stack_slot_scratched(env, spi - i);
+       }
+
+       return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+                                    struct bpf_reg_state *reg, int nr_slots)
+{
+       struct bpf_func_state *state = func(env, reg);
+       int spi, i, j;
+
+       /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+        * will do check_mem_access to check and update stack bounds later, so
+        * return true for that case.
+        */
+       spi = iter_get_spi(env, reg, nr_slots);
+       if (spi == -ERANGE)
+               return true;
+       if (spi < 0)
+               return false;
+
+       for (i = 0; i < nr_slots; i++) {
+               struct bpf_stack_state *slot = &state->stack[spi - i];
+
+               for (j = 0; j < BPF_REG_SIZE; j++)
+                       if (slot->slot_type[j] == STACK_ITER)
+                               return false;
+       }
+
+       return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                                  struct btf *btf, u32 btf_id, int nr_slots)
+{
+       struct bpf_func_state *state = func(env, reg);
+       int spi, i, j;
+
+       spi = iter_get_spi(env, reg, nr_slots);
+       if (spi < 0)
+               return false;
+
+       for (i = 0; i < nr_slots; i++) {
+               struct bpf_stack_state *slot = &state->stack[spi - i];
+               struct bpf_reg_state *st = &slot->spilled_ptr;
+
+               /* only main (first) slot has ref_obj_id set */
+               if (i == 0 && !st->ref_obj_id)
+                       return false;
+               if (i != 0 && st->ref_obj_id)
+                       return false;
+               if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+                       return false;
+
+               for (j = 0; j < BPF_REG_SIZE; j++)
+                       if (slot->slot_type[j] != STACK_ITER)
+                               return false;
+       }
+
+       return true;
+}
+
+/* Check if given stack slot is "special":
+ *   - spilled register state (STACK_SPILL);
+ *   - dynptr state (STACK_DYNPTR);
+ *   - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+       enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+       switch (type) {
+       case STACK_SPILL:
+       case STACK_DYNPTR:
+       case STACK_ITER:
+               return true;
+       case STACK_INVALID:
+       case STACK_MISC:
+       case STACK_ZERO:
+               return false;
+       default:
+               WARN_ONCE(1, "unknown stack slot type %d\n", type);
+               return true;
+       }
+}
+
  /* The reg state of a pointer or a bounded scalar was saved when
   * it was spilled to the stack.
   */
@@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                         if (reg->ref_obj_id)
                                 verbose(env, "(ref_id=%d)", reg->ref_obj_id);
                         break;
+               case STACK_ITER:
+                       /* only main slot has ref_obj_id set; skip others */
+                       reg = &state->stack[i].spilled_ptr;
+                       if (!reg->ref_obj_id)
+                               continue;
+
+                       verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+                       print_liveness(env, reg->live);
+                       verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+                               iter_type_str(reg->iter.btf, reg->iter.btf_id),
+                               reg->ref_obj_id, iter_state_str(reg->iter.state),
+                               reg->iter.depth);
+                       break;
                 case STACK_MISC:
                 case STACK_ZERO:
                 default:
@@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
                              state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
  }
  
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+                         int spi, int nr_slots)
+{
+       struct bpf_func_state *state = func(env, reg);
+       int err, i;
+
+       for (i = 0; i < nr_slots; i++) {
+               struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+               err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+               if (err)
+                       return err;
+
+               mark_stack_slot_scratched(env, spi - i);
+       }
+
+       return 0;
+}
+
  /* This function is supposed to be used by the following 32-bit optimization
   * code only. It returns TRUE if the source or destination register operates
   * on 64-bit, otherwise return FALSE.
@@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
  
                 /* regular write of data into stack destroys any spilled ptr */
                 state->stack[spi].spilled_ptr.type = NOT_INIT;
-               /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
-               if (is_spilled_reg(&state->stack[spi]))
+               /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+               if (is_stack_slot_special(&state->stack[spi]))
                         for (i = 0; i < BPF_REG_SIZE; i++)
                                 scrub_spilled_slot(&state->stack[spi].slot_type[i]);
  
@@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
         return err;
  }
  
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+       struct bpf_func_state *state = func(env, reg);
+
+       return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+       return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+       return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+       return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+       return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+       /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+        * kfunc is iter state pointer
+        */
+       return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+                           struct bpf_kfunc_call_arg_meta *meta)
+{
+       struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+       const struct btf_type *t;
+       const struct btf_param *arg;
+       int spi, err, i, nr_slots;
+       u32 btf_id;
+
+       /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+       arg = &btf_params(meta->func_proto)[0];
+       t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);        /* PTR */
+       t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);       /* STRUCT */
+       nr_slots = t->size / BPF_REG_SIZE;
+
+       spi = iter_get_spi(env, reg, nr_slots);
+       if (spi < 0 && spi != -ERANGE)
+               return spi;
+
+       meta->iter.spi = spi;
+       meta->iter.frameno = reg->frameno;
+
+       if (is_iter_new_kfunc(meta)) {
+               /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+               if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+                       verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+                               iter_type_str(meta->btf, btf_id), regno);
+                       return -EINVAL;
+               }
+
+               for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+                       err = check_mem_access(env, insn_idx, regno,
+                                              i, BPF_DW, BPF_WRITE, -1, false);
+                       if (err)
+                               return err;
+               }
+
+               err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+               if (err)
+                       return err;
+       } else {
+               /* iter_next() or iter_destroy() expect initialized iter state*/
+               if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+                       verbose(env, "expected an initialized iter_%s as arg #%d\n",
+                               iter_type_str(meta->btf, btf_id), regno);
+                       return -EINVAL;
+               }
+
+               err = mark_iter_read(env, reg, spi, nr_slots);
+               if (err)
+                       return err;
+
+               meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+               if (is_iter_destroy_kfunc(meta)) {
+                       err = unmark_stack_slots_iter(env, reg, nr_slots);
+                       if (err)
+                               return err;
+               }
+       }
+
+       return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+                                 struct bpf_kfunc_call_arg_meta *meta)
+{
+       struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+       struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+       struct bpf_reg_state *cur_iter, *queued_iter;
+       int iter_frameno = meta->iter.frameno;
+       int iter_spi = meta->iter.spi;
+
+       BTF_TYPE_EMIT(struct bpf_iter);
+
+       cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+       if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+           cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+               verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+                       cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+               return -EFAULT;
+       }
+
+       if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+               /* branch out active iter state */
+               queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+               if (!queued_st)
+                       return -ENOMEM;
+
+               queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+               queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+               queued_iter->iter.depth++;
+
+               queued_fr = queued_st->frame[queued_st->curframe];
+               mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+       }
+
+       /* switch to DRAINED state, but keep the depth unchanged */
+       /* mark current iter state as drained and assume returned NULL */
+       cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+       __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+       return 0;
+}
+
  static bool arg_type_is_mem_size(enum bpf_arg_type type)
  {
         return type == ARG_CONST_SIZE ||
@@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type {
         KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */
         KF_ARG_PTR_TO_KPTR,          /* PTR_TO_KPTR but type specific */
         KF_ARG_PTR_TO_DYNPTR,
+       KF_ARG_PTR_TO_ITER,
         KF_ARG_PTR_TO_LIST_HEAD,
         KF_ARG_PTR_TO_LIST_NODE,
         KF_ARG_PTR_TO_BTF_ID,        /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
         if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
                 return KF_ARG_PTR_TO_DYNPTR;
  
+       if (is_kfunc_arg_iter(meta, argno))
+               return KF_ARG_PTR_TO_ITER;
+
         if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
                 return KF_ARG_PTR_TO_LIST_HEAD;
  
@@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
                         break;
                 case KF_ARG_PTR_TO_KPTR:
                 case KF_ARG_PTR_TO_DYNPTR:
+               case KF_ARG_PTR_TO_ITER:
                 case KF_ARG_PTR_TO_LIST_HEAD:
                 case KF_ARG_PTR_TO_LIST_NODE:
                 case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
  
                         break;
                 }
+               case KF_ARG_PTR_TO_ITER:
+                       ret = process_iter_arg(env, regno, insn_idx, meta);
+                       if (ret < 0)
+                               return ret;
+                       break;
                 case KF_ARG_PTR_TO_LIST_HEAD:
                         if (reg->type != PTR_TO_MAP_VALUE &&
                             reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
         desc_btf = meta.btf;
         insn_aux = &env->insn_aux_data[insn_idx];
  
+       insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
         if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
                 verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
                 return -EACCES;
@@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                         mark_btf_func_reg_size(env, regno, t->size);
         }
  
+       if (is_iter_next_kfunc(&meta)) {
+               err = process_iter_next_call(env, insn_idx, &meta);
+               if (err)
+                       return err;
+       }
+
         return 0;
  }
  
@@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
                          * async state will be pushed for further exploration.
                          */
                         mark_prune_point(env, t);
+               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+                       struct bpf_kfunc_call_arg_meta meta;
+
+                       ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+                       if (ret == 0 && is_iter_next_kfunc(&meta))
+                               mark_prune_point(env, t);
+               }
                 return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
  
         case BPF_JA:
@@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
          * didn't use them
          */
         for (i = 0; i < old->allocated_stack; i++) {
+               struct bpf_reg_state *old_reg, *cur_reg;
+
                 spi = i / BPF_REG_SIZE;
  
                 if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
                                 return false;
                         break;
                 case STACK_DYNPTR:
-               {
-                       const struct bpf_reg_state *old_reg, *cur_reg;
-
                         old_reg = &old->stack[spi].spilled_ptr;
                         cur_reg = &cur->stack[spi].spilled_ptr;
                         if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
                             !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
                                 return false;
                         break;
-               }
+               case STACK_ITER:
+                       old_reg = &old->stack[spi].spilled_ptr;
+                       cur_reg = &cur->stack[spi].spilled_ptr;
+                       /* iter.depth is not compared between states as it
+                        * doesn't matter for correctness and would otherwise
+                        * prevent convergence; we maintain it only to prevent
+                        * infinite loop check triggering, see
+                        * iter_active_depths_differ()
+                        */
+                       if (old_reg->iter.btf != cur_reg->iter.btf ||
+                           old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+                           old_reg->iter.state != cur_reg->iter.state ||
+                           /* ignore {old_reg,cur_reg}->iter.depth, see above */
+                           !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+                               return false;
+                       break;
                 case STACK_MISC:
                 case STACK_ZERO:
                 case STACK_INVALID:
@@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
         return true;
  }
  
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+       return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ *   0: again:                          ; set up iter_next() call args
+ *   1:   r1 = &it                      ; <CHECKPOINT HERE>
+ *   2:   call bpf_iter_num_next        ; this is iter_next() call
+ *   3:   if r0 == 0 goto done
+ *   4:   ... something useful here ...
+ *   5:   goto again                    ; another iteration
+ *   6: done:
+ *   7:   r1 = &it
+ *   8:   call bpf_iter_num_destroy     ; clean up iter state
+ *   9:   exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ *   struct bpf_iter_num it;
+ *   int *p, x;
+ *
+ *   bpf_iter_num_new(&it, 0, 10);
+ *   while ((p = bpf_iter_num_next(&t))) {
+ *       x = p;
+ *       while (x--) {} // <<-- infinite loop here
+ *   }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+       struct bpf_reg_state *slot, *cur_slot;
+       struct bpf_func_state *state;
+       int i, fr;
+
+       for (fr = old->curframe; fr >= 0; fr--) {
+               state = old->frame[fr];
+               for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+                       if (state->stack[i].slot_type[0] != STACK_ITER)
+                               continue;
+
+                       slot = &state->stack[i].spilled_ptr;
+                       if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+                               continue;
+
+                       cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+                       if (cur_slot->iter.depth != slot->iter.depth)
+                               return true;
+               }
+       }
+       return false;
+}
  
  static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
  {
@@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                                  * Since the verifier still needs to catch infinite loops
                                  * inside async callbacks.
                                  */
-                       } else if (states_maybe_looping(&sl->state, cur) &&
-                                  states_equal(env, &sl->state, cur)) {
+                               goto skip_inf_loop_check;
+                       }
+                       /* BPF open-coded iterators loop detection is special.
+                        * states_maybe_looping() logic is too simplistic in detecting
+                        * states that *might* be equivalent, because it doesn't know
+                        * about ID remapping, so don't even perform it.
+                        * See process_iter_next_call() and iter_active_depths_differ()
+                        * for overview of the logic. When current and one of parent
+                        * states are detected as equivalent, it's a good thing: we prove
+                        * convergence and can stop simulating further iterations.
+                        * It's safe to assume that iterator loop will finish, taking into
+                        * account iter_next() contract of eventually returning
+                        * sticky NULL result.
+                        */
+                       if (is_iter_next_insn(env, insn_idx)) {
+                               if (states_equal(env, &sl->state, cur)) {
+                                       struct bpf_func_state *cur_frame;
+                                       struct bpf_reg_state *iter_state, *iter_reg;
+                                       int spi;
+
+                                       cur_frame = cur->frame[cur->curframe];
+                                       /* btf_check_iter_kfuncs() enforces that
+                                        * iter state pointer is always the first arg
+                                        */
+                                       iter_reg = &cur_frame->regs[BPF_REG_1];
+                                       /* current state is valid due to states_equal(),
+                                        * so we can assume valid iter and reg state,
+                                        * no need for extra (re-)validations
+                                        */
+                                       spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+                                       iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+                                       if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+                                               goto hit;
+                               }
+                               goto skip_inf_loop_check;
+                       }
+                       /* attempt to detect infinite loop to avoid unnecessary doomed work */
+                       if (states_maybe_looping(&sl->state, cur) &&
+                           states_equal(env, &sl->state, cur) &&
+                           !iter_active_depths_differ(&sl->state, cur)) {
                                 verbose_linfo(env, insn_idx, "; ");
                                 verbose(env, "infinite loop detected at insn %d\n", insn_idx);
                                 return -EINVAL;
@@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                          * This threshold shouldn't be too high either, since states
                          * at the end of the loop are likely to be useful in pruning.
                          */
+skip_inf_loop_check:
                         if (!env->test_state_freq &&
                             env->jmps_processed - env->prev_jmps_processed < 20 &&
                             env->insn_processed - env->prev_insn_processed < 100)
@@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                         goto miss;
                 }
                 if (states_equal(env, &sl->state, cur)) {
+hit:
                         sl->hit_cnt++;
                         /* reached equivalent register/stack state,
                          * prune the search.
author	Andrii Nakryiko <andrii@kernel.org>
	Wed, 8 Mar 2023 18:41:16 +0000 (10:41 -0800)
committer	Alexei Starovoitov <ast@kernel.org>
	Thu, 9 Mar 2023 00:19:50 +0000 (16:19 -0800)
include/linux/bpf_verifier.h		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history