diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 36bfd96d4563..05b9fe98b8f8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -220,10 +220,61 @@ enum bpf_stack_slot_type {
 	STACK_DYNPTR,
 	STACK_ITER,
 	STACK_IRQ_FLAG,
+	STACK_POISON,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+/* 4-byte stack slot granularity for liveness analysis */
+#define BPF_HALF_REG_SIZE	4
+#define STACK_SLOT_SZ		4
+#define STACK_SLOTS		(MAX_BPF_STACK / BPF_HALF_REG_SIZE)	/* 128 */
+
+typedef struct {
+	u64 v[2];
+} spis_t;
+
+#define SPIS_ZERO	((spis_t){})
+#define SPIS_ALL	((spis_t){{ U64_MAX, U64_MAX }})
+
+static inline bool spis_is_zero(spis_t s)
+{
+	return s.v[0] == 0 && s.v[1] == 0;
+}
+
+static inline bool spis_equal(spis_t a, spis_t b)
+{
+	return a.v[0] == b.v[0] && a.v[1] == b.v[1];
+}
+
+static inline spis_t spis_or(spis_t a, spis_t b)
+{
+	return (spis_t){{ a.v[0] | b.v[0], a.v[1] | b.v[1] }};
+}
+
+static inline spis_t spis_and(spis_t a, spis_t b)
+{
+	return (spis_t){{ a.v[0] & b.v[0], a.v[1] & b.v[1] }};
+}
+
+static inline spis_t spis_not(spis_t s)
+{
+	return (spis_t){{ ~s.v[0], ~s.v[1] }};
+}
+
+static inline bool spis_test_bit(spis_t s, u32 slot)
+{
+	return s.v[slot / 64] & BIT_ULL(slot % 64);
+}
+
+static inline void spis_or_range(spis_t *mask, u32 lo, u32 hi)
+{
+	u32 w;
+
+	for (w = lo; w <= hi && w < STACK_SLOTS; w++)
+		mask->v[w / 64] |= BIT_ULL(w % 64);
+}
+
 #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \
 			  (1 << BPF_REG_3) | (1 << BPF_REG_4) | \
 			  (1 << BPF_REG_5))
@@ -424,7 +475,6 @@ struct bpf_verifier_state {
 
 	bool speculative;
 	bool in_sleepable;
-	bool cleaned;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
@@ -664,7 +714,7 @@ enum priv_stack_mode {
 };
 
 struct bpf_subprog_info {
-	/* 'start' has to be the first field otherwise find_subprog() won't work */
+	const char *name; /* name extracted from BTF */
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u32 postorder_start; /* The idx to the env->cfg.insn_postorder */
@@ -819,6 +869,8 @@ struct bpf_verifier_env {
 	} cfg;
 	struct backtrack_state bt;
 	struct bpf_jmp_history_entry *cur_hist_ent;
+	/* Per-callsite copy of parent's converged at_stack_in for cross-frame fills. */
+	struct arg_track **callsite_at_stack;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
@@ -1121,12 +1173,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
 			  u32 frameno, bool print_all);
 void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
 		      u32 frameno);
+u32 bpf_vlog_alignment(u32 pos);
 
 struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off);
 int bpf_jmp_offset(struct bpf_insn *insn);
 struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx);
 void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask);
 bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx);
+bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog);
 
 int bpf_find_subprog(struct bpf_verifier_env *env, int off);
 int bpf_compute_const_regs(struct bpf_verifier_env *env);
@@ -1144,16 +1198,11 @@ s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env,
 s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env,
 				 struct bpf_insn *insn, int arg,
 				 int insn_idx);
+int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env);
 
 int bpf_stack_liveness_init(struct bpf_verifier_env *env);
 void bpf_stack_liveness_free(struct bpf_verifier_env *env);
-int bpf_update_live_stack(struct bpf_verifier_env *env);
-int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask);
-void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask);
-int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx);
-int bpf_commit_stack_write_marks(struct bpf_verifier_env *env);
 int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st);
 bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi);
-void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env);
 
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 998986853c61..9bb6574d73fe 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -2,217 +2,119 @@
 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
 
 #include <linux/bpf_verifier.h>
+#include <linux/btf.h>
 #include <linux/hashtable.h>
 #include <linux/jhash.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
 
-/*
- * This file implements live stack slots analysis. After accumulating
- * stack usage data, the analysis answers queries about whether a
- * particular stack slot may be read by an instruction or any of it's
- * successors.  This data is consumed by the verifier states caching
- * mechanism to decide which stack slots are important when looking for a
- * visited state corresponding to the current state.
- *
- * The analysis is call chain sensitive, meaning that data is collected
- * and queried for tuples (call chain, subprogram instruction index).
- * Such sensitivity allows identifying if some subprogram call always
- * leads to writes in the caller's stack.
- *
- * The basic idea is as follows:
- * - As the verifier accumulates a set of visited states, the analysis instance
- *   accumulates a conservative estimate of stack slots that can be read
- *   or must be written for each visited tuple (call chain, instruction index).
- * - If several states happen to visit the same instruction with the same
- *   call chain, stack usage information for the corresponding tuple is joined:
- *   - "may_read" set represents a union of all possibly read slots
- *     (any slot in "may_read" set might be read at or after the instruction);
- *   - "must_write" set represents an intersection of all possibly written slots
- *     (any slot in "must_write" set is guaranteed to be written by the instruction).
- * - The analysis is split into two phases:
- *   - read and write marks accumulation;
- *   - read and write marks propagation.
- * - The propagation phase is a textbook live variable data flow analysis:
- *
- *     state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
- *     state[cc, i].live_before =
- *       (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
- *
- *   Where:
- *   - `U`  stands for set union
- *   - `/`  stands for set difference;
- *   - `cc` stands for a call chain;
- *   - `i` and `s` are instruction indexes;
- *
- *   The above equations are computed for each call chain and instruction
- *   index until state stops changing.
- * - Additionally, in order to transfer "must_write" information from a
- *   subprogram to call instructions invoking this subprogram,
- *   the "must_write_acc" set is tracked for each (cc, i) tuple.
- *   A set of stack slots that are guaranteed to be written by this
- *   instruction or any of its successors (within the subprogram).
- *   The equation for "must_write_acc" propagation looks as follows:
- *
- *     state[cc, i].must_write_acc =
- *       ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
- *       U state[cc, i].must_write
- *
- *   (An intersection of all "must_write_acc" for instruction successors
- *    plus all "must_write" slots for the instruction itself).
- * - After the propagation phase completes for a subprogram, information from
- *   (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
- *   - "must_write_acc" set is intersected with the call site's "must_write" set;
- *   - "may_read" set is added to the call site's "may_read" set.
- * - Any live stack queries must be taken after the propagation phase.
- * - Accumulation and propagation phases can be entered multiple times,
- *   at any point in time:
- *   - "may_read" set only grows;
- *   - "must_write" set only shrinks;
- *   - for each visited verifier state with zero branches, all relevant
- *     read and write marks are already recorded by the analysis instance.
- *
- * Technically, the analysis is facilitated by the following data structures:
- * - Call chain: for given verifier state, the call chain is a tuple of call
- *   instruction indexes leading to the current subprogram plus the subprogram
- *   entry point index.
- * - Function instance: for a given call chain, for each instruction in
- *   the current subprogram, a mapping between instruction index and a
- *   set of "may_read", "must_write" and other marks accumulated for this
- *   instruction.
- * - A hash table mapping call chains to function instances.
- */
-
-struct callchain {
-	u32 callsites[MAX_CALL_FRAMES];	/* instruction pointer for each frame */
-	/* cached subprog_info[*].start for functions owning the frames:
-	 * - sp_starts[curframe] used to get insn relative index within current function;
-	 * - sp_starts[0..current-1] used for fast callchain_frame_up().
-	 */
-	u32 sp_starts[MAX_CALL_FRAMES];
-	u32 curframe;			/* depth of callsites and sp_starts arrays */
-};
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
 
 struct per_frame_masks {
-	u64 may_read;		/* stack slots that may be read by this instruction */
-	u64 must_write;		/* stack slots written by this instruction */
-	u64 must_write_acc;	/* stack slots written by this instruction and its successors */
-	u64 live_before;	/* stack slots that may be read by this insn and its successors */
+	spis_t may_read;	/* stack slots that may be read by this instruction */
+	spis_t must_write;	/* stack slots written by this instruction */
+	spis_t live_before;	/* stack slots that may be read by this insn and its successors */
 };
 
 /*
- * A function instance created for a specific callchain.
+ * A function instance keyed by (callsite, depth).
  * Encapsulates read and write marks for each instruction in the function.
- * Marks are tracked for each frame in the callchain.
+ * Marks are tracked for each frame up to @depth.
  */
 struct func_instance {
 	struct hlist_node hl_node;
-	struct callchain callchain;
+	u32 callsite;		/* call insn that invoked this subprog (subprog_start for depth 0) */
+	u32 depth;		/* call depth (0 = entry subprog) */
+	u32 subprog;		/* subprog index */
+	u32 subprog_start;	/* cached env->subprog_info[subprog].start */
 	u32 insn_cnt;		/* cached number of insns in the function */
-	bool updated;
-	bool must_write_dropped;
 	/* Per frame, per instruction masks, frames allocated lazily. */
 	struct per_frame_masks *frames[MAX_CALL_FRAMES];
-	/* For each instruction a flag telling if "must_write" had been initialized for it. */
-	bool *must_write_set;
+	bool must_write_initialized;
 };
 
 struct live_stack_query {
 	struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+	u32 callsites[MAX_CALL_FRAMES]; /* callsite[i] = insn calling frame i+1 */
 	u32 curframe;
 	u32 insn_idx;
 };
 
 struct bpf_liveness {
-	DECLARE_HASHTABLE(func_instances, 8);		/* maps callchain to func_instance */
+	DECLARE_HASHTABLE(func_instances, 8);		/* maps (depth, callsite) to func_instance */
 	struct live_stack_query live_stack_query;	/* cache to avoid repetitive ht lookups */
-	/* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
-	struct func_instance *cur_instance;
-	/*
-	 * Below fields are used to accumulate stack write marks for instruction at
-	 * @write_insn_idx before submitting the marks to @cur_instance.
-	 */
-	u64 write_masks_acc[MAX_CALL_FRAMES];
-	u32 write_insn_idx;
+	u32 subprog_calls;				/* analyze_subprog() invocations */
 };
 
-/* Compute callchain corresponding to state @st at depth @frameno */
-static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
-			      struct callchain *callchain, u32 frameno)
-{
-	struct bpf_subprog_info *subprog_info = env->subprog_info;
-	u32 i;
-
-	memset(callchain, 0, sizeof(*callchain));
-	for (i = 0; i <= frameno; i++) {
-		callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
-		if (i < st->curframe)
-			callchain->callsites[i] = st->frame[i + 1]->callsite;
-	}
-	callchain->curframe = frameno;
-	callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
-}
-
-static u32 hash_callchain(struct callchain *callchain)
-{
-	return jhash2(callchain->callsites, callchain->curframe, 0);
-}
-
-static bool same_callsites(struct callchain *a, struct callchain *b)
-{
-	int i;
-
-	if (a->curframe != b->curframe)
-		return false;
-	for (i = a->curframe; i >= 0; i--)
-		if (a->callsites[i] != b->callsites[i])
-			return false;
-	return true;
-}
-
 /*
- * Find existing or allocate new function instance corresponding to @callchain.
- * Instances are accumulated in env->liveness->func_instances and persist
- * until the end of the verification process.
+ * Hash/compare key for func_instance: (depth, callsite).
+ * For depth == 0 (entry subprog), @callsite is the subprog start insn.
+ * For depth > 0, @callsite is the call instruction index that invoked the subprog.
  */
-static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
-					       struct callchain *callchain)
+static u32 instance_hash(u32 callsite, u32 depth)
+{
+	u32 key[2] = { depth, callsite };
+
+	return jhash2(key, 2, 0);
+}
+
+static struct func_instance *find_instance(struct bpf_verifier_env *env,
+					   u32 callsite, u32 depth)
 {
 	struct bpf_liveness *liveness = env->liveness;
-	struct bpf_subprog_info *subprog;
-	struct func_instance *result;
-	u32 subprog_sz, size, key;
+	struct func_instance *f;
+	u32 key = instance_hash(callsite, depth);
 
-	key = hash_callchain(callchain);
-	hash_for_each_possible(liveness->func_instances, result, hl_node, key)
-		if (same_callsites(&result->callchain, callchain))
-			return result;
+	hash_for_each_possible(liveness->func_instances, f, hl_node, key)
+		if (f->depth == depth && f->callsite == callsite)
+			return f;
+	return NULL;
+}
 
-	subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
-	subprog_sz = (subprog + 1)->start - subprog->start;
-	size = sizeof(struct func_instance);
-	result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
-	if (!result)
+static struct func_instance *call_instance(struct bpf_verifier_env *env,
+					   struct func_instance *caller,
+					   u32 callsite, int subprog)
+{
+	u32 depth = caller ? caller->depth + 1 : 0;
+	u32 subprog_start = env->subprog_info[subprog].start;
+	u32 lookup_key = depth > 0 ? callsite : subprog_start;
+	struct func_instance *f;
+	u32 hash;
+
+	f = find_instance(env, lookup_key, depth);
+	if (f)
+		return f;
+
+	f = kvzalloc(sizeof(*f), GFP_KERNEL_ACCOUNT);
+	if (!f)
 		return ERR_PTR(-ENOMEM);
-	result->must_write_set = kvzalloc_objs(*result->must_write_set,
-					       subprog_sz, GFP_KERNEL_ACCOUNT);
-	if (!result->must_write_set) {
-		kvfree(result);
-		return ERR_PTR(-ENOMEM);
-	}
-	memcpy(&result->callchain, callchain, sizeof(*callchain));
-	result->insn_cnt = subprog_sz;
-	hash_add(liveness->func_instances, &result->hl_node, key);
-	return result;
+	f->callsite = lookup_key;
+	f->depth = depth;
+	f->subprog = subprog;
+	f->subprog_start = subprog_start;
+	f->insn_cnt = (env->subprog_info + subprog + 1)->start - subprog_start;
+	hash = instance_hash(lookup_key, depth);
+	hash_add(env->liveness->func_instances, &f->hl_node, hash);
+	return f;
 }
 
 static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
 					     struct bpf_verifier_state *st,
 					     u32 frameno)
 {
-	struct callchain callchain;
+	u32 callsite, subprog_start;
+	struct func_instance *f;
+	u32 key, depth;
 
-	compute_callchain(env, st, &callchain, frameno);
-	return __lookup_instance(env, &callchain);
+	subprog_start = env->subprog_info[st->frame[frameno]->subprogno].start;
+	callsite = frameno > 0 ? st->frame[frameno]->callsite : subprog_start;
+
+	for (depth = frameno; ; depth--) {
+		key = depth > 0 ? callsite : subprog_start;
+		f = find_instance(env, key, depth);
+		if (f || depth == 0)
+			return f;
+	}
 }
 
 int bpf_stack_liveness_init(struct bpf_verifier_env *env)
@@ -233,9 +135,8 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
 	if (!env->liveness)
 		return;
 	hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
-		for (i = 0; i <= instance->callchain.curframe; i++)
+		for (i = 0; i <= instance->depth; i++)
 			kvfree(instance->frames[i]);
-		kvfree(instance->must_write_set);
 		kvfree(instance);
 	}
 	kvfree(env->liveness);
@@ -247,7 +148,7 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env)
  */
 static int relative_idx(struct func_instance *instance, u32 insn_idx)
 {
-	return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+	return insn_idx - instance->subprog_start;
 }
 
 static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
@@ -259,8 +160,7 @@ static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
 	return &instance->frames[frame][relative_idx(instance, insn_idx)];
 }
 
-static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
-						 struct func_instance *instance,
+static struct per_frame_masks *alloc_frame_masks(struct func_instance *instance,
 						 u32 frame, u32 insn_idx)
 {
 	struct per_frame_masks *arr;
@@ -275,167 +175,29 @@ static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
 	return get_frame_masks(instance, frame, insn_idx);
 }
 
-void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
-{
-	env->liveness->cur_instance = NULL;
-}
-
-/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
-static int ensure_cur_instance(struct bpf_verifier_env *env)
-{
-	struct bpf_liveness *liveness = env->liveness;
-	struct func_instance *instance;
-
-	if (liveness->cur_instance)
-		return 0;
-
-	instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
-	if (IS_ERR(instance))
-		return PTR_ERR(instance);
-
-	liveness->cur_instance = instance;
-	return 0;
-}
-
 /* Accumulate may_read masks for @frame at @insn_idx */
-static int mark_stack_read(struct bpf_verifier_env *env,
-			   struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+static int mark_stack_read(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
 {
 	struct per_frame_masks *masks;
-	u64 new_may_read;
 
-	masks = alloc_frame_masks(env, instance, frame, insn_idx);
+	masks = alloc_frame_masks(instance, frame, insn_idx);
 	if (IS_ERR(masks))
 		return PTR_ERR(masks);
-	new_may_read = masks->may_read | mask;
-	if (new_may_read != masks->may_read &&
-	    ((new_may_read | masks->live_before) != masks->live_before))
-		instance->updated = true;
-	masks->may_read |= mask;
+	masks->may_read = spis_or(masks->may_read, mask);
 	return 0;
 }
 
-int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
+static int mark_stack_write(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask)
 {
-	int err;
-
-	err = ensure_cur_instance(env);
-	err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
-	return err;
-}
-
-static void reset_stack_write_marks(struct bpf_verifier_env *env,
-				    struct func_instance *instance, u32 insn_idx)
-{
-	struct bpf_liveness *liveness = env->liveness;
-	int i;
-
-	liveness->write_insn_idx = insn_idx;
-	for (i = 0; i <= instance->callchain.curframe; i++)
-		liveness->write_masks_acc[i] = 0;
-}
-
-int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
-{
-	struct bpf_liveness *liveness = env->liveness;
-	int err;
-
-	err = ensure_cur_instance(env);
-	if (err)
-		return err;
-
-	reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
-	return 0;
-}
-
-void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
-{
-	env->liveness->write_masks_acc[frame] |= mask;
-}
-
-static int commit_stack_write_marks(struct bpf_verifier_env *env,
-				    struct func_instance *instance)
-{
-	struct bpf_liveness *liveness = env->liveness;
-	u32 idx, frame, curframe, old_must_write;
 	struct per_frame_masks *masks;
-	u64 mask;
 
-	if (!instance)
-		return 0;
-
-	curframe = instance->callchain.curframe;
-	idx = relative_idx(instance, liveness->write_insn_idx);
-	for (frame = 0; frame <= curframe; frame++) {
-		mask = liveness->write_masks_acc[frame];
-		/* avoid allocating frames for zero masks */
-		if (mask == 0 && !instance->must_write_set[idx])
-			continue;
-		masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
-		if (IS_ERR(masks))
-			return PTR_ERR(masks);
-		old_must_write = masks->must_write;
-		/*
-		 * If instruction at this callchain is seen for a first time, set must_write equal
-		 * to @mask. Otherwise take intersection with the previous value.
-		 */
-		if (instance->must_write_set[idx])
-			mask &= old_must_write;
-		if (old_must_write != mask) {
-			masks->must_write = mask;
-			instance->updated = true;
-		}
-		if (old_must_write & ~mask)
-			instance->must_write_dropped = true;
-	}
-	instance->must_write_set[idx] = true;
-	liveness->write_insn_idx = 0;
+	masks = alloc_frame_masks(instance, frame, insn_idx);
+	if (IS_ERR(masks))
+		return PTR_ERR(masks);
+	masks->must_write = spis_or(masks->must_write, mask);
 	return 0;
 }
 
-/*
- * Merge stack writes marks in @env->liveness->write_masks_acc
- * with information already in @env->liveness->cur_instance.
- */
-int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
-{
-	return commit_stack_write_marks(env, env->liveness->cur_instance);
-}
-
-static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
-{
-	char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
-	char *buf = env->tmp_str_buf;
-	int i;
-
-	buf += snprintf(buf, buf_end - buf, "(");
-	for (i = 0; i <= callchain->curframe; i++)
-		buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
-	snprintf(buf, buf_end - buf, ")");
-	return env->tmp_str_buf;
-}
-
-static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
-			    char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
-{
-	u64 changed_bits = old ^ new;
-	u64 new_ones = new & changed_bits;
-	u64 new_zeros = ~new & changed_bits;
-
-	if (!changed_bits)
-		return;
-	bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
-	if (new_ones) {
-		bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
-		bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
-	}
-	if (new_zeros) {
-		bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
-		bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
-	}
-	bpf_log(&env->log, "\n");
-}
-
 int bpf_jmp_offset(struct bpf_insn *insn)
 {
 	u8 code = insn->code;
@@ -507,62 +269,11 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
 
 __diag_pop();
 
-static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
-						struct func_instance *instance)
-{
-	struct callchain callchain = instance->callchain;
-
-	/* Adjust @callchain to represent callchain one frame up */
-	callchain.callsites[callchain.curframe] = 0;
-	callchain.sp_starts[callchain.curframe] = 0;
-	callchain.curframe--;
-	callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
-	return __lookup_instance(env, &callchain);
-}
-
-static u32 callchain_subprog_start(struct callchain *callchain)
-{
-	return callchain->sp_starts[callchain->curframe];
-}
-
-/*
- * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
- * to the call instruction in function instance calling @instance.
- */
-static int propagate_to_outer_instance(struct bpf_verifier_env *env,
-				       struct func_instance *instance)
-{
-	struct callchain *callchain = &instance->callchain;
-	u32 this_subprog_start, callsite, frame;
-	struct func_instance *outer_instance;
-	struct per_frame_masks *insn;
-	int err;
-
-	this_subprog_start = callchain_subprog_start(callchain);
-	outer_instance = get_outer_instance(env, instance);
-	if (IS_ERR(outer_instance))
-		return PTR_ERR(outer_instance);
-	callsite = callchain->callsites[callchain->curframe - 1];
-
-	reset_stack_write_marks(env, outer_instance, callsite);
-	for (frame = 0; frame < callchain->curframe; frame++) {
-		insn = get_frame_masks(instance, frame, this_subprog_start);
-		if (!insn)
-			continue;
-		bpf_mark_stack_write(env, frame, insn->must_write_acc);
-		err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
-		if (err)
-			return err;
-	}
-	commit_stack_write_marks(env, outer_instance);
-	return 0;
-}
 
 static inline bool update_insn(struct bpf_verifier_env *env,
 			       struct func_instance *instance, u32 frame, u32 insn_idx)
 {
-	struct bpf_insn_aux_data *aux = env->insn_aux_data;
-	u64 new_before, new_after, must_write_acc;
+	spis_t new_before, new_after;
 	struct per_frame_masks *insn, *succ_insn;
 	struct bpf_iarray *succ;
 	u32 s;
@@ -574,69 +285,34 @@ static inline bool update_insn(struct bpf_verifier_env *env,
 
 	changed = false;
 	insn = get_frame_masks(instance, frame, insn_idx);
-	new_before = 0;
-	new_after = 0;
-	/*
-	 * New "must_write_acc" is an intersection of all "must_write_acc"
-	 * of successors plus all "must_write" slots of instruction itself.
-	 */
-	must_write_acc = U64_MAX;
+	new_before = SPIS_ZERO;
+	new_after = SPIS_ZERO;
 	for (s = 0; s < succ->cnt; ++s) {
 		succ_insn = get_frame_masks(instance, frame, succ->items[s]);
-		new_after |= succ_insn->live_before;
-		must_write_acc &= succ_insn->must_write_acc;
+		new_after = spis_or(new_after, succ_insn->live_before);
 	}
-	must_write_acc |= insn->must_write;
 	/*
 	 * New "live_before" is a union of all "live_before" of successors
 	 * minus slots written by instruction plus slots read by instruction.
+	 * new_before = (new_after & ~insn->must_write) | insn->may_read
 	 */
-	new_before = (new_after & ~insn->must_write) | insn->may_read;
-	changed |= new_before != insn->live_before;
-	changed |= must_write_acc != insn->must_write_acc;
-	if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
-	    (insn->may_read || insn->must_write ||
-	     insn_idx == callchain_subprog_start(&instance->callchain) ||
-	     aux[insn_idx].prune_point)) {
-		log_mask_change(env, &instance->callchain, "live",
-				frame, insn_idx, insn->live_before, new_before);
-		log_mask_change(env, &instance->callchain, "written",
-				frame, insn_idx, insn->must_write_acc, must_write_acc);
-	}
+	new_before = spis_or(spis_and(new_after, spis_not(insn->must_write)),
+			     insn->may_read);
+	changed |= !spis_equal(new_before, insn->live_before);
 	insn->live_before = new_before;
-	insn->must_write_acc = must_write_acc;
 	return changed;
 }
 
-/* Fixed-point computation of @live_before and @must_write_acc marks */
-static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+/* Fixed-point computation of @live_before marks */
+static void update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
 {
-	u32 i, frame, po_start, po_end, cnt, this_subprog_start;
-	struct callchain *callchain = &instance->callchain;
+	u32 i, frame, po_start, po_end, cnt;
 	int *insn_postorder = env->cfg.insn_postorder;
 	struct bpf_subprog_info *subprog;
-	struct per_frame_masks *insn;
 	bool changed;
-	int err;
 
-	this_subprog_start = callchain_subprog_start(callchain);
-	/*
-	 * If must_write marks were updated must_write_acc needs to be reset
-	 * (to account for the case when new must_write sets became smaller).
-	 */
-	if (instance->must_write_dropped) {
-		for (frame = 0; frame <= callchain->curframe; frame++) {
-			if (!instance->frames[frame])
-				continue;
-
-			for (i = 0; i < instance->insn_cnt; i++) {
-				insn = get_frame_masks(instance, frame, this_subprog_start + i);
-				insn->must_write_acc = 0;
-			}
-		}
-	}
-
-	subprog = bpf_find_containing_subprog(env, this_subprog_start);
+	instance->must_write_initialized = true;
+	subprog = &env->subprog_info[instance->subprog];
 	po_start = subprog->postorder_start;
 	po_end = (subprog + 1)->postorder_start;
 	cnt = 0;
@@ -644,7 +320,7 @@ static int update_instance(struct bpf_verifier_env *env, struct func_instance *i
 	do {
 		cnt++;
 		changed = false;
-		for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+		for (frame = 0; frame <= instance->depth; frame++) {
 			if (!instance->frames[frame])
 				continue;
 
@@ -652,57 +328,14 @@ static int update_instance(struct bpf_verifier_env *env, struct func_instance *i
 				changed |= update_insn(env, instance, frame, insn_postorder[i]);
 		}
 	} while (changed);
-
-	if (env->log.level & BPF_LOG_LEVEL2)
-		bpf_log(&env->log, "%s live stack update done in %d iterations\n",
-			fmt_callchain(env, callchain), cnt);
-
-	/* transfer marks accumulated for outer frames to outer func instance (caller) */
-	if (callchain->curframe > 0) {
-		err = propagate_to_outer_instance(env, instance);
-		if (err)
-			return err;
-	}
-
-	return 0;
 }
 
-/*
- * Prepare all callchains within @env->cur_state for querying.
- * This function should be called after each verifier.c:pop_stack()
- * and whenever verifier.c:do_check_insn() processes subprogram exit.
- * This would guarantee that visited verifier states with zero branches
- * have their bpf_mark_stack_{read,write}() effects propagated in
- * @env->liveness.
- */
-int bpf_update_live_stack(struct bpf_verifier_env *env)
-{
-	struct func_instance *instance;
-	int err, frame;
-
-	bpf_reset_live_stack_callchain(env);
-	for (frame = env->cur_state->curframe; frame >= 0; --frame) {
-		instance = lookup_instance(env, env->cur_state, frame);
-		if (IS_ERR(instance))
-			return PTR_ERR(instance);
-
-		if (instance->updated) {
-			err = update_instance(env, instance);
-			if (err)
-				return err;
-			instance->updated = false;
-			instance->must_write_dropped = false;
-		}
-	}
-	return 0;
-}
-
-static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 half_spi)
 {
 	struct per_frame_masks *masks;
 
 	masks = get_frame_masks(instance, frameno, insn_idx);
-	return masks && (masks->live_before & BIT(spi));
+	return masks && spis_test_bit(masks->live_before, half_spi);
 }
 
 int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
@@ -714,41 +347,1611 @@ int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_
 	memset(q, 0, sizeof(*q));
 	for (frame = 0; frame <= st->curframe; frame++) {
 		instance = lookup_instance(env, st, frame);
-		if (IS_ERR(instance))
-			return PTR_ERR(instance);
-		q->instances[frame] = instance;
+		if (IS_ERR_OR_NULL(instance))
+			q->instances[frame] = NULL;
+		else
+			q->instances[frame] = instance;
+		if (frame < st->curframe)
+			q->callsites[frame] = st->frame[frame + 1]->callsite;
 	}
 	q->curframe = st->curframe;
 	q->insn_idx = st->insn_idx;
 	return 0;
 }
 
-bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 half_spi)
 {
 	/*
-	 * Slot is alive if it is read before q->st->insn_idx in current func instance,
+	 * Slot is alive if it is read before q->insn_idx in current func instance,
 	 * or if for some outer func instance:
 	 * - alive before callsite if callsite calls callback, otherwise
 	 * - alive after callsite
 	 */
 	struct live_stack_query *q = &env->liveness->live_stack_query;
 	struct func_instance *instance, *curframe_instance;
-	u32 i, callsite;
-	bool alive;
+	u32 i, callsite, rel;
+	int cur_delta, delta;
+	bool alive = false;
 
 	curframe_instance = q->instances[q->curframe];
-	if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+	if (!curframe_instance)
+		return true;
+	cur_delta = (int)curframe_instance->depth - (int)q->curframe;
+	rel = frameno + cur_delta;
+	if (rel <= curframe_instance->depth)
+		alive = is_live_before(curframe_instance, q->insn_idx, rel, half_spi);
+
+	if (alive)
 		return true;
 
 	for (i = frameno; i < q->curframe; i++) {
-		callsite = curframe_instance->callchain.callsites[i];
 		instance = q->instances[i];
+		if (!instance)
+			return true;
+		/* Map actual frameno to frame index within this instance */
+		delta = (int)instance->depth - (int)i;
+		rel = frameno + delta;
+		if (rel > instance->depth)
+			return true;
+
+		/* Get callsite from verifier state, not from instance callchain */
+		callsite = q->callsites[i];
+
 		alive = bpf_calls_callback(env, callsite)
-			? is_live_before(instance, callsite, frameno, spi)
-			: is_live_before(instance, callsite + 1, frameno, spi);
+			? is_live_before(instance, callsite, rel, half_spi)
+			: is_live_before(instance, callsite + 1, rel, half_spi);
 		if (alive)
 			return true;
 	}
 
 	return false;
 }
+
+static char *fmt_subprog(struct bpf_verifier_env *env, int subprog)
+{
+	const char *name = env->subprog_info[subprog].name;
+
+	snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+		 "subprog#%d%s%s", subprog, name ? " " : "", name ? name : "");
+	return env->tmp_str_buf;
+}
+
+static char *fmt_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+	snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf),
+		 "(d%d,cs%d)", instance->depth, instance->callsite);
+	return env->tmp_str_buf;
+}
+
+static int spi_off(int spi)
+{
+	return -(spi + 1) * BPF_REG_SIZE;
+}
+
+/*
+ * When both halves of an 8-byte SPI are set, print as "-8","-16",...
+ * When only one half is set, print as "-4h","-8h",...
+ * Runs of 3+ consecutive fully-set SPIs are collapsed: "fp0-8..-24"
+ */
+static char *fmt_spis_mask(struct bpf_verifier_env *env, int frame, bool first, spis_t spis)
+{
+	int buf_sz = sizeof(env->tmp_str_buf);
+	char *buf = env->tmp_str_buf;
+	int spi, n, run_start;
+
+	buf[0] = '\0';
+
+	for (spi = 0; spi < STACK_SLOTS / 2 && buf_sz > 0; spi++) {
+		bool lo = spis_test_bit(spis, spi * 2);
+		bool hi = spis_test_bit(spis, spi * 2 + 1);
+		const char *space = first ? "" : " ";
+
+		if (!lo && !hi)
+			continue;
+
+		if (!lo || !hi) {
+			/* half-spi */
+			n = scnprintf(buf, buf_sz, "%sfp%d%d%s",
+				      space, frame, spi_off(spi) + (lo ? STACK_SLOT_SZ : 0), "h");
+		} else if (spi + 2 < STACK_SLOTS / 2 &&
+			   spis_test_bit(spis, spi * 2 + 2) &&
+			   spis_test_bit(spis, spi * 2 + 3) &&
+			   spis_test_bit(spis, spi * 2 + 4) &&
+			   spis_test_bit(spis, spi * 2 + 5)) {
+			/* 3+ consecutive full spis */
+			run_start = spi;
+			while (spi + 1 < STACK_SLOTS / 2 &&
+			       spis_test_bit(spis, (spi + 1) * 2) &&
+			       spis_test_bit(spis, (spi + 1) * 2 + 1))
+				spi++;
+			n = scnprintf(buf, buf_sz, "%sfp%d%d..%d",
+				      space, frame, spi_off(run_start), spi_off(spi));
+		} else {
+			/* just a full spi */
+			n = scnprintf(buf, buf_sz, "%sfp%d%d", space, frame, spi_off(spi));
+		}
+		first = false;
+		buf += n;
+		buf_sz -= n;
+	}
+	return env->tmp_str_buf;
+}
+
+static void print_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+	int start = env->subprog_info[instance->subprog].start;
+	struct bpf_insn *insns = env->prog->insnsi;
+	struct per_frame_masks *masks;
+	int len = instance->insn_cnt;
+	int insn_idx, frame, i;
+	bool has_use, has_def;
+	u64 pos, insn_pos;
+
+	if (!(env->log.level & BPF_LOG_LEVEL2))
+		return;
+
+	verbose(env, "stack use/def %s ", fmt_subprog(env, instance->subprog));
+	verbose(env, "%s:\n", fmt_instance(env, instance));
+	for (i = 0; i < len; i++) {
+		insn_idx = start + i;
+		has_use = false;
+		has_def = false;
+		pos = env->log.end_pos;
+		verbose(env, "%3d: ", insn_idx);
+		bpf_verbose_insn(env, &insns[insn_idx]);
+		bpf_vlog_reset(&env->log, env->log.end_pos - 1); /* remove \n */
+		insn_pos = env->log.end_pos;
+		verbose(env, "%*c;", bpf_vlog_alignment(insn_pos - pos), ' ');
+		pos = env->log.end_pos;
+		verbose(env, " use: ");
+		for (frame = instance->depth; frame >= 0; --frame) {
+			masks = get_frame_masks(instance, frame, insn_idx);
+			if (!masks || spis_is_zero(masks->may_read))
+				continue;
+			verbose(env, "%s", fmt_spis_mask(env, frame, !has_use, masks->may_read));
+			has_use = true;
+		}
+		if (!has_use)
+			bpf_vlog_reset(&env->log, pos);
+		pos = env->log.end_pos;
+		verbose(env, " def: ");
+		for (frame = instance->depth; frame >= 0; --frame) {
+			masks = get_frame_masks(instance, frame, insn_idx);
+			if (!masks || spis_is_zero(masks->must_write))
+				continue;
+			verbose(env, "%s", fmt_spis_mask(env, frame, !has_def, masks->must_write));
+			has_def = true;
+		}
+		if (!has_def)
+			bpf_vlog_reset(&env->log, has_use ? pos : insn_pos);
+		verbose(env, "\n");
+		if (bpf_is_ldimm64(&insns[insn_idx]))
+			i++;
+	}
+}
+
+static int cmp_instances(const void *pa, const void *pb)
+{
+	struct func_instance *a = *(struct func_instance **)pa;
+	struct func_instance *b = *(struct func_instance **)pb;
+	int dcallsite = (int)a->callsite - b->callsite;
+	int ddepth = (int)a->depth - b->depth;
+
+	if (dcallsite)
+		return dcallsite;
+	if (ddepth)
+		return ddepth;
+	return 0;
+}
+
+/* print use/def slots for all instances ordered by callsite first, then by depth */
+static int print_instances(struct bpf_verifier_env *env)
+{
+	struct func_instance *instance, **sorted_instances;
+	struct bpf_liveness *liveness = env->liveness;
+	int i, bkt, cnt;
+
+	cnt = 0;
+	hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+		cnt++;
+	sorted_instances = kvmalloc_objs(*sorted_instances, cnt, GFP_KERNEL_ACCOUNT);
+	if (!sorted_instances)
+		return -ENOMEM;
+	cnt = 0;
+	hash_for_each(liveness->func_instances, bkt, instance, hl_node)
+		sorted_instances[cnt++] = instance;
+	sort(sorted_instances, cnt, sizeof(*sorted_instances), cmp_instances, NULL);
+	for (i = 0; i < cnt; i++)
+		print_instance(env, sorted_instances[i]);
+	kvfree(sorted_instances);
+	return 0;
+}
+
+/*
+ * Per-register tracking state for compute_subprog_args().
+ * Tracks which frame's FP a value is derived from
+ * and the byte offset from that frame's FP.
+ *
+ * The .frame field forms a lattice with three levels of precision:
+ *
+ *   precise {frame=N, off=V}      -- known absolute frame index and byte offset
+ *        |
+ *   offset-imprecise {frame=N, off=OFF_IMPRECISE}
+ *        |                        -- known frame identity, unknown offset
+ *   fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask}
+ *                                 -- unknown frame identity; .mask is a
+ *                                    bitmask of which frame indices might be
+ *                                    involved
+ *
+ * At CFG merge points, arg_track_join() moves down the lattice:
+ *   - same frame + same offset  -> precise
+ *   - same frame + different offset -> offset-imprecise
+ *   - different frames          -> fully-imprecise (bitmask OR)
+ *
+ * At memory access sites (LDX/STX/ST), offset-imprecise marks only
+ * the known frame's access mask as SPIS_ALL, while fully-imprecise
+ * iterates bits in the bitmask and routes each frame to its target.
+ */
+#define MAX_ARG_OFFSETS 4
+
+struct arg_track {
+	union {
+		s16 off[MAX_ARG_OFFSETS]; /* byte offsets; off_cnt says how many */
+		u16 mask;	/* arg bitmask when arg == ARG_IMPRECISE */
+	};
+	s8 frame;	/* absolute frame index, or enum arg_track_state */
+	s8 off_cnt;	/* 0 = offset-imprecise, 1-4 = # of precise offsets */
+};
+
+enum arg_track_state {
+	ARG_NONE	= -1,	/* not derived from any argument */
+	ARG_UNVISITED	= -2,	/* not yet reached by dataflow */
+	ARG_IMPRECISE	= -3,	/* lost identity; .mask is arg bitmask */
+};
+
+#define OFF_IMPRECISE	S16_MIN	/* arg identity known but offset unknown */
+
+/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */
+#define MAX_ARG_SPILL_SLOTS 64
+
+static bool arg_is_visited(const struct arg_track *at)
+{
+	return at->frame != ARG_UNVISITED;
+}
+
+static bool arg_is_fp(const struct arg_track *at)
+{
+	return at->frame >= 0 || at->frame == ARG_IMPRECISE;
+}
+
+/*
+ * Clear all tracked callee stack slots overlapping the byte range
+ * [off, off+sz-1] where off is a negative FP-relative offset.
+ */
+static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz)
+{
+	struct arg_track none = { .frame = ARG_NONE };
+
+	if (off == OFF_IMPRECISE) {
+		for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++)
+			at_stack[i] = none;
+		return;
+	}
+	for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+		int slot_start = -((i + 1) * 8);
+		int slot_end = slot_start + 8;
+
+		if (slot_start < off + (int)sz && slot_end > off)
+			at_stack[i] = none;
+	}
+}
+
+static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at)
+{
+	int i;
+
+	switch (at->frame) {
+	case ARG_NONE:      verbose(env, "_");                          break;
+	case ARG_UNVISITED: verbose(env, "?");                          break;
+	case ARG_IMPRECISE: verbose(env, "IMP%x", at->mask);            break;
+	default:
+		/* frame >= 0: absolute frame index */
+		if (at->off_cnt == 0) {
+			verbose(env, "fp%d ?", at->frame);
+		} else {
+			for (i = 0; i < at->off_cnt; i++) {
+				if (i)
+					verbose(env, "|");
+				verbose(env, "fp%d%+d", at->frame, at->off[i]);
+			}
+		}
+		break;
+	}
+}
+
+static bool arg_track_eq(const struct arg_track *a, const struct arg_track *b)
+{
+	int i;
+
+	if (a->frame != b->frame)
+		return false;
+	if (a->frame == ARG_IMPRECISE)
+		return a->mask == b->mask;
+	if (a->frame < 0)
+		return true;
+	if (a->off_cnt != b->off_cnt)
+		return false;
+	for (i = 0; i < a->off_cnt; i++)
+		if (a->off[i] != b->off[i])
+			return false;
+	return true;
+}
+
+static struct arg_track arg_single(s8 arg, s16 off)
+{
+	struct arg_track at = {};
+
+	at.frame = arg;
+	at.off[0] = off;
+	at.off_cnt = 1;
+	return at;
+}
+
+/*
+ * Merge two sorted offset arrays, deduplicate.
+ * Returns off_cnt=0 if the result exceeds MAX_ARG_OFFSETS.
+ * Both args must have the same frame and off_cnt > 0.
+ */
+static struct arg_track arg_merge_offsets(struct arg_track a, struct arg_track b)
+{
+	struct arg_track result = { .frame = a.frame };
+	struct arg_track imp = { .frame = a.frame };
+	int i = 0, j = 0, k = 0;
+
+	while (i < a.off_cnt && j < b.off_cnt) {
+		s16 v;
+
+		if (a.off[i] <= b.off[j]) {
+			v = a.off[i++];
+			if (v == b.off[j])
+				j++;
+		} else {
+			v = b.off[j++];
+		}
+		if (k > 0 && result.off[k - 1] == v)
+			continue;
+		if (k >= MAX_ARG_OFFSETS)
+			return imp;
+		result.off[k++] = v;
+	}
+	while (i < a.off_cnt) {
+		if (k >= MAX_ARG_OFFSETS)
+			return imp;
+		result.off[k++] = a.off[i++];
+	}
+	while (j < b.off_cnt) {
+		if (k >= MAX_ARG_OFFSETS)
+			return imp;
+		result.off[k++] = b.off[j++];
+	}
+	result.off_cnt = k;
+	return result;
+}
+
+/*
+ * Merge two arg_tracks into ARG_IMPRECISE, collecting the frame
+ * bits from both operands. Precise frame indices (frame >= 0)
+ * contribute a single bit; existing ARG_IMPRECISE values
+ * contribute their full bitmask.
+ */
+static struct arg_track arg_join_imprecise(struct arg_track a, struct arg_track b)
+{
+	u32 m = 0;
+
+	if (a.frame >= 0)
+		m |= BIT(a.frame);
+	else if (a.frame == ARG_IMPRECISE)
+		m |= a.mask;
+
+	if (b.frame >= 0)
+		m |= BIT(b.frame);
+	else if (b.frame == ARG_IMPRECISE)
+		m |= b.mask;
+
+	return (struct arg_track){ .mask = m, .frame = ARG_IMPRECISE };
+}
+
+/* Join two arg_track values at merge points */
+static struct arg_track __arg_track_join(struct arg_track a, struct arg_track b)
+{
+	if (!arg_is_visited(&b))
+		return a;
+	if (!arg_is_visited(&a))
+		return b;
+	if (a.frame == b.frame && a.frame >= 0) {
+		/* Both offset-imprecise: stay imprecise */
+		if (a.off_cnt == 0 || b.off_cnt == 0)
+			return (struct arg_track){ .frame = a.frame };
+		/* Merge offset sets; falls back to off_cnt=0 if >4 */
+		return arg_merge_offsets(a, b);
+	}
+
+	/*
+	 * args are different, but one of them is known
+	 * arg + none -> arg
+	 * none + arg -> arg
+	 *
+	 * none + none -> none
+	 */
+	if (a.frame == ARG_NONE && b.frame == ARG_NONE)
+		return a;
+	if (a.frame >= 0 && b.frame == ARG_NONE) {
+		/*
+		 * When joining single fp-N add fake fp+0 to
+		 * keep stack_use and prevent stack_def
+		 */
+		if (a.off_cnt == 1)
+			return arg_merge_offsets(a, arg_single(a.frame, 0));
+		return a;
+	}
+	if (b.frame >= 0 && a.frame == ARG_NONE) {
+		if (b.off_cnt == 1)
+			return arg_merge_offsets(b, arg_single(b.frame, 0));
+		return b;
+	}
+
+	return arg_join_imprecise(a, b);
+}
+
+static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, int r,
+			   struct arg_track *in, struct arg_track out)
+{
+	struct arg_track old = *in;
+	struct arg_track new_val = __arg_track_join(old, out);
+
+	if (arg_track_eq(&new_val, &old))
+		return false;
+
+	*in = new_val;
+	if (!(env->log.level & BPF_LOG_LEVEL2) || !arg_is_visited(&old))
+		return true;
+
+	verbose(env, "arg JOIN insn %d -> %d ", idx, target);
+	if (r >= 0)
+		verbose(env, "r%d: ", r);
+	else
+		verbose(env, "fp%+d: ", r * 8);
+	verbose_arg_track(env, &old);
+	verbose(env, " + ");
+	verbose_arg_track(env, &out);
+	verbose(env, " => ");
+	verbose_arg_track(env, &new_val);
+	verbose(env, "\n");
+	return true;
+}
+
+/*
+ * Compute the result when an ALU op destroys offset precision.
+ * If a single arg is identifiable, preserve it with OFF_IMPRECISE.
+ * If two different args are involved or one is already ARG_IMPRECISE,
+ * the result is fully ARG_IMPRECISE.
+ */
+static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src)
+{
+	WARN_ON_ONCE(!arg_is_visited(dst));
+	WARN_ON_ONCE(!arg_is_visited(src));
+
+	if (dst->frame >= 0 && (src->frame == ARG_NONE || src->frame == dst->frame)) {
+		/*
+		 * rX += rY where rY is not arg derived
+		 * rX += rX
+		 */
+		dst->off_cnt = 0;
+		return;
+	}
+	if (src->frame >= 0 && dst->frame == ARG_NONE) {
+		/*
+		 * rX += rY where rX is not arg derived
+		 * rY identity leaks into rX
+		 */
+		dst->off_cnt = 0;
+		dst->frame = src->frame;
+		return;
+	}
+
+	if (dst->frame == ARG_NONE && src->frame == ARG_NONE)
+		return;
+
+	*dst = arg_join_imprecise(*dst, *src);
+}
+
+static s16 arg_add(s16 off, s64 delta)
+{
+	s64 res;
+
+	if (off == OFF_IMPRECISE)
+		return OFF_IMPRECISE;
+	res = (s64)off + delta;
+	if (res < S16_MIN + 1 || res > S16_MAX)
+		return OFF_IMPRECISE;
+	return res;
+}
+
+static void arg_padd(struct arg_track *at, s64 delta)
+{
+	int i;
+
+	if (at->off_cnt == 0)
+		return;
+	for (i = 0; i < at->off_cnt; i++) {
+		s16 new_off = arg_add(at->off[i], delta);
+
+		if (new_off == OFF_IMPRECISE) {
+			at->off_cnt = 0;
+			return;
+		}
+		at->off[i] = new_off;
+	}
+}
+
+/*
+ * Convert a byte offset from FP to a callee stack slot index.
+ * Returns -1 if out of range or not 8-byte aligned.
+ * Slot 0 = fp-8, slot 1 = fp-16, ..., slot 7 = fp-64, ....
+ */
+static int fp_off_to_slot(s16 off)
+{
+	if (off == OFF_IMPRECISE)
+		return -1;
+	if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8))
+		return -1;
+	if (off % 8)
+		return -1;
+	return (-off) / 8 - 1;
+}
+
+static struct arg_track fill_from_stack(struct bpf_insn *insn,
+					struct arg_track *at_out, int reg,
+					struct arg_track *at_stack_out,
+					int depth)
+{
+	struct arg_track imp = {
+		.mask = (1u << (depth + 1)) - 1,
+		.frame = ARG_IMPRECISE
+	};
+	struct arg_track result = { .frame = ARG_NONE };
+	int cnt, i;
+
+	if (reg == BPF_REG_FP) {
+		int slot = fp_off_to_slot(insn->off);
+
+		return slot >= 0 ? at_stack_out[slot] : imp;
+	}
+	cnt = at_out[reg].off_cnt;
+	if (cnt == 0)
+		return imp;
+
+	for (i = 0; i < cnt; i++) {
+		s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+		int slot = fp_off_to_slot(fp_off);
+
+		if (slot < 0)
+			return imp;
+		result = __arg_track_join(result, at_stack_out[slot]);
+	}
+	return result;
+}
+
+/*
+ * Spill @val to all possible stack slots indicated by the FP offsets in @reg.
+ * For an 8-byte store, single candidate slot gets @val. multi-slots are joined.
+ * sub-8-byte store joins with ARG_NONE.
+ * When exact offset is unknown conservatively add reg values to all slots in at_stack_out.
+ */
+static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out,
+			   int reg, struct arg_track *at_stack_out,
+			   struct arg_track *val, u32 sz)
+{
+	struct arg_track none = { .frame = ARG_NONE };
+	struct arg_track new_val = sz == 8 ? *val : none;
+	int cnt, i;
+
+	if (reg == BPF_REG_FP) {
+		int slot = fp_off_to_slot(insn->off);
+
+		if (slot >= 0)
+			at_stack_out[slot] = new_val;
+		return;
+	}
+	cnt = at_out[reg].off_cnt;
+	if (cnt == 0) {
+		for (int slot = 0; slot < MAX_ARG_SPILL_SLOTS; slot++)
+			at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+		return;
+	}
+	for (i = 0; i < cnt; i++) {
+		s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+		int slot = fp_off_to_slot(fp_off);
+
+		if (slot < 0)
+			continue;
+		if (cnt == 1)
+			at_stack_out[slot] = new_val;
+		else
+			at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val);
+	}
+}
+
+/*
+ * Clear stack slots overlapping all possible FP offsets in @reg.
+ */
+static void clear_stack_for_all_offs(struct bpf_insn *insn,
+				     struct arg_track *at_out, int reg,
+				     struct arg_track *at_stack_out, u32 sz)
+{
+	int cnt, i;
+
+	if (reg == BPF_REG_FP) {
+		clear_overlapping_stack_slots(at_stack_out, insn->off, sz);
+		return;
+	}
+	cnt = at_out[reg].off_cnt;
+	if (cnt == 0) {
+		clear_overlapping_stack_slots(at_stack_out, OFF_IMPRECISE, sz);
+		return;
+	}
+	for (i = 0; i < cnt; i++) {
+		s16 fp_off = arg_add(at_out[reg].off[i], insn->off);
+
+		clear_overlapping_stack_slots(at_stack_out, fp_off, sz);
+	}
+}
+
+static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, int idx,
+			  struct arg_track *at_in, struct arg_track *at_stack_in,
+			  struct arg_track *at_out, struct arg_track *at_stack_out)
+{
+	bool printed = false;
+	int i;
+
+	if (!(env->log.level & BPF_LOG_LEVEL2))
+		return;
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		if (arg_track_eq(&at_out[i], &at_in[i]))
+			continue;
+		if (!printed) {
+			verbose(env, "%3d: ", idx);
+			bpf_verbose_insn(env, insn);
+			bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+			printed = true;
+		}
+		verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]);
+		verbose(env, " -> "); verbose_arg_track(env, &at_out[i]);
+	}
+	for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) {
+		if (arg_track_eq(&at_stack_out[i], &at_stack_in[i]))
+			continue;
+		if (!printed) {
+			verbose(env, "%3d: ", idx);
+			bpf_verbose_insn(env, insn);
+			bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+			printed = true;
+		}
+		verbose(env, "\tfp%+d: ", -(i + 1) * 8); verbose_arg_track(env, &at_stack_in[i]);
+		verbose(env, " -> "); verbose_arg_track(env, &at_stack_out[i]);
+	}
+	if (printed)
+		verbose(env, "\n");
+}
+
+/*
+ * Pure dataflow transfer function for arg_track state.
+ * Updates at_out[] based on how the instruction modifies registers.
+ * Tracks spill/fill, but not other memory accesses.
+ */
+static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			   int insn_idx,
+			   struct arg_track *at_out, struct arg_track *at_stack_out,
+			   struct func_instance *instance,
+			   u32 *callsites)
+{
+	int depth = instance->depth;
+	u8 class = BPF_CLASS(insn->code);
+	u8 code = BPF_OP(insn->code);
+	struct arg_track *dst = &at_out[insn->dst_reg];
+	struct arg_track *src = &at_out[insn->src_reg];
+	struct arg_track none = { .frame = ARG_NONE };
+	int r;
+
+	if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) {
+		if (code == BPF_MOV) {
+			*dst = none;
+		} else if (dst->frame >= 0) {
+			if (code == BPF_ADD)
+				arg_padd(dst, insn->imm);
+			else if (code == BPF_SUB)
+				arg_padd(dst, -(s64)insn->imm);
+			else
+				/* Any other 64-bit alu on the pointer makes it imprecise */
+				dst->off_cnt = 0;
+		} /* else if dst->frame is imprecise it stays so */
+	} else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_X) {
+		if (code == BPF_MOV) {
+			if (insn->off == 0) {
+				*dst = *src;
+			} else {
+				/* addr_space_cast destroys a pointer */
+				*dst = none;
+			}
+		} else {
+			arg_track_alu64(dst, src);
+		}
+	} else if (class == BPF_ALU) {
+		/*
+		 * 32-bit alu destroys the pointer.
+		 * If src was a pointer it cannot leak into dst
+		 */
+		*dst = none;
+	} else if (class == BPF_JMP && code == BPF_CALL) {
+		/*
+		 * at_stack_out[slot] is not cleared by the helper and subprog calls.
+		 * The fill_from_stack() may return the stale spill — which is an FP-derived arg_track
+		 * (the value that was originally spilled there). The loaded register then carries
+		 * a phantom FP-derived identity that doesn't correspond to what's actually in the slot.
+		 * This phantom FP pointer propagates forward, and wherever it's subsequently used
+		 * (as a helper argument, another store, etc.), it sets stack liveness bits.
+		 * Those bits correspond to stack accesses that don't actually happen.
+		 * So the effect is over-reporting stack liveness — marking slots as live that aren't
+		 * actually accessed. The verifier preserves more state than necessary across calls,
+		 * which is conservative.
+		 *
+		 * helpers can scratch stack slots, but they won't make a valid pointer out of it.
+		 * subprogs are allowed to write into parent slots, but they cannot write
+		 * _any_ FP-derived pointer into it (either their own or parent's FP).
+		 */
+		for (r = BPF_REG_0; r <= BPF_REG_5; r++)
+			at_out[r] = none;
+	} else if (class == BPF_LDX) {
+		u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+		bool src_is_local_fp = insn->src_reg == BPF_REG_FP || src->frame == depth ||
+				       (src->frame == ARG_IMPRECISE && (src->mask & BIT(depth)));
+
+		/*
+		 * Reload from callee stack: if src is current-frame FP-derived
+		 * and the load is an 8-byte BPF_MEM, try to restore the spill
+		 * identity.  For imprecise sources fill_from_stack() returns
+		 * ARG_IMPRECISE (off_cnt == 0).
+		 */
+		if (src_is_local_fp && BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+			*dst = fill_from_stack(insn, at_out, insn->src_reg, at_stack_out, depth);
+		} else if (src->frame >= 0 && src->frame < depth &&
+			   BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+			struct arg_track *parent_stack =
+				env->callsite_at_stack[callsites[src->frame]];
+
+			*dst = fill_from_stack(insn, at_out, insn->src_reg,
+					       parent_stack, src->frame);
+		} else if (src->frame == ARG_IMPRECISE &&
+			   !(src->mask & BIT(depth)) && src->mask &&
+			   BPF_MODE(insn->code) == BPF_MEM && sz == 8) {
+			/*
+			 * Imprecise src with only parent-frame bits:
+			 * conservative fallback.
+			 */
+			*dst = *src;
+		} else {
+			*dst = none;
+		}
+	} else if (class == BPF_LD && BPF_MODE(insn->code) == BPF_IMM) {
+		*dst = none;
+	} else if (class == BPF_STX) {
+		u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+		bool dst_is_local_fp;
+
+		/* Track spills to current-frame FP-derived callee stack */
+		dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth;
+		if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM)
+			spill_to_stack(insn, at_out, insn->dst_reg,
+				       at_stack_out, src, sz);
+
+		if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+			if (dst_is_local_fp && insn->imm != BPF_LOAD_ACQ)
+				clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+							 at_stack_out, sz);
+
+			if (insn->imm == BPF_CMPXCHG)
+				at_out[BPF_REG_0] = none;
+			else if (insn->imm == BPF_LOAD_ACQ)
+				*dst = none;
+			else if (insn->imm & BPF_FETCH)
+				*src = none;
+		}
+	} else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) {
+		u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+		bool dst_is_local_fp = insn->dst_reg == BPF_REG_FP || dst->frame == depth;
+
+		/* BPF_ST to FP-derived dst: clear overlapping stack slots */
+		if (dst_is_local_fp)
+			clear_stack_for_all_offs(insn, at_out, insn->dst_reg,
+						 at_stack_out, sz);
+	}
+}
+
+/*
+ * Record access_bytes from helper/kfunc or load/store insn.
+ *   access_bytes > 0:      stack read
+ *   access_bytes < 0:      stack write
+ *   access_bytes == S64_MIN: unknown   — conservative, mark [0..slot] as read
+ *   access_bytes == 0:      no access
+ *
+ */
+static int record_stack_access_off(struct func_instance *instance, s64 fp_off,
+				   s64 access_bytes, u32 frame, u32 insn_idx)
+{
+	s32 slot_hi, slot_lo;
+	spis_t mask;
+
+	if (fp_off >= 0)
+		/*
+		 * out of bounds stack access doesn't contribute
+		 * into actual stack liveness. It will be rejected
+		 * by the main verifier pass later.
+		 */
+		return 0;
+	if (access_bytes == S64_MIN) {
+		/* helper/kfunc read unknown amount of bytes from fp_off until fp+0 */
+		slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+		mask = SPIS_ZERO;
+		spis_or_range(&mask, 0, slot_hi);
+		return mark_stack_read(instance, frame, insn_idx, mask);
+	}
+	if (access_bytes > 0) {
+		/* Mark any touched slot as use */
+		slot_hi = (-fp_off - 1) / STACK_SLOT_SZ;
+		slot_lo = max_t(s32, (-fp_off - access_bytes) / STACK_SLOT_SZ, 0);
+		mask = SPIS_ZERO;
+		spis_or_range(&mask, slot_lo, slot_hi);
+		return mark_stack_read(instance, frame, insn_idx, mask);
+	} else if (access_bytes < 0) {
+		/* Mark only fully covered slots as def */
+		access_bytes = -access_bytes;
+		slot_hi = (-fp_off) / STACK_SLOT_SZ - 1;
+		slot_lo = max_t(s32, (-fp_off - access_bytes + STACK_SLOT_SZ - 1) / STACK_SLOT_SZ, 0);
+		if (slot_lo <= slot_hi) {
+			mask = SPIS_ZERO;
+			spis_or_range(&mask, slot_lo, slot_hi);
+			return mark_stack_write(instance, frame, insn_idx, mask);
+		}
+	}
+	return 0;
+}
+
+/*
+ * 'arg' is FP-derived argument to helper/kfunc or load/store that
+ * reads (positive) or writes (negative) 'access_bytes' into 'use' or 'def'.
+ */
+static int record_stack_access(struct func_instance *instance,
+			       const struct arg_track *arg,
+			       s64 access_bytes, u32 frame, u32 insn_idx)
+{
+	int i, err;
+
+	if (access_bytes == 0)
+		return 0;
+	if (arg->off_cnt == 0) {
+		if (access_bytes > 0 || access_bytes == S64_MIN)
+			return mark_stack_read(instance, frame, insn_idx, SPIS_ALL);
+		return 0;
+	}
+	if (access_bytes != S64_MIN && access_bytes < 0 && arg->off_cnt != 1)
+		/* multi-offset write cannot set stack_def */
+		return 0;
+
+	for (i = 0; i < arg->off_cnt; i++) {
+		err = record_stack_access_off(instance, arg->off[i], access_bytes, frame, insn_idx);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * When a pointer is ARG_IMPRECISE, conservatively mark every frame in
+ * the bitmask as fully used.
+ */
+static int record_imprecise(struct func_instance *instance, u32 mask, u32 insn_idx)
+{
+	int depth = instance->depth;
+	int f, err;
+
+	for (f = 0; mask; f++, mask >>= 1) {
+		if (!(mask & 1))
+			continue;
+		if (f <= depth) {
+			err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+/* Record load/store access for a given 'at' state of 'insn'. */
+static int record_load_store_access(struct bpf_verifier_env *env,
+				    struct func_instance *instance,
+				    struct arg_track *at, int insn_idx)
+{
+	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+	int depth = instance->depth;
+	s32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code));
+	u8 class = BPF_CLASS(insn->code);
+	struct arg_track resolved, *ptr;
+	int oi;
+
+	switch (class) {
+	case BPF_LDX:
+		ptr = &at[insn->src_reg];
+		break;
+	case BPF_STX:
+		if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+			if (insn->imm == BPF_STORE_REL)
+				sz = -sz;
+			if (insn->imm == BPF_LOAD_ACQ)
+				ptr = &at[insn->src_reg];
+			else
+				ptr = &at[insn->dst_reg];
+		} else {
+			ptr = &at[insn->dst_reg];
+			sz = -sz;
+		}
+		break;
+	case BPF_ST:
+		ptr = &at[insn->dst_reg];
+		sz = -sz;
+		break;
+	default:
+		return 0;
+	}
+
+	/* Resolve offsets: fold insn->off into arg_track */
+	if (ptr->off_cnt > 0) {
+		resolved.off_cnt = ptr->off_cnt;
+		resolved.frame = ptr->frame;
+		for (oi = 0; oi < ptr->off_cnt; oi++) {
+			resolved.off[oi] = arg_add(ptr->off[oi], insn->off);
+			if (resolved.off[oi] == OFF_IMPRECISE) {
+				resolved.off_cnt = 0;
+				break;
+			}
+		}
+		ptr = &resolved;
+	}
+
+	if (ptr->frame >= 0 && ptr->frame <= depth)
+		return record_stack_access(instance, ptr, sz, ptr->frame, insn_idx);
+	if (ptr->frame == ARG_IMPRECISE)
+		return record_imprecise(instance, ptr->mask, insn_idx);
+	/* ARG_NONE: not derived from any frame pointer, skip */
+	return 0;
+}
+
+/* Record stack access for a given 'at' state of helper/kfunc 'insn' */
+static int record_call_access(struct bpf_verifier_env *env,
+			      struct func_instance *instance,
+			      struct arg_track *at,
+			      int insn_idx)
+{
+	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+	int depth = instance->depth;
+	struct bpf_call_summary cs;
+	int r, err = 0, num_params = 5;
+
+	if (bpf_pseudo_call(insn))
+		return 0;
+
+	if (bpf_get_call_summary(env, insn, &cs))
+		num_params = cs.num_params;
+
+	for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) {
+		int frame = at[r].frame;
+		s64 bytes;
+
+		if (!arg_is_fp(&at[r]))
+			continue;
+
+		if (bpf_helper_call(insn)) {
+			bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx);
+		} else if (bpf_pseudo_kfunc_call(insn)) {
+			bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx);
+		} else {
+			for (int f = 0; f <= depth; f++) {
+				err = mark_stack_read(instance, f, insn_idx, SPIS_ALL);
+				if (err)
+					return err;
+			}
+			return 0;
+		}
+		if (bytes == 0)
+			continue;
+
+		if (frame >= 0 && frame <= depth)
+			err = record_stack_access(instance, &at[r], bytes, frame, insn_idx);
+		else if (frame == ARG_IMPRECISE)
+			err = record_imprecise(instance, at[r].mask, insn_idx);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * For a calls_callback helper, find the callback subprog and determine
+ * which caller register maps to which callback register for FP passthrough.
+ */
+static int find_callback_subprog(struct bpf_verifier_env *env,
+				 struct bpf_insn *insn, int insn_idx,
+				 int *caller_reg, int *callee_reg)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+	int cb_reg = -1;
+
+	*caller_reg = -1;
+	*callee_reg = -1;
+
+	if (!bpf_helper_call(insn))
+		return -1;
+	switch (insn->imm) {
+	case BPF_FUNC_loop:
+		/* bpf_loop(nr, cb, ctx, flags): cb=R2, R3->cb R2 */
+		cb_reg = BPF_REG_2;
+		*caller_reg = BPF_REG_3;
+		*callee_reg = BPF_REG_2;
+		break;
+	case BPF_FUNC_for_each_map_elem:
+		/* for_each_map_elem(map, cb, ctx, flags): cb=R2, R3->cb R4 */
+		cb_reg = BPF_REG_2;
+		*caller_reg = BPF_REG_3;
+		*callee_reg = BPF_REG_4;
+		break;
+	case BPF_FUNC_find_vma:
+		/* find_vma(task, addr, cb, ctx, flags): cb=R3, R4->cb R3 */
+		cb_reg = BPF_REG_3;
+		*caller_reg = BPF_REG_4;
+		*callee_reg = BPF_REG_3;
+		break;
+	case BPF_FUNC_user_ringbuf_drain:
+		/* user_ringbuf_drain(map, cb, ctx, flags): cb=R2, R3->cb R2 */
+		cb_reg = BPF_REG_2;
+		*caller_reg = BPF_REG_3;
+		*callee_reg = BPF_REG_2;
+		break;
+	default:
+		return -1;
+	}
+
+	if (!(aux->const_reg_subprog_mask & BIT(cb_reg)))
+		return -2;
+
+	return aux->const_reg_vals[cb_reg];
+}
+
+/* Per-subprog intermediate state kept alive across analysis phases */
+struct subprog_at_info {
+	struct arg_track (*at_in)[MAX_BPF_REG];
+	int len;
+};
+
+static void print_subprog_arg_access(struct bpf_verifier_env *env,
+				     int subprog,
+				     struct subprog_at_info *info,
+				     struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS])
+{
+	struct bpf_insn *insns = env->prog->insnsi;
+	int start = env->subprog_info[subprog].start;
+	int len = info->len;
+	int i, r;
+
+	if (!(env->log.level & BPF_LOG_LEVEL2))
+		return;
+
+	verbose(env, "%s:\n", fmt_subprog(env, subprog));
+	for (i = 0; i < len; i++) {
+		int idx = start + i;
+		bool has_extra = false;
+		u8 cls = BPF_CLASS(insns[idx].code);
+		bool is_ldx_stx_call = cls == BPF_LDX || cls == BPF_STX ||
+				       insns[idx].code == (BPF_JMP | BPF_CALL);
+
+		verbose(env, "%3d: ", idx);
+		bpf_verbose_insn(env, &insns[idx]);
+
+		/* Collect what needs printing */
+		if (is_ldx_stx_call &&
+		    arg_is_visited(&info->at_in[i][0])) {
+			for (r = 0; r < MAX_BPF_REG - 1; r++)
+				if (arg_is_fp(&info->at_in[i][r]))
+					has_extra = true;
+		}
+		if (is_ldx_stx_call) {
+			for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+				if (arg_is_fp(&at_stack_in[i][r]))
+					has_extra = true;
+		}
+
+		if (!has_extra) {
+			if (bpf_is_ldimm64(&insns[idx]))
+				i++;
+			continue;
+		}
+
+		bpf_vlog_reset(&env->log, env->log.end_pos - 1);
+		verbose(env, " //");
+
+		if (is_ldx_stx_call && info->at_in &&
+		    arg_is_visited(&info->at_in[i][0])) {
+			for (r = 0; r < MAX_BPF_REG - 1; r++) {
+				if (!arg_is_fp(&info->at_in[i][r]))
+					continue;
+				verbose(env, " r%d=", r);
+				verbose_arg_track(env, &info->at_in[i][r]);
+			}
+		}
+
+		if (is_ldx_stx_call) {
+			for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) {
+				if (!arg_is_fp(&at_stack_in[i][r]))
+					continue;
+				verbose(env, " fp%+d=", -(r + 1) * 8);
+				verbose_arg_track(env, &at_stack_in[i][r]);
+			}
+		}
+
+		verbose(env, "\n");
+		if (bpf_is_ldimm64(&insns[idx]))
+			i++;
+	}
+}
+
+/*
+ * Compute arg tracking dataflow for a single subprog.
+ * Runs forward fixed-point with arg_track_xfer(), then records
+ * memory accesses in a single linear pass over converged state.
+ *
+ * @callee_entry: pre-populated entry state for R1-R5
+ *                NULL for main (subprog 0).
+ * @info:         stores at_in, len for debug printing.
+ */
+static int compute_subprog_args(struct bpf_verifier_env *env,
+				struct subprog_at_info *info,
+				struct arg_track *callee_entry,
+				struct func_instance *instance,
+				u32 *callsites)
+{
+	int subprog = instance->subprog;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int depth = instance->depth;
+	int start = env->subprog_info[subprog].start;
+	int po_start = env->subprog_info[subprog].postorder_start;
+	int end = env->subprog_info[subprog + 1].start;
+	int po_end = env->subprog_info[subprog + 1].postorder_start;
+	int len = end - start;
+	struct arg_track (*at_in)[MAX_BPF_REG] = NULL;
+	struct arg_track at_out[MAX_BPF_REG];
+	struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL;
+	struct arg_track *at_stack_out = NULL;
+	struct arg_track unvisited = { .frame = ARG_UNVISITED };
+	struct arg_track none = { .frame = ARG_NONE };
+	bool changed;
+	int i, p, r, err = -ENOMEM;
+
+	at_in = kvmalloc_objs(*at_in, len, GFP_KERNEL_ACCOUNT);
+	if (!at_in)
+		goto err_free;
+
+	at_stack_in = kvmalloc_objs(*at_stack_in, len, GFP_KERNEL_ACCOUNT);
+	if (!at_stack_in)
+		goto err_free;
+
+	at_stack_out = kvmalloc_objs(*at_stack_out, MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+	if (!at_stack_out)
+		goto err_free;
+
+	for (i = 0; i < len; i++) {
+		for (r = 0; r < MAX_BPF_REG; r++)
+			at_in[i][r] = unvisited;
+		for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+			at_stack_in[i][r] = unvisited;
+	}
+
+	for (r = 0; r < MAX_BPF_REG; r++)
+		at_in[0][r] = none;
+
+	/* Entry: R10 is always precisely the current frame's FP */
+	at_in[0][BPF_REG_FP] = arg_single(depth, 0);
+
+	/* R1-R5: from caller or ARG_NONE for main */
+	if (callee_entry) {
+		for (r = BPF_REG_1; r <= BPF_REG_5; r++)
+			at_in[0][r] = callee_entry[r];
+	}
+
+	/* Entry: all stack slots are ARG_NONE */
+	for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+		at_stack_in[0][r] = none;
+
+	if (env->log.level & BPF_LOG_LEVEL2)
+		verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth);
+
+	/* Forward fixed-point iteration in reverse post order */
+redo:
+	changed = false;
+	for (p = po_end - 1; p >= po_start; p--) {
+		int idx = env->cfg.insn_postorder[p];
+		int i = idx - start;
+		struct bpf_insn *insn = &insns[idx];
+		struct bpf_iarray *succ;
+
+		if (!arg_is_visited(&at_in[i][0]) && !arg_is_visited(&at_in[i][1]))
+			continue;
+
+		memcpy(at_out, at_in[i], sizeof(at_out));
+		memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out));
+
+		arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites);
+		arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out);
+
+		/* Propagate to successors within this subprogram */
+		succ = bpf_insn_successors(env, idx);
+		for (int s = 0; s < succ->cnt; s++) {
+			int target = succ->items[s];
+			int ti;
+
+			/* Filter: stay within the subprogram's range */
+			if (target < start || target >= end)
+				continue;
+			ti = target - start;
+
+			for (r = 0; r < MAX_BPF_REG; r++)
+				changed |= arg_track_join(env, idx, target, r,
+							  &at_in[ti][r], at_out[r]);
+
+			for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++)
+				changed |= arg_track_join(env, idx, target, -r - 1,
+							  &at_stack_in[ti][r], at_stack_out[r]);
+		}
+	}
+	if (changed)
+		goto redo;
+
+	/* Record memory accesses using converged at_in (RPO skips dead code) */
+	for (p = po_end - 1; p >= po_start; p--) {
+		int idx = env->cfg.insn_postorder[p];
+		int i = idx - start;
+		struct bpf_insn *insn = &insns[idx];
+
+		err = record_load_store_access(env, instance, at_in[i], idx);
+		if (err)
+			goto err_free;
+
+		if (insn->code == (BPF_JMP | BPF_CALL)) {
+			err = record_call_access(env, instance, at_in[i], idx);
+			if (err)
+				goto err_free;
+		}
+
+		if (bpf_pseudo_call(insn) || bpf_calls_callback(env, idx)) {
+			kvfree(env->callsite_at_stack[idx]);
+			env->callsite_at_stack[idx] =
+				kvmalloc_objs(*env->callsite_at_stack[idx],
+					      MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT);
+			if (!env->callsite_at_stack[idx]) {
+				err = -ENOMEM;
+				goto err_free;
+			}
+			memcpy(env->callsite_at_stack[idx],
+			       at_stack_in[i], sizeof(struct arg_track) * MAX_ARG_SPILL_SLOTS);
+		}
+	}
+
+	info->at_in = at_in;
+	at_in = NULL;
+	info->len = len;
+	print_subprog_arg_access(env, subprog, info, at_stack_in);
+	err = 0;
+
+err_free:
+	kvfree(at_stack_out);
+	kvfree(at_stack_in);
+	kvfree(at_in);
+	return err;
+}
+
+/* Return true if any of R1-R5 is derived from a frame pointer. */
+static bool has_fp_args(struct arg_track *args)
+{
+	for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+		if (args[r].frame != ARG_NONE)
+			return true;
+	return false;
+}
+
+/*
+ * Merge a freshly analyzed instance into the original.
+ * may_read: union (any pass might read the slot).
+ * must_write: intersection (only slots written on ALL passes are guaranteed).
+ * live_before is recomputed by a subsequent update_instance() on @dst.
+ */
+static void merge_instances(struct func_instance *dst, struct func_instance *src)
+{
+	int f, i;
+
+	for (f = 0; f <= dst->depth; f++) {
+		if (!src->frames[f]) {
+			/* This pass didn't touch frame f — must_write intersects with empty. */
+			if (dst->frames[f])
+				for (i = 0; i < dst->insn_cnt; i++)
+					dst->frames[f][i].must_write = SPIS_ZERO;
+			continue;
+		}
+		if (!dst->frames[f]) {
+			/* Previous pass didn't touch frame f — take src, zero must_write. */
+			dst->frames[f] = src->frames[f];
+			src->frames[f] = NULL;
+			for (i = 0; i < dst->insn_cnt; i++)
+				dst->frames[f][i].must_write = SPIS_ZERO;
+			continue;
+		}
+		for (i = 0; i < dst->insn_cnt; i++) {
+			dst->frames[f][i].may_read =
+				spis_or(dst->frames[f][i].may_read,
+					src->frames[f][i].may_read);
+			dst->frames[f][i].must_write =
+				spis_and(dst->frames[f][i].must_write,
+					 src->frames[f][i].must_write);
+		}
+	}
+}
+
+static struct func_instance *fresh_instance(struct func_instance *src)
+{
+	struct func_instance *f;
+
+	f = kvzalloc_obj(*f, GFP_KERNEL_ACCOUNT);
+	if (!f)
+		return ERR_PTR(-ENOMEM);
+	f->callsite = src->callsite;
+	f->depth = src->depth;
+	f->subprog = src->subprog;
+	f->subprog_start = src->subprog_start;
+	f->insn_cnt = src->insn_cnt;
+	return f;
+}
+
+static void free_instance(struct func_instance *instance)
+{
+	int i;
+
+	for (i = 0; i <= instance->depth; i++)
+		kvfree(instance->frames[i]);
+	kvfree(instance);
+}
+
+/*
+ * Recursively analyze a subprog with specific 'entry_args'.
+ * Each callee is analyzed with the exact args from its call site.
+ *
+ * Args are recomputed for each call because the dataflow result at_in[]
+ * depends on the entry args and frame depth. Consider: A->C->D and B->C->D
+ * Callsites in A and B pass different args into C, so C is recomputed.
+ * Then within C the same callsite passes different args into D.
+ */
+static int analyze_subprog(struct bpf_verifier_env *env,
+			   struct arg_track *entry_args,
+			   struct subprog_at_info *info,
+			   struct func_instance *instance,
+			   u32 *callsites)
+{
+	int subprog = instance->subprog;
+	int depth = instance->depth;
+	struct bpf_insn *insns = env->prog->insnsi;
+	int start = env->subprog_info[subprog].start;
+	int po_start = env->subprog_info[subprog].postorder_start;
+	int po_end = env->subprog_info[subprog + 1].postorder_start;
+	struct func_instance *prev_instance = NULL;
+	int j, err;
+
+	if (++env->liveness->subprog_calls > 10000) {
+		verbose(env, "liveness analysis exceeded complexity limit (%d calls)\n",
+			env->liveness->subprog_calls);
+		return -E2BIG;
+	}
+
+	if (need_resched())
+		cond_resched();
+
+
+	/*
+	 * When an instance is reused (must_write_initialized == true),
+	 * record into a fresh instance and merge afterward.  This avoids
+	 * stale must_write marks for instructions not reached in this pass.
+	 */
+	if (instance->must_write_initialized) {
+		struct func_instance *fresh = fresh_instance(instance);
+
+		if (IS_ERR(fresh))
+			return PTR_ERR(fresh);
+		prev_instance = instance;
+		instance = fresh;
+	}
+
+	/* Free prior analysis if this subprog was already visited */
+	kvfree(info[subprog].at_in);
+	info[subprog].at_in = NULL;
+
+	err = compute_subprog_args(env, &info[subprog], entry_args, instance, callsites);
+	if (err)
+		goto out_free;
+
+	/* For each reachable call site in the subprog, recurse into callees */
+	for (int p = po_start; p < po_end; p++) {
+		int idx = env->cfg.insn_postorder[p];
+		struct arg_track callee_args[BPF_REG_5 + 1];
+		struct arg_track none = { .frame = ARG_NONE };
+		struct bpf_insn *insn = &insns[idx];
+		struct func_instance *callee_instance;
+		int callee, target;
+		int caller_reg, cb_callee_reg;
+
+		j = idx - start; /* relative index within this subprog */
+
+		if (bpf_pseudo_call(insn)) {
+			target = idx + insn->imm + 1;
+			callee = bpf_find_subprog(env, target);
+			if (callee < 0)
+				continue;
+
+			/* Build entry args: R1-R5 from at_in at call site */
+			for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+				callee_args[r] = info[subprog].at_in[j][r];
+		} else if (bpf_calls_callback(env, idx)) {
+			callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg);
+			if (callee == -2) {
+				/*
+				 * same bpf_loop() calls two different callbacks and passes
+				 * stack pointer to them
+				 */
+				if (info[subprog].at_in[j][caller_reg].frame == ARG_NONE)
+					continue;
+				for (int f = 0; f <= depth; f++) {
+					err = mark_stack_read(instance, f, idx, SPIS_ALL);
+					if (err)
+						goto out_free;
+				}
+				continue;
+			}
+			if (callee < 0)
+				continue;
+
+			for (int r = BPF_REG_1; r <= BPF_REG_5; r++)
+				callee_args[r] = none;
+			callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg];
+		} else {
+			continue;
+		}
+
+		if (!has_fp_args(callee_args))
+			continue;
+
+		if (depth == MAX_CALL_FRAMES - 1) {
+			err = -EINVAL;
+			goto out_free;
+		}
+
+		callee_instance = call_instance(env, instance, idx, callee);
+		if (IS_ERR(callee_instance)) {
+			err = PTR_ERR(callee_instance);
+			goto out_free;
+		}
+		callsites[depth] = idx;
+		err = analyze_subprog(env, callee_args, info, callee_instance, callsites);
+		if (err)
+			goto out_free;
+
+		/* Pull callee's entry liveness back to caller's callsite */
+		{
+			u32 callee_start = callee_instance->subprog_start;
+			struct per_frame_masks *entry;
+
+			for (int f = 0; f < callee_instance->depth; f++) {
+				entry = get_frame_masks(callee_instance, f, callee_start);
+				if (!entry)
+					continue;
+				err = mark_stack_read(instance, f, idx, entry->live_before);
+				if (err)
+					goto out_free;
+			}
+		}
+	}
+
+	if (prev_instance) {
+		merge_instances(prev_instance, instance);
+		free_instance(instance);
+		instance = prev_instance;
+	}
+	update_instance(env, instance);
+	return 0;
+
+out_free:
+	if (prev_instance)
+		free_instance(instance);
+	return err;
+}
+
+int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
+{
+	u32 callsites[MAX_CALL_FRAMES] = {};
+	int insn_cnt = env->prog->len;
+	struct func_instance *instance;
+	struct subprog_at_info *info;
+	int k, err = 0;
+
+	info = kvzalloc_objs(*info, env->subprog_cnt, GFP_KERNEL_ACCOUNT);
+	if (!info)
+		return -ENOMEM;
+
+	env->callsite_at_stack = kvzalloc_objs(*env->callsite_at_stack, insn_cnt,
+					       GFP_KERNEL_ACCOUNT);
+	if (!env->callsite_at_stack) {
+		kvfree(info);
+		return -ENOMEM;
+	}
+
+	instance = call_instance(env, NULL, 0, 0);
+	if (IS_ERR(instance)) {
+		err = PTR_ERR(instance);
+		goto out;
+	}
+	err = analyze_subprog(env, NULL, info, instance, callsites);
+	if (err)
+		goto out;
+
+	/*
+	 * Subprogs and callbacks that don't receive FP-derived arguments
+	 * cannot access ancestor stack frames, so they were skipped during
+	 * the recursive walk above.  Async callbacks (timer, workqueue) are
+	 * also not reachable from the main program's call graph.  Analyze
+	 * all unvisited subprogs as independent roots at depth 0.
+	 *
+	 * Use reverse topological order (callers before callees) so that
+	 * each subprog is analyzed before its callees, allowing the
+	 * recursive walk inside analyze_subprog() to naturally
+	 * reach nested callees that also lack FP-derived args.
+	 */
+	for (k = env->subprog_cnt - 1; k >= 0; k--) {
+		int sub = env->subprog_topo_order[k];
+
+		if (info[sub].at_in && !bpf_subprog_is_global(env, sub))
+			continue;
+		instance = call_instance(env, NULL, 0, sub);
+		if (IS_ERR(instance)) {
+			err = PTR_ERR(instance);
+			goto out;
+		}
+		err = analyze_subprog(env, NULL, info, instance, callsites);
+		if (err)
+			goto out;
+	}
+
+	if (env->log.level & BPF_LOG_LEVEL2)
+		err = print_instances(env);
+
+out:
+	for (k = 0; k < insn_cnt; k++)
+		kvfree(env->callsite_at_stack[k]);
+	kvfree(env->callsite_at_stack);
+	env->callsite_at_stack = NULL;
+	for (k = 0; k < env->subprog_cnt; k++)
+		kvfree(info[k].at_in);
+	kvfree(info);
+	return err;
+}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 6fd030fd6eeb..011e4ec25acd 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -501,7 +501,8 @@ static char slot_type_char[] = {
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
 	[STACK_ITER]	= 'i',
-	[STACK_IRQ_FLAG] = 'f'
+	[STACK_IRQ_FLAG] = 'f',
+	[STACK_POISON]	= 'p',
 };
 
 #define UNUM_MAX_DECIMAL U16_MAX
@@ -738,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
 
 		for (j = 0; j < BPF_REG_SIZE; j++) {
 			slot_type = state->stack[i].slot_type[j];
-			if (slot_type != STACK_INVALID)
+			if (slot_type != STACK_INVALID && slot_type != STACK_POISON)
 				valid = true;
 			types_buf[j] = slot_type_char[slot_type];
 		}
@@ -806,7 +807,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
 		mark_verifier_state_clean(env);
 }
 
-static inline u32 vlog_alignment(u32 pos)
+u32 bpf_vlog_alignment(u32 pos)
 {
 	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
 			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
@@ -818,7 +819,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st
 	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
 		/* remove new line character */
 		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+		verbose(env, "%*c;", bpf_vlog_alignment(env->prev_insn_print_pos), ' ');
 	} else {
 		verbose(env, "%d:", env->insn_idx);
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c1135d373e2..566311dd4fba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -423,7 +423,7 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 	return rec;
 }
 
-static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
 
@@ -830,8 +830,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
 	}
 
-	bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
-
 	return 0;
 }
 
@@ -846,8 +844,6 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat
 
 	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
 	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
-
-	bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
 }
 
 static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
@@ -984,8 +980,6 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
 	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
 
-	bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
-
 	return 0;
 }
 
@@ -1111,7 +1105,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 		for (j = 0; j < BPF_REG_SIZE; j++)
 			slot->slot_type[j] = STACK_ITER;
 
-		bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
 		mark_stack_slot_scratched(env, spi - i);
 	}
 
@@ -1140,7 +1133,6 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
 		for (j = 0; j < BPF_REG_SIZE; j++)
 			slot->slot_type[j] = STACK_INVALID;
 
-		bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
 		mark_stack_slot_scratched(env, spi - i);
 	}
 
@@ -1230,7 +1222,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
 	slot = &state->stack[spi];
 	st = &slot->spilled_ptr;
 
-	bpf_mark_stack_write(env, reg->frameno, BIT(spi));
 	__mark_reg_known_zero(st);
 	st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
 	st->ref_obj_id = id;
@@ -1286,8 +1277,6 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
 
 	__mark_reg_not_init(env, st);
 
-	bpf_mark_stack_write(env, reg->frameno, BIT(spi));
-
 	for (i = 0; i < BPF_REG_SIZE; i++)
 		slot->slot_type[i] = STACK_INVALID;
 
@@ -1359,6 +1348,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
 	case STACK_IRQ_FLAG:
 		return true;
 	case STACK_INVALID:
+	case STACK_POISON:
 	case STACK_MISC:
 	case STACK_ZERO:
 		return false;
@@ -1388,9 +1378,11 @@ static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im)
 	       stack->spilled_ptr.type == SCALAR_VALUE;
 }
 
-/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
- * case they are equivalent, or it's STACK_ZERO, in which case we preserve
- * more precise STACK_ZERO.
+/*
+ * Mark stack slot as STACK_MISC, unless it is already:
+ * - STACK_INVALID, in which case they are equivalent.
+ * - STACK_ZERO, in which case we preserve more precise STACK_ZERO.
+ * - STACK_POISON, which truly forbids access to the slot.
  * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
  * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
  * unnecessary as both are considered equivalent when loading data and pruning,
@@ -1401,14 +1393,14 @@ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
 {
 	if (*stype == STACK_ZERO)
 		return;
-	if (*stype == STACK_INVALID)
+	if (*stype == STACK_INVALID || *stype == STACK_POISON)
 		return;
 	*stype = STACK_MISC;
 }
 
 static void scrub_spilled_slot(u8 *stype)
 {
-	if (*stype != STACK_INVALID)
+	if (*stype != STACK_INVALID && *stype != STACK_POISON)
 		*stype = STACK_MISC;
 }
 
@@ -1801,7 +1793,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		return err;
 	dst_state->speculative = src->speculative;
 	dst_state->in_sleepable = src->in_sleepable;
-	dst_state->cleaned = src->cleaned;
 	dst_state->curframe = src->curframe;
 	dst_state->branches = src->branches;
 	dst_state->parent = src->parent;
@@ -3864,14 +3855,10 @@ static int sort_subprogs_topo(struct bpf_verifier_env *env)
 static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				    int spi, int nr_slots)
 {
-	int err, i;
+	int i;
 
-	for (i = 0; i < nr_slots; i++) {
-		err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
-		if (err)
-			return err;
+	for (i = 0; i < nr_slots; i++)
 		mark_stack_slot_scratched(env, spi - i);
-	}
 	return 0;
 }
 
@@ -4631,7 +4618,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			if (subprog < 0)
 				return -EFAULT;
 
-			if (subprog_is_global(env, subprog)) {
+			if (bpf_subprog_is_global(env, subprog)) {
 				/* check that jump history doesn't have any
 				 * extra instructions from subprog; the next
 				 * instruction after call to global subprog
@@ -5422,18 +5409,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	if (err)
 		return err;
 
-	if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
-		/* only mark the slot as written if all 8 bytes were written
-		 * otherwise read propagation may incorrectly stop too soon
-		 * when stack slots are partially written.
-		 * This heuristic means that read propagation will be
-		 * conservative, since it will add reg_live_read marks
-		 * to stack slots all the way to first state when programs
-		 * writes+reads less than 8 bytes
-		 */
-		bpf_mark_stack_write(env, state->frameno, BIT(spi));
-	}
-
 	check_fastcall_stack_contract(env, state, insn_idx, off);
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
@@ -5614,8 +5589,10 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 		 * For privileged programs, we will accept such reads to slots
 		 * that may or may not be written because, if we're reject
 		 * them, the error would be too confusing.
+		 * Conservatively, treat STACK_POISON in a similar way.
 		 */
-		if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+		if ((*stype == STACK_INVALID || *stype == STACK_POISON) &&
+		    !env->allow_uninit_stack) {
 			verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
 					insn_idx, i);
 			return -EINVAL;
@@ -5690,16 +5667,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg;
 	u8 *stype, type;
 	int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
-	int err;
 
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
 
 	mark_stack_slot_scratched(env, spi);
 	check_fastcall_stack_contract(env, state, env->insn_idx, off);
-	err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
-	if (err)
-		return err;
 
 	if (is_spilled_reg(&reg_state->stack[spi])) {
 		u8 spill_size = 1;
@@ -5755,8 +5728,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 					}
 					if (type == STACK_INVALID && env->allow_uninit_stack)
 						continue;
-					verbose(env, "invalid read from stack off %d+%d size %d\n",
-						off, i, size);
+					if (type == STACK_POISON) {
+						verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+							off, i, size);
+					} else {
+						verbose(env, "invalid read from stack off %d+%d size %d\n",
+							off, i, size);
+					}
 					return -EACCES;
 				}
 
@@ -5805,8 +5783,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 				continue;
 			if (type == STACK_INVALID && env->allow_uninit_stack)
 				continue;
-			verbose(env, "invalid read from stack off %d+%d size %d\n",
-				off, i, size);
+			if (type == STACK_POISON) {
+				verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
+					off, i, size);
+			} else {
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
+					off, i, size);
+			}
 			return -EACCES;
 		}
 		if (dst_regno >= 0)
@@ -7032,7 +7015,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
 		if (subprog[idx].has_tail_call)
 			tail_call_reachable = true;
 
-		frame = subprog_is_global(env, idx) ? 0 : frame + 1;
+		frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1;
 		if (frame >= MAX_CALL_FRAMES) {
 			verbose(env, "the call stack of %d frames is too deep !\n",
 				frame);
@@ -8409,16 +8392,22 @@ static int check_stack_range_initialized(
 	/* Some accesses can write anything into the stack, others are
 	 * read-only.
 	 */
-	bool clobber = false;
+	bool clobber = type == BPF_WRITE;
+	/*
+	 * Negative access_size signals global subprog/kfunc arg check where
+	 * STACK_POISON slots are acceptable. static stack liveness
+	 * might have determined that subprog doesn't read them,
+	 * but BTF based global subprog validation isn't accurate enough.
+	 */
+	bool allow_poison = access_size < 0 || clobber;
+
+	access_size = abs(access_size);
 
 	if (access_size == 0 && !zero_size_allowed) {
 		verbose(env, "invalid zero-sized read\n");
 		return -EACCES;
 	}
 
-	if (type == BPF_WRITE)
-		clobber = true;
-
 	err = check_stack_access_within_bounds(env, regno, off, access_size, type);
 	if (err)
 		return err;
@@ -8517,7 +8506,12 @@ static int check_stack_range_initialized(
 			goto mark;
 		}
 
-		if (tnum_is_const(reg->var_off)) {
+		if (*stype == STACK_POISON) {
+			if (allow_poison)
+				goto mark;
+			verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n",
+				regno, min_off, i - min_off, access_size);
+		} else if (tnum_is_const(reg->var_off)) {
 			verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
 				regno, min_off, i - min_off, access_size);
 		} else {
@@ -8529,17 +8523,7 @@ static int check_stack_range_initialized(
 		}
 		return -EACCES;
 mark:
-		/* reading any byte out of 8-byte 'spill_slot' will cause
-		 * the whole slot to be marked as 'read'
-		 */
-		err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
-		if (err)
-			return err;
-		/* We do not call bpf_mark_stack_write(), as we can not
-		 * be sure that whether stack slot is written to or not. Hence,
-		 * we must still conservatively propagate reads upwards even if
-		 * helper may write to the entire memory range.
-		 */
+		;
 	}
 	return 0;
 }
@@ -8704,8 +8688,10 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg
 		mark_ptr_not_null_reg(reg);
 	}
 
-	err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
-	err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
+	int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size;
+
+	err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL);
+	err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL);
 
 	if (may_be_null)
 		*reg = saved_reg;
@@ -11107,7 +11093,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
-	if (subprog_is_global(env, subprog)) {
+	if (bpf_subprog_is_global(env, subprog)) {
 		const char *sub_name = subprog_name(env, subprog);
 
 		if (env->cur_state->active_locks) {
@@ -11159,8 +11145,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* and go analyze first insn of the callee */
 	*insn_idx = env->subprog_info[subprog].start - 1;
 
-	bpf_reset_live_stack_callchain(env);
-
 	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "caller:\n");
 		print_verifier_state(env, state, caller->frameno, true);
@@ -11445,10 +11429,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	bool in_callback_fn;
 	int err;
 
-	err = bpf_update_live_stack(env);
-	if (err)
-		return err;
-
 	callee = state->frame[state->curframe];
 	r0 = &callee->regs[BPF_REG_0];
 	if (r0->type == PTR_TO_STACK) {
@@ -19797,6 +19777,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			goto err_free;
 		}
 
+		env->subprog_info[i].name = btf_name_by_offset(btf, type->name_off);
 		bpfptr_add(&urecord, urec_size);
 	}
 
@@ -20160,11 +20141,10 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	return check_ids(old_id, cur_id, idmap);
 }
 
-static void clean_func_state(struct bpf_verifier_env *env,
-			     struct bpf_func_state *st,
-			     u32 ip)
+static void __clean_func_state(struct bpf_verifier_env *env,
+			       struct bpf_func_state *st,
+			       u16 live_regs, int frame)
 {
-	u16 live_regs = env->insn_aux_data[ip].live_regs_before;
 	int i, j;
 
 	for (i = 0; i < BPF_REG_FP; i++) {
@@ -20176,58 +20156,82 @@ static void clean_func_state(struct bpf_verifier_env *env,
 			__mark_reg_not_init(env, &st->regs[i]);
 	}
 
+	/*
+	 * Clean dead 4-byte halves within each SPI independently.
+	 * half_spi 2*i   → lower half: slot_type[0..3] (closer to FP)
+	 * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP)
+	 */
 	for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
-		if (!bpf_stack_slot_alive(env, st->frameno, i)) {
-			__mark_reg_not_init(env, &st->stack[i].spilled_ptr);
-			for (j = 0; j < BPF_REG_SIZE; j++)
-				st->stack[i].slot_type[j] = STACK_INVALID;
+		bool lo_live = bpf_stack_slot_alive(env, frame, i * 2);
+		bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1);
+
+		if (!hi_live || !lo_live) {
+			int start = !lo_live ? 0 : BPF_REG_SIZE / 2;
+			int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2;
+			u8 stype = st->stack[i].slot_type[7];
+
+			/*
+			 * Don't clear special slots.
+			 * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to
+			 * detect overwrites and invalidate associated data slices.
+			 * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit()
+			 * check for their respective slot types to detect double-create.
+			 */
+			if (stype == STACK_DYNPTR || stype == STACK_ITER ||
+			    stype == STACK_IRQ_FLAG)
+				continue;
+
+			/*
+			 * Only destroy spilled_ptr when hi half is dead.
+			 * If hi half is still live with STACK_SPILL, the
+			 * spilled_ptr metadata is needed for correct state
+			 * comparison in stacksafe().
+			 * is_spilled_reg() is using slot_type[7], but
+			 * is_spilled_scalar_after() check either slot_type[0] or [4]
+			 */
+			if (!hi_live) {
+				struct bpf_reg_state *spill = &st->stack[i].spilled_ptr;
+
+				if (lo_live && stype == STACK_SPILL) {
+					u8 val = STACK_MISC;
+
+					/*
+					 * 8 byte spill of scalar 0 where half slot is dead
+					 * should become STACK_ZERO in lo 4 bytes.
+					 */
+					if (register_is_null(spill))
+						val = STACK_ZERO;
+					for (j = 0; j < 4; j++) {
+						u8 *t = &st->stack[i].slot_type[j];
+
+						if (*t == STACK_SPILL)
+							*t = val;
+					}
+				}
+				__mark_reg_not_init(env, spill);
+			}
+			for (j = start; j < end; j++)
+				st->stack[i].slot_type[j] = STACK_POISON;
 		}
 	}
 }
 
-static void clean_verifier_state(struct bpf_verifier_env *env,
+static int clean_verifier_state(struct bpf_verifier_env *env,
 				 struct bpf_verifier_state *st)
 {
-	int i, ip;
+	int i, err;
 
-	bpf_live_stack_query_init(env, st);
-	st->cleaned = true;
+	err = bpf_live_stack_query_init(env, st);
+	if (err)
+		return err;
 	for (i = 0; i <= st->curframe; i++) {
-		ip = frame_insn_idx(st, i);
-		clean_func_state(env, st->frame[i], ip);
-	}
-}
+		u32 ip = frame_insn_idx(st, i);
+		u16 live_regs = env->insn_aux_data[ip].live_regs_before;
 
-/* the parentage chains form a tree.
- * the verifier states are added to state lists at given insn and
- * pushed into state stack for future exploration.
- * when the verifier reaches bpf_exit insn some of the verifier states
- * stored in the state lists have their final liveness state already,
- * but a lot of states will get revised from liveness point of view when
- * the verifier explores other branches.
- * Example:
- * 1: *(u64)(r10 - 8) = 1
- * 2: if r1 == 100 goto pc+1
- * 3: *(u64)(r10 - 8) = 2
- * 4: r0 = *(u64)(r10 - 8)
- * 5: exit
- * when the verifier reaches exit insn the stack slot -8 in the state list of
- * insn 2 is not yet marked alive. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. After the insn 4 read, liveness
- * analysis would propagate read mark for -8 at insn 2.
- *
- * Since the verifier pushes the branch states as it sees them while exploring
- * the program the condition of walking the branch instruction for the second
- * time means that all states below this branch were already explored and
- * their final liveness marks are already propagated.
- * Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to clear dead the registers and stack
- * slots to simplify state merging.
- *
- * Important note here that walking the same branch instruction in the callee
- * doesn't meant that the states are DONE. The verifier has to compare
- * the callsites
- */
+		__clean_func_state(env, st->frame[i], live_regs, i);
+	}
+	return 0;
+}
 
 /* Find id in idset and increment its count, or add new entry */
 static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
@@ -20292,29 +20296,6 @@ static void clear_singular_ids(struct bpf_verifier_env *env,
 	}));
 }
 
-static void clean_live_states(struct bpf_verifier_env *env, int insn,
-			      struct bpf_verifier_state *cur)
-{
-	struct bpf_verifier_state_list *sl;
-	struct list_head *pos, *head;
-
-	head = explored_state(env, insn);
-	list_for_each(pos, head) {
-		sl = container_of(pos, struct bpf_verifier_state_list, node);
-		if (sl->state.branches)
-			continue;
-		if (sl->state.insn_idx != insn ||
-		    !same_callsites(&sl->state, cur))
-			continue;
-		if (sl->state.cleaned)
-			/* all regs in this state in all frames were already marked */
-			continue;
-		if (incomplete_read_marks(env, &sl->state))
-			continue;
-		clean_verifier_state(env, &sl->state);
-	}
-}
-
 static bool regs_exact(const struct bpf_reg_state *rold,
 		       const struct bpf_reg_state *rcur,
 		       struct bpf_idmap *idmap)
@@ -20499,7 +20480,8 @@ static bool is_stack_misc_after(struct bpf_verifier_env *env,
 
 	for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) {
 		if ((stack->slot_type[i] == STACK_MISC) ||
-		    (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
+		    ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) &&
+		     env->allow_uninit_stack))
 			continue;
 		return false;
 	}
@@ -20535,13 +20517,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 
 		spi = i / BPF_REG_SIZE;
 
-		if (exact == EXACT &&
-		    (i >= cur->allocated_stack ||
-		     old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
-		     cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
-			return false;
+		if (exact == EXACT) {
+			u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE];
+			u8 cur_type = i < cur->allocated_stack ?
+				      cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID;
 
-		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			/* STACK_INVALID and STACK_POISON are equivalent for pruning */
+			if (old_type == STACK_POISON)
+				old_type = STACK_INVALID;
+			if (cur_type == STACK_POISON)
+				cur_type = STACK_INVALID;
+			if (i >= cur->allocated_stack || old_type != cur_type)
+				return false;
+		}
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID ||
+		    old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON)
 			continue;
 
 		if (env->allow_uninit_stack &&
@@ -20639,6 +20630,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
+		case STACK_POISON:
 			continue;
 		/* Ensure that new unhandled slot types return false by default */
 		default:
@@ -21015,7 +21007,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	    env->insn_processed - env->prev_insn_processed >= 8)
 		add_new_state = true;
 
-	clean_live_states(env, insn_idx, cur);
+	/* keep cleaning the current state as registers/stack become dead */
+	err = clean_verifier_state(env, cur);
+	if (err)
+		return err;
 
 	loop = false;
 	head = explored_state(env, insn_idx);
@@ -21783,7 +21778,7 @@ static int do_check(struct bpf_verifier_env *env)
 	for (;;) {
 		struct bpf_insn *insn;
 		struct bpf_insn_aux_data *insn_aux;
-		int err, marks_err;
+		int err;
 
 		/* reset current history entry on each new instruction */
 		env->cur_hist_ent = NULL;
@@ -21897,15 +21892,7 @@ static int do_check(struct bpf_verifier_env *env)
 		if (state->speculative && insn_aux->nospec)
 			goto process_bpf_exit;
 
-		err = bpf_reset_stack_write_marks(env, env->insn_idx);
-		if (err)
-			return err;
 		err = do_check_insn(env, &do_print_state);
-		if (err >= 0 || error_recoverable_with_nospec(err)) {
-			marks_err = bpf_commit_stack_write_marks(env);
-			if (marks_err)
-				return marks_err;
-		}
 		if (error_recoverable_with_nospec(err) && state->speculative) {
 			/* Prevent this speculative path from ever reaching the
 			 * insn that would have been unsafe to execute.
@@ -21946,9 +21933,6 @@ static int do_check(struct bpf_verifier_env *env)
 process_bpf_exit:
 			mark_verifier_state_scratched(env);
 			err = update_branch_counts(env, env->cur_state);
-			if (err)
-				return err;
-			err = bpf_update_live_stack(env);
 			if (err)
 				return err;
 			err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
@@ -25299,7 +25283,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 again:
 	new_cnt = 0;
 	for (i = 1; i < env->subprog_cnt; i++) {
-		if (!subprog_is_global(env, i))
+		if (!bpf_subprog_is_global(env, i))
 			continue;
 
 		sub_aux = subprog_aux(env, i);
@@ -26338,6 +26322,11 @@ static int compute_live_registers(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; ++i)
 		compute_insn_live_regs(env, &insns[i], &state[i]);
 
+	/* Forward pass: resolve stack access through FP-derived pointers */
+	err = bpf_compute_subprog_arg_access(env);
+	if (err)
+		goto out;
+
 	changed = true;
 	while (changed) {
 		changed = false;
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 169cf7fbf40f..a96b25ebff23 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -54,6 +54,7 @@
 #include "verifier_leak_ptr.skel.h"
 #include "verifier_linked_scalars.skel.h"
 #include "verifier_live_stack.skel.h"
+#include "verifier_liveness_exp.skel.h"
 #include "verifier_load_acquire.skel.h"
 #include "verifier_loops1.skel.h"
 #include "verifier_lwt.skel.h"
@@ -202,6 +203,7 @@ void test_verifier_ldsx(void)                  { RUN(verifier_ldsx); }
 void test_verifier_leak_ptr(void)             { RUN(verifier_leak_ptr); }
 void test_verifier_linked_scalars(void)       { RUN(verifier_linked_scalars); }
 void test_verifier_live_stack(void)           { RUN(verifier_live_stack); }
+void test_verifier_liveness_exp(void)         { RUN(verifier_liveness_exp); }
 void test_verifier_loops1(void)               { RUN(verifier_loops1); }
 void test_verifier_lwt(void)                  { RUN(verifier_lwt); }
 void test_verifier_map_in_map(void)           { RUN(verifier_map_in_map); }
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c
index aaa2854974c0..c01c0114af1b 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c
@@ -25,10 +25,10 @@ static bool check_prog_load(int prog_fd, bool expect_err, const char *tag)
 
 static struct {
 	/* strategically placed before others to avoid accidental modification by kernel */
-	char filler[1024];
-	char buf[1024];
+	char filler[16384];
+	char buf[16384];
 	/* strategically placed after buf[] to catch more accidental corruptions */
-	char reference[1024];
+	char reference[16384];
 } logs;
 static const struct bpf_insn *insns;
 static size_t insn_cnt;
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index 2027cfcb748c..e4abf4172fca 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -59,7 +59,7 @@ check_assert(s64, >=, ge_neg, INT_MIN);
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx() R2=scalar(smin=0xffffffff80000002,smax=smax32=0x7ffffffd,smin32=0x80000002) R10=fp0")
+__msg(": R1=ctx() R2=scalar(smin=0xffffffff80000002,smax=smax32=0x7ffffffd,smin32=0x80000002) R10=fp0")
 int check_assert_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
@@ -86,7 +86,7 @@ int check_assert_range_u64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx() R2=4096 R10=fp0")
+__msg(": R1=ctx() R2=4096 R10=fp0")
 int check_assert_single_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
@@ -114,7 +114,7 @@ int check_assert_single_range_u64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R1=pkt(r=64,imm=64) R2=pkt_end() R6=pkt(r=64) R10=fp0")
+__msg(": R6=pkt(r=64) R10=fp0")
 int check_assert_generic(struct __sk_buff *ctx)
 {
 	u8 *data_end = (void *)(long)ctx->data_end;
diff --git a/tools/testing/selftests/bpf/progs/uninit_stack.c b/tools/testing/selftests/bpf/progs/uninit_stack.c
index 046a204c8fc6..5db02323c89c 100644
--- a/tools/testing/selftests/bpf/progs/uninit_stack.c
+++ b/tools/testing/selftests/bpf/progs/uninit_stack.c
@@ -76,6 +76,7 @@ __naked int helper_uninit_to_misc(void *ctx)
 		 * thus showing the stack state, matched by __msg().		\
 		 */					\
 		call %[dummy];				\
+		r1 = *(u64*)(r10 - 104);		\
 		r0 = 0;					\
 		exit;					\
 "
diff --git a/tools/testing/selftests/bpf/progs/verifier_align.c b/tools/testing/selftests/bpf/progs/verifier_align.c
index 24553ce62881..3e52686515ca 100644
--- a/tools/testing/selftests/bpf/progs/verifier_align.c
+++ b/tools/testing/selftests/bpf/progs/verifier_align.c
@@ -131,7 +131,7 @@ LBL ":"							\
 SEC("tc")
 __success __log_level(2)
 __flag(BPF_F_ANY_ALIGNMENT)
-__msg("6: R0=pkt(r=8,imm=8)")
+__msg("6: {{.*}} R2=pkt(r=8)")
 __msg("6: {{.*}} R3={{[^)]*}}var_off=(0x0; 0xff)")
 __msg("7: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x1fe)")
 __msg("8: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x3fc)")
@@ -205,7 +205,7 @@ __success __log_level(2)
 __msg("2: {{.*}} R5=pkt(r=0)")
 __msg("4: {{.*}} R5=pkt(r=0,imm=14)")
 __msg("5: {{.*}} R4=pkt(r=0,imm=14)")
-__msg("9: {{.*}} R2=pkt(r=18)")
+__msg("9: {{.*}} R5=pkt(r=18,imm=14)")
 __msg("10: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xff){{.*}} R5=pkt(r=18,imm=14)")
 __msg("13: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)")
 __msg("14: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)")
@@ -254,7 +254,7 @@ __msg("11: {{.*}} R5=pkt(id=1,{{[^)]*}},var_off=(0x2; 0x7fc)")
  * offset is considered using reg->aux_off_align which
  * is 4 and meets the load's requirements.
  */
-__msg("15: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)")
+__msg("15: {{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)")
 /* Variable offset is added to R5 packet pointer,
  * resulting in auxiliary alignment of 4. To avoid BPF
  * verifier's precision backtracking logging
@@ -273,7 +273,7 @@ __msg("19: {{.*}} R5=pkt(id=2,{{[^)]*}}var_off=(0x2; 0x7fc)")
  * aligned, so the total offset is 4-byte aligned and
  * meets the load's requirements.
  */
-__msg("24: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)")
+__msg("24: {{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)")
 /* Constant offset is added to R5 packet pointer,
  * resulting in reg->off value of 14.
  */
@@ -296,7 +296,7 @@ __msg("31: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x
  * the total offset is 4-byte aligned and meets the
  * load's requirements.
  */
-__msg("35: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)")
+__msg("35: {{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)")
 __naked void packet_variable_offset(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c
index 4672af0b3268..e814a054d69a 100644
--- a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c
@@ -36,7 +36,7 @@ l0_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("UDIV32, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("w1 /= w2 {{.*}}; R1=0 R2=0")
+__msg("w1 /= w2 {{.*}}; R1=0")
 __naked void udiv32_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -81,7 +81,7 @@ l0_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("UDIV64, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("r1 /= r2 {{.*}}; R1=0 R2=0")
+__msg("r1 /= r2 {{.*}}; R1=0")
 __naked void udiv64_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -242,7 +242,7 @@ l1_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("SDIV32, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("w1 s/= w2 {{.*}}; R1=0 R2=0")
+__msg("w1 s/= w2 {{.*}}; R1=0")
 __naked void sdiv32_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -275,6 +275,7 @@ __naked void sdiv32_overflow_1(void)
 	w2 += 10;					\
 	if w1 s> w2 goto l0_%=;				\
 	w1 s/= -1;					\
+	r2 = r1;					\
 l0_%=:	r0 = 0;						\
 	exit;						\
 "	:
@@ -443,7 +444,7 @@ l1_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("SDIV64, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("r1 s/= r2 {{.*}}; R1=0 R2=0")
+__msg("r1 s/= r2 {{.*}}; R1=0")
 __naked void sdiv64_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -476,6 +477,7 @@ __naked void sdiv64_overflow_1(void)
 	r2 += 10;					\
 	if r1 s> r2 goto l0_%=;				\
 	r1 s/= -1;					\
+	r2 = r1;					\
 l0_%=:	r0 = 0;						\
 	exit;						\
 "	:
@@ -553,7 +555,7 @@ l0_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("UMOD32, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0")
+__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))")
 __naked void umod32_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -624,7 +626,7 @@ l0_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("UMOD64, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0")
+__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))")
 __naked void umod64_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -833,7 +835,7 @@ l1_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("SMOD32, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0")
+__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))")
 __naked void smod32_zero_divisor(void)
 {
 	asm volatile ("					\
@@ -1084,7 +1086,7 @@ l1_%=:	r0 = *(u64 *)(r1 + 0);				\
 SEC("socket")
 __description("SMOD64, zero divisor")
 __success __retval(0) __log_level(2)
-__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0")
+__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)")
 __naked void smod64_zero_divisor(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_live_stack.c b/tools/testing/selftests/bpf/progs/verifier_live_stack.c
index 2de105057bbc..b7a9fa10e84d 100644
--- a/tools/testing/selftests/bpf/progs/verifier_live_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_live_stack.c
@@ -3,8 +3,10 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "../../../include/linux/filter.h"
 #include "bpf_misc.h"
 
+char _license[] SEC("license") = "GPL";
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(max_entries, 1);
@@ -12,14 +14,20 @@ struct {
 	__type(value, long long);
 } map SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u64);
+} array_map_8b SEC(".maps");
+
+const char snprintf_u64_fmt[] = "%llu";
+
 SEC("socket")
 __log_level(2)
-__msg("(0) frame 0 insn 2 +written -8")
-__msg("(0) frame 0 insn 1 +live -24")
-__msg("(0) frame 0 insn 1 +written -8")
-__msg("(0) frame 0 insn 0 +live -8,-24")
-__msg("(0) frame 0 insn 0 +written -8")
-__msg("(0) live stack update done in 2 iterations")
+__msg("0: (79) r1 = *(u64 *)(r10 -8)        ; use: fp0-8")
+__msg("1: (79) r2 = *(u64 *)(r10 -24)       ; use: fp0-24")
+__msg("2: (7b) *(u64 *)(r10 -8) = r1        ; def: fp0-8")
 __naked void simple_read_simple_write(void)
 {
 	asm volatile (
@@ -33,12 +41,8 @@ __naked void simple_read_simple_write(void)
 
 SEC("socket")
 __log_level(2)
-__msg("(0) frame 0 insn 1 +live -8")
-__not_msg("(0) frame 0 insn 1 +written")
-__msg("(0) live stack update done in 2 iterations")
-__msg("(0) frame 0 insn 1 +live -16")
-__msg("(0) frame 0 insn 1 +written -32")
-__msg("(0) live stack update done in 2 iterations")
+__msg("2: (79) r0 = *(u64 *)(r10 -8)        ; use: fp0-8")
+__msg("6: (79) r0 = *(u64 *)(r10 -16)       ; use: fp0-16")
 __naked void read_write_join(void)
 {
 	asm volatile (
@@ -58,13 +62,9 @@ __naked void read_write_join(void)
 
 SEC("socket")
 __log_level(2)
-__msg("2: (25) if r0 > 0x2a goto pc+1")
-__msg("7: (95) exit")
-__msg("(0) frame 0 insn 2 +written -16")
-__msg("(0) live stack update done in 2 iterations")
-__msg("7: (95) exit")
-__not_msg("(0) frame 0 insn 2")
-__msg("(0) live stack update done in 1 iterations")
+__msg("stack use/def subprog#0 must_write_not_same_slot (d0,cs0):")
+__msg("6: (7b) *(u64 *)(r2 +0) = r0{{$}}")
+__msg("Live regs before insn:")
 __naked void must_write_not_same_slot(void)
 {
 	asm volatile (
@@ -83,10 +83,8 @@ __naked void must_write_not_same_slot(void)
 
 SEC("socket")
 __log_level(2)
-__msg("(0) frame 0 insn 0 +written -8,-16")
-__msg("(0) live stack update done in 2 iterations")
-__msg("(0) frame 0 insn 0 +written -8")
-__msg("(0) live stack update done in 2 iterations")
+__msg("0: (7a) *(u64 *)(r10 -8) = 0         ; def: fp0-8")
+__msg("5: (85) call bpf_map_lookup_elem#1   ; use: fp0-8h")
 __naked void must_write_not_same_type(void)
 {
 	asm volatile (
@@ -110,10 +108,11 @@ __naked void must_write_not_same_type(void)
 
 SEC("socket")
 __log_level(2)
-__msg("(2,4) frame 0 insn 4 +written -8")
-__msg("(2,4) live stack update done in 2 iterations")
-__msg("(0) frame 0 insn 2 +written -8")
-__msg("(0) live stack update done in 2 iterations")
+/* Callee writes fp[0]-8: stack_use at call site has slots 0,1 live */
+__msg("stack use/def subprog#0 caller_stack_write (d0,cs0):")
+__msg("2: (85) call pc+1{{$}}")
+__msg("stack use/def subprog#1 write_first_param (d1,cs2):")
+__msg("4: (7a) *(u64 *)(r1 +0) = 7          ; def: fp0-8")
 __naked void caller_stack_write(void)
 {
 	asm volatile (
@@ -135,23 +134,15 @@ static __used __naked void write_first_param(void)
 
 SEC("socket")
 __log_level(2)
-/* caller_stack_read() function */
-__msg("2: .12345.... (85) call pc+4")
-__msg("5: .12345.... (85) call pc+1")
-__msg("6: 0......... (95) exit")
-/* read_first_param() function */
-__msg("7: .1........ (79) r0 = *(u64 *)(r1 +0)")
-__msg("8: 0......... (95) exit")
-/* update for callsite at (2) */
-__msg("(2,7) frame 0 insn 7 +live -8")
-__msg("(2,7) live stack update done in 2 iterations")
-__msg("(0) frame 0 insn 2 +live -8")
-__msg("(0) live stack update done in 2 iterations")
-/* update for callsite at (5) */
-__msg("(5,7) frame 0 insn 7 +live -16")
-__msg("(5,7) live stack update done in 2 iterations")
-__msg("(0) frame 0 insn 5 +live -16")
-__msg("(0) live stack update done in 2 iterations")
+__msg("stack use/def subprog#0 caller_stack_read (d0,cs0):")
+__msg("2: (85) call pc+{{.*}}                   ; use: fp0-8{{$}}")
+__msg("5: (85) call pc+{{.*}}                   ; use: fp0-16{{$}}")
+__msg("stack use/def subprog#1 read_first_param (d1,cs2):")
+__msg("7: (79) r0 = *(u64 *)(r1 +0)         ; use: fp0-8{{$}}")
+__msg("8: (95) exit")
+__msg("stack use/def subprog#1 read_first_param (d1,cs5):")
+__msg("7: (79) r0 = *(u64 *)(r1 +0)         ; use: fp0-16{{$}}")
+__msg("8: (95) exit")
 __naked void caller_stack_read(void)
 {
 	asm volatile (
@@ -173,21 +164,49 @@ static __used __naked void read_first_param(void)
 	::: __clobber_all);
 }
 
+SEC("socket")
+__success
+__naked void arg_track_join_convergence(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"r2 = 2;"
+	"call arg_track_join_convergence_subprog;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void arg_track_join_convergence_subprog(void)
+{
+	asm volatile (
+	"if r1 == 0 goto 1f;"
+	"r0 = r1;"
+	"goto 2f;"
+"1:"
+	"r0 = r2;"
+"2:"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
 SEC("socket")
 __flag(BPF_F_TEST_STATE_FREQ)
 __log_level(2)
-/* read_first_param2() function */
-__msg(" 9: .1........ (79) r0 = *(u64 *)(r1 +0)")
-__msg("10: .......... (b7) r0 = 0")
-__msg("11: 0......... (05) goto pc+0")
-__msg("12: 0......... (95) exit")
+/* fp0-8 consumed at insn 9, dead by insn 11. stack_def at insn 4 kills slots 0,1. */
+__msg("4: (7b) *(u64 *)(r10 -8) = r0        ; def: fp0-8")
+/* stack_use at call site: callee reads fp0-8, slots 0,1 live */
+__msg("7: (85) call pc+{{.*}}               ; use: fp0-8")
+/* read_first_param2: no caller stack live inside callee after first read */
+__msg("9: (79) r0 = *(u64 *)(r1 +0)         ; use: fp0-8")
+__msg("10: (b7) r0 = 0{{$}}")
+__msg("11: (05) goto pc+0{{$}}")
+__msg("12: (95) exit")
 /*
- * The purpose of the test is to check that checkpoint in
- * read_first_param2() stops path traversal. This will only happen if
- * verifier understands that fp[0]-8 at insn (12) is not alive.
+ * Checkpoint at goto +0 fires because fp0-8 is dead → state pruning.
  */
 __msg("12: safe")
-__msg("processed 20 insns")
 __naked void caller_stack_pruning(void)
 {
 	asm volatile (
@@ -342,3 +361,2289 @@ static __used __naked unsigned long write_tail_call(void)
           __imm_addr(map_array)
         : __clobber_all);
 }
+
+/* Test precise subprog stack access analysis.
+ * Caller passes fp-32 (SPI 3) to callee that only accesses arg+0 and arg+8
+ * (SPIs 3 and 2). Slots 0 and 1 should NOT be live at the call site.
+ *
+ * Insn layout:
+ *   0: *(u64*)(r10 - 8) = 0      write SPI 0
+ *   1: *(u64*)(r10 - 16) = 0     write SPI 1
+ *   2: *(u64*)(r10 - 24) = 0     write SPI 2
+ *   3: *(u64*)(r10 - 32) = 0     write SPI 3
+ *   4: r1 = r10
+ *   5: r1 += -32
+ *   6: call precise_read_two      passes fp-32 (SPI 3)
+ *   7: r0 = 0
+ *   8: exit
+ *
+ * At insn 6 only SPIs 2,3 should be live (slots 4-7, 0xf0).
+ * SPIs 0,1 are written but never read → dead.
+ */
+SEC("socket")
+__log_level(2)
+__msg("6: (85) call pc+{{.*}}                   ; use: fp0-24 fp0-32{{$}}")
+__naked void subprog_precise_stack_access(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u64 *)(r10 - 16) = 0;"
+	"*(u64 *)(r10 - 24) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"call precise_read_two;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Callee reads only at arg+0 (SPI 3) and arg+8 (SPI 2) */
+static __used __naked void precise_read_two(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"
+	"r2 = *(u64 *)(r1 + 8);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that multi-level subprog calls (callee passes arg-derived ptr
+ * to another BPF subprog) are analyzed precisely.
+ *
+ * Caller passes fp-32 (SPI 3). The callee forwards it to inner_callee.
+ * inner_callee only reads at offset 0 from the pointer.
+ * The analysis recurses into forward_to_inner -> inner_callee and
+ * determines only SPI 3 is accessed (slots 6-7, 0xc0), not all of SPIs 0-3.
+ *
+ * Insn layout:
+ *   0: *(u64*)(r10 - 8) = 0      write SPI 0
+ *   1: *(u64*)(r10 - 16) = 0     write SPI 1
+ *   2: *(u64*)(r10 - 24) = 0     write SPI 2
+ *   3: *(u64*)(r10 - 32) = 0     write SPI 3
+ *   4: r1 = r10
+ *   5: r1 += -32
+ *   6: call forward_to_inner      passes fp-32 (SPI 3)
+ *   7: r0 = 0
+ *   8: exit
+ */
+SEC("socket")
+__log_level(2)
+__msg("6: (85) call pc+{{.*}}                   ; use: fp0-32{{$}}")
+__naked void subprog_multilevel_conservative(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u64 *)(r10 - 16) = 0;"
+	"*(u64 *)(r10 - 24) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"call forward_to_inner;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Forwards arg to another subprog */
+static __used __naked void forward_to_inner(void)
+{
+	asm volatile (
+	"call inner_callee;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void inner_callee(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test multi-frame precision loss: callee consumes caller stack early,
+ * but static liveness keeps it live at pruning points inside callee.
+ *
+ * Caller stores map_ptr or scalar(42) at fp-8, then calls
+ * consume_and_call_inner. The callee reads fp0-8 at entry (consuming
+ * the slot), then calls do_nothing2. After do_nothing2 returns (a
+ * pruning point), fp-8 should be dead -- the read already happened.
+ * But because the call instruction's stack_use includes SPI 0, the
+ * static live_stack_before at insn 7 is 0x1, keeping fp-8 live inside
+ * the callee and preventing state pruning between the two paths.
+ *
+ * Insn layout:
+ *   0: call bpf_get_prandom_u32
+ *   1: if r0 == 42 goto pc+2    -> insn 4
+ *   2: r0 = map ll (ldimm64 part1)
+ *   3: (ldimm64 part2)
+ *   4: *(u64)(r10 - 8) = r0     fp-8 = map_ptr OR scalar(42)
+ *   5: r1 = r10
+ *   6: r1 += -8
+ *   7: call consume_and_call_inner
+ *   8: r0 = 0
+ *   9: exit
+ *
+ * At insn 7, live_stack_before = 0x3 (slots 0-1 live due to stack_use).
+ * At insn 8, live_stack_before = 0x0 (SPI 0 dead, caller doesn't need it).
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__log_level(2)
+__success
+__msg(" 7: (85) call pc+{{.*}}                   ; use: fp0-8")
+__msg(" 8: {{.*}} (b7)")
+__naked void callee_consumed_caller_stack(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 42 goto 1f;"
+	"r0 = %[map] ll;"
+"1:"
+	"*(u64 *)(r10 - 8) = r0;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call consume_and_call_inner;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+static __used __naked void consume_and_call_inner(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"	/* read fp[0]-8 into caller-saved r0 */
+	"call do_nothing2;"		/* inner call clobbers r0 */
+	"r0 = 0;"
+	"goto +0;"			/* checkpoint */
+	"r0 = 0;"
+	"goto +0;"			/* checkpoint */
+	"r0 = 0;"
+	"goto +0;"			/* checkpoint */
+	"r0 = 0;"
+	"goto +0;"			/* checkpoint */
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void do_nothing2(void)
+{
+	asm volatile (
+	"r0 = 0;"
+	"r0 = 0;"
+	"r0 = 0;"
+	"r0 = 0;"
+	"r0 = 0;"
+	"r0 = 0;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Reproducer for unsound pruning when clean_verifier_state() promotes
+ * live STACK_ZERO bytes to STACK_MISC.
+ *
+ * Program shape:
+ * - Build key at fp-4:
+ *   - path A keeps key byte as STACK_ZERO;
+ *   - path B writes unknown byte making it STACK_MISC.
+ * - Branches merge at a prune point before map_lookup.
+ * - map_lookup on ARRAY map is value-sensitive to constant zero key:
+ *   - path A: const key 0 => PTR_TO_MAP_VALUE (non-NULL);
+ *   - path B: non-const key => PTR_TO_MAP_VALUE_OR_NULL.
+ * - Dereference lookup result without null check.
+ *
+ * Note this behavior won't trigger at fp-8, since the verifier will
+ * track 32-bit scalar spill differently as spilled_ptr.
+ *
+ * Correct verifier behavior: reject (path B unsafe).
+ * With blanket STACK_ZERO->STACK_MISC promotion on live slots, cached path A
+ * state can be generalized and incorrectly prune path B, making program load.
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+__naked void stack_zero_to_misc_unsound_array_lookup(void)
+{
+	asm volatile (
+	/* key at fp-4: all bytes STACK_ZERO */
+	"*(u32 *)(r10 - 4) = 0;"
+	"call %[bpf_get_prandom_u32];"
+	/* fall-through (path A) explored first */
+	"if r0 != 0 goto l_nonconst%=;"
+	/* path A: keep key constant zero */
+	"goto l_lookup%=;"
+"l_nonconst%=:"
+	/* path B: key byte turns to STACK_MISC, key no longer const */
+	"*(u8 *)(r10 - 4) = r0;"
+"l_lookup%=:"
+	/* value-sensitive lookup */
+	"r2 = r10;"
+	"r2 += -4;"
+	"r1 = %[array_map_8b] ll;"
+	"call %[bpf_map_lookup_elem];"
+	/* unsafe when lookup result is map_value_or_null */
+	"r0 = *(u64 *)(r0 + 0);"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_map_lookup_elem),
+	  __imm_addr(array_map_8b)
+	: __clobber_all);
+}
+
+/*
+ * Subprog variant of stack_zero_to_misc_unsound_array_lookup.
+ *
+ * Check unsound pruning when a callee modifies the caller's
+ * stack through a pointer argument.
+ *
+ * Program shape:
+ *   main:
+ *     *(u32)(fp - 4) = 0            key = 0 (all bytes STACK_ZERO)
+ *     r1 = fp - 4
+ *     call maybe_clobber_key        may overwrite key[0] with scalar
+ *     <-- prune point: two states meet here -->
+ *     r2 = fp - 4
+ *     r1 = array_map_8b
+ *     call bpf_map_lookup_elem      value-sensitive on const-zero key
+ *     r0 = *(u64)(r0 + 0)           deref without null check
+ *     exit
+ *
+ *   maybe_clobber_key(r1):
+ *     r6 = r1                       save &key
+ *     call bpf_get_prandom_u32
+ *     if r0 == 0 goto skip          path A: key stays STACK_ZERO
+ *     *(u8)(r6 + 0) = r0            path B: key[0] becomes STACK_MISC
+ *   skip:
+ *     r0 = 0
+ *     exit
+ *
+ * Path A: const-zero key => array lookup => PTR_TO_MAP_VALUE => deref OK.
+ * Path B: non-const key  => array lookup => PTR_TO_MAP_VALUE_OR_NULL => UNSAFE.
+ *
+ * If the cleaner collapses STACK_ZERO -> STACK_MISC for the live key
+ * slot, path A's cached state matches path B, pruning the unsafe path.
+ *
+ * Correct verifier behaviour: reject.
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+__naked void subprog_stack_zero_to_misc_unsound(void)
+{
+	asm volatile (
+	/* key at fp-4: all bytes STACK_ZERO */
+	"*(u32 *)(r10 - 4) = 0;"
+	/* subprog may clobber key[0] with a scalar byte */
+	"r1 = r10;"
+	"r1 += -4;"
+	"call maybe_clobber_key;"
+	/* value-sensitive array lookup */
+	"r2 = r10;"
+	"r2 += -4;"
+	"r1 = %[array_map_8b] ll;"
+	"call %[bpf_map_lookup_elem];"
+	/* unsafe when result is map_value_or_null (path B) */
+	"r0 = *(u64 *)(r0 + 0);"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(array_map_8b)
+	: __clobber_all);
+}
+
+static __used __naked void maybe_clobber_key(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"call %[bpf_get_prandom_u32];"
+	/* path A (r0==0): key stays STACK_ZERO, explored first */
+	"if r0 == 0 goto 1f;"
+	/* path B (r0!=0): overwrite key[0] with scalar */
+	"*(u8 *)(r6 + 0) = r0;"
+	"1:"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * Demonstrate that subprog arg spill/reload breaks arg tracking,
+ * inflating caller stack liveness and preventing state pruning.
+ *
+ * modifier2(fp-24) has two paths: one writes a scalar to *(r1+8)
+ * = caller fp-16, the other leaves it as zero.  After modifier2
+ * returns, fp-16 is never read again — it is dead.
+ *
+ * spill_reload_reader2(fp-24) only reads caller fp-8 via
+ * *(r1+16), but it spills r1 across a helper call.  This
+ * breaks compute_subprog_arg_access(): the reload from callee
+ * stack cannot be connected back to arg1, so arg1 access goes
+ * "all (conservative)".  At the call site (r1 = fp-24, slot 5)
+ * apply_callee_stack_access() marks slots 0..5 as stack_use —
+ * pulling fp-16 (slots 2-3) into live_stack_before even though
+ * the reader never touches it.
+ *
+ * Result: at modifier2's return point two states with different
+ * fp-16 values cannot be pruned.
+ *
+ * With correct (or old dynamic) liveness fp-16 is dead at that
+ * point and the states prune → "6: safe" appears in the log.
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__log_level(2)
+__success
+__msg("6: safe")
+__naked void spill_reload_inflates_stack_liveness(void)
+{
+	asm volatile (
+	/* struct at fp-24: { ctx; ptr; tail; } */
+	"*(u64 *)(r10 - 24) = r1;"		/* fp-24 = ctx */
+	"*(u64 *)(r10 - 16) = r1;"		/* fp-16 = ctx (STACK_SPILL ptr) */
+	"*(u64 *)(r10 - 8) = 0;"		/* fp-8  = tail */
+	/* modifier2 writes different values to fp-16 on two paths */
+	"r1 = r10;"
+	"r1 += -24;"
+	"call modifier2;"
+	/* insn 6: prune point — two states with different fp-16
+	 * path A: fp-16 = STACK_MISC  (scalar overwrote pointer)
+	 * path B: fp-16 = STACK_SPILL (original ctx pointer)
+	 * STACK_MISC does NOT subsume STACK_SPILL(ptr),
+	 * so pruning fails unless fp-16 is cleaned (dead).
+	 */
+	"r1 = r10;"
+	"r1 += -24;"
+	"call spill_reload_reader2;"		/* reads fp-8 via *(r1+16) */
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Two paths: one writes a scalar to *(r1+8) = caller fp-16,
+ * the other leaves it unchanged.  Both return 0 via separate
+ * exits to prevent pruning inside the subprog at the merge.
+ */
+static __used __naked void modifier2(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"*(u64 *)(r6 + 8) = r0;"		/* fp-16 = random */
+	"r0 = 0;"
+	"exit;"					/* path A exit */
+	"1:"
+	"r0 = 0;"
+	"exit;"					/* path B exit */
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Receives r1 = caller fp-24.  Only reads *(r1+16) = fp-8.
+ * Spills r1 across a helper call → arg tracking goes conservative →
+ * slots 0..5 all appear used instead of just slot 1 (fp-8).
+ */
+static __used __naked void spill_reload_reader2(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = r1;"		/* spill arg1 */
+	"call %[bpf_get_prandom_u32];"		/* clobbers r1-r5 */
+	"r1 = *(u64 *)(r10 - 8);"		/* reload arg1 */
+	"r0 = *(u64 *)(r1 + 16);"		/* read caller fp-8 */
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* BTF FUNC records are not generated for kfuncs referenced
+ * from inline assembly. These records are necessary for
+ * libbpf to link the program. The function below is a hack
+ * to ensure that BTF FUNC records are generated.
+ */
+void __kfunc_btf_root(void)
+{
+	bpf_iter_num_new(0, 0, 0);
+	bpf_iter_num_next(0);
+	bpf_iter_num_destroy(0);
+}
+
+/* Test that open-coded iterator kfunc arguments get precise stack
+ * liveness tracking. struct bpf_iter_num is 8 bytes (1 SPI).
+ *
+ * Insn layout:
+ *   0: *(u64*)(r10 - 8) = 0      write SPI 0 (dead)
+ *   1: *(u64*)(r10 - 16) = 0     write SPI 1 (dead)
+ *   2: r1 = r10
+ *   3: r1 += -24                 iter state at fp-24 (SPI 2)
+ *   4: r2 = 0
+ *   5: r3 = 10
+ *   6: call bpf_iter_num_new     defines SPI 2 (KF_ITER_NEW) → 0x0
+ *   7-8: r1 = fp-24
+ *   9: call bpf_iter_num_next    uses SPI 2 → 0x30
+ *  10: if r0 == 0 goto 2f
+ *  11: goto 1b
+ *  12-13: r1 = fp-24
+ *  14: call bpf_iter_num_destroy uses SPI 2 → 0x30
+ *  15: r0 = 0
+ *  16: exit
+ *
+ * At insn 6, SPI 2 is defined (KF_ITER_NEW initializes, doesn't read),
+ * so it kills liveness from successors. live_stack_before = 0x0.
+ * At insns 9 and 14, SPI 2 is used (iter_next/destroy read the state),
+ * so live_stack_before = 0x30.
+ */
+SEC("socket")
+__success __log_level(2)
+__msg(" 6: (85) call bpf_iter_num_new{{.*}}          ; def: fp0-24{{$}}")
+__msg(" 9: (85) call bpf_iter_num_next{{.*}}         ; use: fp0-24{{$}}")
+__msg("14: (85) call bpf_iter_num_destroy{{.*}}      ; use: fp0-24{{$}}")
+__naked void kfunc_iter_stack_liveness(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"	/* SPI 0 - dead */
+	"*(u64 *)(r10 - 16) = 0;"	/* SPI 1 - dead */
+	"r1 = r10;"
+	"r1 += -24;"
+	"r2 = 0;"
+	"r3 = 10;"
+	"call %[bpf_iter_num_new];"
+"1:"
+	"r1 = r10;"
+	"r1 += -24;"
+	"call %[bpf_iter_num_next];"
+	"if r0 == 0 goto 2f;"
+	"goto 1b;"
+"2:"
+	"r1 = r10;"
+	"r1 += -24;"
+	"call %[bpf_iter_num_destroy];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_iter_num_new),
+	   __imm(bpf_iter_num_next),
+	   __imm(bpf_iter_num_destroy)
+	: __clobber_all);
+}
+
+/*
+ * Test for soundness bug in static stack liveness analysis.
+ *
+ * The static pre-pass tracks FP-derived register offsets to determine
+ * which stack slots are accessed. When a PTR_TO_STACK is spilled to
+ * the stack and later reloaded, the reload (BPF_LDX) kills FP-derived
+ * tracking, making subsequent accesses through the reloaded pointer
+ * invisible to the static analysis.
+ *
+ * This causes the analysis to incorrectly mark SPI 0 as dead at the
+ * merge point. clean_verifier_state() zeros it in the cached state,
+ * and stacksafe() accepts the new state against STACK_INVALID,
+ * enabling incorrect pruning.
+ *
+ * Path A (verified first): stores PTR_TO_MAP_VALUE in SPI 0
+ * Path B (verified second): stores scalar 42 in SPI 0
+ * After merge: reads SPI 0 through spilled/reloaded PTR_TO_STACK
+ * and dereferences the result as a pointer.
+ *
+ * Correct behavior: reject (path B dereferences a scalar)
+ * Bug behavior: accept (path B is incorrectly pruned)
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'scalar'")
+__naked void spill_ptr_liveness_type_confusion(void)
+{
+	asm volatile (
+	/* Map lookup to get PTR_TO_MAP_VALUE */
+	"r1 = %[map] ll;"
+	"*(u32 *)(r10 - 32) = 0;"
+	"r2 = r10;"
+	"r2 += -32;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto l_exit%=;"
+	/* r6 = PTR_TO_MAP_VALUE (callee-saved) */
+	"r6 = r0;"
+	/* Branch: fall-through (path A) verified first */
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_scalar%=;"
+	/* Path A: store map value ptr at SPI 0 */
+	"*(u64 *)(r10 - 8) = r6;"
+	"goto l_merge%=;"
+"l_scalar%=:"
+	/* Path B: store scalar at SPI 0 */
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+"l_merge%=:"
+	/*
+	 * Spill PTR_TO_STACK{off=-8} to SPI 1, then reload.
+	 * Reload kills FP-derived tracking, hiding the
+	 * subsequent SPI 0 access from the static analysis.
+	 */
+	"r1 = r10;"
+	"r1 += -8;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"goto +0;"			/* checkpoint */
+	"goto +0;"			/* checkpoint */
+	"goto +0;"			/* checkpoint */
+	"r1 = *(u64 *)(r10 - 16);"
+	/* Read SPI 0 through reloaded pointer */
+	"r0 = *(u64 *)(r1 + 0);"
+	/* Dereference: safe for map value (path A),
+	 * unsafe for scalar (path B).
+	 */
+	"r0 = *(u64 *)(r0 + 0);"
+	"exit;"
+"l_exit%=:"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm(bpf_get_prandom_u32),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+/* === Tests for 4-byte stack slot liveness granularity === */
+
+/* Test that a 4-byte aligned write is stack_def and kills liveness.
+ *
+ *   0: *(u64 *)(r10 - 8) = 0      def slots 0,1 (full SPI 0)
+ *   1: *(u32 *)(r10 - 8) = 0      def slot 1 (4-byte write kills slot 1)
+ *   2: r0 = *(u64 *)(r10 - 8)     use slots 0,1
+ *   3: r0 = 0
+ *   4: exit
+ *
+ * At insn 1, the 4-byte write defines slot 1. Slot 0 still flows
+ * backward from insn 2's read: live_stack_before = 0x1.
+ */
+SEC("socket")
+__log_level(2)
+__msg("1: (62) *(u32 *)(r10 -8) = 0         ; def: fp0-8h")
+__naked void four_byte_write_kills_slot(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u32 *)(r10 - 8) = 0;"
+	"r0 = *(u64 *)(r10 - 8);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that a write to the upper half of an SPI is dead when only
+ * the lower half is read. This was impossible at SPI granularity
+ * where any read of the SPI kept the entire SPI live.
+ *
+ *   0: *(u32 *)(r10 - 8) = 0      def slot 1 (DEAD: never read)
+ *   1: *(u32 *)(r10 - 4) = 0      def slot 0
+ *   2: r0 = *(u32 *)(r10 - 4)     use slot 0 only
+ *   3: r0 = 0
+ *   4: exit
+ *
+ * At insn 0, nothing is live (0x0). Previously at SPI granularity,
+ * the read at insn 2 would mark the full SPI 0 as live and the
+ * 4-byte writes wouldn't count as def, so insn 0 would have had
+ * SPI 0 live (0x1).
+ */
+SEC("socket")
+__log_level(2)
+__msg("0: (62) *(u32 *)(r10 -8) = 0         ; def: fp0-8h")
+__msg("2: (61) r0 = *(u32 *)(r10 -4)        ; use: fp0-4h")
+__naked void dead_half_spi_write(void)
+{
+	asm volatile (
+	"*(u32 *)(r10 - 8) = 0;"
+	"*(u32 *)(r10 - 4) = 0;"
+	"r0 = *(u32 *)(r10 - 4);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that a 4-byte read from the upper half of SPI 0 makes only
+ * slot 1 live (0x2), not the full SPI (0x3).
+ *
+ *   0: *(u64 *)(r10 - 8) = 0      def slots 0,1
+ *   1: r0 = *(u32 *)(r10 - 8)     use slot 1 only (upper half)
+ *   2: r0 = 0
+ *   3: exit
+ *
+ * At insn 1, live_stack_before = 0x2 (slot 1 only).
+ */
+SEC("socket")
+__log_level(2)
+__msg("1: (61) r0 = *(u32 *)(r10 -8)        ; use: fp0-8h")
+__naked void four_byte_read_upper_half(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"r0 = *(u32 *)(r10 - 8);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that a 2-byte write does NOT count as stack_def.
+ * Sub-4-byte writes don't fully cover a 4-byte slot,
+ * so liveness passes through.
+ *
+ *   0: *(u64 *)(r10 - 8) = 0      def slots 0,1
+ *   1: *(u16 *)(r10 - 4) = 0      NOT stack_def (2 < 4 bytes)
+ *   2: r0 = *(u32 *)(r10 - 4)     use slot 0
+ *   3: r0 = 0
+ *   4: exit
+ *
+ * At insn 1, slot 0 still live (0x1) because 2-byte write
+ * didn't kill it.
+ */
+SEC("socket")
+__log_level(2)
+__msg("0: (7a) *(u64 *)(r10 -8) = 0         ; def: fp0-8")
+__msg("1: (6a) *(u16 *)(r10 -4) = 0{{$}}")
+__msg("2: (61) r0 = *(u32 *)(r10 -4)        ; use: fp0-4h")
+__naked void two_byte_write_no_kill(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u16 *)(r10 - 4) = 0;"
+	"r0 = *(u32 *)(r10 - 4);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that a 1-byte write does NOT count as stack_def.
+ *
+ *   0: *(u64 *)(r10 - 8) = 0      def slots 0,1
+ *   1: *(u8 *)(r10 - 4) = 0       NOT stack_def (1 < 4 bytes)
+ *   2: r0 = *(u32 *)(r10 - 4)     use slot 0
+ *   3: r0 = 0
+ *   4: exit
+ *
+ * At insn 1, slot 0 still live (0x1).
+ */
+SEC("socket")
+__log_level(2)
+__msg("0: (7a) *(u64 *)(r10 -8) = 0         ; def: fp0-8")
+__msg("1: (72) *(u8 *)(r10 -4) = 0")
+__msg("2: (61) r0 = *(u32 *)(r10 -4)        ; use: fp0-4h")
+__naked void one_byte_write_no_kill(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u8 *)(r10 - 4) = 0;"
+	"r0 = *(u32 *)(r10 - 4);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test stack access beyond fp-256 exercising the second bitmask word.
+ * fp-264 is SPI 32, slots 64-65, which are bits 0-1 of live_stack[1].
+ *
+ *   0: *(u64 *)(r10 - 264) = 0     def slots 64,65
+ *   1: r0 = *(u64 *)(r10 - 264)    use slots 64,65
+ *   2: r0 = 0
+ *   3: exit
+ *
+ * At insn 1, live_stack high word has bits 0,1 set: 0x3:0x0.
+ */
+SEC("socket")
+__log_level(2)
+__msg("1: (79) r0 = *(u64 *)(r10 -264)      ; use: fp0-264")
+__naked void high_stack_second_bitmask_word(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 264) = 0;"
+	"r0 = *(u64 *)(r10 - 264);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that two separate 4-byte writes to each half of an SPI
+ * together kill liveness for the full SPI.
+ *
+ *   0: *(u32 *)(r10 - 8) = 0      def slot 1 (upper half)
+ *   1: *(u32 *)(r10 - 4) = 0      def slot 0 (lower half)
+ *   2: r0 = *(u64 *)(r10 - 8)     use slots 0,1
+ *   3: r0 = 0
+ *   4: exit
+ *
+ * At insn 0: live_stack_before = 0x0 (both slots killed by insns 0,1).
+ * At insn 1: live_stack_before = 0x2 (slot 1 still live, slot 0 killed here).
+ */
+SEC("socket")
+__log_level(2)
+__msg("0: (62) *(u32 *)(r10 -8) = 0         ; def: fp0-8h")
+__msg("1: (62) *(u32 *)(r10 -4) = 0         ; def: fp0-4h")
+__naked void two_four_byte_writes_kill_full_spi(void)
+{
+	asm volatile (
+	"*(u32 *)(r10 - 8) = 0;"
+	"*(u32 *)(r10 - 4) = 0;"
+	"r0 = *(u64 *)(r10 - 8);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Test that 4-byte writes on both branches kill a slot at the
+ * join point. Previously at SPI granularity, a 4-byte write was
+ * not stack_def, so liveness would flow backward through the
+ * branch that only had a 4-byte write.
+ *
+ *   0: call bpf_get_prandom_u32
+ *   1: if r0 != 0 goto 1f
+ *   2: *(u64 *)(r10 - 8) = 0       path A: def slots 0,1
+ *   3: goto 2f
+ * 1:4: *(u32 *)(r10 - 4) = 0       path B: def slot 0
+ * 2:5: r0 = *(u32 *)(r10 - 4)      use slot 0
+ *   6: r0 = 0
+ *   7: exit
+ *
+ * Both paths define slot 0 before the read. At insn 1 (branch),
+ * live_stack_before = 0x0 because slot 0 is killed on both paths.
+ */
+SEC("socket")
+__log_level(2)
+__msg("1: (55) if r0 != 0x0 goto pc+2")
+__msg("2: (7a) *(u64 *)(r10 -8) = 0         ; def: fp0-8")
+__msg("3: (05) goto pc+1")
+__msg("4: (62) *(u32 *)(r10 -4) = 0         ; def: fp0-4h")
+__msg("5: (61) r0 = *(u32 *)(r10 -4)        ; use: fp0-4h")
+__naked void both_branches_kill_slot(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto 1f;"
+	"*(u64 *)(r10 - 8) = 0;"
+	"goto 2f;"
+"1:"
+	"*(u32 *)(r10 - 4) = 0;"
+"2:"
+	"r0 = *(u32 *)(r10 - 4);"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Soundness: cleaning the dead upper half of an SPI must not
+ * affect the live lower half's type information for pruning.
+ *
+ * Both halves of SPI 0 are written separately. Only the lower
+ * half (slot 0) is used as a 4-byte map key. The upper half
+ * (slot 1) is dead and cleaned to STACK_INVALID.
+ *
+ * Path A: key stays 0 (STACK_ZERO) → non-null array lookup
+ * Path B: key byte turns STACK_MISC → may-null array lookup
+ * Deref without null check: safe for A, unsafe for B.
+ *
+ * If half-SPI cleaning incorrectly corrupted the live half's
+ * type info, path A's cached state could generalize and unsoundly
+ * prune path B.
+ *
+ * Expected: reject (path B unsafe).
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+__naked void half_spi_clean_preserves_stack_zero(void)
+{
+	asm volatile (
+	"*(u32 *)(r10 - 4) = 0;"           /* slot 0: STACK_ZERO */
+	"*(u32 *)(r10 - 8) = 0;"           /* slot 1: STACK_ZERO (dead) */
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_nonconst%=;"
+	"goto l_lookup%=;"
+"l_nonconst%=:"
+	"*(u8 *)(r10 - 4) = r0;"           /* slot 0: STACK_MISC */
+"l_lookup%=:"
+	"r2 = r10;"
+	"r2 += -4;"
+	"r1 = %[array_map_8b] ll;"
+	"call %[bpf_map_lookup_elem];"
+	"r0 = *(u64 *)(r0 + 0);"           /* unsafe if null */
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_map_lookup_elem),
+	  __imm_addr(array_map_8b)
+	: __clobber_all);
+}
+
+/*
+ * Model of scx_lavd's pick_idle_cpu_at_cpdom iat block:
+ * conditional block with helper call and temporary stack spill,
+ * spill dead after merge.
+ *
+ * Path A (fall-through): spill r6 to fp-8 across helper call
+ * Path B (branch taken): skip the block entirely
+ * At merge (insn 6): fp-8 is dead (never read after merge)
+ *
+ * Static liveness marks fp-8 dead at merge. clean_verifier_state()
+ * converts path A's STACK_SPILL to STACK_INVALID. Path B has
+ * STACK_INVALID. stacksafe() matches -> path B pruned -> "6: safe".
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__success
+__log_level(2)
+__msg("6: safe")
+__naked void dead_spill_at_merge_enables_pruning(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"r6 = 7;"
+	"if r0 != 0 goto l_skip%=;"
+	/* conditional block: spill, call, reload */
+	"*(u64 *)(r10 - 8) = r6;"
+	"call %[bpf_get_prandom_u32];"
+	"r6 = *(u64 *)(r10 - 8);"
+"l_skip%=:"
+	/* fp-8 dead. Path B pruned here -> "6: safe" */
+	"r0 = r6;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * FP-offset tracking loses precision on second ADD, killing all liveness.
+ *
+ * fp_off_insn_xfer() handles "FP itself + negative imm" precisely
+ * (e.g. r6 = r10; r6 += -24 -> slot 5).  But any subsequent ADD/SUB
+ * on a register that already has non-zero spis falls through to
+ * spis_set_all(), because the code only handles the FP-itself case.
+ *
+ * A write through this imprecise register enters the non-zero-spis
+ * branch of set_indirect_stack_access(), which OR's the all-ones
+ * mask into stack_def.  The backward liveness equation
+ *
+ *   stack_in = (stack_out & ~stack_def) | stack_use
+ *
+ * sees ~ALL = 0, killing ALL slot liveness at that instruction.
+ *
+ * At the merge pruning point, live_stack_before is empty.
+ * clean_verifier_state() marks fp-8 as STACK_INVALID.
+ * stacksafe() skips STACK_INVALID (line "continue"), so pruning
+ * succeeds regardless of the current state's fp-8 value.
+ * Path B is pruned, its null deref is never explored.
+ *
+ * Correct behavior: reject (path B dereferences NULL).
+ * Bug behavior: accept (path B pruned away).
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R1 invalid mem access 'scalar'")
+__naked void fp_add_loses_precision_kills_liveness(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_pathB%=;"
+
+	/* Path A (fall-through, explored first): fp-8 = 0 */
+	"r1 = 0;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"goto l_merge%=;"
+
+"l_pathB%=:"
+	/* Path B (explored second): fp-8 = 42 */
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+
+"l_merge%=:"
+	/*
+	 * Create imprecise FP-derived register.
+	 * r6 = r10 - 24 gets precise slot 5.
+	 * r6 += 8 hits the else branch (spis non-zero, delta > 0)
+	 * and sets spis to ALL.  r6 is actually r10-16.
+	 */
+	"r6 = r10;"
+	"r6 += -24;"
+	"r6 += 8;"
+
+	/*
+	 * Write through imprecise r6.  Actually writes to fp-16
+	 * (does NOT touch fp-8), but liveness marks ALL slots
+	 * as stack_def, killing fp-8's liveness.
+	 */
+	"r7 = 0;"
+	"*(u64 *)(r6 + 0) = r7;"
+
+	/* Read fp-8: liveness says dead, but value is needed. */
+	"r2 = *(u64 *)(r10 - 8);"
+	"if r2 == 42 goto l_danger%=;"
+
+	/* r2 != 42 (path A: r2 == 0): safe exit */
+	"r0 = 0;"
+	"exit;"
+
+"l_danger%=:"
+	/* Only reachable from path B (r2 == 42): null deref */
+	"r1 = 0;"
+	"r0 = *(u64 *)(r1 + 0);"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R1 invalid mem access 'scalar'")
+__naked void fp_spill_loses_precision_kills_liveness(void)
+{
+	asm volatile (
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_pathB%=;"
+
+	"r1 = 0;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"goto l_merge%=;"
+
+"l_pathB%=:"
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+
+"l_merge%=:"
+	"r6 = r10;"
+	"r6 += -64;"
+	"*(u64 *)(r10 - 160) = r6;"
+	"r6 = *(u64 *)(r10 - 160);"
+
+	"r7 = 0;"
+	"*(u64 *)(r6 + 0) = r7;"
+
+	"r2 = *(u64 *)(r10 - 8);"
+	"if r2 == 42 goto l_danger%=;"
+
+	"r0 = *(u64 *)(r10 - 56);"
+	"exit;"
+
+"l_danger%=:"
+	"r1 = 0;"
+	"r0 = *(u64 *)(r1 + 0);"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* === Tests for frame-based AT_FP tracking === */
+
+/*
+ * Test 1: conditional_stx_in_subprog
+ * Subprog conditionally writes caller's slot.
+ * Verify slot stays live (backward pass handles conditional def via CFG).
+ *
+ * Main writes fp-8=42, calls cond_writer(fp-8), reads fp-8.
+ * cond_writer only writes on one path → parent_def only on that path.
+ * The backward parent_live correctly keeps fp-8 live at entry
+ * (conditional write doesn't kill liveness at the join).
+ */
+SEC("socket")
+__log_level(2)
+/* fp-8 live at call (callee conditionally writes → slot not killed) */
+__msg("1: (7b) *(u64 *)(r10 -8) = r1        ; def: fp0-8")
+__msg("4: (85) call pc+2{{$}}")
+__msg("5: (79) r0 = *(u64 *)(r10 -8)        ; use: fp0-8")
+__naked void conditional_stx_in_subprog(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call cond_writer;"
+	"r0 = *(u64 *)(r10 - 8);"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Conditionally writes to *(r1+0) */
+static __used __naked void cond_writer(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"*(u64 *)(r6 + 0) = r0;"
+	"1:"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__msg("4: (85) call pc+{{.*}}                   ; use: fp0-16")
+__msg("7: (85) call pc+{{.*}}                   ; use: fp0-32")
+__naked void multiple_callsites_different_offsets(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 16) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -16;"
+	"call read_first_param;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"call read_first_param;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Test 3: nested_fp_passthrough
+ * main→A→B, main's FP forwarded to B. B accesses main's stack.
+ * Verify liveness propagates through.
+ *
+ * Main passes fp-32 to outer_forwarder, which passes it to inner_reader.
+ * inner_reader reads at arg+0 (= main's fp-32).
+ * parent_live propagates transitively: inner→outer→main.
+ */
+SEC("socket")
+__log_level(2)
+/* At call to outer_forwarder: main's fp-32 (slots 6,7) should be live */
+__msg("6: (85) call pc+{{.*}}                   ; use: fp0-32")
+__naked void nested_fp_passthrough(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u64 *)(r10 - 16) = 0;"
+	"*(u64 *)(r10 - 24) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"call outer_forwarder;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Forwards arg to inner_reader */
+static __used __naked void outer_forwarder(void)
+{
+	asm volatile (
+	"call inner_reader;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void inner_reader(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Test 4: callee_must_write_before_read
+ * Callee unconditionally writes parent slot before reading.
+ * Verify slot is NOT live at call site (parent_def kills it).
+ */
+SEC("socket")
+__log_level(2)
+/* fp-8 NOT live at call: callee writes before reading (parent_def kills it) */
+__msg("2: .12345.... (85) call pc+")
+__naked void callee_must_write_before_read(void)
+{
+	asm volatile (
+	"r1 = r10;"
+	"r1 += -8;"
+	"call write_then_read;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Unconditionally writes *(r1+0), then reads it back */
+static __used __naked void write_then_read(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"r7 = 99;"
+	"*(u64 *)(r6 + 0) = r7;"
+	"r0 = *(u64 *)(r6 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Test 5: return_site_liveness_bleeding
+ * Main calls subprog twice. Slot used after one call but not the other.
+ * Context-insensitive: slot conservatively live at both.
+ *
+ * After first call: read fp-8.
+ * After second call: don't read fp-8.
+ * Since parent_live is per-subprog (not per call-site),
+ * fp-8 is live at both call sites.
+ */
+SEC("socket")
+__log_level(2)
+/* Both calls have fp-8 live due to context-insensitive parent_live */
+__msg("3: (85) call pc+{{.*}}                   ; use: fp0-8")
+__msg("7: (85) call pc+{{.*}}                   ; use: fp0-8")
+__naked void return_site_liveness_bleeding(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call read_first_param;"
+	"r0 = *(u64 *)(r10 - 8);"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call read_first_param;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__msg("9: (85) call bpf_loop#181            ; use: fp0-16")
+__naked void callback_conditional_read_beyond_ctx(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"r1 = 2;"
+	"r2 = cb_cond_read ll;"
+	"r3 = r10;"
+	"r3 += -8;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_loop)
+	: __clobber_all);
+}
+
+/* Callback conditionally reads *(ctx - 8) = caller fp-16 */
+static __used __naked void cb_cond_read(void)
+{
+	asm volatile (
+	"r6 = r2;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"r0 = *(u64 *)(r6 - 8);"
+	"1:"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__msg("14: (7b) *(u64 *)(r6 -8) = r7         ; def: fp0-16")
+__msg("15: (79) r0 = *(u64 *)(r6 -8)         ; use: fp0-16")
+__naked void callback_write_before_read_kills(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"r1 = 2;"
+	"r2 = cb_write_read ll;"
+	"r3 = r10;"
+	"r3 += -8;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_loop)
+	: __clobber_all);
+}
+
+/* Callback unconditionally writes *(ctx-8), then reads it back.
+ * The write (parent_def) kills liveness before entry.
+ */
+static __used __naked void cb_write_read(void)
+{
+	asm volatile (
+	"r6 = r2;"
+	"r7 = 99;"
+	"*(u64 *)(r6 - 8) = r7;"
+	"r0 = *(u64 *)(r6 - 8);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * bpf_loop callback conditionally writes fp-16 then unconditionally
+ * reads it. The conditional write does NOT kill liveness
+ */
+SEC("socket")
+__log_level(2)
+__msg("9: (85) call bpf_loop#181            ; use: fp0-16")
+__naked void callback_conditional_write_preserves(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"r1 = 2;"
+	"r2 = cb_cond_write_read ll;"
+	"r3 = r10;"
+	"r3 += -8;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_loop)
+	: __clobber_all);
+}
+
+static __used __naked void cb_cond_write_read(void)
+{
+	asm volatile (
+	"r6 = r2;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"*(u64 *)(r6 - 8) = r0;"
+	"1:"
+	"r0 = *(u64 *)(r6 - 8);"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * Two bpf_loop calls with the same callback but different ctx pointers.
+ *
+ * First call: ctx=fp-8, second call: ctx=fp-24.
+ */
+SEC("socket")
+__log_level(2)
+__msg(" 8: (85) call bpf_loop{{.*}}            ; use: fp0-8")
+__msg("15: (85) call bpf_loop{{.*}}            ; use: fp0-24")
+__naked void callback_two_calls_different_ctx(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"*(u64 *)(r10 - 24) = 0;"
+	"r1 = 1;"
+	"r2 = cb_read_ctx ll;"
+	"r3 = r10;"
+	"r3 += -8;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r1 = 1;"
+	"r2 = cb_read_ctx ll;"
+	"r3 = r10;"
+	"r3 += -24;"
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_loop)
+	: __clobber_all);
+}
+
+/* Callback reads at ctx+0 unconditionally */
+static __used __naked void cb_read_ctx(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r2 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Reproducer for unsound pruning in refined_caller_live_stack().
+ *
+ * Three-level call chain: main → mid_fwd → grandchild_deref.
+ * Main passes &fp-8 to mid_fwd, which forwards R1 to grandchild_deref.
+ * grandchild_deref reads main's fp-8 through the forwarded pointer
+ * and dereferences the result.
+ *
+ * refined_caller_live_stack() has a callee_offset++ when mid_fwd
+ * (frame 1) is mid-call. This drops the transitive parent_live
+ * contribution at mid_fwd's call instruction — the only place
+ * where grandchild_deref's read of main's fp-8 is recorded.
+ * As a result, main's fp-8 is cleaned to STACK_INVALID at the
+ * pruning point inside grandchild_deref, and path B is
+ * incorrectly pruned against path A.
+ *
+ * Path A: main stores PTR_TO_MAP_VALUE at fp-8
+ * Path B: main stores scalar 42 at fp-8
+ *
+ * Correct behavior: reject (path B dereferences scalar)
+ * Bug behavior: accept (path B pruned against cleaned path A)
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'scalar'")
+__naked void transitive_parent_stack_read_unsound(void)
+{
+	asm volatile (
+	/* Map lookup to get PTR_TO_MAP_VALUE */
+	"r1 = %[map] ll;"
+	"*(u32 *)(r10 - 32) = 0;"
+	"r2 = r10;"
+	"r2 += -32;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto l_exit%=;"
+	"r6 = r0;"
+	/* Branch: path A (fall-through) explored first */
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_scalar%=;"
+	/* Path A: fp-8 = PTR_TO_MAP_VALUE */
+	"*(u64 *)(r10 - 8) = r6;"
+	"goto l_merge%=;"
+"l_scalar%=:"
+	/* Path B: fp-8 = scalar 42 */
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+"l_merge%=:"
+	/* Pass &fp-8 to mid_fwd → grandchild_deref */
+	"r1 = r10;"
+	"r1 += -8;"
+	"call mid_fwd;"
+	"r0 = 0;"
+	"exit;"
+"l_exit%=:"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm(bpf_get_prandom_u32),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+/* Forwards R1 (ptr to main's fp-8) to grandchild_deref */
+static __used __naked void mid_fwd(void)
+{
+	asm volatile (
+	"call grandchild_deref;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Reads main's fp-8 through forwarded pointer, dereferences result */
+static __used __naked void grandchild_deref(void)
+{
+	asm volatile (
+	"goto +0;"				/* checkpoint */
+	"goto +0;"				/* checkpoint */
+	/* read main's fp-8: map_ptr (path A) or scalar (path B) */
+	"r0 = *(u64 *)(r1 + 0);"
+	/* dereference: safe for map_ptr, unsafe for scalar */
+	"r0 = *(u64 *)(r0 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__success
+__msg("14: (79) r1 = *(u64 *)(r10 -8) // r6=fp0-8 r7=fp1-16 fp-8=fp1-16 fp-16=fp0-8")
+__msg("15: (79) r0 = *(u64 *)(r1 +0) // r1=fp1-16 r6=fp0-8 r7=fp1-16 fp-8=fp1-16 fp-16=fp0-8")
+__msg("stack use/def subprog#1 mid_two_fp_threshold (d1,cs2):")
+__msg("14: (79) r1 = *(u64 *)(r10 -8)        ; use: fp1-8")
+__msg("15: (79) r0 = *(u64 *)(r1 +0)         ; use: fp1-16")
+__naked void two_fp_clear_stack_threshold(void)
+{
+	asm volatile (
+	"r1 = r10;"
+	"r1 += -8;"
+	"call mid_two_fp_threshold;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void mid_two_fp_threshold(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"r7 = r10;"
+	"r7 += -16;"
+	"*(u64 *)(r10 - 8) = r7;"
+	"*(u64 *)(r10 - 16) = r6;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = r6;"
+	"call inner_nop_fptest;"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void inner_nop_fptest(void)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__success
+__msg("13: (79) r1 = *(u64 *)(r10 -8) // r6=fp0-8 r7=fp1-16 fp-8=fp1-16 fp-16=fp0-8")
+__msg("14: (79) r0 = *(u64 *)(r1 +0) // r1=fp1-16 r6=fp0-8 r7=fp1-16 fp-8=fp1-16 fp-16=fp0-8")
+__msg("stack use/def subprog#1 mid_one_fp_threshold (d1,cs2):")
+__msg("13: (79) r1 = *(u64 *)(r10 -8)        ; use: fp1-8")
+__msg("14: (79) r0 = *(u64 *)(r1 +0)         ; use: fp1-16")
+__naked void one_fp_clear_stack_threshold(void)
+{
+	asm volatile (
+	"r1 = r10;"
+	"r1 += -8;"
+	"call mid_one_fp_threshold;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void mid_one_fp_threshold(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"r7 = r10;"
+	"r7 += -16;"
+	"*(u64 *)(r10 - 8) = r7;"
+	"*(u64 *)(r10 - 16) = r6;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call inner_nop_fptest;"
+	"r1 = *(u64 *)(r10 - 8);"
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Reproducer for unsound pruning when a subprog forwards a parent
+ * stack pointer (AT_PARENT) to a helper with a memory argument.
+ *
+ * set_call_stack_access_at() previously only tracked AT_CURRENT args,
+ * skipping AT_PARENT entirely. This meant helper reads through parent
+ * stack pointers did not set parent_use, letting the slot appear dead
+ * at pruning checkpoints inside the subprog.
+ *
+ * Program shape:
+ *   main:
+ *     *(u32)(fp-4) = 0             key = STACK_ZERO (const 0)
+ *     call bpf_get_prandom_u32
+ *     if r0 != 0 goto clobber      path A (fall-through) first
+ *     goto merge
+ *   clobber:
+ *     *(u8)(fp-4) = r0             path B: key[0] = STACK_MISC
+ *   merge:
+ *     r1 = fp - 4
+ *     call fwd_parent_key_to_helper
+ *     r0 = 0
+ *     exit
+ *
+ *   fwd_parent_key_to_helper(r1 = &caller_fp-4):
+ *     goto +0                      checkpoint
+ *     r2 = r1                      R2 = AT_PARENT ptr to caller fp-4
+ *     r1 = array_map_8b ll         R1 = array map
+ *     call bpf_map_lookup_elem     reads key_size(4) from parent fp-4
+ *     r0 = *(u64 *)(r0 + 0)        deref without null check
+ *     r0 = 0
+ *     exit
+ *
+ * Path A: STACK_ZERO key = const 0 -> array lookup -> PTR_TO_MAP_VALUE
+ *         (non-NULL for in-bounds const key) -> deref OK.
+ * Path B: STACK_MISC key = unknown -> array lookup ->
+ *         PTR_TO_MAP_VALUE_OR_NULL -> deref UNSAFE.
+ *
+ * Bug: AT_PARENT R2 arg to bpf_map_lookup_elem skipped -> parent_use
+ *      not set -> fp-4 cleaned at checkpoint -> STACK_ZERO collapses
+ *      to STACK_INVALID -> path B pruned -> deref never checked.
+ *
+ * Correct verifier behavior: reject (path B deref of map_value_or_null).
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'map_value_or_null'")
+__naked void helper_parent_stack_read_unsound(void)
+{
+	asm volatile (
+	/* key at fp-4: all bytes STACK_ZERO */
+	"*(u32 *)(r10 - 4) = 0;"
+	"call %[bpf_get_prandom_u32];"
+	/* fall-through (path A) explored first */
+	"if r0 != 0 goto l_clobber%=;"
+	/* path A: key stays constant zero */
+	"goto l_merge%=;"
+"l_clobber%=:"
+	/* path B: key[0] becomes STACK_MISC, key no longer const */
+	"*(u8 *)(r10 - 4) = r0;"
+"l_merge%=:"
+	"r1 = r10;"
+	"r1 += -4;"
+	"call fwd_parent_key_to_helper;"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * Subprog forwards parent stack pointer to bpf_map_lookup_elem as key
+ * on an array map, then dereferences the result without a null check.
+ * R1 = &parent_fp-4 (AT_PARENT in this frame).
+ *
+ * The helper reads key_size(4) bytes from parent stack.  The deref of
+ * R0 reads the map value, NOT parent stack, so record_insn_mem_accesses
+ * does not set parent_use for it.  The ONLY parent stack access is
+ * through the helper's R2 arg.
+ */
+static __used __naked void fwd_parent_key_to_helper(void)
+{
+	asm volatile (
+	"goto +0;"				/* checkpoint */
+	"r2 = r1;"				/* R2 = parent ptr (AT_PARENT) */
+	"r1 = %[array_map_8b] ll;"		/* R1 = array map */
+	"call %[bpf_map_lookup_elem];"		/* reads 4 bytes from parent fp-4 */
+	/* deref without null check: safe for PTR_TO_MAP_VALUE,
+	 * unsafe for PTR_TO_MAP_VALUE_OR_NULL
+	 */
+	"r0 = *(u64 *)(r0 + 0);"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(array_map_8b)
+	: __clobber_all);
+}
+
+/*
+ * Regression for keeping later helper args after a whole-stack fallback
+ * on an earlier local arg.  The first bpf_snprintf() arg is a local
+ * frame-derived pointer with offset-imprecise tracking (`fp1 ?`), which
+ * conservatively marks the whole local stack live.  The fourth arg still
+ * forwards &parent_fp-8 and must contribute nonlocal_use[0]=0:3.
+ */
+SEC("socket")
+__log_level(2)
+__success
+__msg("call bpf_snprintf{{.*}}        ; use: fp1-8..-512 fp0-8")
+__naked void helper_arg_fallback_keeps_scanning(void)
+{
+	asm volatile (
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call helper_snprintf_parent_after_local_fallback;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void helper_snprintf_parent_after_local_fallback(void)
+{
+	asm volatile (
+	"r6 = r1;"				/* save &parent_fp-8 */
+	"call %[bpf_get_prandom_u32];"
+	"r0 &= 8;"
+	"r1 = r10;"
+	"r1 += -16;"
+	"r1 += r0;"				/* local fp, offset-imprecise */
+	"r2 = 8;"
+	"r3 = %[snprintf_u64_fmt] ll;"
+	"r4 = r6;"				/* later arg: parent fp-8 */
+	"r5 = 8;"
+	"call %[bpf_snprintf];"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_get_prandom_u32),
+	  __imm(bpf_snprintf),
+	  __imm_addr(snprintf_u64_fmt)
+	: __clobber_all);
+}
+
+/*
+ * Test that propagate_callee_ancestor() correctly chains ancestor
+ * liveness across sequential calls within a single frame.
+ *
+ * main → mid_seq_touch → {nop_callee, deref_ancestor}
+ *
+ * mid_seq_touch receives two pointers: R1 = &main_fp-8 (forwarded to
+ * deref_ancestor) and R2 = &main_fp-16 (read directly by mid_seq_touch).
+ * The direct read of fp-16 forces ensure_anc_arrays() to allocate
+ * ancestor_live[0] for mid_seq_touch, so refined_caller_live_stack()
+ * uses the refined path (not the conservative fallback).
+ *
+ * mid_seq_touch calls nop_callee first (no-op, creates a pruning point),
+ * then calls deref_ancestor which reads main's fp-8 and dereferences it.
+ *
+ * propagate_callee_ancestor() propagates deref_ancestor's entry
+ * ancestor_live[0] into mid_seq_touch's anc_use[0] at the call-to-deref
+ * instruction.  mid_seq_touch's backward pass flows this backward so
+ * ancestor_live[0] includes fp-8 at the pruning point between the calls.
+ *
+ * Without propagation, mid_seq_touch's ancestor_live[0] only has fp-16
+ * (from the direct read) — fp-8 is missing.  refined_caller_live_stack()
+ * Term 1 says fp-8 is dead, the verifier cleans it, and path B
+ * (scalar 42) is incorrectly pruned against path A (MAP_VALUE).
+ *
+ * Path A: main stores PTR_TO_MAP_VALUE at fp-8  → deref succeeds
+ * Path B: main stores scalar 42 at fp-8         → deref must fail
+ *
+ * Correct: reject (path B dereferences scalar)
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("R0 invalid mem access 'scalar'")
+__naked void propagate_callee_ancestor_chain(void)
+{
+	asm volatile (
+	/* Map lookup to get PTR_TO_MAP_VALUE */
+	"r1 = %[map] ll;"
+	"*(u32 *)(r10 - 32) = 0;"
+	"r2 = r10;"
+	"r2 += -32;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto l_exit%=;"
+	"r6 = r0;"
+	/* Branch: path A (fall-through) explored first */
+	"call %[bpf_get_prandom_u32];"
+	"if r0 != 0 goto l_scalar%=;"
+	/* Path A: fp-8 = PTR_TO_MAP_VALUE */
+	"*(u64 *)(r10 - 8) = r6;"
+	"goto l_merge%=;"
+"l_scalar%=:"
+	/* Path B: fp-8 = scalar 42 */
+	"r1 = 42;"
+	"*(u64 *)(r10 - 8) = r1;"
+"l_merge%=:"
+	/* fp-16 = dummy value (mid_seq_touch reads it directly) */
+	"r1 = 99;"
+	"*(u64 *)(r10 - 16) = r1;"
+	/* R1 = &fp-8 (for deref_ancestor), R2 = &fp-16 (for mid_seq_touch) */
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = r10;"
+	"r2 += -16;"
+	"call mid_seq_touch;"
+	"r0 = 0;"
+	"exit;"
+"l_exit%=:"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm(bpf_get_prandom_u32),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+/*
+ * R1 = &main_fp-8 (forwarded to deref_ancestor)
+ * R2 = &main_fp-16 (read directly here → allocates ancestor_live[0])
+ *
+ * Reads main's fp-16 to force ancestor_live[0] allocation, then
+ * calls nop_callee (pruning point), then deref_ancestor.
+ */
+static __used __naked void mid_seq_touch(void)
+{
+	asm volatile (
+	"r6 = r1;"			/* save &main_fp-8 in callee-saved */
+	"r0 = *(u64 *)(r2 + 0);"	/* read main's fp-16: triggers anc_use[0] */
+	"call nop_callee;"		/* no-op, creates pruning point after */
+	"r1 = r6;"			/* restore ptr to &main_fp-8 */
+	"call deref_ancestor;"		/* reads main's fp-8, dereferences */
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void nop_callee(void)
+{
+	asm volatile (
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Reads main's fp-8 through forwarded pointer, dereferences result */
+static __used __naked void deref_ancestor(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"	/* read main's fp-8 */
+	"r0 = *(u64 *)(r0 + 0);"	/* deref: safe for map_ptr, unsafe for scalar */
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Test: callee loads an fp-derived pointer from caller's stack, then
+ * reads through it to access another caller stack slot.
+ *
+ * main stores PTR_TO_MAP_VALUE at fp-24, stores &fp-24 (an fp-derived
+ * pointer) at fp-8, passes &fp-8 through mid_fwd_spilled_ptr to
+ * load_ptr_deref_grandchild.  The leaf loads the pointer from main's
+ * fp-8, then reads main's fp-24 through the loaded pointer.
+ *
+ * fill_from_stack() in arg_track_xfer() only handles local-frame
+ * FP-derived loads (src_is_local_fp check requires frame == depth).
+ * When a callee loads from a parent-frame pointer (frame < depth),
+ * the loaded value gets ARG_NONE instead of being recognized as
+ * fp-derived.  Subsequent reads through that loaded pointer are
+ * invisible to liveness — nonlocal_use is never set for fp-24.
+ *
+ * clean_live_states() cleans the current state at every prune point.
+ * Because liveness misses fp-24, refined_caller_live_stack() tells
+ * __clean_func_state() that fp-24 is dead, which destroys the
+ * PTR_TO_MAP_VALUE spill before the grandchild can read it.
+ * The grandchild then reads STACK_INVALID → scalar, and the deref
+ * is rejected with "R0 invalid mem access 'scalar'" — even though
+ * fp-24 is genuinely live and holds a valid map pointer.
+ *
+ * This is a false positive: a valid program incorrectly rejected.
+ */
+SEC("socket")
+__flag(BPF_F_TEST_STATE_FREQ)
+__success
+__naked void spilled_fp_cross_frame_deref(void)
+{
+	asm volatile (
+	/* Map lookup to get PTR_TO_MAP_VALUE */
+	"r1 = %[map] ll;"
+	"*(u32 *)(r10 - 32) = 0;"
+	"r2 = r10;"
+	"r2 += -32;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto l_exit%=;"
+	/* fp-24 = PTR_TO_MAP_VALUE */
+	"*(u64 *)(r10 - 24) = r0;"
+	/* Store pointer to fp-24 at fp-8 */
+	"r1 = r10;"
+	"r1 += -24;"
+	"*(u64 *)(r10 - 8) = r1;"
+	/* R1 = &fp-8: pointer to the spilled ptr */
+	"r1 = r10;"
+	"r1 += -8;"
+	"call mid_fwd_spilled_ptr;"
+	"r0 = 0;"
+	"exit;"
+"l_exit%=:"
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+/* Forwards R1 (ptr to main's fp-8, which holds &main_fp-24) to leaf */
+static __used __naked void mid_fwd_spilled_ptr(void)
+{
+	asm volatile (
+	"call load_ptr_deref_grandchild;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * R1 = &main_fp-8 (where main stored ptr to fp-24)
+ * Loads the ptr from main's fp-8, reads main's fp-24 through it,
+ * then dereferences the result.
+ */
+static __used __naked void load_ptr_deref_grandchild(void)
+{
+	asm volatile (
+	/* Load ptr from main's fp-8 → r2 = &main_fp-24 */
+	"r2 = *(u64 *)(r1 + 0);"
+	/* Read main's fp-24 through loaded ptr */
+	"r0 = *(u64 *)(r2 + 0);"
+	/* Dereference: safe for map_ptr */
+	"r0 = *(u64 *)(r0 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Exercise merge_nonlocal_live().
+ *
+ * merge_shared_mid is analyzed twice (once from each wrapper), so the
+ * callsite within merge_shared_mid that calls merge_leaf_read gets its
+ * nonlocal_live info merged twice via merge_nonlocal_live().
+ */
+SEC("socket")
+__log_level(2)
+__success
+__msg("14: (85) call pc+2	r1: fp0-16")
+__msg("17: (79) r0 = *(u64 *)(r1 +0) // r1=fp0-16")
+__msg("14: (85) call pc+2	r1: fp0-8")
+__msg("17: (79) r0 = *(u64 *)(r1 +0) // r1=fp0-8")
+__msg("5: (85) call pc+{{.*}}                   ; use: fp0-8 fp0-16")
+__naked void test_merge_nonlocal_live(void)
+{
+	asm volatile (
+	"r1 = 0;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call merge_wrapper_a;"
+	"r1 = r10;"
+	"r1 += -16;"
+	"call merge_wrapper_b;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void merge_wrapper_a(void)
+{
+	asm volatile (
+	"call merge_shared_mid;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void merge_wrapper_b(void)
+{
+	asm volatile (
+	"call merge_shared_mid;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void merge_shared_mid(void)
+{
+	asm volatile (
+	"call merge_leaf_read;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void merge_leaf_read(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/* Same bpf_loop instruction calls different callbacks depending on branch. */
+SEC("socket")
+__log_level(2)
+__success
+__msg("call bpf_loop#181            ; use: fp2-8..-512 fp1-8..-512 fp0-8..-512")
+__naked void bpf_loop_two_callbacks(void)
+{
+	asm volatile (
+	"r1 = 0;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call dyn_wrapper_a;"
+	"r1 = r10;"
+	"r1 += -16;"
+	"call dyn_wrapper_b;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void dyn_wrapper_a(void)
+{
+	asm volatile (
+	"call mid_dynamic_cb;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void dyn_wrapper_b(void)
+{
+	asm volatile (
+	"call mid_dynamic_cb;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void mid_dynamic_cb(void)
+{
+	asm volatile (
+	"r6 = r1;"
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"r2 = dyn_cb_a ll;"
+	"goto 2f;"
+	"1:"
+	"r2 = dyn_cb_b ll;"
+	"2:"
+	"r1 = 1;"
+	"r3 = r6;" /* ctx = fp-derived ptr from parent */
+	"r4 = 0;"
+	"call %[bpf_loop];"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32),
+	   __imm(bpf_loop)
+	: __clobber_all);
+}
+
+/* Callback A/B: read parent stack through ctx */
+static __used __naked void dyn_cb_a(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r2 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void dyn_cb_b(void)
+{
+	asm volatile (
+	"r0 = *(u64 *)(r2 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Path A: r0 = map_lookup result (non-FP, ARG_NONE for stack tracking)
+ * Path B: r0 = fp-8 (FP-derived, frame=0, off=-8)
+ * At the join: r0 is not guaranteed to be a frame pointer.
+ */
+SEC("socket")
+__log_level(2)
+__msg("10: (79) r0 = *(u64 *)(r10 -8) // r0=fp0-8|fp0+0")
+__naked void stack_or_non_stack_write(void)
+{
+	asm volatile (
+	/* initial write to fp-8 */
+	"*(u64 *)(r10 - 8) = 0;"
+	/* map lookup to get a non-FP pointer */
+	"r2 = r10;"
+	"r2 += -4;"
+	"r1 = %[map] ll;"
+	"call %[bpf_map_lookup_elem];"
+	/* r0 = map_value (ARG_NONE) */
+	"if r0 != 0 goto 1f;"
+	/* path B: r0 = fp-8 */
+	"r0 = r10;"
+	"r0 += -8;"
+"1:"
+	/* join: the write is not a def for fp[0]-8 */
+	"*(u64 *)(r0 + 0) = 7;"
+	/* read fp-8: should be non-poisoned */
+	"r0 = *(u64 *)(r10 - 8);"
+	"exit;"
+	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm_addr(map)
+	: __clobber_all);
+}
+
+SEC("socket")
+__log_level(2)
+__flag(BPF_F_TEST_STATE_FREQ)
+__msg("subprog#2 write_first_read_second:")
+__msg("17: (7a) *(u64 *)(r1 +0) = 42{{$}}")
+__msg("18: (79) r0 = *(u64 *)(r2 +0) // r1=fp0-8 r2=fp0-16{{$}}")
+__msg("stack use/def subprog#2 write_first_read_second (d2,cs15):")
+__msg("17: (7a) *(u64 *)(r1 +0) = 42{{$}}")
+__msg("18: (79) r0 = *(u64 *)(r2 +0)         ; use: fp0-8 fp0-16")
+__naked void shared_instance_must_write_overwrite(void)
+{
+	asm volatile (
+	"r1 = 1;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"*(u64 *)(r10 - 16) = r1;"
+	/* Call 1: write_first_read_second(&fp[-8], &fp[-16]) */
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = r10;"
+	"r2 += -16;"
+	"call forwarding_rw;"
+	/* Call 2: write_first_read_second(&fp[-16], &fp[-8]) */
+	"r1 = r10;"
+	"r1 += -16;"
+	"r2 = r10;"
+	"r2 += -8;"
+	"call forwarding_rw;"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void forwarding_rw(void)
+{
+	asm volatile (
+	"call write_first_read_second;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void write_first_read_second(void)
+{
+	asm volatile (
+	"*(u64 *)(r1 + 0) = 42;"
+	"r0 = *(u64 *)(r2 + 0);"
+	"exit;"
+	::: __clobber_all);
+}
+
+/*
+ * Shared must_write when (callsite, depth) instance is reused.
+ * Main calls fwd_to_stale_wr at two sites. fwd_to_stale_wr calls
+ * stale_wr_leaf at a single internal callsite. Both calls share
+ * stale_wr_leaf's (callsite, depth) instance.
+ *
+ * Call 1: stale_wr_leaf(map_value, fp-8) writes map, reads fp-8.
+ * Call 2: stale_wr_leaf(fp-8, fp-8) writes fp-8, reads fp-8.
+ *
+ * The analysis can't presume that stale_wr_leaf() always writes fp-8,
+ * it must conservatively join must_write masks computed for both calls.
+ */
+SEC("socket")
+__success
+__naked void stale_must_write_cross_callsite(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	/* Call 1: map_value write, fp-8 read (processed second in PO) */
+	"*(u32 *)(r10 - 16) = 0;"
+	"r1 = %[map] ll;"
+	"r2 = r10;"
+	"r2 += -16;"
+	"call %[bpf_map_lookup_elem];"
+	"if r0 == 0 goto 1f;"
+	"r1 = r0;"
+	"r2 = r10;"
+	"r2 += -8;"
+	"call fwd_to_stale_wr;"
+	/* Call 2: fp-8 write, fp-8 read (processed first in PO) */
+	"r1 = r10;"
+	"r1 += -8;"
+	"r2 = r1;"
+	"call fwd_to_stale_wr;"
+"1:"
+	"r0 = 0;"
+	"exit;"
+	:: __imm_addr(map),
+	   __imm(bpf_map_lookup_elem)
+	: __clobber_all);
+}
+
+static __used __naked void fwd_to_stale_wr(void)
+{
+	asm volatile (
+	"call stale_wr_leaf;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void stale_wr_leaf(void)
+{
+	asm volatile (
+	"*(u64 *)(r1 + 0) = 42;"
+	"r0 = *(u64 *)(r2 + 0);"
+	"exit;"
+	::: __clobber_all);
+}
+
+#ifdef CAN_USE_LOAD_ACQ_STORE_REL
+
+SEC("socket")
+__log_level(2)
+__success
+__msg("*(u64 *)(r0 +0) = 42         ; def: fp0-16")
+__naked void load_acquire_dont_clear_dst(void)
+{
+	asm volatile (
+	"r0 = r10;"
+	"r0 += -16;"
+	"*(u64 *)(r0 + 0) = r0;"	/* fp[-16] == &fp[-16] */
+	".8byte %[load_acquire_insn];"	/* load_acquire is a special case for BPF_STX, */
+	"r0 = *(u64 *)(r10 - 16);"	/* it shouldn't clear tracking info for */
+	"*(u64 *)(r0 + 0) = 42;"	/* dst register, r0 in this case. */
+	"r0 = 0;"
+	"exit;"
+	:
+	: __imm_insn(load_acquire_insn,
+		     BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_0, 0))
+	: __clobber_all);
+}
+
+#endif /* CAN_USE_LOAD_ACQ_STORE_REL */
+
+SEC("socket")
+__success
+__naked void imprecise_fill_loses_cross_frame(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 8) = 0;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"call imprecise_fill_cross_frame;"
+	"exit;"
+	::: __clobber_all);
+}
+
+static __used __naked void imprecise_fill_cross_frame(void)
+{
+	asm volatile (
+	/* spill &caller_fp-8 to callee's fp-8 */
+	"*(u64 *)(r10 - 8) = r1;"
+	/* imprecise FP pointer in r1 */
+	"r1 = r10;"
+	"r2 = -8;"
+	"r1 += r2;"
+	/* load from imprecise offset. fill_from_stack returns
+	 * ARG_IMPRECISE{mask=BIT(1)}, losing frame 0
+	 */
+	"r1 = *(u64 *)(r1 + 0);"
+	/* read caller's fp-8 through loaded pointer, should mark fp0-8 live */
+	"r0 = *(u64 *)(r1 + 0);"
+	"r0 = 0;"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Test that spill_to_stack with multi-offset dst (sz=8) joins instead
+ * of overwriting. r1 has offsets [-8, -16]. Both slots hold FP-derived
+ * pointers. Writing through r1 should join *val with existing values,
+ * not destroy them.
+ *
+ *   fp-8  = &fp-24
+ *   fp-16 = &fp-32
+ *   r1 = fp-8 or fp-16 (two offsets from branch)
+ *   *(u64 *)(r1 + 0) = &fp-24   -- writes to one slot, other untouched
+ *   r0 = *(u64 *)(r10 - 16)     -- fill from fp-16
+ *   r0 = *(u64 *)(r0 + 0)       -- deref: should produce use
+ */
+SEC("socket")
+__log_level(2)
+__success
+__msg("20: (79) r0 = *(u64 *)(r10 -16)")
+__msg("21: (79) r0 = *(u64 *)(r0 +0)         ; use: fp0-24 fp0-32")
+__naked void spill_join_with_multi_off(void)
+{
+	asm volatile (
+	/* fp-8 = &fp-24, fp-16 = &fp-32 (different pointers) */
+	"*(u64 *)(r10 - 24) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -24;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"*(u64 *)(r10 - 16) = r1;"
+	/* create r1 with two candidate offsets: fp-8 or fp-16 */
+	"call %[bpf_get_prandom_u32];"
+	"if r0 == 0 goto 1f;"
+	"r1 = r10;"
+	"r1 += -8;"
+	"goto 2f;"
+"1:"
+	"r1 = r10;"
+	"r1 += -16;"
+"2:"
+	/* write &fp-24 through multi-offset r1: hits one slot, other untouched */
+	"r2 = r10;"
+	"r2 += -24;"
+	"*(u64 *)(r1 + 0) = r2;"
+	/* read back *fp-8 and *fp-16 */
+	"r0 = *(u64 *)(r10 - 8);"
+	"r0 = *(u64 *)(r0 + 0);"
+	"r0 = *(u64 *)(r10 - 16);"
+	"r0 = *(u64 *)(r0 + 0);"
+	"exit;"
+	:: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Test that spill_to_stack with imprecise dst (off_cnt == 0, sz=8)
+ * joins instead of overwriting. Use "r2 = -8; r1 += r2" to make
+ * arg tracking lose offset precision while the main verifier keeps
+ * r1 as PTR_TO_STACK with fixed offset. Both slots hold FP-derived
+ * pointers. Writing through r1 should join *val with existing
+ * values, not destroy them.
+ *
+ *   fp-8  = &fp-24
+ *   fp-16 = &fp-32
+ *   r1 = fp-8 (imprecise to arg tracking)
+ *   *(u64 *)(r1 + 0) = &fp-24   -- since r1 is imprecise, this adds &fp-24
+ *                                  to the set of possible values for all slots,
+ *                                  hence the values at fp-16 become [fp-24, fp-32]
+ *   r0 = *(u64 *)(r10 - 16)
+ *   r0 = *(u64 *)(r0 + 0)       -- deref: should produce use of fp-24 or fp-32
+ */
+SEC("socket")
+__log_level(2)
+__success
+__msg("15: (79) r0 = *(u64 *)(r0 +0)         ; use: fp0-24 fp0-32")
+__naked void spill_join_with_imprecise_off(void)
+{
+	asm volatile (
+	"*(u64 *)(r10 - 24) = 0;"
+	"*(u64 *)(r10 - 32) = 0;"
+	"r1 = r10;"
+	"r1 += -24;"
+	"*(u64 *)(r10 - 8) = r1;"
+	"r1 = r10;"
+	"r1 += -32;"
+	"*(u64 *)(r10 - 16) = r1;"
+	/* r1 = fp-8 but arg tracking sees off_cnt == 0 */
+	"r1 = r10;"
+	"r2 = -8;"
+	"r1 += r2;"
+	/* write through imprecise r1 */
+	"r3 = r10;"
+	"r3 += -24;"
+	"*(u64 *)(r1 + 0) = r3;"
+	/* read back fp-16: at_stack should still track &fp-32 */
+	"r0 = *(u64 *)(r10 - 16);"
+	/* deref: should produce use for fp-32 */
+	"r0 = *(u64 *)(r0 + 0);"
+	"r0 = 0;"
+	"exit;"
+	::: __clobber_all);
+}
diff --git a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c
new file mode 100644
index 000000000000..b058de623200
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+/*
+ * Exponential complexity in analyze_subprog() liveness analysis.
+ *
+ * analyze_subprog() recurses into each call site that passes FP-derived
+ * arguments, creating a unique func_instance per (callsite, depth).
+ * There is no memoization for callees reached with equivalent entry args.
+ * Even if memoization were added, it can be defeated by passing a distinct
+ * FP offset at each call site.  arg_track keys on (frame, off[]), so
+ * r1=fp-8, r1=fp-16, ... r1=fp-400 produce 50 unique cache keys per level.
+ *
+ * This test chains 8 subprograms (the MAX_CALL_FRAMES limit).  Each
+ * intermediate function calls the next one 50 times, each time with a
+ * different FP-relative offset in r1.
+ *
+ * Without complexity limits in analyze_subprog() the resulting 50^7 ~ 7.8 * 10^11
+ * recursive analyze_subprog() calls will cause a CPU soft lockup or OOM.
+ *
+ * The BPF program itself is ~1200 instructions and perfectly valid.
+ */
+
+char _license[] SEC("license") = "GPL";
+
+/* Call fn with r1 = r10 + off (a unique FP-derived arg per call site) */
+#define C(fn, off)	"r1 = r10;"		\
+			"r1 += -" #off ";"	\
+			"call " #fn ";"
+
+/* 50 calls, each with a distinct FP offset: -8, -16, ... -400 */
+#define CALLS_50(fn)							\
+	C(fn,   8) C(fn,  16) C(fn,  24) C(fn,  32) C(fn,  40)		\
+	C(fn,  48) C(fn,  56) C(fn,  64) C(fn,  72) C(fn,  80)		\
+	C(fn,  88) C(fn,  96) C(fn, 104) C(fn, 112) C(fn, 120)		\
+	C(fn, 128) C(fn, 136) C(fn, 144) C(fn, 152) C(fn, 160)		\
+	C(fn, 168) C(fn, 176) C(fn, 184) C(fn, 192) C(fn, 200)		\
+	C(fn, 208) C(fn, 216) C(fn, 224) C(fn, 232) C(fn, 240)		\
+	C(fn, 248) C(fn, 256) C(fn, 264) C(fn, 272) C(fn, 280)		\
+	C(fn, 288) C(fn, 296) C(fn, 304) C(fn, 312) C(fn, 320)		\
+	C(fn, 328) C(fn, 336) C(fn, 344) C(fn, 352) C(fn, 360)		\
+	C(fn, 368) C(fn, 376) C(fn, 384) C(fn, 392) C(fn, 400)
+
+/* Leaf: depth 7, no further calls */
+__naked __noinline __used
+static unsigned long exp_sub7(void)
+{
+	asm volatile (
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 6 -> calls exp_sub7 x50 with distinct offsets */
+__naked __noinline __used
+static unsigned long exp_sub6(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub7)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 5 -> calls exp_sub6 x50 */
+__naked __noinline __used
+static unsigned long exp_sub5(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub6)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 4 -> calls exp_sub5 x50 */
+__naked __noinline __used
+static unsigned long exp_sub4(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub5)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 3 -> calls exp_sub4 x50 */
+__naked __noinline __used
+static unsigned long exp_sub3(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub4)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 2 -> calls exp_sub3 x50 */
+__naked __noinline __used
+static unsigned long exp_sub2(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub3)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/* depth 1 -> calls exp_sub2 x50 */
+__naked __noinline __used
+static unsigned long exp_sub1(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub2)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
+
+/*
+ * Entry: depth 0.  Calls exp_sub1 50 times, each with a distinct
+ * FP offset in r1.  Every call site produces a unique arg_track,
+ * defeating any memoization keyed on entry args.
+ */
+SEC("?raw_tp")
+__failure __log_level(2)
+__msg("liveness analysis exceeded complexity limit")
+__naked int liveness_exponential_complexity(void)
+{
+	asm volatile (
+		CALLS_50(exp_sub1)
+		"r0 = 0;"
+		"exit;"
+		::: __clobber_all);
+}
diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index a5b8753ce52c..70ae14d6084f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -264,13 +264,13 @@ void precision_many_frames__bar(void)
  */
 SEC("socket")
 __success __log_level(2)
-__msg("11: (0f) r2 += r1")
+__msg("12: (0f) r2 += r1")
 /* foo frame */
-__msg("frame1: regs=r1 stack= before 10: (bf) r2 = r10")
-__msg("frame1: regs=r1 stack= before 9: (25) if r1 > 0x7 goto pc+0")
-__msg("frame1: regs=r1 stack=-8,-16 before 8: (7b) *(u64 *)(r10 -16) = r1")
-__msg("frame1: regs=r1 stack=-8 before 7: (7b) *(u64 *)(r10 -8) = r1")
-__msg("frame1: regs=r1 stack= before 4: (85) call pc+2")
+__msg("frame1: regs=r1 stack= before 11: (bf) r2 = r10")
+__msg("frame1: regs=r1 stack= before 10: (25) if r1 > 0x7 goto pc+0")
+__msg("frame1: regs=r1 stack=-8,-16 before 9: (7b) *(u64 *)(r10 -16) = r1")
+__msg("frame1: regs=r1 stack=-8 before 8: (7b) *(u64 *)(r10 -8) = r1")
+__msg("frame1: regs=r1 stack= before 4: (85) call pc+3")
 /* main frame */
 __msg("frame0: regs=r1 stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r1")
 __msg("frame0: regs=r1 stack= before 2: (bf) r1 = r0")
@@ -286,6 +286,7 @@ __naked void precision_stack(void)
 	"r1 = r0;"
 	"*(u64*)(r10 - 8) = r1;"
 	"call precision_stack__foo;"
+	"r0 = *(u64*)(r10 - 8);"
 	"r0 = 0;"
 	"exit;"
 	:
@@ -309,6 +310,8 @@ void precision_stack__foo(void)
 	 */
 	"r2 = r10;"
 	"r2 += r1;"
+	"r0 = *(u64*)(r10 - 8);"
+	"r0 = *(u64*)(r10 - 16);"
 	"exit"
 	::: __clobber_all);
 }
@@ -802,9 +805,9 @@ __success __log_level(2)
 /* The exit instruction should be reachable from two states,
  * use two matches and "processed .. insns" to ensure this.
  */
-__msg("15: (95) exit")
-__msg("15: (95) exit")
-__msg("processed 20 insns")
+__msg("16: (95) exit")
+__msg("16: (95) exit")
+__msg("processed 22 insns")
 __flag(BPF_F_TEST_STATE_FREQ)
 __naked void two_old_ids_one_cur_id(void)
 {
@@ -835,6 +838,11 @@ __naked void two_old_ids_one_cur_id(void)
 	"r2 = r10;"
 	"r2 += r6;"
 	"r2 += r7;"
+	/*
+	 * keep r8 and r9 live, otherwise r6->id and r7->id
+	 * will become singular and reset to zero before if r6 > r7
+	 */
+	"r9 += r8;"
 	"exit;"
 	:
 	: __imm(bpf_ktime_get_ns)
diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
index 672e4446181e..6bc721accbae 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
@@ -650,7 +650,7 @@ __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1")
 __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)")
 __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6")
 /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */
-__msg("mark_precise: frame0: parent state regs= stack=-8:  R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1")
+__msg("mark_precise: frame0: parent state regs= stack=-8:  R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1")
 __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7")
 __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0")
@@ -668,7 +668,7 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2")
 __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)")
 __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6")
 /* now both fp-8 and fp-16 are precise, very good */
-__msg("mark_precise: frame0: parent state regs= stack=-16:  R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1")
+__msg("mark_precise: frame0: parent state regs= stack=-16:  R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1")
 __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7")
 __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0")
 __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0")
@@ -726,7 +726,7 @@ __msg("9: (0f) r1 += r2")
 __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1")
 __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)")
 __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6")
-__msg("mark_precise: frame0: parent state regs= stack=-8:  R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1")
+__msg("mark_precise: frame0: parent state regs= stack=-8:  R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1")
 __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7")
 __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0")
@@ -743,7 +743,7 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2
 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2")
 __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)")
 __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6")
-__msg("mark_precise: frame0: parent state regs= stack=-16:  R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1")
+__msg("mark_precise: frame0: parent state regs= stack=-16:  R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1")
 __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7")
 __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0")
 __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0")
@@ -780,6 +780,8 @@ __naked void stack_load_preserves_const_precision_subreg(void)
 		"r1 += r2;"
 		"*(u8 *)(r1 + 0) = r2;" /* this should be fine */
 
+		"r2 = *(u64 *)(r10 -8);" /* keep slots alive */
+		"r2 = *(u64 *)(r10 -16);"
 		"r0 = 0;"
 		"exit;"
 	:
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index 61886ed554de..d21d32f6a676 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -282,7 +282,7 @@ __msg("mark_precise: frame0: regs=r0,r6 stack= before 10: (bf) r6 = r0")
 __msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
 /* State entering callback body popped from states stack */
 __msg("from 9 to 17: frame1:")
-__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: frame1: R10=fp0 cb")
 __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
@@ -411,7 +411,7 @@ __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
 /* State entering callback body popped from states stack */
 __msg("from 9 to 15: frame1:")
-__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("15: frame1: R10=fp0 cb")
 __msg("15: (b7) r0 = 0")
 __msg("16: (95) exit")
 __msg("returning from callee:")
@@ -567,7 +567,7 @@ __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
 /* State entering callback body popped from states stack */
 __msg("from 10 to 17: frame1:")
-__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: frame1: R10=fp0 cb")
 __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
@@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1")
 __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8")
 __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4")
 __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)")
-__msg("mark_precise: frame0: parent state regs= stack=-8:  R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1")
+__msg("mark_precise: frame0: parent state regs= stack=-8:  R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1")
 __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7")
 __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
 __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2")