mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
sched/fair: Fix overflow in vruntime_eligible()
Zhan Xusheng reported running into sporadic a s64 mult overflow in
vruntime_eligible().
When constructing a worst case scenario:
If you have cgroups, then you can have an entity of weight 2 (per
calc_group_shares()), and its vlag should then be bounded by: (slice+TICK_NSEC)
* NICE_0_LOAD, which is around 44 bits as per the comment on entity_key().
The other extreme is 100*NICE_0_LOAD, thus you get:
{key, weight}[] := {
puny: { (slice + TICK_NSEC) * NICE_0_LOAD, 2 },
max: { 0, 100*NICE_0_LOAD },
}
The avg_vruntime() would end up being very close to 0 (which is
zero_vruntime), so no real help making that more accurate.
vruntime_eligible(puny) ends up with:
avg = 2 * puny.key (+ 0)
load = 2 + 100 * NICE_0_LOAD
avg >= puny.key * load
And that is: (slice + TICK_NSEC) * NICE_0_LOAD * NICE_0_LOAD * 100, which will
overflow s64.
Zhan suggested using __builtin_mul_overflow(), however after staring at
compiler output for various architectures using godbolt, it seems that using an
__int128 multiplication often results in better code.
Specifically, a number of architectures already compute the __int128 product to
determine the overflow. Eg. arm64 already has the 'smulh' instruction used. By
explicitly doing an __int128 multiply, it will emit the 'mul; smulh' pattern,
which modern cores can fuse (armv8-a clang-22.1.0). x86_64 has less branches
(no OF handling).
Since Linux has ARCH_SUPPORTS_INT128 to gate __int128 usage, also provide the
__builtin_mul_overflow() variant as a fallback.
[peterz: Changelog and __int128 bits]
Fixes: 556146ce5e ("sched/fair: Avoid overflow in enqueue_entity()")
Reported-by: Zhan Xusheng <zhanxusheng1024@gmail.com>
Closes: https://patch.msgid.link/20260415145742.10359-1-zhanxusheng%40xiaomi.com
Signed-off-by: Zhan Xusheng <zhanxusheng@xiaomi.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260505103155.GN3102924%40noisy.programming.kicks-ass.net
This commit is contained in:
parent
e744060076
commit
b6eee96843
|
|
@ -882,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
*
|
||||
* lag_i >= 0 -> V >= v_i
|
||||
*
|
||||
* \Sum (v_i - v)*w_i
|
||||
* V = ------------------ + v
|
||||
* \Sum (v_i - v0)*w_i
|
||||
* V = ------------------- + v0
|
||||
* \Sum w_i
|
||||
*
|
||||
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
|
||||
* lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i)
|
||||
*
|
||||
* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
|
||||
* to the loss in precision caused by the division.
|
||||
|
|
@ -894,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
|
||||
{
|
||||
struct sched_entity *curr = cfs_rq->curr;
|
||||
s64 avg = cfs_rq->sum_w_vruntime;
|
||||
s64 key, avg = cfs_rq->sum_w_vruntime;
|
||||
long load = cfs_rq->sum_weight;
|
||||
|
||||
if (curr && curr->on_rq) {
|
||||
|
|
@ -904,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
|
|||
load += weight;
|
||||
}
|
||||
|
||||
return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
|
||||
key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
|
||||
|
||||
/*
|
||||
* The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD'
|
||||
* and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24
|
||||
* bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise.
|
||||
*
|
||||
* This gives that on 64bit the product will be at least 64bit which
|
||||
* overflows s64, while on 32bit it will only be 44bits and should fit
|
||||
* comfortably.
|
||||
*/
|
||||
#ifdef CONFIG_64BIT
|
||||
#ifdef CONFIG_ARCH_SUPPORTS_INT128
|
||||
/* This often results in simpler code than __builtin_mul_overflow(). */
|
||||
return avg >= (__int128)key * load;
|
||||
#else
|
||||
s64 rhs;
|
||||
/*
|
||||
* On overflow, the sign of key tells us the correct answer: a large
|
||||
* positive key means vruntime >> V, so not eligible; a large negative
|
||||
* key means vruntime << V, so eligible.
|
||||
*/
|
||||
if (check_mul_overflow(key, load, &rhs))
|
||||
return key <= 0;
|
||||
|
||||
return avg >= rhs;
|
||||
#endif
|
||||
#else /* 32bit */
|
||||
return avg >= key * load;
|
||||
#endif
|
||||
}
|
||||
|
||||
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user