From 8c4840277b6daffe09dea0338f3fce1eb4319a43 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jan 2025 18:28:44 +0100
Subject: [PATCH 1/5] signal/posixtimers: Handle ignore/blocked sequences
 correctly

syzbot triggered the warning in posixtimer_send_sigqueue(), which warns
about a non-ignored signal being already queued on the ignored list.

The warning is actually bogus, as the following sequence causes this:

    signal($SIG, SIGIGN);
    timer_settime(...);			// arm periodic timer

      timer fires, signal is ignored and queued on ignored list

    sigprocmask(SIG_BLOCK, ...);        // block the signal
    timer_settime(...);			// re-arm periodic timer

      timer fires, signal is not ignored because it is blocked
        ---> Warning triggers as signal is on the ignored list

Ideally timer_settime() could remove the signal, but that's racy and
incomplete vs. other scenarios and requires a full reevaluation of the
pending signal list.

Instead of adding more complexity, handle it gracefully by removing the
warning and requeueing the signal to the pending list. That's correct
versus:

  1) sig[timed]wait() as that does not check for SIGIGN and only relies on
     dequeue_signal() -> posixtimers_deliver_signal() to check whether the
     pending signal is still valid.

  2) Unblocking of the signal.

     - If the unblocking happens before SIGIGN is replaced by a signal
       handler, then the timer is rearmed in dequeue_signal(), but
       get_signal() will ignore it. The next timer expiry will move it back
       to the ignored list.

     - If SIGIGN was replaced before unblocking, then the signal will be
       delivered and a subsequent expiry will queue a signal on the pending
       list again.

There is a related scenario to trigger the complementary warning in the
signal ignored path, which does not expect the signal to be on the pending
list when it is ignored. That can be triggered even before the above change
via:

task1			task2

signal($SIG, SIGIGN);
			sigprocmask(SIG_BLOCK, ...);

timer_create();		// Signal target is task2
timer_settime(...);	// arm periodic timer

   timer fires, signal is not ignored because it is blocked
   and queued on the pending list of task2

       	      	     	syscall()
			   // Sets the pending flag
			   sigprocmask(SIG_UNBLOCK, ...);

			-> preemption, task2 cannot dequeue the signal

timer_settime(...);	// re-arm periodic timer

   timer fires, signal is ignored
        ---> Warning triggers as signal is on task2's pending list
	     and the thread group is not exiting

Consequently, remove that warning too and just keep the signal on the
pending list.

The following attempt to deliver the signal on return to user space of
task2 will ignore the signal and a subsequent expiry will bring it back to
the ignored list, if it did not get blocked or un-ignored before that.

Fixes: df7a996b4dab ("signal: Queue ignored posixtimers on ignore list")
Reported-by: syzbot+3c2e3cc60665d71de2f7@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/all/87ikqhcnjn.ffs@tglx
---
 kernel/signal.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 989b1cc9116a..a2afd54303f0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
 
 		if (!list_empty(&q->list)) {
 			/*
-			 * If task group is exiting with the signal already pending,
-			 * wait for __exit_signal() to do its job. Otherwise if
-			 * ignored, it's not supposed to be queued. Try to survive.
+			 * The signal was ignored and blocked. The timer
+			 * expiry queued it because blocked signals are
+			 * queued independent of the ignored state.
+			 *
+			 * The unblocking set SIGPENDING, but the signal
+			 * was not yet dequeued from the pending list.
+			 * So prepare_signal() sees unblocked and ignored,
+			 * which ends up here. Leave it queued like a
+			 * regular signal.
+			 *
+			 * The same happens when the task group is exiting
+			 * and the signal is already queued.
+			 * prepare_signal() treats SIGNAL_GROUP_EXIT as
+			 * ignored independent of its queued state. This
+			 * gets cleaned up in __exit_signal().
 			 */
-			WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
 			goto out;
 		}
 
@@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
 		goto out;
 	}
 
-	/* This should never happen and leaks a reference count */
-	if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
-		hlist_del_init(&tmr->ignored_list);
-
 	if (unlikely(!list_empty(&q->list))) {
 		/* This holds a reference count already */
 		result = TRACE_SIGNAL_ALREADY_PENDING;
 		goto out;
 	}
 
-	posixtimer_sigqueue_getref(q);
+	/*
+	 * If the signal is on the ignore list, it got blocked after it was
+	 * ignored earlier. But nothing lifted the ignore. Move it back to
+	 * the pending list to be consistent with the regular signal
+	 * handling. This already holds a reference count.
+	 *
+	 * If it's not on the ignore list acquire a reference count.
+	 */
+	if (likely(hlist_unhashed(&tmr->ignored_list)))
+		posixtimer_sigqueue_getref(q);
+	else
+		hlist_del_init(&tmr->ignored_list);
+
 	posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
 	result = TRACE_SIGNAL_DELIVERED;
 out:

From b729cc1ec21a5899b7879ccfbe1786664928d597 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Jan 2025 00:15:04 +0100
Subject: [PATCH 2/5] timers/migration: Fix another race between hotplug and
 idle entry/exit

Commit 10a0e6f3d3db ("timers/migration: Move hierarchy setup into
cpuhotplug prepare callback") fixed a race between idle exit and CPU
hotplug up leading to a wrong "0" value migrator assigned to the top
level. However there is still a situation that remains unhandled:

         [GRP0:0]
        migrator  = TMIGR_NONE
        active    = NONE
        groupmask = 0
        /     \      \
       0       1     2..7
     idle      idle   idle

0) The system is fully idle.

         [GRP0:0]
        migrator  = CPU 0
        active    = CPU 0
        groupmask = 0
        /     \      \
       0       1     2..7
     active   idle   idle

1) CPU 0 is activating. It has done the cmpxchg on the top's ->migr_state
but it hasn't yet returned to __walk_groups().

         [GRP0:0]
        migrator  = CPU 0
        active    = CPU 0, CPU 1
        groupmask = 0
        /     \      \
       0       1     2..7
     active  active  idle

2) CPU 1 is activating. CPU 0 stays the migrator (still stuck in
__walk_groups(), delayed by #VMEXIT for example).

                 [GRP1:0]
              migrator = TMIGR_NONE
              active   = NONE
              groupmask = 0
              /                  \
        [GRP0:0]                      [GRP0:1]
       migrator  = CPU 0           migrator = TMIGR_NONE
       active    = CPU 0, CPU1     active   = NONE
       groupmask = 2               groupmask = 1
       /     \      \
      0       1     2..7                   8
    active  active  idle              !online

3) CPU 8 is preparing to boot. CPUHP_TMIGR_PREPARE is being ran by CPU 1
which has created the GRP0:1 and the new top GRP1:0 connected to GRP0:1
and GRP0:0. The groupmask of GRP0:0 is now 2. CPU 1 hasn't yet
propagated its activation up to GRP1:0.

                 [GRP1:0]
              migrator = 0 (!!!)
              active   = NONE
              groupmask = 0
              /                  \
        [GRP0:0]                  [GRP0:1]
       migrator  = CPU 0           migrator = TMIGR_NONE
       active    = CPU 0, CPU1     active   = NONE
       groupmask = 2               groupmask = 1
       /     \      \
      0       1     2..7                   8
    active  active  idle                !online

4) CPU 0 finally resumed after its #VMEXIT. It's in __walk_groups()
returning from tmigr_cpu_active(). The new top GRP1:0 is visible and
fetched but the freshly updated groupmask of GRP0:0 may not be visible
due to lack of ordering! As a result tmigr_active_up() is called to
GRP0:0 with a child's groupmask of "0". This buggy "0" groupmask then
becomes the migrator for GRP1:0 forever. As a result, timers on a fully
idle system get ignored.

One possible fix would be to define TMIGR_NONE as "0" so that such a
race would have no effect. And after all TMIGR_NONE doesn't need to be
anything else. However this would leave an uncomfortable state machine
where gears happen not to break by chance but are vulnerable to future
modifications.

Keep TMIGR_NONE as is instead and pre-initialize to "1" the groupmask of
any newly created top level. This groupmask is guaranteed to be visible
upon fetching the corresponding group for the 1st time:

_ By the upcoming CPU thanks to CPU hotplug synchronization between the
  control CPU (BP) and the booting one (AP).

_ By the control CPU since the groupmask and parent pointers are
  initialized locally.

_ By all CPUs belonging to the same group than the control CPU because
  they must wait for it to ever become idle before needing to walk to
  the new top. The cmpcxhg() on ->migr_state then makes sure its
  groupmask is visible.

With this pre-initialization, it is guaranteed that if a future top level
is linked to an old one, it is walked through with a valid groupmask.

Fixes: 10a0e6f3d3db ("timers/migration: Move hierarchy setup into cpuhotplug prepare callback")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250114231507.21672-2-frederic@kernel.org
---
 kernel/time/timer_migration.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 8d57f7686bb0..c8a8ea2e5b98 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1487,6 +1487,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 	s.seq = 0;
 	atomic_set(&group->migr_state, s.state);
 
+	/*
+	 * If this is a new top-level, prepare its groupmask in advance.
+	 * This avoids accidents where yet another new top-level is
+	 * created in the future and made visible before the current groupmask.
+	 */
+	if (list_empty(&tmigr_level_list[lvl])) {
+		group->groupmask = BIT(0);
+		/*
+		 * The previous top level has prepared its groupmask already,
+		 * simply account it as the first child.
+		 */
+		if (lvl > 0)
+			group->num_children = 1;
+	}
+
 	timerqueue_init_head(&group->events);
 	timerqueue_init(&group->groupevt.nextevt);
 	group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1550,8 +1565,20 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	raw_spin_lock_irq(&child->lock);
 	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
 
+	if (activate) {
+		/*
+		 * @child is the old top and @parent the new one. In this
+		 * case groupmask is pre-initialized and @child already
+		 * accounted, along with its new sibling corresponding to the
+		 * CPU going up.
+		 */
+		WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+	} else {
+		/* Adding @child for the CPU going up to @parent. */
+		child->groupmask = BIT(parent->num_children++);
+	}
+
 	child->parent = parent;
-	child->groupmask = BIT(parent->num_children++);
 
 	raw_spin_unlock(&parent->lock);
 	raw_spin_unlock_irq(&child->lock);

From de3ced72a79280fefd680e5e101d8b9f03cfa1d7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Jan 2025 00:15:05 +0100
Subject: [PATCH 3/5] timers/migration: Enforce group initialization visibility
 to tree walkers

Commit 2522c84db513 ("timers/migration: Fix another race between hotplug
and idle entry/exit") fixed yet another race between idle exit and CPU
hotplug up leading to a wrong "0" value migrator assigned to the top
level. However there is yet another situation that remains unhandled:

         [GRP0:0]
      migrator  = TMIGR_NONE
      active    = NONE
      groupmask = 1
      /     \      \
     0       1     2..7
   idle      idle   idle

0) The system is fully idle.

         [GRP0:0]
      migrator  = CPU 0
      active    = CPU 0
      groupmask = 1
      /     \      \
     0       1     2..7
   active   idle   idle

1) CPU 0 is activating. It has done the cmpxchg on the top's ->migr_state
but it hasn't yet returned to __walk_groups().

         [GRP0:0]
      migrator  = CPU 0
      active    = CPU 0, CPU 1
      groupmask = 1
      /     \      \
     0       1     2..7
   active  active  idle

2) CPU 1 is activating. CPU 0 stays the migrator (still stuck in
__walk_groups(), delayed by #VMEXIT for example).

                    [GRP1:0]
                migrator = TMIGR_NONE
                active   = NONE
                groupmask = 1
             /                   \
         [GRP0:0]                  [GRP0:1]
      migrator  = CPU 0           migrator = TMIGR_NONE
      active    = CPU 0, CPU1     active   = NONE
      groupmask = 1               groupmask = 2
      /     \      \
     0       1     2..7                   8
   active  active  idle                !online

3) CPU 8 is preparing to boot. CPUHP_TMIGR_PREPARE is being ran by CPU 1
which has created the GRP0:1 and the new top GRP1:0 connected to GRP0:1
and GRP0:0. CPU 1 hasn't yet propagated its activation up to GRP1:0.

                    [GRP1:0]
               migrator = GRP0:0
               active   = GRP0:0
               groupmask = 1
             /                   \
         [GRP0:0]                  [GRP0:1]
     migrator  = CPU 0           migrator = TMIGR_NONE
     active    = CPU 0, CPU1     active   = NONE
     groupmask = 1               groupmask = 2
     /     \      \
    0       1     2..7                   8
  active  active  idle                !online

4) CPU 0 finally resumed after its #VMEXIT. It's in __walk_groups()
returning from tmigr_cpu_active(). The new top GRP1:0 is visible and
fetched and the pre-initialized groupmask of GRP0:0 is also visible.
As a result tmigr_active_up() is called to GRP1:0 with GRP0:0 as active
and migrator. CPU 0 is returning to __walk_groups() but suffers again
a #VMEXIT.

                    [GRP1:0]
               migrator = GRP0:0
               active   = GRP0:0
               groupmask = 1
             /                   \
         [GRP0:0]                  [GRP0:1]
     migrator  = CPU 0           migrator = TMIGR_NONE
     active    = CPU 0, CPU1     active   = NONE
     groupmask = 1               groupmask = 2
     /     \      \
    0       1     2..7                   8
  active  active  idle                 !online

5) CPU 1 propagates its activation of GRP0:0 to GRP1:0. This has no
   effect since CPU 0 did it already.

                    [GRP1:0]
               migrator = GRP0:0
               active   = GRP0:0, GRP0:1
               groupmask = 1
             /                   \
         [GRP0:0]                  [GRP0:1]
     migrator  = CPU 0           migrator = CPU 8
     active    = CPU 0, CPU1     active   = CPU 8
     groupmask = 1               groupmask = 2
     /     \      \                     \
    0       1     2..7                   8
  active  active  idle                 active

6) CPU 1 links CPU 8 to its group. CPU 8 boots and goes through
   CPUHP_AP_TMIGR_ONLINE which propagates activation.

                                   [GRP2:0]
                              migrator = TMIGR_NONE
                              active   = NONE
                              groupmask = 1
                             /                \
                    [GRP1:0]                    [GRP1:1]
               migrator = GRP0:0              migrator = TMIGR_NONE
               active   = GRP0:0, GRP0:1      active   = NONE
               groupmask = 1                  groupmask = 2
             /                   \
         [GRP0:0]                  [GRP0:1]                [GRP0:2]
     migrator  = CPU 0           migrator = CPU 8        migrator = TMIGR_NONE
     active    = CPU 0, CPU1     active   = CPU 8        active   = NONE
     groupmask = 1               groupmask = 2           groupmask = 0
     /     \      \                     \
    0       1     2..7                   8                  64
  active  active  idle                 active             !online

7) CPU 64 is booting. CPUHP_TMIGR_PREPARE is being ran by CPU 1
which has created the GRP1:1, GRP0:2 and the new top GRP2:0 connected to
GRP1:1 and GRP1:0. CPU 1 hasn't yet propagated its activation up to
GRP2:0.

                                   [GRP2:0]
                              migrator = 0 (!!!)
                              active   = NONE
                              groupmask = 1
                             /                \
                    [GRP1:0]                    [GRP1:1]
               migrator = GRP0:0              migrator = TMIGR_NONE
               active   = GRP0:0, GRP0:1      active   = NONE
               groupmask = 1                  groupmask = 2
             /                   \
         [GRP0:0]                  [GRP0:1]                [GRP0:2]
     migrator  = CPU 0           migrator = CPU 8        migrator = TMIGR_NONE
     active    = CPU 0, CPU1     active   = CPU 8        active   = NONE
     groupmask = 1               groupmask = 2           groupmask = 0
     /     \      \                     \
    0       1     2..7                   8                  64
  active  active  idle                 active             !online

8) CPU 0 finally resumed after its #VMEXIT. It's in __walk_groups()
returning from tmigr_cpu_active(). The new top GRP2:0 is visible and
fetched but the pre-initialized groupmask of GRP1:0 is not because no
ordering made its initialization visible. As a result tmigr_active_up()
may be called to GRP2:0 with a "0" child's groumask. Leaving the timers
ignored for ever when the system is fully idle.

The race is highly theoretical and perhaps impossible in practice but
the groupmask of the child is not the only concern here as the whole
initialization of the child is not guaranteed to be visible to any
tree walker racing against hotplug (idle entry/exit, remote handling,
etc...). Although the current code layout seem to be resilient to such
hazards, this doesn't tell much about the future.

Fix this with enforcing address dependency between group initialization
and the write/read to the group's parent's pointer. Fortunately that
doesn't involve any barrier addition in the fast paths.

Fixes: 10a0e6f3d3db ("timers/migration: Move hierarchy setup into cpuhotplug prepare callback")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250114231507.21672-3-frederic@kernel.org
---
 kernel/time/timer_migration.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index c8a8ea2e5b98..371a62a749aa 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
 			break;
 
 		child = group;
-		group = group->parent;
+		/*
+		 * Pairs with the store release on group connection
+		 * to make sure group initialization is visible.
+		 */
+		group = READ_ONCE(group->parent);
 		data->childmask = child->groupmask;
+		WARN_ON_ONCE(!data->childmask);
 	} while (group);
 }
 
@@ -1578,7 +1583,12 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 		child->groupmask = BIT(parent->num_children++);
 	}
 
-	child->parent = parent;
+	/*
+	 * Make sure parent initialization is visible before publishing it to a
+	 * racing CPU entering/exiting idle. This RELEASE barrier enforces an
+	 * address dependency that pairs with the READ_ONCE() in __walk_groups().
+	 */
+	smp_store_release(&child->parent, parent);
 
 	raw_spin_unlock(&parent->lock);
 	raw_spin_unlock_irq(&child->lock);

From 922efd298bb2636880408c00942dbd54d8bf6e0d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 15 Jan 2025 00:15:06 +0100
Subject: [PATCH 4/5] timers/migration: Annotate accesses to ignore flag

The group's ignore flag is:

_ read under the group's lock (idle entry, remote expiry)
_ turned on/off under the group's lock (idle entry, remote expiry)
_ turned on locklessly on idle exit

When idle entry or remote expiry clear the "ignore" flag of a group, the
operation must be synchronized against other concurrent idle entry or
remote expiry to make sure the related group timer is never missed. To
enforce this synchronization, both "ignore" clear and read are
performed under the group lock.

On the contrary, whether idle entry or remote expiry manage to observe
the "ignore" flag turned on by a CPU exiting idle is a matter of
optimization. If that flag set is missed or cleared concurrently, the
worst outcome is a migrator wasting time remotely handling a "ghost"
timer. This is why the ignore flag can be set locklessly.

Unfortunately, the related lockless accesses are bare and miss
appropriate annotations. KCSAN rightfully complains:

		 BUG: KCSAN: data-race in __tmigr_cpu_activate / print_report

		 write to 0xffff88842fc28004 of 1 bytes by task 0 on cpu 0:
		 __tmigr_cpu_activate
		 tmigr_cpu_activate
		 timer_clear_idle
		 tick_nohz_restart_sched_tick
		 tick_nohz_idle_exit
		 do_idle
		 cpu_startup_entry
		 kernel_init
		 do_initcalls
		 clear_bss
		 reserve_bios_regions
		 common_startup_64

		 read to 0xffff88842fc28004 of 1 bytes by task 0 on cpu 1:
		 print_report
		 kcsan_report_known_origin
		 kcsan_setup_watchpoint
		 tmigr_next_groupevt
		 tmigr_update_events
		 tmigr_inactive_up
		 __walk_groups+0x50/0x77
		 walk_groups
		 __tmigr_cpu_deactivate
		 tmigr_cpu_deactivate
		 __get_next_timer_interrupt
		 timer_base_try_to_set_idle
		 tick_nohz_stop_tick
		 tick_nohz_idle_stop_tick
		 cpuidle_idle_call
		 do_idle

Although the relevant accesses could be marked as data_race(), the
"ignore" flag being read several times within the same
tmigr_update_events() function is confusing and error prone. Prefer
reading it once in that function and make use of similar/paired accesses
elsewhere with appropriate comments when necessary.

Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250114231507.21672-4-frederic@kernel.org
Closes: https://lore.kernel.org/oe-lkp/202501031612.62e0c498-lkp@intel.com
---
 kernel/time/timer_migration.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 371a62a749aa..066c9ddca4ec 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -569,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
 	while ((node = timerqueue_getnext(&group->events))) {
 		evt = container_of(node, struct tmigr_event, nextevt);
 
-		if (!evt->ignore) {
+		if (!READ_ONCE(evt->ignore)) {
 			WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
 			return evt;
 		}
@@ -665,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
 	 * lock is held while updating the ignore flag in idle path. So this
 	 * state change will not be lost.
 	 */
-	group->groupevt.ignore = true;
+	WRITE_ONCE(group->groupevt.ignore, true);
 
 	return walk_done;
 }
@@ -726,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 	union tmigr_state childstate, groupstate;
 	bool remote = data->remote;
 	bool walk_done = false;
+	bool ignore;
 	u64 nextexp;
 
 	if (child) {
@@ -744,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 		nextexp = child->next_expiry;
 		evt = &child->groupevt;
 
-		evt->ignore = (nextexp == KTIME_MAX) ? true : false;
+		/*
+		 * This can race with concurrent idle exit (activate).
+		 * If the current writer wins, a useless remote expiration may
+		 * be scheduled. If the activate wins, the event is properly
+		 * ignored.
+		 */
+		ignore = (nextexp == KTIME_MAX) ? true : false;
+		WRITE_ONCE(evt->ignore, ignore);
 	} else {
 		nextexp = data->nextexp;
 
 		first_childevt = evt = data->evt;
+		ignore = evt->ignore;
 
 		/*
 		 * Walking the hierarchy is required in any case when a
@@ -774,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 		 * first event information of the group is updated properly and
 		 * also handled properly, so skip this fast return path.
 		 */
-		if (evt->ignore && !remote && group->parent)
+		if (ignore && !remote && group->parent)
 			return true;
 
 		raw_spin_lock(&group->lock);
@@ -788,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 	 * queue when the expiry time changed only or when it could be ignored.
 	 */
 	if (timerqueue_node_queued(&evt->nextevt)) {
-		if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
+		if ((evt->nextevt.expires == nextexp) && !ignore) {
 			/* Make sure not to miss a new CPU event with the same expiry */
 			evt->cpu = first_childevt->cpu;
 			goto check_toplvl;
@@ -798,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 			WRITE_ONCE(group->next_expiry, KTIME_MAX);
 	}
 
-	if (evt->ignore) {
+	if (ignore) {
 		/*
 		 * When the next child event could be ignored (nextexp is
 		 * KTIME_MAX) and there was no remote timer handling before or

From 2f8dea1692eef2b7ba6a256246ed82c365fdc686 Mon Sep 17 00:00:00 2001
From: Koichiro Den <koichiro.den@canonical.com>
Date: Fri, 20 Dec 2024 22:44:21 +0900
Subject: [PATCH 5/5] hrtimers: Handle CPU state correctly on hotplug

Consider a scenario where a CPU transitions from CPUHP_ONLINE to halfway
through a CPU hotunplug down to CPUHP_HRTIMERS_PREPARE, and then back to
CPUHP_ONLINE:

Since hrtimers_prepare_cpu() does not run, cpu_base.hres_active remains set
to 1 throughout. However, during a CPU unplug operation, the tick and the
clockevents are shut down at CPUHP_AP_TICK_DYING. On return to the online
state, for instance CFS incorrectly assumes that the hrtick is already
active, and the chance of the clockevent device to transition to oneshot
mode is also lost forever for the CPU, unless it goes back to a lower state
than CPUHP_HRTIMERS_PREPARE once.

This round-trip reveals another issue; cpu_base.online is not set to 1
after the transition, which appears as a WARN_ON_ONCE in enqueue_hrtimer().

Aside of that, the bulk of the per CPU state is not reset either, which
means there are dangling pointers in the worst case.

Address this by adding a corresponding startup() callback, which resets the
stale per CPU state and sets the online flag.

[ tglx: Make the new callback unconditionally available, remove the online
  	modification in the prepare() callback and clear the remaining
  	state in the starting callback instead of the prepare callback ]

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Signed-off-by: Koichiro Den <koichiro.den@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/20241220134421.3809834-1-koichiro.den@canonical.com
---
 include/linux/hrtimer.h |  1 +
 kernel/cpu.c            |  2 +-
 kernel/time/hrtimer.c   | 11 ++++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7ef5f7ef31a9..f7bfdcf0dda3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -386,6 +386,7 @@ extern void __init hrtimers_init(void);
 extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
+int hrtimers_cpu_starting(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 int hrtimers_cpu_dying(unsigned int cpu);
 #else
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b605334f8ee6..0509a9733745 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	},
 	[CPUHP_AP_HRTIMERS_DYING] = {
 		.name			= "hrtimers:dying",
-		.startup.single		= NULL,
+		.startup.single		= hrtimers_cpu_starting,
 		.teardown.single	= hrtimers_cpu_dying,
 	},
 	[CPUHP_AP_TICK_DYING] = {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 80fe3749d2db..030426c8c944 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	}
 
 	cpu_base->cpu = cpu;
+	hrtimer_cpu_base_init_expiry_lock(cpu_base);
+	return 0;
+}
+
+int hrtimers_cpu_starting(unsigned int cpu)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+
+	/* Clear out any left over state from a CPU down operation */
 	cpu_base->active_bases = 0;
 	cpu_base->hres_active = 0;
 	cpu_base->hang_detected = 0;
@@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
 	cpu_base->online = 1;
-	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
 
@@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 void __init hrtimers_init(void)
 {
 	hrtimers_prepare_cpu(smp_processor_id());
+	hrtimers_cpu_starting(smp_processor_id());
 	open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
 }