From 66067c3c8a1ee097e9c30e8bbd643b12ba54a6b0 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 22 May 2025 14:08:01 -0700 Subject: [PATCH 1/5] genirq: Add kunit tests for depth counts There have been a few bugs and/or misunderstandings about the reference counting, and startup/shutdown behaviors in the IRQ core and related CPU hotplug code. These 4 test cases try to capture a few interesting cases. * irq_disable_depth_test: basic request/disable/enable sequence * irq_free_disabled_test: request/disable/free/re-request sequence - this catches errors on previous revisions of my work * irq_cpuhotplug_test: exercises managed-affinity IRQ + CPU hotplug. This captures a problematic test case which was fixed recently. This test requires CONFIG_SMP and a hotpluggable CPU#1. * irq_shutdown_depth_test: exercises similar behavior from irq_cpuhotplug_test, but directly using irq_*() APIs instead of going through CPU hotplug. This still requires CONFIG_SMP, because managed-affinity is stubbed out (and not all APIs are even present) without it. Note the use of 'imply SMP': ARCH=um doesn't support SMP, and kunit is often exercised there. Thus, 'imply' will force SMP on where possible (such as ARCH=x86_64), but leave it off where it's not. Behavior on various SMP and ARCH configurations: $ tools/testing/kunit/kunit.py run 'irq_test_cases*' --arch x86_64 --qemu_args '-smp 2' [...] [11:12:24] Testing complete. Ran 4 tests: passed: 4 $ tools/testing/kunit/kunit.py run 'irq_test_cases*' --arch x86_64 [...] [11:13:27] [SKIPPED] irq_cpuhotplug_test [11:13:27] ================= [PASSED] irq_test_cases ================== [11:13:27] ============================================================ [11:13:27] Testing complete. Ran 4 tests: passed: 3, skipped: 1 # default: ARCH=um $ tools/testing/kunit/kunit.py run 'irq_test_cases*' [11:14:26] [SKIPPED] irq_shutdown_depth_test [11:14:26] [SKIPPED] irq_cpuhotplug_test [11:14:26] ================= [PASSED] irq_test_cases ================== [11:14:26] ============================================================ [11:14:26] Testing complete. Ran 4 tests: passed: 2, skipped: 2 Without commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug"), this fails as follows: [11:18:55] =============== irq_test_cases (4 subtests) ================ [11:18:55] [PASSED] irq_disable_depth_test [11:18:55] [PASSED] irq_free_disabled_test [11:18:55] # irq_shutdown_depth_test: EXPECTATION FAILED at kernel/irq/irq_test.c:147 [11:18:55] Expected desc->depth == 1, but [11:18:55] desc->depth == 0 (0x0) [11:18:55] ------------[ cut here ]------------ [11:18:55] Unbalanced enable for IRQ 26 [11:18:55] WARNING: CPU: 1 PID: 36 at kernel/irq/manage.c:792 __enable_irq+0x36/0x60 ... [11:18:55] [FAILED] irq_shutdown_depth_test [11:18:55] #1 [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:202 [11:18:55] Expected irqd_is_activated(data) to be false, but is true [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:203 [11:18:55] Expected irqd_is_started(data) to be false, but is true [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:204 [11:18:55] Expected desc->depth == 1, but [11:18:55] desc->depth == 0 (0x0) [11:18:55] ------------[ cut here ]------------ [11:18:55] Unbalanced enable for IRQ 27 [11:18:55] WARNING: CPU: 0 PID: 38 at kernel/irq/manage.c:792 __enable_irq+0x36/0x60 ... [11:18:55] [FAILED] irq_cpuhotplug_test [11:18:55] # module: irq_test [11:18:55] # irq_test_cases: pass:2 fail:2 skip:0 total:4 [11:18:55] # Totals: pass:2 fail:2 skip:0 total:4 [11:18:55] ================= [FAILED] irq_test_cases ================== [11:18:55] ============================================================ [11:18:55] Testing complete. Ran 4 tests: passed: 2, failed: 2 Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250522210837.4135244-1-briannorris@chromium.org --- kernel/irq/Kconfig | 11 ++ kernel/irq/Makefile | 1 + kernel/irq/irq_test.c | 229 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 kernel/irq/irq_test.c diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3f02a0e45254..1da5e9d9da71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD bool +config IRQ_KUNIT_TEST + bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + imply SMP + help + This option enables KUnit tests for the IRQ subsystem API. These are + only for development and testing, not for regular kernel use cases. + + If unsure, say N. + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c0f44c06d69d..6ab3a4055667 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c new file mode 100644 index 000000000000..5161b56a12f9 --- /dev/null +++ b/kernel/irq/irq_test.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static irqreturn_t noop_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +static void noop(struct irq_data *data) { } +static unsigned int noop_ret(struct irq_data *data) { return 0; } + +static int noop_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + irq_data_update_effective_affinity(data, dest); + + return 0; +} + +static struct irq_chip fake_irq_chip = { + .name = "fake", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .irq_set_affinity = noop_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_free_disabled_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + free_irq(virq, NULL); + KUNIT_EXPECT_GE(test, desc->depth, 1); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_shutdown_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + .mask = CPU_MASK_ALL, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + irq_shutdown_and_deactivate(desc); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + + KUNIT_EXPECT_EQ(test, irq_activate(desc), 0); +#ifdef CONFIG_SMP + irq_startup_managed(desc); +#endif + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_cpuhotplug_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for CPU hotplug"); + if (!get_cpu_device(1)) + kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); + if (!cpu_is_hotpluggable(1)) + kunit_skip(test, "CPU 1 must be hotpluggable"); + + cpumask_copy(&affinity.mask, cpumask_of(1)); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_GE(test, desc->depth, 1); + KUNIT_EXPECT_EQ(test, add_cpu(1), 0); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static struct kunit_case irq_test_cases[] = { + KUNIT_CASE(irq_disable_depth_test), + KUNIT_CASE(irq_free_disabled_test), + KUNIT_CASE(irq_shutdown_depth_test), + KUNIT_CASE(irq_cpuhotplug_test), + {} +}; + +static struct kunit_suite irq_test_suite = { + .name = "irq_test_cases", + .test_cases = irq_test_cases, +}; + +kunit_test_suite(irq_test_suite); +MODULE_DESCRIPTION("IRQ unit test suite"); +MODULE_LICENSE("GPL"); From 46958a7bac2d32fda43fd7cd1858aa414640fbd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:06 +0200 Subject: [PATCH 2/5] genirq: Remove pointless local variable The variable is only used at one place, which can simply take the constant as function argument. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Link: https://lore.kernel.org/all/20250718185311.884314473@linutronix.de --- kernel/irq/chip.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b274007e8ba..5bb26fc5368b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -466,13 +466,11 @@ static bool irq_check_poll(struct irq_desc *desc) static bool irq_can_handle_pm(struct irq_desc *desc) { - unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; - /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, mask)) + if (!irqd_has_set(&desc->irq_data, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* From 4e879dedd571128ed5aa4d5989ec0a1938804d20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:08 +0200 Subject: [PATCH 3/5] genirq: Move irq_wait_for_poll() to call site Move it to the call site so that the waiting for the INPROGRESS flag can be reused by an upcoming mitigation for a potential live lock in the edge type handler. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/all/20250718185311.948555026@linutronix.de --- kernel/irq/chip.c | 31 +++++++++++++++++++++++-------- kernel/irq/internals.h | 2 +- kernel/irq/spurious.c | 37 +------------------------------------ 3 files changed, 25 insertions(+), 45 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5bb26fc5368b..290244ca28dd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -457,11 +457,21 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -static bool irq_check_poll(struct irq_desc *desc) +/* Busy wait until INPROGRESS is cleared */ +static bool irq_wait_on_inprogress(struct irq_desc *desc) { - if (!(desc->istate & IRQS_POLL_INPROGRESS)) - return false; - return irq_wait_for_poll(desc); + if (IS_ENABLED(CONFIG_SMP)) { + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); + + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; + } + return false; } static bool irq_can_handle_pm(struct irq_desc *desc) @@ -481,10 +491,15 @@ static bool irq_can_handle_pm(struct irq_desc *desc) if (irq_pm_check_wakeup(desc)) return false; - /* - * Handle a potential concurrent poll on a different core. - */ - return irq_check_poll(desc); + /* Check whether the interrupt is polled on another CPU */ + if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + return irq_wait_on_inprogress(desc); + } + return false; } static inline bool irq_can_handle_actions(struct irq_desc *desc) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index aebfe225c9a6..82b0d67022c4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,6 +20,7 @@ #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; +extern int irq_poll_cpu; extern struct irqaction chained_action; @@ -112,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); -bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 8f26982e7300..73280ccb74b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -19,44 +19,9 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); -static int irq_poll_cpu; +int irq_poll_cpu; static atomic_t irq_poll_active; -/* - * We wait here for a poller to finish. - * - * If the poll runs on this CPU, then we yell loudly and return - * false. That will leave the interrupt line disabled in the worst - * case, but it should never happen. - * - * We wait until the poller is done and then recheck disabled and - * action (about to be disabled). Only if it's still active, we return - * true and let the handler run. - */ -bool irq_wait_for_poll(struct irq_desc *desc) -{ - lockdep_assert_held(&desc->lock); - - if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), - "irq poll in progress on cpu %d for irq %d\n", - smp_processor_id(), desc->irq_data.irq)) - return false; - -#ifdef CONFIG_SMP - do { - raw_spin_unlock(&desc->lock); - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - raw_spin_lock(&desc->lock); - } while (irqd_irq_inprogress(&desc->irq_data)); - /* Might have been disabled in meantime */ - return !irqd_irq_disabled(&desc->irq_data) && desc->action; -#else - return false; -#endif -} - - /* * Recovery handler for misrouted interrupts. */ From c609045abc778689ce42e8f5827a84179ace52c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:10 +0200 Subject: [PATCH 4/5] genirq: Split up irq_pm_check_wakeup() Let the calling code check for the IRQD_WAKEUP_ARMED flag to prepare for a live lock mitigation in the edge type handler. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Link: https://lore.kernel.org/all/20250718185312.012392426@linutronix.de --- kernel/irq/chip.c | 4 +++- kernel/irq/internals.h | 4 ++-- kernel/irq/pm.c | 16 ++++++---------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 290244ca28dd..11ecf6c78c01 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -488,8 +488,10 @@ static bool irq_can_handle_pm(struct irq_desc *desc) * and suspended, disable it and notify the pm core about the * event. */ - if (irq_pm_check_wakeup(desc)) + if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) { + irq_pm_handle_wakeup(desc); return false; + } /* Check whether the interrupt is polled on another CPU */ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 82b0d67022c4..0164ca48da59 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc) } #ifdef CONFIG_PM_SLEEP -bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_handle_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else -static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 445912d51033..f7394729cedc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,17 +13,13 @@ #include "internals.h" -bool irq_pm_check_wakeup(struct irq_desc *desc) +void irq_pm_handle_wakeup(struct irq_desc *desc) { - if (irqd_is_wakeup_armed(&desc->irq_data)) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); - desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; - desc->depth++; - irq_disable(desc); - pm_system_irq_wakeup(irq_desc_get_irq(desc)); - return true; - } - return false; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* From 8d39d6ec4db5da9899993092227584a97c203fd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:12 +0200 Subject: [PATCH 5/5] genirq: Prevent migration live lock in handle_edge_irq() Yicon reported and Liangyan debugged a live lock in handle_edge_irq() related to interrupt migration. If the interrupt affinity is moved to a new target CPU and the interrupt is currently handled on the previous target CPU for edge type interrupts the handler might get stuck on the previous target: CPU 0 (previous target) CPU 1 (new target) handle_edge_irq() repeat: handle_event() handle_edge_irq() if (INPROGESS) { set(PENDING); mask(); return; } if (PENDING) { clear(PENDING); unmask(); goto repeat; } The migration in software never completes and CPU0 continues to handle the pending events forever. This happens when the device raises interrupts with a high rate and always before handle_event() completes and before the CPU0 handler can clear INPROGRESS so that CPU1 sets the PENDING flag over and over. This has been observed in virtual machines. Prevent this by checking whether the CPU which observes the INPROGRESS flag is the new affinity target. If that's the case, do not set the PENDING flag and wait for the INPROGRESS flag to be cleared instead, so that the new interrupt is handled on the new target CPU and the previous CPU is released from the action. This is restricted to the edge type handler and only utilized on systems, which use single CPU targets for interrupt affinity. Reported-by: Yicong Shen Reported-by: Liangyan Signed-off-by: Thomas Gleixner Tested-by: Liangyan Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/all/20250701163558.2588435-1-liangyan.peng@bytedance.com Link: https://lore.kernel.org/all/20250718185312.076515034@linutronix.de --- kernel/irq/chip.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11ecf6c78c01..624106e886ad 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -476,11 +476,14 @@ static bool irq_wait_on_inprogress(struct irq_desc *desc) static bool irq_can_handle_pm(struct irq_desc *desc) { + struct irq_data *irqd = &desc->irq_data; + const struct cpumask *aff; + /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) + if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* @@ -501,7 +504,41 @@ static bool irq_can_handle_pm(struct irq_desc *desc) return false; return irq_wait_on_inprogress(desc); } - return false; + + /* The below works only for single target interrupts */ + if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) || + !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq) + return false; + + /* + * If the interrupt affinity was moved to this CPU and the + * interrupt is currently handled on the previous target CPU, then + * busy wait for INPROGRESS to be cleared. Otherwise for edge type + * interrupts the handler might get stuck on the previous target: + * + * CPU 0 CPU 1 (new target) + * handle_edge_irq() + * repeat: + * handle_event() handle_edge_irq() + * if (INPROGESS) { + * set(PENDING); + * mask(); + * return; + * } + * if (PENDING) { + * clear(PENDING); + * unmask(); + * goto repeat; + * } + * + * This happens when the device raises interrupts with a high rate + * and always before handle_event() completes and the CPU0 handler + * can clear INPROGRESS. This has been observed in virtual machines. + */ + aff = irq_data_get_effective_affinity_mask(irqd); + if (cpumask_first(aff) != smp_processor_id()) + return false; + return irq_wait_on_inprogress(desc); } static inline bool irq_can_handle_actions(struct irq_desc *desc)