net: Extend NAPI threaded polling to allow kthread based busy polling

Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Samiullah Khawaja
2025-10-28 20:30:05 +00:00
committed by Jakub Kicinski
parent 998b5d9683
commit c18d4b190a
8 changed files with 109 additions and 15 deletions

View File

@@ -88,7 +88,7 @@ definitions:
-
name: napi-threaded
type: enum
entries: [disabled, enabled]
entries: [disabled, enabled, busy-poll]
attribute-sets:
-
@@ -291,7 +291,8 @@ attribute-sets:
name: threaded
doc: Whether the NAPI is configured to operate in threaded polling
mode. If this is set to enabled then the NAPI context operates
in threaded polling mode.
in threaded polling mode. If this is set to busy-poll, then the
threaded polling mode also busy polls.
type: u32
enum: napi-threaded
-

View File

@@ -263,7 +263,9 @@ are not well known).
Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
selected sockets or using the global ``net.core.busy_poll`` and
``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
also exists.
also exists. Threaded polling of NAPI also has a mode to busy poll for
packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI
processing kthread.
epoll-based busy polling
------------------------
@@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
the recommended usage, because otherwise setting ``irq-suspend-timeout``
might not have any discernible effect.
.. _threaded_busy_poll:
Threaded NAPI busy polling
--------------------------
Threaded NAPI busy polling extends threaded NAPI and adds support to do
continuous busy polling of the NAPI. This can be useful for forwarding or
AF_XDP applications.
Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink.
For example, using the following script:
.. code-block:: bash
$ ynl --family netdev --do napi-set \
--json='{"id": 66, "threaded": "busy-poll"}'
The kernel will create a kthread that busy polls on this NAPI.
The user may elect to set the CPU affinity of this kthread to an unused CPU
core to improve how often the NAPI is polled at the expense of wasted CPU
cycles. Note that this will keep the CPU core busy with 100% usage.
Once threaded busy polling is enabled for a NAPI, PID of the kthread can be
retrieved using Netlink so the affinity of the kthread can be set up.
For example, the following script can be used to fetch the PID:
.. code-block:: bash
$ ynl --family netdev --do napi-get --json='{"id": 66}'
This will output something like following, the pid `258` is the PID of the
kthread that is polling this NAPI.
.. code-block:: bash
$ {'defer-hard-irqs': 0,
'gro-flush-timeout': 0,
'id': 66,
'ifindex': 2,
'irq-suspend-timeout': 0,
'pid': 258,
'threaded': 'busy-poll'}
.. _threaded:
Threaded NAPI

View File

@@ -423,11 +423,12 @@ enum {
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */
NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */
};
enum {
@@ -442,6 +443,7 @@ enum {
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER),
NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL),
};
enum gro_result {

View File

@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {

View File

@@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
*/
if ((val & NAPIF_STATE_SCHED_THREADED) ||
!(val & NAPIF_STATE_SCHED)) {
new = val & (~NAPIF_STATE_THREADED);
new = val & (~(NAPIF_STATE_THREADED |
NAPIF_STATE_THREADED_BUSY_POLL));
} else {
msleep(20);
continue;
@@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi)
napi->thread = NULL;
}
static void napi_set_threaded_state(struct napi_struct *napi,
enum netdev_napi_threaded threaded_mode)
{
bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
}
int napi_set_threaded(struct napi_struct *napi,
enum netdev_napi_threaded threaded)
{
@@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi,
} else {
/* Make sure kthread is created before THREADED bit is set. */
smp_mb__before_atomic();
assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
napi_set_threaded_state(napi, threaded);
}
return 0;
@@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n)
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
new &= ~(NAPIF_STATE_THREADED |
NAPIF_STATE_THREADED_BUSY_POLL |
NAPIF_STATE_PREFER_BUSY_POLL);
} while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
@@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
static void napi_threaded_poll_loop(struct napi_struct *napi)
static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
@@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
}
skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
/* When busy poll is enabled, the old packets are not flushed in
* napi_complete_done. So flush them here.
*/
if (busy_poll)
gro_flush_normal(&napi->gro, HZ >= 1000);
local_bh_enable();
/* Call cond_resched here to avoid watchdog warnings. */
if (repoll || busy_poll) {
rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
if (!repoll)
break;
rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
bool want_busy_poll;
bool in_busy_poll;
unsigned long val;
while (!napi_thread_wait(napi))
napi_threaded_poll_loop(napi);
while (!napi_thread_wait(napi)) {
val = READ_ONCE(napi->state);
want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
if (unlikely(val & NAPIF_STATE_DISABLE))
want_busy_poll = false;
if (want_busy_poll != in_busy_poll)
assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
want_busy_poll);
napi_threaded_poll_loop(napi, want_busy_poll);
}
return 0;
}
@@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
napi_threaded_poll_loop(&sd->backlog);
napi_threaded_poll_loop(&sd->backlog, false);
}
static void backlog_napi_setup(unsigned int cpu)

View File

@@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
{
if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
return NETDEV_NAPI_THREADED_BUSY_POLL;
if (test_bit(NAPI_STATE_THREADED, &n->state))
return NETDEV_NAPI_THREADED_ENABLED;

View File

@@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
};
/* NETDEV_CMD_BIND_TX - do */

View File

@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {