diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 78047d0d9882..232baea90a1d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -398,6 +398,13 @@ enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, + + /* + * Per-node default cap on min_active. Unless explicitly set, min_active + * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see + * workqueue_struct->min_active definition. + */ + WQ_DFL_MIN_ACTIVE = 8, }; /* @@ -440,11 +447,33 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items per CPU, 0 for default + * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * - * Allocate a workqueue with the specified parameters. For detailed - * information on WQ_* flags, please refer to + * For a per-cpu workqueue, @max_active limits the number of in-flight work + * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be + * executing at most one work item for the workqueue. + * + * For unbound workqueues, @max_active limits the number of in-flight work items + * for the whole system. e.g. @max_active of 16 indicates that that there can be + * at most 16 work items executing for the workqueue in the whole system. + * + * As sharing the same active counter for an unbound workqueue across multiple + * NUMA nodes can be expensive, @max_active is distributed to each NUMA node + * according to the proportion of the number of online CPUs and enforced + * independently. + * + * Depending on online CPU distribution, a node may end up with per-node + * max_active which is significantly lower than @max_active, which can lead to + * deadlocks if the per-node concurrency limit is lower than the maximum number + * of interdependent work items for the workqueue. + * + * To guarantee forward progress regardless of online CPU distribution, the + * concurrency limit on every node is guaranteed to be equal to or greater than + * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means + * that the sum of per-node max_active's may be larger than @max_active. + * + * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d465478adb9..903be39bd2d1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,9 @@ enum wq_internal_consts { * * L: pool->lock protected. Access with pool->lock held. * + * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for + * reads. + * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -247,17 +250,18 @@ struct pool_workqueue { * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * - * All work items marked with WORK_STRUCT_INACTIVE do not participate - * in pwq->nr_active and all work items in pwq->inactive_works are - * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE - * work items are in pwq->inactive_works. Some of them are ready to - * run in pool->worklist or worker->scheduled. Those work itmes are - * only struct wq_barrier which is used for flush_work() and should - * not participate in pwq->nr_active. For non-barrier work item, it - * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + * All work items marked with WORK_STRUCT_INACTIVE do not participate in + * nr_active and all work items in pwq->inactive_works are marked with + * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are + * in pwq->inactive_works. Some of them are ready to run in + * pool->worklist or worker->scheduled. Those work itmes are only struct + * wq_barrier which is used for flush_work() and should not participate + * in nr_active. For non-barrier work item, it is marked with + * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ struct list_head inactive_works; /* L: inactive works */ + struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -289,9 +293,19 @@ struct wq_device; * on each CPU, in an unbound workqueue, max_active applies to the whole system. * As sharing a single nr_active across multiple sockets can be very expensive, * the counting and enforcement is per NUMA node. + * + * The following struct is used to enforce per-node max_active. When a pwq wants + * to start executing a work item, it should increment ->nr using + * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over + * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish + * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in + * round-robin order. */ struct wq_node_nr_active { - atomic_t nr; /* per-node nr_active count */ + int max; /* per-node max_active */ + atomic_t nr; /* per-node nr_active */ + raw_spinlock_t lock; /* nests inside pool locks */ + struct list_head pending_pwqs; /* LN: pwqs with inactive works */ }; /* @@ -314,8 +328,12 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ + + /* See alloc_workqueue() function comment for info on min/max_active */ int max_active; /* WO: max active works */ + int min_active; /* WO: min active works */ int saved_max_active; /* WQ: saved max_active */ + int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ @@ -667,6 +685,19 @@ static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) lockdep_is_held(&wq->mutex)); } +/** + * unbound_effective_cpumask - effective cpumask of an unbound workqueue + * @wq: workqueue of interest + * + * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which + * is masked with wq_unbound_cpumask to determine the effective cpumask. The + * default pwq is always mapped to the pool with the current effective cpumask. + */ +static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) +{ + return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1461,6 +1492,46 @@ static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, return wq->node_nr_active[node]; } +/** + * wq_update_node_max_active - Update per-node max_actives to use + * @wq: workqueue to update + * @off_cpu: CPU that's going down, -1 if a CPU is not going down + * + * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is + * distributed among nodes according to the proportions of numbers of online + * cpus. The result is always between @wq->min_active and max_active. + */ +static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) +{ + struct cpumask *effective = unbound_effective_cpumask(wq); + int min_active = READ_ONCE(wq->min_active); + int max_active = READ_ONCE(wq->max_active); + int total_cpus, node; + + lockdep_assert_held(&wq->mutex); + + if (!cpumask_test_cpu(off_cpu, effective)) + off_cpu = -1; + + total_cpus = cpumask_weight_and(effective, cpu_online_mask); + if (off_cpu >= 0) + total_cpus--; + + for_each_node(node) { + int node_cpus; + + node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); + if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) + node_cpus--; + + wq_node_nr_active(wq, node)->max = + clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), + min_active, max_active); + } + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; +} + /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1558,35 +1629,98 @@ static bool pwq_activate_work(struct pool_workqueue *pwq, return true; } +static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) +{ + int max = READ_ONCE(nna->max); + + while (true) { + int old, tmp; + + old = atomic_read(&nna->nr); + if (old >= max) + return false; + tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); + if (tmp == old) + return true; + } +} + /** * pwq_tryinc_nr_active - Try to increment nr_active for a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Try to increment nr_active for @pwq. Returns %true if an nr_active count is * successfully obtained. %false otherwise. */ -static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) { struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); - bool obtained; + bool obtained = false; lockdep_assert_held(&pool->lock); - obtained = pwq->nr_active < READ_ONCE(wq->max_active); - - if (obtained) { - pwq->nr_active++; - if (nna) - atomic_inc(&nna->nr); + if (!nna) { + /* per-cpu workqueue, pwq->nr_active is sufficient */ + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + goto out; } + + /* + * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is + * already waiting on $nna, pwq_dec_nr_active() will maintain the + * concurrency level. Don't jump the line. + * + * We need to ignore the pending test after max_active has increased as + * pwq_dec_nr_active() can only maintain the concurrency level but not + * increase it. This is indicated by @fill. + */ + if (!list_empty(&pwq->pending_node) && likely(!fill)) + goto out; + + obtained = tryinc_node_nr_active(nna); + if (obtained) + goto out; + + /* + * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs + * and try again. The smp_mb() is paired with the implied memory barrier + * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either + * we see the decremented $nna->nr or they see non-empty + * $nna->pending_pwqs. + */ + raw_spin_lock(&nna->lock); + + if (list_empty(&pwq->pending_node)) + list_add_tail(&pwq->pending_node, &nna->pending_pwqs); + else if (likely(!fill)) + goto out_unlock; + + smp_mb(); + + obtained = tryinc_node_nr_active(nna); + + /* + * If @fill, @pwq might have already been pending. Being spuriously + * pending in cold paths doesn't affect anything. Let's leave it be. + */ + if (obtained && likely(!fill)) + list_del_init(&pwq->pending_node); + +out_unlock: + raw_spin_unlock(&nna->lock); +out: + if (obtained) + pwq->nr_active++; return obtained; } /** * pwq_activate_first_inactive - Activate the first inactive work item on a pwq * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level * * Activate the first inactive work item of @pwq if available and allowed by * max_active limit. @@ -1594,13 +1728,13 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq) * Returns %true if an inactive work item has been activated. %false if no * inactive work item is found or max_active limit is reached. */ -static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) { struct work_struct *work = list_first_entry_or_null(&pwq->inactive_works, struct work_struct, entry); - if (work && pwq_tryinc_nr_active(pwq)) { + if (work && pwq_tryinc_nr_active(pwq, fill)) { __pwq_activate_work(pwq, work); return true; } else { @@ -1608,11 +1742,93 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq) } } +/** + * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active + * @nna: wq_node_nr_active to activate a pending pwq for + * @caller_pool: worker_pool the caller is locking + * + * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. + * @caller_pool may be unlocked and relocked to lock other worker_pools. + */ +static void node_activate_pending_pwq(struct wq_node_nr_active *nna, + struct worker_pool *caller_pool) +{ + struct worker_pool *locked_pool = caller_pool; + struct pool_workqueue *pwq; + struct work_struct *work; + + lockdep_assert_held(&caller_pool->lock); + + raw_spin_lock(&nna->lock); +retry: + pwq = list_first_entry_or_null(&nna->pending_pwqs, + struct pool_workqueue, pending_node); + if (!pwq) + goto out_unlock; + + /* + * If @pwq is for a different pool than @locked_pool, we need to lock + * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock + * / lock dance. For that, we also need to release @nna->lock as it's + * nested inside pool locks. + */ + if (pwq->pool != locked_pool) { + raw_spin_unlock(&locked_pool->lock); + locked_pool = pwq->pool; + if (!raw_spin_trylock(&locked_pool->lock)) { + raw_spin_unlock(&nna->lock); + raw_spin_lock(&locked_pool->lock); + raw_spin_lock(&nna->lock); + goto retry; + } + } + + /* + * $pwq may not have any inactive work items due to e.g. cancellations. + * Drop it from pending_pwqs and see if there's another one. + */ + work = list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + if (!work) { + list_del_init(&pwq->pending_node); + goto retry; + } + + /* + * Acquire an nr_active count and activate the inactive work item. If + * $pwq still has inactive work items, rotate it to the end of the + * pending_pwqs so that we round-robin through them. This means that + * inactive work items are not activated in queueing order which is fine + * given that there has never been any ordering across different pwqs. + */ + if (likely(tryinc_node_nr_active(nna))) { + pwq->nr_active++; + __pwq_activate_work(pwq, work); + + if (list_empty(&pwq->inactive_works)) + list_del_init(&pwq->pending_node); + else + list_move_tail(&pwq->pending_node, &nna->pending_pwqs); + + /* if activating a foreign pool, make sure it's running */ + if (pwq->pool != caller_pool) + kick_pool(pwq->pool); + } + +out_unlock: + raw_spin_unlock(&nna->lock); + if (locked_pool != caller_pool) { + raw_spin_unlock(&locked_pool->lock); + raw_spin_lock(&caller_pool->lock); + } +} + /** * pwq_dec_nr_active - Retire an active count * @pwq: pool_workqueue of interest * * Decrement @pwq's nr_active and try to activate the first inactive work item. + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. */ static void pwq_dec_nr_active(struct pool_workqueue *pwq) { @@ -1632,12 +1848,29 @@ static void pwq_dec_nr_active(struct pool_workqueue *pwq) * inactive work item on @pwq itself. */ if (!nna) { - pwq_activate_first_inactive(pwq); + pwq_activate_first_inactive(pwq, false); return; } - atomic_dec(&nna->nr); - pwq_activate_first_inactive(pwq); + /* + * If @pwq is for an unbound workqueue, it's more complicated because + * multiple pwqs and pools may be sharing the nr_active count. When a + * pwq needs to wait for an nr_active count, it puts itself on + * $nna->pending_pwqs. The following atomic_dec_return()'s implied + * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to + * guarantee that either we see non-empty pending_pwqs or they see + * decremented $nna->nr. + * + * $nna->max may change as CPUs come online/offline and @pwq->wq's + * max_active gets updated. However, it is guaranteed to be equal to or + * larger than @pwq->wq->min_active which is above zero unless freezing. + * This maintains the forward progress guarantee. + */ + if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) + return; + + if (!list_empty(&nna->pending_pwqs)) + node_activate_pending_pwq(nna, pool); } /** @@ -1965,7 +2198,7 @@ retry: * @work must also queue behind existing inactive work items to maintain * ordering when max_active changes. See wq_adjust_max_active(). */ - if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq)) { + if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) pool->watchdog_ts = jiffies; @@ -3200,7 +3433,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; - /* The barrier work item does not participate in pwq->nr_active. */ + /* The barrier work item does not participate in nr_active. */ work_flags |= WORK_STRUCT_INACTIVE; /* @@ -4116,6 +4349,8 @@ static void free_node_nr_active(struct wq_node_nr_active **nna_ar) static void init_node_nr_active(struct wq_node_nr_active *nna) { atomic_set(&nna->nr, 0); + raw_spin_lock_init(&nna->lock); + INIT_LIST_HEAD(&nna->pending_pwqs); } /* @@ -4355,6 +4590,15 @@ static void pwq_release_workfn(struct kthread_work *work) mutex_unlock(&wq_pool_mutex); } + if (!list_empty(&pwq->pending_node)) { + struct wq_node_nr_active *nna = + wq_node_nr_active(pwq->wq, pwq->pool->node); + + raw_spin_lock_irq(&nna->lock); + list_del_init(&pwq->pending_node); + raw_spin_unlock_irq(&nna->lock); + } + call_rcu(&pwq->rcu, rcu_free_pwq); /* @@ -4380,6 +4624,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->inactive_works); + INIT_LIST_HEAD(&pwq->pending_node); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); @@ -4587,6 +4832,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) ctx->pwq_tbl[cpu]); ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq); + /* update node_nr_active->max */ + wq_update_node_max_active(ctx->wq, -1); + mutex_unlock(&ctx->wq->mutex); } @@ -4850,24 +5098,35 @@ static int init_rescuer(struct workqueue_struct *wq) static void wq_adjust_max_active(struct workqueue_struct *wq) { bool activated; + int new_max, new_min; lockdep_assert_held(&wq->mutex); if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) { - WRITE_ONCE(wq->max_active, 0); - return; + new_max = 0; + new_min = 0; + } else { + new_max = wq->saved_max_active; + new_min = wq->saved_min_active; } - if (wq->max_active == wq->saved_max_active) + if (wq->max_active == new_max && wq->min_active == new_min) return; /* - * Update @wq->max_active and then kick inactive work items if more + * Update @wq->max/min_active and then kick inactive work items if more * active work items are allowed. This doesn't break work item ordering * because new work items are always queued behind existing inactive * work items if there are any. */ - WRITE_ONCE(wq->max_active, wq->saved_max_active); + WRITE_ONCE(wq->max_active, new_max); + WRITE_ONCE(wq->min_active, new_min); + + if (wq->flags & WQ_UNBOUND) + wq_update_node_max_active(wq, -1); + + if (new_max == 0) + return; /* * Round-robin through pwq's activating the first inactive work item @@ -4882,7 +5141,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) /* can be called during early boot w/ irq disabled */ raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq_activate_first_inactive(pwq)) { + if (pwq_activate_first_inactive(pwq, true)) { activated = true; kick_pool(pwq->pool); } @@ -4944,7 +5203,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, /* init wq */ wq->flags = flags; wq->max_active = max_active; - wq->saved_max_active = max_active; + wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE); + wq->saved_max_active = wq->max_active; + wq->saved_min_active = wq->min_active; mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); @@ -5110,7 +5371,8 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); * @wq: target workqueue * @max_active: new max_active value. * - * Set max_active of @wq to @max_active. + * Set max_active of @wq to @max_active. See the alloc_workqueue() function + * comment. * * CONTEXT: * Don't call from IRQ context. @@ -5127,6 +5389,9 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; + if (wq->flags & WQ_UNBOUND) + wq->saved_min_active = min(wq->saved_min_active, max_active); + wq_adjust_max_active(wq); mutex_unlock(&wq->mutex); @@ -5808,6 +6073,10 @@ int workqueue_online_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } } @@ -5836,6 +6105,10 @@ int workqueue_offline_cpu(unsigned int cpu) for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); + + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, cpu); + mutex_unlock(&wq->mutex); } } mutex_unlock(&wq_pool_mutex); @@ -7127,8 +7400,12 @@ void __init workqueue_init_topology(void) * combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) wq_update_pod(wq, cpu, cpu, true); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq->mutex); + wq_update_node_max_active(wq, -1); + mutex_unlock(&wq->mutex); } }