From 863a66cdb4df25fd146d9851c3289072298566d5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Mar 2022 09:27:08 +0800 Subject: [PATCH 1/6] lib/sbitmap: allocate sb->map via kvzalloc_node sbitmap has been used in scsi for replacing atomic operations on sdev->device_busy, so IOPS on some fast scsi storage can be improved. However, sdev->device_busy can be changed in fast path, so we have to allocate the sb->map statically. sdev->device_busy has been capped to 1024, but some drivers may configure the default depth as < 8, then cause each sbitmap word to hold only one bit. Finally 1024 * 128( sizeof(sbitmap_word)) bytes is needed for sb->map, given it is order 5 allocation, sometimes it may fail. Avoid the issue by using kvzalloc_node() for allocating sb->map. Cc: Ewan D. Milne Signed-off-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220316012708.354668-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- lib/sbitmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index dffeb8281c2d..8f5a86e210b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -174,7 +174,7 @@ static inline unsigned int __map_depth(const struct sbitmap *sb, int index) static inline void sbitmap_free(struct sbitmap *sb) { free_percpu(sb->alloc_hint); - kfree(sb->map); + kvfree(sb->map); sb->map = NULL; } diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2eb3de18ded3..ae4fd4de9ebe 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -110,7 +110,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, sb->alloc_hint = NULL; } - sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node); + sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; From d578c770c85233af592e54537f93f3831bde7e9a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 23 Mar 2022 09:13:08 +0800 Subject: [PATCH 2/6] block: avoid calling blkg_free() in atomic context blkg_free() can currently be called in atomic context, either spin lock is held, or run in rcu callback. Meantime either request queue's release handler or ->pd_free_fn can sleep. Fix the issue by scheduling a work function for freeing blkcg_gq the instance. [ 148.553894] BUG: sleeping function called from invalid context at block/blk-sysfs.c:767 [ 148.557381] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 0, name: swapper/13 [ 148.560741] preempt_count: 101, expected: 0 [ 148.562577] RCU nest depth: 0, expected: 0 [ 148.564379] 1 lock held by swapper/13/0: [ 148.566127] #0: ffffffff82615f80 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire+0x0/0x1b [ 148.569640] Preemption disabled at: [ 148.569642] [] ___slab_alloc+0x554/0x661 [ 148.573559] CPU: 13 PID: 0 Comm: swapper/13 Kdump: loaded Not tainted 5.17.0_up+ #110 [ 148.576834] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1.fc33 04/01/2014 [ 148.579768] Call Trace: [ 148.580567] [ 148.581262] dump_stack_lvl+0x56/0x7c [ 148.582367] ? ___slab_alloc+0x554/0x661 [ 148.583526] __might_resched+0x1af/0x1c8 [ 148.584678] blk_release_queue+0x24/0x109 [ 148.585861] kobject_cleanup+0xc9/0xfe [ 148.586979] blkg_free+0x46/0x63 [ 148.587962] rcu_do_batch+0x1c5/0x3db [ 148.589057] rcu_core+0x14a/0x184 [ 148.590065] __do_softirq+0x14d/0x2c7 [ 148.591167] __irq_exit_rcu+0x7a/0xd4 [ 148.592264] sysvec_apic_timer_interrupt+0x82/0xa5 [ 148.593649] [ 148.594354] [ 148.595058] asm_sysvec_apic_timer_interrupt+0x12/0x20 Cc: Tejun Heo Fixes: 0a9a25ca7843 ("block: let blkcg_gq grab request queue's refcnt") Reported-by: Christoph Hellwig Link: https://lore.kernel.org/linux-block/20220322093322.GA27283@lst.de/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220323011308.2010380-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 32 ++++++++++++++++++++++---------- include/linux/blk-cgroup.h | 5 ++++- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d53b0d69dd73..6ed43fc0e6ab 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -65,19 +65,12 @@ static bool blkcg_policy_enabled(struct request_queue *q, return pol && test_bit(pol->plid, q->blkcg_pols); } -/** - * blkg_free - free a blkg - * @blkg: blkg to free - * - * Free @blkg which may be partially allocated. - */ -static void blkg_free(struct blkcg_gq *blkg) +static void blkg_free_workfn(struct work_struct *work) { + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + free_work); int i; - if (!blkg) - return; - for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); @@ -89,6 +82,25 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(blkg); } +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) +{ + if (!blkg) + return; + + /* + * Both ->pd_free_fn() and request queue's release handler may + * sleep, so free us by scheduling one work func + */ + INIT_WORK(&blkg->free_work, blkg_free_workfn); + schedule_work(&blkg->free_work); +} + static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f2ad8ed8f777..652cd05b0924 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -95,7 +95,10 @@ struct blkcg_gq { spinlock_t async_bio_lock; struct bio_list async_bios; - struct work_struct async_bio_work; + union { + struct work_struct async_bio_work; + struct work_struct free_work; + }; atomic_t use_delay; atomic64_t delay_nsec; From 15583a563cd5a7358e975599b7de7caacd9e9ce9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 28 Mar 2022 10:59:28 +0200 Subject: [PATCH 3/6] block: restore the old set_task_ioprio() behaviour wrt PF_EXITING PF_EXITING tasks were silently ignored before the below commits. Continue doing so. Otherwise python-psutil tests fail: ERROR: psutil.tests.test_process.TestProcess.test_zombie_process ---------------------------------------------------------------------- Traceback (most recent call last): File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/build/lib.linux-x86_64-3.9/psutil/_pslinux.py", line 1661, in wrapper return fun(self, *args, **kwargs) File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/build/lib.linux-x86_64-3.9/psutil/_pslinux.py", line 2133, in ionice_set return cext.proc_ioprio_set(self.pid, ioclass, value) ProcessLookupError: [Errno 3] No such process During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/psutil/tests/test_process.py", line 1313, in test_zombie_process succeed_or_zombie_p_exc(fun) File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/psutil/tests/test_process.py", line 1288, in succeed_or_zombie_p_exc return fun() File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/build/lib.linux-x86_64-3.9/psutil/__init__.py", line 792, in ionice return self._proc.ionice_set(ioclass, value) File "/home/abuild/rpmbuild/BUILD/psutil-5.9.0/build/lib.linux-x86_64-3.9/psutil/_pslinux.py", line 1665, in wrapper raise NoSuchProcess(self.pid, self._name) psutil.NoSuchProcess: process no longer exists (pid=2057) Cc: Christoph Hellwig Cc: Jan Kara Cc: Jens Axboe Fixes: 5fc11eebb4 (block: open code create_task_io_context in set_task_ioprio) Fixes: a957b61254 (block: fix error in handling dead task for ioprio setting) Signed-off-by: Jiri Slaby Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220328085928.7899-1-jslaby@suse.cz Signed-off-by: Jens Axboe --- block/blk-ioc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 11f49f78db32..df9cfe4ca532 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -280,7 +280,6 @@ int set_task_ioprio(struct task_struct *task, int ioprio) task_lock(task); if (task->flags & PF_EXITING) { - err = -ESRCH; kmem_cache_free(iocontext_cachep, ioc); goto out; } @@ -292,7 +291,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) task->io_context->ioprio = ioprio; out: task_unlock(task); - return err; + return 0; } EXPORT_SYMBOL_GPL(set_task_ioprio); From d1868328dec5ae2cf210111025fcbc71f78dd5ca Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 26 Mar 2022 15:50:46 +0100 Subject: [PATCH 4/6] block: Fix the maximum minor value is blk_alloc_ext_minor() ida_alloc_range(..., min, max, ...) returns values from min to max, inclusive. So, NR_EXT_DEVT is a valid idx returned by blk_alloc_ext_minor(). This is an issue because in device_add_disk(), this value is used in: ddev->devt = MKDEV(disk->major, disk->first_minor); and NR_EXT_DEVT is '(1 << MINORBITS)'. So, should 'disk->first_minor' be NR_EXT_DEVT, it would overflow. Fixes: 22ae8ce8b892 ("block: simplify bdev/disk lookup in blkdev_get") Signed-off-by: Christophe JAILLET Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/cc17199798312406b90834e433d2cefe8266823d.1648306232.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 8244f09e058d..19b8a6e7a13b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -335,7 +335,7 @@ int blk_alloc_ext_minor(void) { int idx; - idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); if (idx == -ENOSPC) return -EBUSY; return idx; From 4a3b666e0ea977dd40adb56c37a91370f76fa19e Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 31 Mar 2022 11:12:18 +0200 Subject: [PATCH 5/6] block: use dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Link: https://lore.kernel.org/r/20220331091218.641532-1-jakobkoschel@gmail.com [axboe: move lookup to where return value is checked] Signed-off-by: Jens Axboe --- block/blk-mq.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 213bb5979bed..176b031510ff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4448,21 +4448,28 @@ static bool blk_mq_elv_switch_none(struct list_head *head, return true; } -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) +static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, + struct request_queue *q) { struct blk_mq_qe_pair *qe; - struct elevator_type *t = NULL; list_for_each_entry(qe, head, node) - if (qe->q == q) { - t = qe->type; - break; - } + if (qe->q == q) + return qe; - if (!t) + return NULL; +} + +static void blk_mq_elv_switch_back(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + struct elevator_type *t; + + qe = blk_lookup_qe_pair(head, q); + if (!qe) return; - + t = qe->type; list_del(&qe->node); kfree(qe); From 8d7829ebc1e48208b3c02c2a10c5f8856246033c Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 31 Mar 2022 11:54:58 -0700 Subject: [PATCH 6/6] blk-wbt: remove wbt_track stub cppcheck returns this warning [block/blk-wbt.h:104] -> [block/blk-wbt.c:592]: (warning) Function 'wbt_track' argument order different: declaration 'rq, flags, ' definition 'rqos, rq, bio' In commit c1c80384c8f4 ("block: remove external dependency on wbt_flags") wbt_track was removed for the real declaration, its stub should have been as well. Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20220331185458.3427454-1-trix@redhat.com Signed-off-by: Jens Axboe --- block/blk-wbt.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 2eb01becde8c..7e44eccc676d 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -101,9 +101,6 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline void wbt_track(struct request *rq, enum wbt_flags flags) -{ -} static inline int wbt_init(struct request_queue *q) { return -EINVAL;