linux-stable/drivers/block/null_blk/null_blk.h

175 lines
5.6 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BLK_NULL_BLK_H
#define __BLK_NULL_BLK_H
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/blk-mq.h>
#include <linux/hrtimer.h>
#include <linux/configfs.h>
#include <linux/badblocks.h>
#include <linux/fault-inject.h>
null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-11-20 01:55:14 +00:00
#include <linux/spinlock.h>
#include <linux/mutex.h>
struct nullb_cmd {
blk_status_t error;
bool fake_timeout;
struct nullb_queue *nq;
struct hrtimer timer;
};
struct nullb_queue {
struct nullb_device *dev;
unsigned int requeue_selection;
struct list_head poll_list;
spinlock_t poll_lock;
};
null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-11-20 01:55:14 +00:00
struct nullb_zone {
/*
* Zone lock to prevent concurrent modification of a zone write
* pointer position and condition: with memory backing, a write
* command execution may sleep on memory allocation. For this case,
* use mutex as the zone lock. Otherwise, use the spinlock for
* locking the zone.
*/
union {
spinlock_t spinlock;
struct mutex mutex;
};
enum blk_zone_type type;
enum blk_zone_cond cond;
sector_t start;
sector_t wp;
unsigned int len;
unsigned int capacity;
};
struct nullb_device {
struct nullb *nullb;
struct config_group group;
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
struct fault_config timeout_config;
struct fault_config requeue_config;
struct fault_config init_hctx_fault_config;
#endif
struct radix_tree_root data; /* data stored in the disk */
struct radix_tree_root cache; /* disk cache data */
unsigned long flags; /* device flags */
unsigned int curr_cache;
struct badblocks badblocks;
unsigned int nr_zones;
unsigned int nr_zones_imp_open;
unsigned int nr_zones_exp_open;
unsigned int nr_zones_closed;
unsigned int imp_close_zone_no;
null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-11-20 01:55:14 +00:00
struct nullb_zone *zones;
sector_t zone_size_sects;
null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-11-20 01:55:14 +00:00
bool need_zone_res_mgmt;
spinlock_t zone_res_lock;
unsigned long size; /* device size in MB */
unsigned long completion_nsec; /* time in ns to complete a request */
unsigned long cache_size; /* disk cache size in MB */
unsigned long zone_size; /* zone size in MB if device is zoned */
unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int zone_max_open; /* max number of open zones */
unsigned int zone_max_active; /* max number of active zones */
unsigned int submit_queues; /* number of submission queues */
null_blk: Fix handling of submit_queues and poll_queues attributes Commit 0a593fbbc245 ("null_blk: poll queue support") introduced the poll queue feature to null_blk. After this change, null_blk device has both submit queues and poll queues, and null_map_queues() callback maps the both queues for corresponding hardware contexts. The commit also added the device configuration attribute 'poll_queues' in same manner as the existing attribute 'submit_queues'. These attributes allow to modify the numbers of queues. However, when the new values are stored to these attributes, the values are just handled only for the corresponding queue. When number of submit_queue is updated, number of poll_queue is not counted, or vice versa. This caused inconsistent number of queues and queue mapping and resulted in null-ptr-dereference. This failure was observed in blktests block/029 and block/030. To avoid the inconsistency, fix the attribute updates to care both submit_queues and poll_queues. Introduce the helper function nullb_update_nr_hw_queues() to handle stores to the both two attributes. Add poll_queues field to the struct nullb_device to track the number in same manner as submit_queues. Add two more fields prev_submit_queues and prev_poll_queues to keep the previous values before change. In case the block layer failed to update the nr_hw_queues, refer the previous values in null_map_queues() to map queues in same manner as before change. Also add poll_queues value checks in nullb_update_nr_hw_queues() and null_validate_conf(). They ensure the poll_queues value of each device is within the range from 1 to module parameter value of poll_queues. Fixes: 0a593fbbc245 ("null_blk: poll queue support") Reported-by: Yi Zhang <yi.zhang@redhat.com> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Link: https://lore.kernel.org/r/20211029103926.845635-1-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-10-29 10:39:26 +00:00
unsigned int prev_submit_queues; /* number of submission queues before change */
unsigned int poll_queues; /* number of IOPOLL submission queues */
null_blk: Fix handling of submit_queues and poll_queues attributes Commit 0a593fbbc245 ("null_blk: poll queue support") introduced the poll queue feature to null_blk. After this change, null_blk device has both submit queues and poll queues, and null_map_queues() callback maps the both queues for corresponding hardware contexts. The commit also added the device configuration attribute 'poll_queues' in same manner as the existing attribute 'submit_queues'. These attributes allow to modify the numbers of queues. However, when the new values are stored to these attributes, the values are just handled only for the corresponding queue. When number of submit_queue is updated, number of poll_queue is not counted, or vice versa. This caused inconsistent number of queues and queue mapping and resulted in null-ptr-dereference. This failure was observed in blktests block/029 and block/030. To avoid the inconsistency, fix the attribute updates to care both submit_queues and poll_queues. Introduce the helper function nullb_update_nr_hw_queues() to handle stores to the both two attributes. Add poll_queues field to the struct nullb_device to track the number in same manner as submit_queues. Add two more fields prev_submit_queues and prev_poll_queues to keep the previous values before change. In case the block layer failed to update the nr_hw_queues, refer the previous values in null_map_queues() to map queues in same manner as before change. Also add poll_queues value checks in nullb_update_nr_hw_queues() and null_validate_conf(). They ensure the poll_queues value of each device is within the range from 1 to module parameter value of poll_queues. Fixes: 0a593fbbc245 ("null_blk: poll queue support") Reported-by: Yi Zhang <yi.zhang@redhat.com> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Link: https://lore.kernel.org/r/20211029103926.845635-1-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-10-29 10:39:26 +00:00
unsigned int prev_poll_queues; /* number of IOPOLL submission queues before change */
unsigned int home_node; /* home node for the device */
unsigned int queue_mode; /* block interface */
unsigned int blocksize; /* block size */
unsigned int max_sectors; /* Max sectors per command */
unsigned int irqmode; /* IRQ completion handler */
unsigned int hw_queue_depth; /* queue depth */
unsigned int index; /* index of the disk, only valid with a disk */
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
bool blocking; /* blocking blk-mq device */
bool use_per_node_hctx; /* use per-node allocation for hardware context */
bool power; /* power on/off the device */
bool memory_backed; /* if data is stored in memory */
bool discard; /* if support discard */
bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */
bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
};
struct nullb {
struct nullb_device *dev;
struct list_head list;
unsigned int index;
struct request_queue *q;
struct gendisk *disk;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
atomic_long_t cur_bytes;
struct hrtimer bw_timer;
unsigned long cache_flush_pos;
spinlock_t lock;
struct nullb_queue *queues;
char disk_name[DISK_NAME_LEN];
};
blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector,
sector_t nr_sectors);
blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors);
#ifdef CONFIG_BLK_DEV_ZONED
int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
sector_t sector, unsigned int len);
2022-12-01 06:10:36 +00:00
ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond);
#else
static inline int null_init_zoned_dev(struct nullb_device *dev,
struct queue_limits *lim)
{
pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
return -EINVAL;
}
static inline int null_register_zoned_dev(struct nullb *nullb)
{
return -ENODEV;
}
static inline void null_free_zoned_dev(struct nullb_device *dev) {}
static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
enum req_op op, sector_t sector, sector_t nr_sectors)
{
return BLK_STS_NOTSUPP;
}
static inline size_t null_zone_valid_read_len(struct nullb *nullb,
sector_t sector,
unsigned int len)
{
return len;
}
2022-12-01 06:10:36 +00:00
static inline ssize_t zone_cond_store(struct nullb_device *dev,
const char *page, size_t count,
enum blk_zone_cond cond)
{
return -EOPNOTSUPP;
}
#define null_report_zones NULL
#endif /* CONFIG_BLK_DEV_ZONED */
#endif /* __NULL_BLK_H */