for-6.9/block-20240310

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmXuFO4QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpq33D/9hyNyBce2A9iyo026eK8EqLDoed6BPzuvB
 kLKj5tsGvX4YlfuswvP86M5dgibTASXclnfUK394TijW/JPOfJ3mNhi9gMnHzRoK
 ZaR1di0Lum56dY1FkpMmWiGmE4fB79PAtXYKtajOkuoIcNzylncEAAACUY4/Ouhg
 Cm+LMg2prcc+m9g8rKDNQ51pUFg4U21KAUTl35XLMUAaQk1ahW3EDEVYhweC/zwE
 V/5hJsv8UY72+oQGY2Dc/YgQk/Zj4ZDh7C+oHR9XeB/ro99kr3/Vopagu0gBMLZi
 Rq6qqz6PVMhVcuz8uN2rsTQKXmXhsBn9/adsl4AKtdxcW5D5moWb5BLq1P0WQylc
 nzMxa1d6cVcTKZpaUQQv3Rj6ZMrLuDwP277UYHfn5x1oPWYRZCG7FtHuOo1gNcpG
 DrSNwVG6BSDcbABqI+MIS2oD1JoUMyevjwT7e2hOXukZhc6GLO5F3ODWE5j3KnCR
 S/aGSAmcdR4fTcgavULqWdQVt7SYl4f1IxT8KrUirJGVhc2LgahaWj69ooklVHoU
 fPDFRiruwJ5YkH4RWCSDm9mi4kAz6eUf+f4yE06wZOFOb2fT8/1ZK2Snpz2KeXuZ
 INO0RejtFzT8L0OUlu7dBmF20y6rgAYt87lR8mIt71yuuATIrVhzlX1VdsvhdrAo
 VLHGV1Ncgw==
 =WlVL
 -----END PGP SIGNATURE-----

Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - MD pull requests via Song:
      - Cleanup redundant checks (Yu Kuai)
      - Remove deprecated headers (Marc Zyngier, Song Liu)
      - Concurrency fixes (Li Lingfeng)
      - Memory leak fix (Li Nan)
      - Refactor raid1 read_balance (Yu Kuai, Paul Luse)
      - Clean up and fix for md_ioctl (Li Nan)
      - Other small fixes (Gui-Dong Han, Heming Zhao)
      - MD atomic limits (Christoph)

 - NVMe pull request via Keith:
      - RDMA target enhancements (Max)
      - Fabrics fixes (Max, Guixin, Hannes)
      - Atomic queue_limits usage (Christoph)
      - Const use for class_register (Ricardo)
      - Identification error handling fixes (Shin'ichiro, Keith)

 - Improvement and cleanup for cached request handling (Christoph)

 - Moving towards atomic queue limits. Core changes and driver bits so
   far (Christoph)

 - Fix UAF issues in aoeblk (Chun-Yi)

 - Zoned fix and cleanups (Damien)

 - s390 dasd cleanups and fixes (Jan, Miroslav)

 - Block issue timestamp caching (me)

 - noio scope guarding for zoned IO (Johannes)

 - block/nvme PI improvements (Kanchan)

 - Ability to terminate long running discard loop (Keith)

 - bdev revalidation fix (Li)

 - Get rid of old nr_queues hack for kdump kernels (Ming)

 - Support for async deletion of ublk (Ming)

 - Improve IRQ bio recycling (Pavel)

 - Factor in CPU capacity for remote vs local completion (Qais)

 - Add shared_tags configfs entry for null_blk (Shin'ichiro

 - Fix for a regression in page refcounts introduced by the folio
   unification (Tony)

 - Misc fixes and cleanups (Arnd, Colin, John, Kunwu, Li, Navid,
   Ricardo, Roman, Tang, Uwe)

* tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux: (221 commits)
  block: partitions: only define function mac_fix_string for CONFIG_PPC_PMAC
  block/swim: Convert to platform remove callback returning void
  cdrom: gdrom: Convert to platform remove callback returning void
  block: remove disk_stack_limits
  md: remove mddev->queue
  md: don't initialize queue limits
  md/raid10: use the atomic queue limit update APIs
  md/raid5: use the atomic queue limit update APIs
  md/raid1: use the atomic queue limit update APIs
  md/raid0: use the atomic queue limit update APIs
  md: add queue limit helpers
  md: add a mddev_is_dm helper
  md: add a mddev_add_trace_msg helper
  md: add a mddev_trace_remap helper
  bcache: move calculation of stripe_size and io_opt into bcache_device_init
  virtio_blk: Do not use disk_set_max_open/active_zones()
  aoe: fix the potential use-after-free problem in aoecmd_cfg_pkts
  block: move capacity validation to blkpg_do_ioctl()
  block: prevent division by zero in blk_rq_stat_sum()
  drbd: atomically update queue limits in drbd_reconsider_queue_parameters
  ...
This commit is contained in:
Linus Torvalds 2024-03-11 11:43:44 -07:00
commit 1ddeeb2a05
138 changed files with 3571 additions and 3298 deletions

View File

@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = {
static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
{ {
struct queue_limits lim = {
.logical_block_size = bsize,
};
struct nfhd_device *dev; struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET; int dev_id = id - NFHD_DEV_OFFSET;
int err = -ENOMEM; int err = -ENOMEM;
@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->bsize = bsize; dev->bsize = bsize;
dev->bshift = ffs(bsize) - 10; dev->bshift = ffs(bsize) - 10;
dev->disk = blk_alloc_disk(NUMA_NO_NODE); dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!dev->disk) if (IS_ERR(dev->disk)) {
err = PTR_ERR(dev->disk);
goto free_dev; goto free_dev;
}
dev->disk->major = major_num; dev->disk->major = major_num;
dev->disk->first_minor = dev_id * 16; dev->disk->first_minor = dev_id * 16;
@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->disk->private_data = dev; dev->disk->private_data = dev;
sprintf(dev->disk->disk_name, "nfhd%u", dev_id); sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
blk_queue_logical_block_size(dev->disk->queue, bsize);
err = add_disk(dev->disk); err = add_disk(dev->disk);
if (err) if (err)
goto out_cleanup_disk; goto out_cleanup_disk;

View File

@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data)
static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_lock);
static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
static int ubd_open(struct gendisk *disk, blk_mode_t mode);
static void ubd_release(struct gendisk *disk);
static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg); unsigned int cmd, unsigned long arg);
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
static const struct block_device_operations ubd_blops = { static const struct block_device_operations ubd_blops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.open = ubd_open,
.release = ubd_release,
.ioctl = ubd_ioctl, .ioctl = ubd_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = ubd_getgeo, .getgeo = ubd_getgeo,
}; };
/* Protected by ubd_lock */
static struct gendisk *ubd_gendisk[MAX_DEV];
#ifdef CONFIG_BLK_DEV_UBD_SYNC #ifdef CONFIG_BLK_DEV_UBD_SYNC
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
.cl = 1 }) .cl = 1 })
@ -155,7 +148,6 @@ struct ubd {
* backing or the cow file. */ * backing or the cow file. */
char *file; char *file;
char *serial; char *serial;
int count;
int fd; int fd;
__u64 size; __u64 size;
struct openflags boot_openflags; struct openflags boot_openflags;
@ -165,7 +157,7 @@ struct ubd {
unsigned no_trim:1; unsigned no_trim:1;
struct cow cow; struct cow cow;
struct platform_device pdev; struct platform_device pdev;
struct request_queue *queue; struct gendisk *disk;
struct blk_mq_tag_set tag_set; struct blk_mq_tag_set tag_set;
spinlock_t lock; spinlock_t lock;
}; };
@ -181,7 +173,6 @@ struct ubd {
#define DEFAULT_UBD { \ #define DEFAULT_UBD { \
.file = NULL, \ .file = NULL, \
.serial = NULL, \ .serial = NULL, \
.count = 0, \
.fd = -1, \ .fd = -1, \
.size = -1, \ .size = -1, \
.boot_openflags = OPEN_FLAGS, \ .boot_openflags = OPEN_FLAGS, \
@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
ubd_dev->fd = fd; ubd_dev->fd = fd;
if(ubd_dev->cow.file != NULL){ if(ubd_dev->cow.file != NULL){
blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
err = -ENOMEM; err = -ENOMEM;
ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
if(ubd_dev->cow.bitmap == NULL){ if(ubd_dev->cow.bitmap == NULL){
@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
if(err < 0) goto error; if(err < 0) goto error;
ubd_dev->cow.fd = err; ubd_dev->cow.fd = err;
} }
if (ubd_dev->no_trim == 0) {
blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
return 0; return 0;
error: error:
os_close_file(ubd_dev->fd); os_close_file(ubd_dev->fd);
@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = {
NULL, NULL,
}; };
static int ubd_disk_register(int major, u64 size, int unit,
struct gendisk *disk)
{
disk->major = major;
disk->first_minor = unit << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + unit);
ubd_devs[unit].pdev.id = unit;
ubd_devs[unit].pdev.name = DRIVER_NAME;
ubd_devs[unit].pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
platform_device_register(&ubd_devs[unit].pdev);
disk->private_data = &ubd_devs[unit];
disk->queue = ubd_devs[unit].queue;
return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
}
#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
static const struct blk_mq_ops ubd_mq_ops = { static const struct blk_mq_ops ubd_mq_ops = {
@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = {
static int ubd_add(int n, char **error_out) static int ubd_add(int n, char **error_out)
{ {
struct ubd *ubd_dev = &ubd_devs[n]; struct ubd *ubd_dev = &ubd_devs[n];
struct queue_limits lim = {
.max_segments = MAX_SG,
.seg_boundary_mask = PAGE_SIZE - 1,
};
struct gendisk *disk; struct gendisk *disk;
int err = 0; int err = 0;
if(ubd_dev->file == NULL) if(ubd_dev->file == NULL)
goto out; goto out;
if (ubd_dev->cow.file)
lim.max_hw_sectors = 8 * sizeof(long);
if (!ubd_dev->no_trim) {
lim.max_hw_discard_sectors = UBD_MAX_REQUEST;
lim.max_write_zeroes_sectors = UBD_MAX_REQUEST;
}
err = ubd_file_size(ubd_dev, &ubd_dev->size); err = ubd_file_size(ubd_dev, &ubd_dev->size);
if(err < 0){ if(err < 0){
*error_out = "Couldn't determine size of device's file"; *error_out = "Couldn't determine size of device's file";
goto out; goto out;
} }
err = ubd_open_dev(ubd_dev);
if (err) {
pr_err("ubd%c: Can't open \"%s\": errno = %d\n",
'a' + n, ubd_dev->file, -err);
goto out;
}
ubd_dev->size = ROUND_BLOCK(ubd_dev->size); ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
ubd_dev->tag_set.ops = &ubd_mq_ops; ubd_dev->tag_set.ops = &ubd_mq_ops;
@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out)
err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
if (err) if (err)
goto out; goto out_close;
disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
err = PTR_ERR(disk); err = PTR_ERR(disk);
goto out_cleanup_tags; goto out_cleanup_tags;
} }
ubd_dev->queue = disk->queue;
blk_queue_write_cache(ubd_dev->queue, true, false); blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_max_segments(ubd_dev->queue, MAX_SG); blk_queue_write_cache(disk->queue, true, false);
blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); disk->major = UBD_MAJOR;
err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); disk->first_minor = n << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, ubd_dev->size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + n);
disk->private_data = ubd_dev;
set_disk_ro(disk, !ubd_dev->openflags.w);
ubd_dev->pdev.id = n;
ubd_dev->pdev.name = DRIVER_NAME;
ubd_dev->pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev);
platform_device_register(&ubd_dev->pdev);
err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups);
if (err) if (err)
goto out_cleanup_disk; goto out_cleanup_disk;
ubd_gendisk[n] = disk;
return 0; return 0;
out_cleanup_disk: out_cleanup_disk:
put_disk(disk); put_disk(disk);
out_cleanup_tags: out_cleanup_tags:
blk_mq_free_tag_set(&ubd_dev->tag_set); blk_mq_free_tag_set(&ubd_dev->tag_set);
out_close:
ubd_close_dev(ubd_dev);
out: out:
return err; return err;
} }
@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out)
static int ubd_remove(int n, char **error_out) static int ubd_remove(int n, char **error_out)
{ {
struct gendisk *disk = ubd_gendisk[n];
struct ubd *ubd_dev; struct ubd *ubd_dev;
int err = -ENODEV; int err = -ENODEV;
@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out)
if(ubd_dev->file == NULL) if(ubd_dev->file == NULL)
goto out; goto out;
if (ubd_dev->disk) {
/* you cannot remove a open disk */ /* you cannot remove a open disk */
err = -EBUSY; err = -EBUSY;
if(ubd_dev->count > 0) if (disk_openers(ubd_dev->disk))
goto out; goto out;
ubd_gendisk[n] = NULL; del_gendisk(ubd_dev->disk);
if(disk != NULL){ ubd_close_dev(ubd_dev);
del_gendisk(disk); put_disk(ubd_dev->disk);
put_disk(disk);
} }
err = 0; err = 0;
@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){
device_initcall(ubd_driver_init); device_initcall(ubd_driver_init);
static int ubd_open(struct gendisk *disk, blk_mode_t mode)
{
struct ubd *ubd_dev = disk->private_data;
int err = 0;
mutex_lock(&ubd_mutex);
if(ubd_dev->count == 0){
err = ubd_open_dev(ubd_dev);
if(err){
printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
disk->disk_name, ubd_dev->file, -err);
goto out;
}
}
ubd_dev->count++;
set_disk_ro(disk, !ubd_dev->openflags.w);
out:
mutex_unlock(&ubd_mutex);
return err;
}
static void ubd_release(struct gendisk *disk)
{
struct ubd *ubd_dev = disk->private_data;
mutex_lock(&ubd_mutex);
if(--ubd_dev->count == 0)
ubd_close_dev(ubd_dev);
mutex_unlock(&ubd_mutex);
}
static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
__u64 *cow_offset, unsigned long *bitmap, __u64 *cow_offset, unsigned long *bitmap,
__u64 bitmap_offset, unsigned long *bitmap_words, __u64 bitmap_offset, unsigned long *bitmap_words,

View File

@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir) struct proc_dir_entry *procdir)
{ {
char tmp[2] = { '0' + which, 0 }; char tmp[2] = { '0' + which, 0 };
int err = -ENOMEM; int err;
dev->fd = -1; dev->fd = -1;
dev->filename = NULL; dev->filename = NULL;
spin_lock_init(&dev->lock); spin_lock_init(&dev->lock);
dev->users = 0; dev->users = 0;
dev->gd = blk_alloc_disk(NUMA_NO_NODE); dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (!dev->gd) if (IS_ERR(dev->gd)) {
err = PTR_ERR(dev->gd);
goto out; goto out;
}
dev->gd->major = simdisk_major; dev->gd->major = simdisk_major;
dev->gd->first_minor = which; dev->gd->first_minor = which;
dev->gd->minors = SIMDISK_MINORS; dev->gd->minors = SIMDISK_MINORS;

View File

@ -383,7 +383,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), SLAB_ACCOUNT|SLAB_PANIC),
init_once); init_once);
err = register_filesystem(&bd_type); err = register_filesystem(&bd_type);
if (err) if (err)

View File

@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
if (!bfqg_stats_waiting(stats)) if (!bfqg_stats_waiting(stats))
return; return;
now = ktime_get_ns(); now = blk_time_get_ns();
if (now > stats->start_group_wait_time) if (now > stats->start_group_wait_time)
bfq_stat_add(&stats->group_wait_time, bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time); now - stats->start_group_wait_time);
@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
return; return;
if (bfqg == curr_bfqg) if (bfqg == curr_bfqg)
return; return;
stats->start_group_wait_time = ktime_get_ns(); stats->start_group_wait_time = blk_time_get_ns();
bfqg_stats_mark_waiting(stats); bfqg_stats_mark_waiting(stats);
} }
@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
if (!bfqg_stats_empty(stats)) if (!bfqg_stats_empty(stats))
return; return;
now = ktime_get_ns(); now = blk_time_get_ns();
if (now > stats->start_empty_time) if (now > stats->start_empty_time)
bfq_stat_add(&stats->empty_time, bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time); now - stats->start_empty_time);
@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
if (bfqg_stats_empty(stats)) if (bfqg_stats_empty(stats))
return; return;
stats->start_empty_time = ktime_get_ns(); stats->start_empty_time = blk_time_get_ns();
bfqg_stats_mark_empty(stats); bfqg_stats_mark_empty(stats);
} }
@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
struct bfqg_stats *stats = &bfqg->stats; struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) { if (bfqg_stats_idling(stats)) {
u64 now = ktime_get_ns(); u64 now = blk_time_get_ns();
if (now > stats->start_idle_time) if (now > stats->start_idle_time)
bfq_stat_add(&stats->idle_time, bfq_stat_add(&stats->idle_time,
@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
{ {
struct bfqg_stats *stats = &bfqg->stats; struct bfqg_stats *stats = &bfqg->stats;
stats->start_idle_time = ktime_get_ns(); stats->start_idle_time = blk_time_get_ns();
bfqg_stats_mark_idling(stats); bfqg_stats_mark_idling(stats);
} }
@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf) u64 io_start_time_ns, blk_opf_t opf)
{ {
struct bfqg_stats *stats = &bfqg->stats; struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns(); u64 now = blk_time_get_ns();
if (now > io_start_time_ns) if (now > io_start_time_ns)
blkg_rwstat_add(&stats->service_time, opf, blkg_rwstat_add(&stats->service_time, opf,

View File

@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next); rq = rq_entry_fifo(bfqq->fifo.next);
if (rq == last || ktime_get_ns() < rq->fifo_time) if (rq == last || blk_time_get_ns() < rq->fifo_time)
return NULL; return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* bfq_bfqq_update_budg_for_activation for * bfq_bfqq_update_budg_for_activation for
* details on the usage of the next variable. * details on the usage of the next variable.
*/ */
arrived_in_time = ktime_get_ns() <= arrived_in_time = blk_time_get_ns() <=
bfqq->ttime.last_end_request + bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3; bfqd->bfq_slice_idle * 3;
unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev; struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff; unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false; bool interactive = false;
u64 now_ns = ktime_get_ns(); u64 now_ns = blk_time_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++; bfqq->queued[rq_is_sync(rq)]++;
@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq)
bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif + time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) { msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns(); bfqd->last_empty_occupied_ns = blk_time_get_ns();
/* /*
* Start the state machine for measuring the * Start the state machine for measuring the
* total service time of rq: setting * total service time of rq: setting
@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
else else
timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
bfqd->last_budget_start = ktime_get(); bfqd->last_budget_start = blk_time_get();
bfqq->budget_timeout = jiffies + bfqq->budget_timeout = jiffies +
bfqd->bfq_timeout * timeout_coeff; bfqd->bfq_timeout * timeout_coeff;
@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
else if (bfqq->wr_coeff > 1) else if (bfqq->wr_coeff > 1)
sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
bfqd->last_idling_start = ktime_get(); bfqd->last_idling_start = blk_time_get();
bfqd->last_idling_start_jiffies = jiffies; bfqd->last_idling_start_jiffies = jiffies;
hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
struct request *rq) struct request *rq)
{ {
if (rq != NULL) { /* new rq dispatch now, reset accordingly */ if (rq != NULL) { /* new rq dispatch now, reset accordingly */
bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
bfqd->peak_rate_samples = 1; bfqd->peak_rate_samples = 1;
bfqd->sequential_samples = 0; bfqd->sequential_samples = 0;
bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@ -3590,7 +3590,7 @@ reset_computation:
*/ */
static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
{ {
u64 now_ns = ktime_get_ns(); u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */ if (bfqd->peak_rate_samples == 0) { /* first dispatch */
bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (compensate) if (compensate)
delta_ktime = bfqd->last_idling_start; delta_ktime = bfqd->last_idling_start;
else else
delta_ktime = ktime_get(); delta_ktime = blk_time_get();
delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
delta_usecs = ktime_to_us(delta_ktime); delta_usecs = ktime_to_us(delta_ktime);
@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic, pid_t pid, int is_sync, struct bfq_io_cq *bic, pid_t pid, int is_sync,
unsigned int act_idx) unsigned int act_idx)
{ {
u64 now_ns = ktime_get_ns(); u64 now_ns = blk_time_get_ns();
bfqq->actuator_idx = act_idx; bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node); RB_CLEAR_NODE(&bfqq->entity.rb_node);
@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
*/ */
if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
return; return;
elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfq_add_request(rq); bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo); list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq); bfq_rq_enqueued(bfqd, bfqq, rq);
@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_weights_tree_remove(bfqq); bfq_weights_tree_remove(bfqq);
} }
now_ns = ktime_get_ns(); now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns; bfqq->ttime.last_end_request = now_ns;
@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_update_inject_limit(struct bfq_data *bfqd, static void bfq_update_inject_limit(struct bfq_data *bfqd,
struct bfq_queue *bfqq) struct bfq_queue *bfqq)
{ {
u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit; unsigned int old_limit = bfqq->inject_limit;
if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {

View File

@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.tuple_size = bi->tuple_size; iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector; iter.seed = proc_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec); iter.prot_buf = bvec_virt(bip->bip_vec);
iter.pi_offset = bi->pi_offset;
__bio_for_each_segment(bv, bio, bviter, *proc_iter) { __bio_for_each_segment(bv, bio, bviter, *proc_iter) {
void *kaddr = bvec_kmap_local(&bv); void *kaddr = bvec_kmap_local(&bv);

View File

@ -16,7 +16,6 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h> #include <linux/blk-crypto.h>
#include <linux/xarray.h> #include <linux/xarray.h>
@ -763,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio)
struct bio_alloc_cache *cache; struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
put_cpu(); goto out_free;
bio_free(bio);
return;
}
if (in_task()) {
bio_uninit(bio); bio_uninit(bio);
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
bio->bi_next = cache->free_list; bio->bi_next = cache->free_list;
/* Not necessary but helps not to iopoll already freed bios */
bio->bi_bdev = NULL; bio->bi_bdev = NULL;
cache->free_list = bio; cache->free_list = bio;
cache->nr++; cache->nr++;
} else { } else if (in_hardirq()) {
unsigned long flags; lockdep_assert_irqs_disabled();
local_irq_save(flags); bio_uninit(bio);
bio->bi_next = cache->free_list_irq; bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio; cache->free_list_irq = bio;
cache->nr_irq++; cache->nr_irq++;
local_irq_restore(flags); } else {
goto out_free;
} }
put_cpu(); put_cpu();
return;
out_free:
put_cpu();
bio_free(bio);
} }
/** /**
@ -1154,7 +1155,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_folio_all(fi, bio) { bio_for_each_folio_all(fi, bio) {
struct page *page; struct page *page;
size_t done = 0; size_t nr_pages;
if (mark_dirty) { if (mark_dirty) {
folio_lock(fi.folio); folio_lock(fi.folio);
@ -1162,10 +1163,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_unlock(fi.folio); folio_unlock(fi.folio);
} }
page = folio_page(fi.folio, fi.offset / PAGE_SIZE); page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
do { do {
bio_release_page(bio, page++); bio_release_page(bio, page++);
done += PAGE_SIZE; } while (--nr_pages != 0);
} while (done < fi.length);
} }
} }
EXPORT_SYMBOL_GPL(__bio_release_pages); EXPORT_SYMBOL_GPL(__bio_release_pages);
@ -1371,21 +1373,12 @@ int submit_bio_wait(struct bio *bio)
{ {
DECLARE_COMPLETION_ONSTACK_MAP(done, DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map); bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done; bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio; bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC; bio->bi_opf |= REQ_SYNC;
submit_bio(bio); submit_bio(bio);
blk_wait_io(&done);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status); return blk_status_to_errno(bio->bi_status);
} }

View File

@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{ {
unsigned long pflags; unsigned long pflags;
bool clamp; bool clamp;
u64 now = ktime_to_ns(ktime_get()); u64 now = blk_time_get_ns();
u64 exp; u64 exp;
u64 delay_nsec = 0; u64 delay_nsec = 0;
int tok; int tok;

View File

@ -19,6 +19,7 @@
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/llist.h> #include <linux/llist.h>
#include "blk.h"
struct blkcg_gq; struct blkcg_gq;
struct blkg_policy_data; struct blkg_policy_data;

View File

@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work)
{ {
} }
struct request_queue *blk_alloc_queue(int node_id) struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{ {
struct request_queue *q; struct request_queue *q;
int error;
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
node_id); node_id);
if (!q) if (!q)
return NULL; return ERR_PTR(-ENOMEM);
q->last_merge = NULL; q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0) if (q->id < 0) {
error = q->id;
goto fail_q; goto fail_q;
}
q->stats = blk_alloc_queue_stats(); q->stats = blk_alloc_queue_stats();
if (!q->stats) if (!q->stats) {
error = -ENOMEM;
goto fail_id; goto fail_id;
}
error = blk_set_default_limits(lim);
if (error)
goto fail_stats;
q->limits = *lim;
q->node = node_id; q->node = node_id;
@ -425,6 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex); mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock); mutex_init(&q->sysfs_dir_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex); mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock); spin_lock_init(&q->queue_lock);
@ -435,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id)
* Init percpu_ref in atomic mode so that it's faster to shutdown. * Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details. * See blk_register_queue() for details.
*/ */
if (percpu_ref_init(&q->q_usage_counter, error = percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release, blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats; goto fail_stats;
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ; q->nr_requests = BLKDEV_DEFAULT_RQ;
return q; return q;
@ -451,7 +462,7 @@ fail_id:
ida_free(&blk_queue_ida, q->id); ida_free(&blk_queue_ida, q->id);
fail_q: fail_q:
kmem_cache_free(blk_requestq_cachep, q); kmem_cache_free(blk_requestq_cachep, q);
return NULL; return ERR_PTR(error);
} }
/** /**
@ -1083,6 +1094,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
if (tsk->plug) if (tsk->plug)
return; return;
plug->cur_ktime = 0;
plug->mq_list = NULL; plug->mq_list = NULL;
plug->cached_rq = NULL; plug->cached_rq = NULL;
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
@ -1182,6 +1194,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
*/ */
if (unlikely(!rq_list_empty(plug->cached_rq))) if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug); blk_mq_free_plug_rqs(plug);
current->flags &= ~PF_BLOCK_TS;
} }
/** /**
@ -1229,8 +1243,7 @@ int __init blk_dev_init(void)
if (!kblockd_workqueue) if (!kblockd_workqueue)
panic("Failed to create kblockd\n"); panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue", blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL); blk_debugfs_root = debugfs_create_dir("block", NULL);

View File

@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
part_stat_lock(); part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]); part_stat_inc(part, ios[STAT_FLUSH]);
part_stat_add(part, nsecs[STAT_FLUSH], part_stat_add(part, nsecs[STAT_FLUSH],
ktime_get_ns() - rq->start_time_ns); blk_time_get_ns() - rq->start_time_ns);
part_stat_unlock(); part_stat_unlock();
} }

View File

@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->profile = template->profile ? template->profile : &nop_profile; bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size; bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size; bi->tag_size = template->tag_size;
bi->pi_offset = template->pi_offset;
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);

View File

@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
/* step up/down based on the vrate */ /* step up/down based on the vrate */
vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns(); now_ns = blk_time_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
if (!ioc->autop_too_fast_at) if (!ioc->autop_too_fast_at)
@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
unsigned seq; unsigned seq;
u64 vrate; u64 vrate;
now->now_ns = ktime_get(); now->now_ns = blk_time_get_ns();
now->now = ktime_to_us(now->now_ns); now->now = ktime_to_us(now->now_ns);
vrate = atomic64_read(&ioc->vtime_rate); vrate = atomic64_read(&ioc->vtime_rate);
@ -2817,7 +2817,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
return; return;
} }
on_q_ns = ktime_get_ns() - rq->alloc_time_ns; on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
@ -2900,7 +2900,7 @@ static int blk_iocost_init(struct gendisk *disk)
ioc->vtime_base_rate = VTIME_PER_USEC; ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get()); ioc->period_at = ktime_to_us(blk_time_get());
atomic64_set(&ioc->cur_period, 0); atomic64_set(&ioc->cur_period, 0);
atomic_set(&ioc->hweight_gen, 0); atomic_set(&ioc->hweight_gen, 0);

View File

@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
if (!iolat->blkiolat->enabled) if (!iolat->blkiolat->enabled)
return; return;
now = ktime_to_ns(ktime_get()); now = blk_time_get_ns();
while (blkg && blkg->parent) { while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg); iolat = blkg_to_lat(blkg);
if (!iolat) { if (!iolat) {
@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css; struct cgroup_subsys_state *pos_css;
u64 now = ktime_to_ns(ktime_get()); u64 now = blk_time_get_ns();
rcu_read_lock(); rcu_read_lock();
blkg_for_each_descendant_pre(blkg, pos_css, blkg_for_each_descendant_pre(blkg, pos_css,
@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
struct blkcg_gq *blkg = lat_to_blkg(iolat); struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct rq_qos *rqos = iolat_rq_qos(blkg->q);
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
u64 now = ktime_to_ns(ktime_get()); u64 now = blk_time_get_ns();
int cpu; int cpu;
if (blk_queue_nonrot(blkg->q)) if (blk_queue_nonrot(blkg->q))

View File

@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
} }
static void await_bio_endio(struct bio *bio)
{
complete(bio->bi_private);
bio_put(bio);
}
/*
* await_bio_chain - ends @bio and waits for every chained bio to complete
*/
static void await_bio_chain(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = await_bio_endio;
bio_endio(bio);
blk_wait_io(&done);
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{ {
@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
* is disabled. * is disabled.
*/ */
cond_resched(); cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
} }
*biop = bio; *biop = bio;
@ -120,32 +144,33 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
struct bio **biop, unsigned flags) struct bio **biop, unsigned flags)
{ {
struct bio *bio = *biop; struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors; unsigned int max_sectors;
if (bdev_read_only(bdev)) if (bdev_read_only(bdev))
return -EPERM; return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ /* Ensure that max_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); max_sectors = bdev_write_zeroes_sectors(bdev);
if (max_write_zeroes_sectors == 0) if (max_sectors == 0)
return -EOPNOTSUPP; return -EOPNOTSUPP;
while (nr_sects) { while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_sector = sector;
if (flags & BLKDEV_ZERO_NOUNMAP) if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP; bio->bi_opf |= REQ_NOUNMAP;
if (nr_sects > max_write_zeroes_sectors) { bio->bi_iter.bi_size = len << SECTOR_SHIFT;
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; nr_sects -= len;
nr_sects -= max_write_zeroes_sectors; sector += len;
sector += max_write_zeroes_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
cond_resched(); cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
} }
*biop = bio; *biop = bio;
@ -190,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
break; break;
} }
cond_resched(); cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
} }
*biop = bio; *biop = bio;
@ -280,7 +309,7 @@ retry:
bio_put(bio); bio_put(bio);
} }
blk_finish_plug(&plug); blk_finish_plug(&plug);
if (ret && try_write_zeroes) { if (ret && ret != -EINTR && try_write_zeroes) {
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false; try_write_zeroes = false;
goto retry; goto retry;
@ -322,7 +351,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return -EPERM; return -EPERM;
blk_start_plug(&plug); blk_start_plug(&plug);
for (;;) { while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors); unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp);
@ -331,12 +360,17 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector += len; sector += len;
nr_sects -= len; nr_sects -= len;
if (!nr_sects) { cond_resched();
ret = submit_bio_wait(bio); if (fatal_signal_pending(current)) {
bio_put(bio); await_bio_chain(bio);
ret = -EINTR;
bio = NULL;
break; break;
} }
cond_resched(); }
if (bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
} }
blk_finish_plug(&plug); blk_finish_plug(&plug);

View File

@ -21,7 +21,6 @@
#include <linux/llist.h> #include <linux/llist.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cache.h> #include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h> #include <linux/sched/topology.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/delay.h> #include <linux/delay.h>
@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
RB_CLEAR_NODE(&rq->rb_node); RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG; rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns(); rq->start_time_ns = blk_time_get_ns();
rq->part = NULL; rq->part = NULL;
blk_crypto_rq_set_defaults(rq); blk_crypto_rq_set_defaults(rq);
} }
@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init);
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{ {
if (blk_mq_need_time_stamp(rq)) if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns(); rq->start_time_ns = blk_time_get_ns();
else else
rq->start_time_ns = 0; rq->start_time_ns = 0;
@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
/* alloc_time includes depth and tag waits */ /* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q)) if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns(); alloc_time_ns = blk_time_get_ns();
if (data->cmd_flags & REQ_NOWAIT) if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT; data->flags |= BLK_MQ_REQ_NOWAIT;
@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
/* alloc_time includes depth and tag waits */ /* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q)) if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns(); alloc_time_ns = blk_time_get_ns();
/* /*
* If the tag allocator sleeps we could get an allocation for a * If the tag allocator sleeps we could get an allocation for a
@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
inline void __blk_mq_end_request(struct request *rq, blk_status_t error) inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{ {
if (blk_mq_need_time_stamp(rq)) if (blk_mq_need_time_stamp(rq))
__blk_mq_end_request_acct(rq, ktime_get_ns()); __blk_mq_end_request_acct(rq, blk_time_get_ns());
blk_mq_finish_request(rq); blk_mq_finish_request(rq);
@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
u64 now = 0; u64 now = 0;
if (iob->need_ts) if (iob->need_ts)
now = ktime_get_ns(); now = blk_time_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) { while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio); prefetch(rq->bio);
@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
if (force_irqthreads()) if (force_irqthreads())
return false; return false;
/* same CPU or cache domain? Complete locally */ /* same CPU or cache domain and capacity? Complete locally */
if (cpu == rq->mq_ctx->cpu || if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu))) cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
return false; return false;
/* don't try to IPI to an offline CPU */ /* don't try to IPI to an offline CPU */
@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
!blk_rq_is_passthrough(rq)) { !blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns(); rq->io_start_time_ns = blk_time_get_ns();
rq->stats_sectors = blk_rq_sectors(rq); rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS; rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq); rq_qos_issue(q, rq);
@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
blk_mq_run_hw_queue(hctx, false); blk_mq_run_hw_queue(hctx, false);
if (blk_rq_is_poll(rq)) { if (blk_rq_is_poll(rq))
blk_rq_poll_completion(rq, &wait.done); blk_rq_poll_completion(rq, &wait.done);
} else {
/*
* Prevent hang_check timer from firing at us during very long
* I/O
*/
unsigned long hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait.done,
hang_check * (HZ/2)))
;
else else
wait_for_completion_io(&wait.done); blk_wait_io(&wait.done);
}
return wait.ret; return wait.ret;
} }
@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
}; };
struct request *rq; struct request *rq;
if (blk_mq_attempt_bio_merge(q, bio, nsegs))
return NULL;
rq_qos_throttle(q, bio); rq_qos_throttle(q, bio);
if (plug) { if (plug) {
@ -2913,23 +2898,32 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
} }
/* /*
* Check if we can use the passed on request for submitting the passed in bio, * Check if there is a suitable cached request and return it.
* and remove it from the request list if it can be used.
*/ */
static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
struct request_queue *q, blk_opf_t opf)
{
enum hctx_type type = blk_mq_get_hctx_type(opf);
struct request *rq;
if (!plug)
return NULL;
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
(type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
return rq;
}
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio) struct bio *bio)
{ {
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
enum hctx_type hctx_type = rq->mq_hctx->type;
WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
if (type != hctx_type &&
!(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
return false;
if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
return false;
/* /*
* If any qos ->throttle() end up blocking, we will have flushed the * If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry * plug and hence killed the cached_rq list as well. Pop this entry
@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
blk_mq_rq_time_init(rq, 0); blk_mq_rq_time_init(rq, 0);
rq->cmd_flags = bio->bi_opf; rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->queuelist);
return true;
} }
/** /**
@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = blk_mq_plug(bio); struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf); const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
struct request *rq = NULL;
unsigned int nr_segs = 1; unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret; blk_status_t ret;
bio = blk_queue_bounce(bio, q); bio = blk_queue_bounce(bio, q);
if (plug) { /*
rq = rq_list_peek(&plug->cached_rq); * If the plug has a cached request for this queue, try use it.
if (rq && rq->q != q) *
rq = NULL; * The cached request already holds a q_usage_counter reference and we
} * don't have to acquire a new one if we use it.
if (rq) { */
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!rq) {
if (!bio)
return;
}
if (!bio_integrity_prep(bio))
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
if (unlikely(bio_queue_enter(bio))) if (unlikely(bio_queue_enter(bio)))
return; return;
}
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio) if (!bio)
goto fail; goto queue_exit;
} }
if (!bio_integrity_prep(bio)) if (!bio_integrity_prep(bio))
goto fail; goto queue_exit;
}
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq)) { if (unlikely(!rq))
fail: goto queue_exit;
blk_queue_exit(q); } else {
return; blk_mq_use_cached_rq(rq, plug, bio);
} }
done:
trace_block_getrq(bio); trace_block_getrq(bio);
rq_qos_track(q, rq, bio); rq_qos_track(q, rq, bio);
@ -3037,6 +3023,15 @@ done:
} else { } else {
blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
} }
return;
queue_exit:
/*
* Don't drop the queue reference if we were trying to use a cached
* request and thus didn't acquire one.
*/
if (!rq)
blk_queue_exit(q);
} }
#ifdef CONFIG_BLK_MQ_STACKING #ifdef CONFIG_BLK_MQ_STACKING
@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
blk_mq_run_dispatch_ops(q, blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true)); ret = blk_mq_request_issue_directly(rq, true));
if (ret) if (ret)
blk_account_io_done(rq, ktime_get_ns()); blk_account_io_done(rq, blk_time_get_ns());
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(blk_insert_cloned_request); EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q); blk_mq_sysfs_deinit(q);
} }
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
void *queuedata) struct queue_limits *lim, void *queuedata)
{ {
struct queue_limits default_lim = { };
struct request_queue *q; struct request_queue *q;
int ret; int ret;
q = blk_alloc_queue(set->numa_node); q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
if (!q) if (IS_ERR(q))
return ERR_PTR(-ENOMEM); return q;
q->queuedata = queuedata; q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q); ret = blk_mq_init_allocated_queue(set, q);
if (ret) { if (ret) {
@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
} }
return q; return q;
} }
EXPORT_SYMBOL(blk_mq_alloc_queue);
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
/** /**
* blk_mq_destroy_queue - shutdown a request queue * blk_mq_destroy_queue - shutdown a request queue
* @q: request queue to shutdown * @q: request queue to shutdown
* *
* This shuts down a request queue allocated by blk_mq_init_queue(). All future * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
* requests will be failed with -ENODEV. The caller is responsible for dropping * requests will be failed with -ENODEV. The caller is responsible for dropping
* the reference from blk_mq_init_queue() by calling blk_put_queue(). * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
* *
* Context: can sleep * Context: can sleep
*/ */
@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q)
} }
EXPORT_SYMBOL(blk_mq_destroy_queue); EXPORT_SYMBOL(blk_mq_destroy_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass) struct lock_class_key *lkclass)
{ {
struct request_queue *q; struct request_queue *q;
struct gendisk *disk; struct gendisk *disk;
q = blk_mq_init_queue_data(set, queuedata); q = blk_mq_alloc_queue(set, lim, queuedata);
if (IS_ERR(q)) if (IS_ERR(q))
return ERR_CAST(q); return ERR_CAST(q);
@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
if (set->nr_maps == 1) if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues && !is_kdump_kernel()) { if (set->ops->map_queues) {
int i; int i;
/* /*
@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
/* /*
* If a crashdump is active, then we are potentially in a very * If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and * memory constrained environment. Limit us to 64 tags to prevent
* 64 tags to prevent using too much memory. * using too much memory.
*/ */
if (is_kdump_kernel()) { if (is_kdump_kernel())
set->nr_hw_queues = 1;
set->nr_maps = 1;
set->queue_depth = min(64U, set->queue_depth); set->queue_depth = min(64U, set->queue_depth);
}
/* /*
* There is no use for more h/w queues than cpus if we just have * There is no use for more h/w queues than cpus if we just have
* a single map * a single map
@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
GFP_KERNEL, set->numa_node); GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map) if (!set->map[i].mq_map)
goto out_free_mq_map; goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; set->map[i].nr_queues = set->nr_hw_queues;
} }
blk_mq_update_queue_map(set); blk_mq_update_queue_map(set);

View File

@ -25,53 +25,22 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
} }
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_segments = 1;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_user_sectors = lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->max_secure_erase_sectors = 0;
lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce = BLK_BOUNCE_NONE;
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = false;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
}
/** /**
* blk_set_stacking_limits - set default limits for stacking devices * blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset * @lim: the queue_limits structure to reset
* *
* Description: * Prepare queue limits for applying limits from underlying devices using
* Returns a queue_limit struct to its default state. Should be used * blk_stack_limits().
* by stacking drivers like DM that have no internal limits.
*/ */
void blk_set_stacking_limits(struct queue_limits *lim) void blk_set_stacking_limits(struct queue_limits *lim)
{ {
blk_set_default_limits(lim); memset(lim, 0, sizeof(*lim));
lim->logical_block_size = SECTOR_SIZE;
lim->physical_block_size = SECTOR_SIZE;
lim->io_min = SECTOR_SIZE;
lim->discard_granularity = SECTOR_SIZE;
lim->dma_alignment = SECTOR_SIZE - 1;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
/* Inherit limits from component devices */ /* Inherit limits from component devices */
lim->max_segments = USHRT_MAX; lim->max_segments = USHRT_MAX;
@ -82,9 +51,239 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
} }
EXPORT_SYMBOL(blk_set_stacking_limits); EXPORT_SYMBOL(blk_set_stacking_limits);
static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim)
{
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
static int blk_validate_zoned_limits(struct queue_limits *lim)
{
if (!lim->zoned) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
WARN_ON_ONCE(lim->max_zone_append_sectors))
return -EINVAL;
return 0;
}
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
return -EINVAL;
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
if (lim->max_zone_append_sectors) {
/*
* The Zone Append size is limited by the maximum I/O size
* and the zone size given that it can't span zones.
*/
lim->max_zone_append_sectors =
min3(lim->max_hw_sectors,
lim->max_zone_append_sectors,
lim->chunk_sectors);
}
return 0;
}
/*
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
*/
static int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
/*
* Unless otherwise specified, default to 512 byte logical blocks and a
* physical block size equal to the logical block size.
*/
if (!lim->logical_block_size)
lim->logical_block_size = SECTOR_SIZE;
if (lim->physical_block_size < lim->logical_block_size)
lim->physical_block_size = lim->logical_block_size;
/*
* The minimum I/O size defaults to the physical block size unless
* explicitly overridden.
*/
if (lim->io_min < lim->physical_block_size)
lim->io_min = lim->physical_block_size;
/*
* max_hw_sectors has a somewhat weird default for historical reason,
* but driver really should set their own instead of relying on this
* value.
*
* The block layer relies on the fact that every driver can
* handle at lest a page worth of data per I/O, and needs the value
* aligned to the logical block size.
*/
if (!lim->max_hw_sectors)
lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
return -EINVAL;
lim->max_hw_sectors = round_down(lim->max_hw_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* The actual max_sectors value is a complex beast and also takes the
* max_dev_sectors value (set by SCSI ULPs) and a user configurable
* value into account. The ->max_sectors value is always calculated
* from these, so directly setting it won't have any effect.
*/
max_hw_sectors = min_not_zero(lim->max_hw_sectors,
lim->max_dev_sectors);
if (lim->max_user_sectors) {
if (lim->max_user_sectors > max_hw_sectors ||
lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
} else {
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
lim->max_sectors = round_down(lim->max_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* Random default for the maximum number of segments. Driver should not
* rely on this and set their own.
*/
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
if (lim->discard_granularity < lim->physical_block_size)
lim->discard_granularity = lim->physical_block_size;
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
* that would break all the normal I/O patterns.
*/
if (!lim->seg_boundary_mask)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
return -EINVAL;
/*
* Devices that require a virtual boundary do not support scatter/gather
* I/O natively, but instead require a descriptor list entry for each
* page (which might not be identical to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
if (lim->virt_boundary_mask) {
if (WARN_ON_ONCE(lim->max_segment_size &&
lim->max_segment_size != UINT_MAX))
return -EINVAL;
lim->max_segment_size = UINT_MAX;
} else {
/*
* The maximum segment size has an odd historic 64k default that
* drivers probably should override. Just like the I/O size we
* require drivers to at least handle a full page per segment.
*/
if (!lim->max_segment_size)
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
return -EINVAL;
}
/*
* We require drivers to at least do logical block aligned I/O, but
* historically could not check for that due to the separate calls
* to set the limits. Once the transition is finished the check
* below should be narrowed down to check the logical block size.
*/
if (!lim->dma_alignment)
lim->dma_alignment = SECTOR_SIZE - 1;
if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
return -EINVAL;
if (lim->alignment_offset) {
lim->alignment_offset &= (lim->physical_block_size - 1);
lim->misaligned = 0;
}
return blk_validate_zoned_limits(lim);
}
/*
* Set the default limits for a newly allocated queue. @lim contains the
* initial limits set by the driver, which could be no limit in which case
* all fields are cleared to zero.
*/
int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
* but max_user_discard_sectors is special and needs an explicit
* initialization to the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
/**
* queue_limits_commit_update - commit an atomic update of queue limits
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
* and updated by the caller to @q.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim)
__releases(q->limits_lock)
{
int error = blk_validate_limits(lim);
if (!error) {
q->limits = *lim;
if (q->disk)
blk_apply_bdi_limits(q->disk->bdi, lim);
}
mutex_unlock(&q->limits_lock);
return error;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were freshly initialized to @q.
* To update existing limits use queue_limits_start_update() and
* queue_limits_commit_update() instead.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
{
mutex_lock(&q->limits_lock);
return queue_limits_commit_update(q, lim);
}
EXPORT_SYMBOL_GPL(queue_limits_set);
/** /**
* blk_queue_bounce_limit - set bounce buffer limit for queue * blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device * @q: the request queue for the device
@ -177,8 +376,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
void blk_queue_max_discard_sectors(struct request_queue *q, void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors) unsigned int max_discard_sectors)
{ {
q->limits.max_hw_discard_sectors = max_discard_sectors; struct queue_limits *lim = &q->limits;
q->limits.max_discard_sectors = max_discard_sectors;
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_discard_sectors =
min(max_discard_sectors, lim->max_user_discard_sectors);
} }
EXPORT_SYMBOL(blk_queue_max_discard_sectors); EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@ -393,15 +595,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
void disk_update_readahead(struct gendisk *disk) void disk_update_readahead(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue; blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
} }
EXPORT_SYMBOL_GPL(disk_update_readahead); EXPORT_SYMBOL_GPL(disk_update_readahead);
@ -689,33 +883,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = max(t->zone_write_granularity, t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity); b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned); t->zoned = max(t->zoned, b->zoned);
if (!t->zoned) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
return ret; return ret;
} }
EXPORT_SYMBOL(blk_stack_limits); EXPORT_SYMBOL(blk_stack_limits);
/** /**
* disk_stack_limits - adjust queue limits for stacked drivers * queue_limits_stack_bdev - adjust queue_limits for stacked devices
* @disk: MD/DM gendisk (top) * @t: the stacking driver limits (top device)
* @bdev: the underlying block device (bottom) * @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device * @offset: offset to beginning of data within component device
* @pfx: prefix to use for warnings logged
* *
* Description: * Description:
* Merges the limits for a top level gendisk and a bottom level * This function is used by stacking drivers like MD and DM to ensure
* block_device. * that all component devices have compatible block sizes and
* alignments. The stacking driver must provide a queue_limits
* struct (top) and then iteratively call the stacking function for
* all component (bottom) devices. The stacking function will
* attempt to combine the values and ensure proper alignment.
*/ */
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset) sector_t offset, const char *pfx)
{ {
struct request_queue *t = disk->queue; if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + offset))
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + (offset >> 9)) < 0)
pr_notice("%s: Warning: Device %pg is misaligned\n", pr_notice("%s: Warning: Device %pg is misaligned\n",
disk->disk_name, bdev); pfx, bdev);
disk_update_readahead(disk);
} }
EXPORT_SYMBOL(disk_stack_limits); EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/** /**
* blk_queue_update_dma_pad - update pad mask * blk_queue_update_dma_pad - update pad mask

View File

@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
/* src is a per-cpu stat, mean isn't initialized */ /* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{ {
if (!src->nr_samples) if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
return; return;
dst->min = min(dst->min, src->min); dst->min = min(dst->min, src->min);

View File

@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
static ssize_t queue_discard_max_store(struct request_queue *q, static ssize_t queue_discard_max_store(struct request_queue *q,
const char *page, size_t count) const char *page, size_t count)
{ {
unsigned long max_discard; unsigned long max_discard_bytes;
ssize_t ret = queue_var_store(&max_discard, page, count); struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0) if (ret < 0)
return ret; return ret;
if (max_discard & (q->limits.discard_granularity - 1)) if (max_discard_bytes & (q->limits.discard_granularity - 1))
return -EINVAL; return -EINVAL;
max_discard >>= 9; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
if (max_discard > UINT_MAX)
return -EINVAL; return -EINVAL;
if (max_discard > q->limits.max_hw_discard_sectors) blk_mq_freeze_queue(q);
max_discard = q->limits.max_hw_discard_sectors; lim = queue_limits_start_update(q);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
q->limits.max_discard_sectors = max_discard; if (err)
return err;
return ret; return ret;
} }
@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{ {
unsigned long var; unsigned long max_sectors_kb;
unsigned int max_sectors_kb, struct queue_limits lim;
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, ssize_t ret;
page_kb = 1 << (PAGE_SHIFT - 10); int err;
ssize_t ret = queue_var_store(&var, page, count);
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0) if (ret < 0)
return ret; return ret;
max_sectors_kb = (unsigned int)var; blk_mq_freeze_queue(q);
max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, lim = queue_limits_start_update(q);
q->limits.max_dev_sectors >> 1); lim.max_user_sectors = max_sectors_kb << 1;
if (max_sectors_kb == 0) { err = queue_limits_commit_update(q, &lim);
q->limits.max_user_sectors = 0; blk_mq_unfreeze_queue(q);
max_sectors_kb = min(max_hw_sectors_kb, if (err)
BLK_DEF_MAX_SECTORS_CAP >> 1); return err;
} else {
if (max_sectors_kb > max_hw_sectors_kb ||
max_sectors_kb < page_kb)
return -EINVAL;
q->limits.max_user_sectors = max_sectors_kb << 1;
}
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
if (q->disk)
q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(&q->queue_lock);
return ret; return ret;
} }

View File

@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[READ])) && while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
tg_may_dispatch(tg, bio, NULL)) { tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(tg, bio_data_dir(bio)); tg_dispatch_one_bio(tg, READ);
nr_reads++; nr_reads++;
if (nr_reads >= max_nr_reads) if (nr_reads >= max_nr_reads)
@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
tg_may_dispatch(tg, bio, NULL)) { tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(tg, bio_data_dir(bio)); tg_dispatch_one_bio(tg, WRITE);
nr_writes++; nr_writes++;
if (nr_writes >= max_nr_writes) if (nr_writes >= max_nr_writes)
@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
ret = tg->latency_target == DFL_LATENCY_TARGET || ret = tg->latency_target == DFL_LATENCY_TARGET ||
tg->idletime_threshold == DFL_IDLE_THRESHOLD || tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
(ktime_get_ns() >> 10) - tg->last_finish_time > time || (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
tg->avg_idletime > tg->idletime_threshold || tg->avg_idletime > tg->idletime_threshold ||
(tg->latency_target && tg->bio_cnt && (tg->latency_target && tg->bio_cnt &&
tg->bad_bio_cnt * 5 < tg->bio_cnt); tg->bad_bio_cnt * 5 < tg->bio_cnt);
@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
if (last_finish_time == 0) if (last_finish_time == 0)
return; return;
now = ktime_get_ns() >> 10; now = blk_time_get_ns() >> 10;
if (now <= last_finish_time || if (now <= last_finish_time ||
last_finish_time == tg->checked_last_finish_time) last_finish_time == tg->checked_last_finish_time)
return; return;
@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
if (!tg->td->limit_valid[LIMIT_LOW]) if (!tg->td->limit_valid[LIMIT_LOW])
return; return;
finish_time_ns = ktime_get_ns(); finish_time_ns = blk_time_get_ns();
tg->last_finish_time = finish_time_ns >> 10; tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10;

View File

@ -29,6 +29,7 @@
#include "blk-wbt.h" #include "blk-wbt.h"
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
#include "elevator.h" #include "elevator.h"
#include "blk.h"
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/wbt.h> #include <trace/events/wbt.h>
@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb) static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{ {
u64 now, issue = READ_ONCE(rwb->sync_issue); u64 issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie) if (!issue || !rwb->sync_cookie)
return 0; return 0;
now = ktime_to_ns(ktime_get()); return blk_time_get_ns() - issue;
return now - issue;
} }
static inline unsigned int wbt_inflight(struct rq_wb *rwb) static inline unsigned int wbt_inflight(struct rq_wb *rwb)

View File

@ -11,7 +11,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/rbtree.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/mm.h> #include <linux/mm.h>
@ -177,8 +176,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
} }
} }
static int blkdev_zone_reset_all_emulated(struct block_device *bdev, static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
gfp_t gfp_mask)
{ {
struct gendisk *disk = bdev->bd_disk; struct gendisk *disk = bdev->bd_disk;
sector_t capacity = bdev_nr_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev);
@ -205,7 +203,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
} }
bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
gfp_mask); GFP_KERNEL);
bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_sector = sector;
sector += zone_sectors; sector += zone_sectors;
@ -223,7 +221,7 @@ out_free_need_reset:
return ret; return ret;
} }
static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) static int blkdev_zone_reset_all(struct block_device *bdev)
{ {
struct bio bio; struct bio bio;
@ -238,7 +236,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* @sector: Start sector of the first zone to operate on * @sector: Start sector of the first zone to operate on
* @nr_sectors: Number of sectors, should be at least the length of one zone and * @nr_sectors: Number of sectors, should be at least the length of one zone and
* must be zone size aligned. * must be zone size aligned.
* @gfp_mask: Memory allocation flags (for bio_alloc)
* *
* Description: * Description:
* Perform the specified operation on the range of zones specified by * Perform the specified operation on the range of zones specified by
@ -248,7 +245,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* or finish request. * or finish request.
*/ */
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) sector_t sector, sector_t nr_sectors)
{ {
struct request_queue *q = bdev_get_queue(bdev); struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev);
@ -285,12 +282,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
*/ */
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q)) if (!blk_queue_zone_resetall(q))
return blkdev_zone_reset_all_emulated(bdev, gfp_mask); return blkdev_zone_reset_all_emulated(bdev);
return blkdev_zone_reset_all(bdev, gfp_mask); return blkdev_zone_reset_all(bdev);
} }
while (sector < end_sector) { while (sector < end_sector) {
bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_sector = sector;
sector += zone_sectors; sector += zone_sectors;
@ -419,8 +416,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY; return -ENOTTY;
} }
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
GFP_KERNEL);
fail: fail:
if (cmd == BLKRESETZONE) if (cmd == BLKRESETZONE)

View File

@ -4,6 +4,8 @@
#include <linux/blk-crypto.h> #include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h> #include <xen/xen.h>
#include "blk-crypto-internal.h" #include "blk-crypto-internal.h"
@ -70,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio)
return __bio_queue_enter(q, bio); return __bio_queue_enter(q, bio);
} }
static inline void blk_wait_io(struct completion *done)
{
/* Prevent hang_check timer from firing at us during very long I/O */
unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
if (timeout)
while (!wait_for_completion_io_timeout(done, timeout))
;
else
wait_for_completion_io(done);
}
#define BIO_INLINE_VECS 4 #define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask); gfp_t gfp_mask);
@ -329,7 +343,7 @@ void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio); bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
void blk_set_default_limits(struct queue_limits *lim); int blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void); int blk_dev_init(void);
/* /*
@ -447,7 +461,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
unpin_user_page(page); unpin_user_page(page);
} }
struct request_queue *blk_alloc_queue(int node_id); struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
@ -516,8 +530,75 @@ static inline int req_ref_read(struct request *req)
return atomic_read(&req->ref); return atomic_read(&req->ref);
} }
static inline u64 blk_time_get_ns(void)
{
struct blk_plug *plug = current->plug;
if (!plug)
return ktime_get_ns();
/*
* 0 could very well be a valid time, but rather than flag "this is
* a valid timestamp" separately, just accept that we'll do an extra
* ktime_get_ns() if we just happen to get 0 as the current time.
*/
if (!plug->cur_ktime) {
plug->cur_ktime = ktime_get_ns();
current->flags |= PF_BLOCK_TS;
}
return plug->cur_ktime;
}
static inline ktime_t blk_time_get(void)
{
return ns_to_ktime(blk_time_get_ns());
}
/*
* From most significant bit:
* 1 bit: reserved for other usage, see below
* 12 bits: original size of bio
* 51 bits: issue time of bio
*/
#define BIO_ISSUE_RES_BITS 1
#define BIO_ISSUE_SIZE_BITS 12
#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK \
(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
static inline u64 __bio_issue_time(u64 time)
{
return time & BIO_ISSUE_TIME_MASK;
}
static inline u64 bio_issue_time(struct bio_issue *issue)
{
return __bio_issue_time(issue->value);
}
static inline sector_t bio_issue_size(struct bio_issue *issue)
{
return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}
static inline void bio_issue_init(struct bio_issue *issue,
sector_t size)
{
size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
(blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
((u64)size << BIO_ISSUE_SIZE_SHIFT));
}
void bdev_release(struct file *bdev_file); void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file); const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
#endif /* BLK_INTERNAL_H */ #endif /* BLK_INTERNAL_H */

View File

@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
if (blk_mq_alloc_tag_set(set)) if (blk_mq_alloc_tag_set(set))
goto out_tag_set; goto out_tag_set;
q = blk_mq_init_queue(set); q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(q)) { if (IS_ERR(q)) {
ret = PTR_ERR(q); ret = PTR_ERR(q);
goto out_queue; goto out_queue;

View File

@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
} }
struct class block_class = { const struct class block_class = {
.name = "block", .name = "block",
.dev_uevent = block_uevent, .dev_uevent = block_uevent,
}; };
@ -1391,19 +1391,21 @@ out_free_disk:
return NULL; return NULL;
} }
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
struct lock_class_key *lkclass)
{ {
struct queue_limits default_lim = { };
struct request_queue *q; struct request_queue *q;
struct gendisk *disk; struct gendisk *disk;
q = blk_alloc_queue(node); q = blk_alloc_queue(lim ? lim : &default_lim, node);
if (!q) if (IS_ERR(q))
return NULL; return ERR_CAST(q);
disk = __alloc_disk_node(q, node, lkclass); disk = __alloc_disk_node(q, node, lkclass);
if (!disk) { if (!disk) {
blk_put_queue(q); blk_put_queue(q);
return NULL; return ERR_PTR(-ENOMEM);
} }
set_bit(GD_OWNS_QUEUE, &disk->state); set_bit(GD_OWNS_QUEUE, &disk->state);
return disk; return disk;

View File

@ -8,6 +8,8 @@ struct bd_holder_disk {
int refcnt; int refcnt;
}; };
static DEFINE_MUTEX(blk_holder_mutex);
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct gendisk *disk) struct gendisk *disk)
{ {
@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
kobject_get(bdev->bd_holder_dir); kobject_get(bdev->bd_holder_dir);
mutex_unlock(&bdev->bd_disk->open_mutex); mutex_unlock(&bdev->bd_disk->open_mutex);
mutex_lock(&disk->open_mutex); mutex_lock(&blk_holder_mutex);
WARN_ON_ONCE(!bdev->bd_holder); WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk); holder = bd_find_holder_disk(bdev, disk);
@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
goto out_del_symlink; goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs); list_add(&holder->list, &disk->slave_bdevs);
mutex_unlock(&disk->open_mutex); mutex_unlock(&blk_holder_mutex);
return 0; return 0;
out_del_symlink: out_del_symlink:
@ -116,7 +118,7 @@ out_del_symlink:
out_free_holder: out_free_holder:
kfree(holder); kfree(holder);
out_unlock: out_unlock:
mutex_unlock(&disk->open_mutex); mutex_unlock(&blk_holder_mutex);
if (ret) if (ret)
kobject_put(bdev->bd_holder_dir); kobject_put(bdev->bd_holder_dir);
return ret; return ret;
@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
if (WARN_ON_ONCE(!disk->slave_dir)) if (WARN_ON_ONCE(!disk->slave_dir))
return; return;
mutex_lock(&disk->open_mutex); mutex_lock(&blk_holder_mutex);
holder = bd_find_holder_disk(bdev, disk); holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
del_symlink(disk->slave_dir, bdev_kobj(bdev)); del_symlink(disk->slave_dir, bdev_kobj(bdev));
@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
list_del_init(&holder->list); list_del_init(&holder->list);
kfree(holder); kfree(holder);
} }
mutex_unlock(&disk->open_mutex); mutex_unlock(&blk_holder_mutex);
} }
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);

View File

@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
{ {
struct gendisk *disk = bdev->bd_disk; struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p; struct blkpg_partition p;
sector_t start, length; sector_t start, length, capacity, end;
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EACCES; return -EACCES;
@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev,
start = p.start >> SECTOR_SHIFT; start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT;
capacity = get_capacity(disk);
if (check_add_overflow(start, length, &end))
return -EINVAL;
if (start >= capacity || end > capacity)
return -EINVAL;
switch (op) { switch (op) {
case BLKPG_ADD_PARTITION: case BLKPG_ADD_PARTITION:

View File

@ -419,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length) sector_t length)
{ {
sector_t capacity = get_capacity(disk), end;
struct block_device *part; struct block_device *part;
int ret; int ret;
mutex_lock(&disk->open_mutex); mutex_lock(&disk->open_mutex);
if (check_add_overflow(start, length, &end)) {
ret = -EINVAL;
goto out;
}
if (start >= capacity || end > capacity) {
ret = -EINVAL;
goto out;
}
if (!disk_live(disk)) { if (!disk_live(disk)) {
ret = -ENXIO; ret = -ENXIO;
goto out; goto out;

View File

@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness);
* Code to understand MacOS partition tables. * Code to understand MacOS partition tables.
*/ */
#ifdef CONFIG_PPC_PMAC
static inline void mac_fix_string(char *stg, int len) static inline void mac_fix_string(char *stg, int len)
{ {
int i; int i;
@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len)
for (i = len - 1; i >= 0 && stg[i] == ' '; i--) for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
stg[i] = 0; stg[i] = 0;
} }
#endif
int mac_partition(struct parsed_partitions *state) int mac_partition(struct parsed_partitions *state)
{ {

View File

@ -1212,7 +1212,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method)
static int start_opal_session_cont(struct opal_dev *dev) static int start_opal_session_cont(struct opal_dev *dev)
{ {
u32 hsn, tsn; u32 hsn, tsn;
int error = 0; int error;
error = parse_and_check_status(dev); error = parse_and_check_status(dev);
if (error) if (error)
@ -1354,7 +1354,7 @@ static int get_active_key_cont(struct opal_dev *dev)
{ {
const char *activekey; const char *activekey;
size_t keylen; size_t keylen;
int error = 0; int error;
error = parse_and_check_status(dev); error = parse_and_check_status(dev);
if (error) if (error)
@ -2157,7 +2157,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
u8 lr_buffer[OPAL_UID_LENGTH]; u8 lr_buffer[OPAL_UID_LENGTH];
struct opal_lock_unlock *lkul = data; struct opal_lock_unlock *lkul = data;
u8 read_locked = 1, write_locked = 1; u8 read_locked = 1, write_locked = 1;
int err = 0; int err;
if (build_locking_range(lr_buffer, sizeof(lr_buffer), if (build_locking_range(lr_buffer, sizeof(lr_buffer),
lkul->session.opal_key.lr) < 0) lkul->session.opal_key.lr) < 0)
@ -2580,7 +2580,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
const struct opal_step discovery0_step = { const struct opal_step discovery0_step = {
opal_discovery0, discv opal_discovery0, discv
}; };
int ret = 0; int ret;
mutex_lock(&dev->dev_lock); mutex_lock(&dev->dev_lock);
setup_opal_dev(dev); setup_opal_dev(dev);
@ -3069,7 +3069,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
{ {
struct opal_suspend_data *suspend; struct opal_suspend_data *suspend;
bool was_failure = false; bool was_failure = false;
int ret = 0; int ret;
if (!dev) if (!dev)
return false; return false;
@ -3112,10 +3112,9 @@ static int opal_read_table(struct opal_dev *dev,
{ read_table_data, rw_tbl }, { read_table_data, rw_tbl },
{ end_opal_session, } { end_opal_session, }
}; };
int ret = 0;
if (!rw_tbl->size) if (!rw_tbl->size)
return ret; return 0;
return execute_steps(dev, read_table_steps, return execute_steps(dev, read_table_steps,
ARRAY_SIZE(read_table_steps)); ARRAY_SIZE(read_table_steps));
@ -3129,10 +3128,9 @@ static int opal_write_table(struct opal_dev *dev,
{ write_table_data, rw_tbl }, { write_table_data, rw_tbl },
{ end_opal_session, } { end_opal_session, }
}; };
int ret = 0;
if (!rw_tbl->size) if (!rw_tbl->size)
return ret; return 0;
return execute_steps(dev, write_table_steps, return execute_steps(dev, write_table_steps,
ARRAY_SIZE(write_table_steps)); ARRAY_SIZE(write_table_steps));

View File

@ -12,14 +12,14 @@
#include <net/checksum.h> #include <net/checksum.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
typedef __be16 (csum_fn) (void *, unsigned int); typedef __be16 (csum_fn) (__be16, void *, unsigned int);
static __be16 t10_pi_crc_fn(void *data, unsigned int len) static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len)
{ {
return cpu_to_be16(crc_t10dif(data, len)); return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len));
} }
static __be16 t10_pi_ip_fn(void *data, unsigned int len) static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len)
{ {
return (__force __be16)ip_compute_csum(data, len); return (__force __be16)ip_compute_csum(data, len);
} }
@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type) csum_fn *fn, enum t10_dif_type type)
{ {
u8 offset = iter->pi_offset;
unsigned int i; unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) { for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf; struct t10_pi_tuple *pi = iter->prot_buf + offset;
pi->guard_tag = fn(iter->data_buf, iter->interval); pi->guard_tag = fn(0, iter->data_buf, iter->interval);
if (offset)
pi->guard_tag = fn(pi->guard_tag, iter->prot_buf,
offset);
pi->app_tag = 0; pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION) if (type == T10_PI_TYPE1_PROTECTION)
@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type) csum_fn *fn, enum t10_dif_type type)
{ {
u8 offset = iter->pi_offset;
unsigned int i; unsigned int i;
BUG_ON(type == T10_PI_TYPE0_PROTECTION); BUG_ON(type == T10_PI_TYPE0_PROTECTION);
for (i = 0 ; i < iter->data_size ; i += iter->interval) { for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf; struct t10_pi_tuple *pi = iter->prot_buf + offset;
__be16 csum; __be16 csum;
if (type == T10_PI_TYPE1_PROTECTION || if (type == T10_PI_TYPE1_PROTECTION ||
@ -83,7 +88,9 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
goto next; goto next;
} }
csum = fn(iter->data_buf, iter->interval); csum = fn(0, iter->data_buf, iter->interval);
if (offset)
csum = fn(csum, iter->prot_buf, offset);
if (pi->guard_tag != csum) { if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \ pr_err("%s: guard tag error at sector %llu " \
@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
*/ */
static void t10_pi_type1_prepare(struct request *rq) static void t10_pi_type1_prepare(struct request *rq)
{ {
const int tuple_sz = rq->q->integrity.tuple_size; struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq); u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio; struct bio *bio;
__rq_for_each_bio(bio, rq) { __rq_for_each_bio(bio, rq) {
@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv); p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) { for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct t10_pi_tuple *pi = p; struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == virt) if (be32_to_cpu(pi->ref_tag) == virt)
pi->ref_tag = cpu_to_be32(ref_tag); pi->ref_tag = cpu_to_be32(ref_tag);
@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq)
*/ */
static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{ {
unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = rq->q->integrity.tuple_size; unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq); u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio; struct bio *bio;
__rq_for_each_bio(bio, rq) { __rq_for_each_bio(bio, rq) {
@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv); p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct t10_pi_tuple *pi = p; struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == ref_tag) if (be32_to_cpu(pi->ref_tag) == ref_tag)
pi->ref_tag = cpu_to_be32(virt); pi->ref_tag = cpu_to_be32(virt);
@ -280,20 +291,24 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
}; };
EXPORT_SYMBOL(t10_pi_type3_ip); EXPORT_SYMBOL(t10_pi_type3_ip);
static __be64 ext_pi_crc64(void *data, unsigned int len) static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{ {
return cpu_to_be64(crc64_rocksoft(data, len)); return cpu_to_be64(crc64_rocksoft_update(crc, data, len));
} }
static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
enum t10_dif_type type) enum t10_dif_type type)
{ {
u8 offset = iter->pi_offset;
unsigned int i; unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) { for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf; struct crc64_pi_tuple *pi = iter->prot_buf + offset;
pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag),
iter->prot_buf, offset);
pi->app_tag = 0; pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION) if (type == T10_PI_TYPE1_PROTECTION)
@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag)
static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
enum t10_dif_type type) enum t10_dif_type type)
{ {
u8 offset = iter->pi_offset;
unsigned int i; unsigned int i;
for (i = 0; i < iter->data_size; i += iter->interval) { for (i = 0; i < iter->data_size; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf; struct crc64_pi_tuple *pi = iter->prot_buf + offset;
u64 ref, seed; u64 ref, seed;
__be64 csum; __be64 csum;
@ -343,7 +359,11 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
goto next; goto next;
} }
csum = ext_pi_crc64(iter->data_buf, iter->interval); csum = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf,
offset);
if (pi->guard_tag != csum) { if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \ pr_err("%s: guard tag error at sector %llu " \
"(rcvd %016llx, want %016llx)\n", "(rcvd %016llx, want %016llx)\n",
@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_prepare(struct request *rq)
{ {
const int tuple_sz = rq->q->integrity.tuple_size; struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq); u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio; struct bio *bio;
__rq_for_each_bio(bio, rq) { __rq_for_each_bio(bio, rq) {
@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv); p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) { for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct crc64_pi_tuple *pi = p; struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag); u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == virt) if (ref == virt)
@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq)
static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{ {
unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = rq->q->integrity.tuple_size; unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq); u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio; struct bio *bio;
__rq_for_each_bio(bio, rq) { __rq_for_each_bio(bio, rq) {
@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv); p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct crc64_pi_tuple *pi = p; struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag); u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == ref_tag) if (ref == ref_tag)

View File

@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; }
#endif #endif
#ifdef CONFIG_BLOCK #ifdef CONFIG_BLOCK
extern struct class block_class; extern const struct class block_class;
static inline bool is_blockdev(struct device *dev) static inline bool is_blockdev(struct device *dev)
{ {
return dev->class == &block_class; return dev->class == &block_class;

View File

@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system)
struct gendisk *disk; struct gendisk *disk;
int err; int err;
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);

View File

@ -24,8 +24,8 @@ static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache; static struct kmem_cache *buf_pool_cache;
static struct dentry *aoe_debugfs_dir; static struct dentry *aoe_debugfs_dir;
/* GPFS needs a larger value than the default. */ /* random default picked from the historic block max_sectors cap */
static int aoe_maxsectors; static int aoe_maxsectors = 2560;
module_param(aoe_maxsectors, int, 0644); module_param(aoe_maxsectors, int, 0644);
MODULE_PARM_DESC(aoe_maxsectors, MODULE_PARM_DESC(aoe_maxsectors,
"When nonzero, set the maximum number of sectors per I/O request"); "When nonzero, set the maximum number of sectors per I/O request");
@ -334,6 +334,10 @@ aoeblk_gdalloc(void *vp)
mempool_t *mp; mempool_t *mp;
struct blk_mq_tag_set *set; struct blk_mq_tag_set *set;
sector_t ssize; sector_t ssize;
struct queue_limits lim = {
.max_hw_sectors = aoe_maxsectors,
.io_opt = SZ_2M,
};
ulong flags; ulong flags;
int late = 0; int late = 0;
int err; int err;
@ -371,7 +375,7 @@ aoeblk_gdalloc(void *vp)
goto err_mempool; goto err_mempool;
} }
gd = blk_mq_alloc_disk(set, d); gd = blk_mq_alloc_disk(set, &lim, d);
if (IS_ERR(gd)) { if (IS_ERR(gd)) {
pr_err("aoe: cannot allocate block queue for %ld.%d\n", pr_err("aoe: cannot allocate block queue for %ld.%d\n",
d->aoemajor, d->aoeminor); d->aoemajor, d->aoeminor);
@ -384,14 +388,9 @@ aoeblk_gdalloc(void *vp)
WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->flags & DEVFL_TKILL);
WARN_ON(d->gd); WARN_ON(d->gd);
WARN_ON(d->flags & DEVFL_UP); WARN_ON(d->flags & DEVFL_UP);
/* random number picked from the history block max_sectors cap */
blk_queue_max_hw_sectors(gd->queue, 2560u);
blk_queue_io_opt(gd->queue, SZ_2M);
d->bufpool = mp; d->bufpool = mp;
d->blkq = gd->queue; d->blkq = gd->queue;
d->gd = gd; d->gd = gd;
if (aoe_maxsectors)
blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors);
gd->major = AOE_MAJOR; gd->major = AOE_MAJOR;
gd->first_minor = d->sysminor; gd->first_minor = d->sysminor;
gd->minors = AOE_PARTITIONS; gd->minors = AOE_PARTITIONS;

View File

@ -419,13 +419,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
rcu_read_lock(); rcu_read_lock();
for_each_netdev_rcu(&init_net, ifp) { for_each_netdev_rcu(&init_net, ifp) {
dev_hold(ifp); dev_hold(ifp);
if (!is_aoe_netif(ifp)) if (!is_aoe_netif(ifp)) {
goto cont; dev_put(ifp);
continue;
}
skb = new_skb(sizeof *h + sizeof *ch); skb = new_skb(sizeof *h + sizeof *ch);
if (skb == NULL) { if (skb == NULL) {
printk(KERN_INFO "aoe: skb alloc failure\n"); printk(KERN_INFO "aoe: skb alloc failure\n");
goto cont; dev_put(ifp);
continue;
} }
skb_put(skb, sizeof *h + sizeof *ch); skb_put(skb, sizeof *h + sizeof *ch);
skb->dev = ifp; skb->dev = ifp;
@ -440,9 +443,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
h->major = cpu_to_be16(aoemajor); h->major = cpu_to_be16(aoemajor);
h->minor = aoeminor; h->minor = aoeminor;
h->cmd = AOECMD_CFG; h->cmd = AOECMD_CFG;
cont:
dev_put(ifp);
} }
rcu_read_unlock(); rcu_read_unlock();
} }

View File

@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock)
pr_warn("aoe: packet could not be sent on %s. %s\n", pr_warn("aoe: packet could not be sent on %s. %s\n",
ifp ? ifp->name : "netif", ifp ? ifp->name : "netif",
"consider increasing tx_queue_len"); "consider increasing tx_queue_len");
dev_put(ifp);
spin_lock_irq(&txlock); spin_lock_irq(&txlock);
} }
return 0; return 0;

View File

@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
{ {
struct gendisk *disk; struct gendisk *disk;
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);

View File

@ -318,6 +318,16 @@ static int brd_alloc(int i)
struct gendisk *disk; struct gendisk *disk;
char buf[DISK_NAME_LEN]; char buf[DISK_NAME_LEN];
int err = -ENOMEM; int err = -ENOMEM;
struct queue_limits lim = {
/*
* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
.physical_block_size = PAGE_SIZE,
};
list_for_each_entry(brd, &brd_devices, brd_list) list_for_each_entry(brd, &brd_devices, brd_list)
if (brd->brd_number == i) if (brd->brd_number == i)
@ -335,10 +345,11 @@ static int brd_alloc(int i)
debugfs_create_u64(buf, 0444, brd_debugfs_dir, debugfs_create_u64(buf, 0444, brd_debugfs_dir,
&brd->brd_nr_pages); &brd->brd_nr_pages);
disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!disk) if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_free_dev; goto out_free_dev;
}
disk->major = RAMDISK_MAJOR; disk->major = RAMDISK_MAJOR;
disk->first_minor = i * max_part; disk->first_minor = i * max_part;
disk->minors = max_part; disk->minors = max_part;
@ -347,15 +358,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN); strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2); set_capacity(disk, rd_size * 2);
/*
* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
/* Tell the block layer that this is not a rotational device */ /* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);

View File

@ -2690,6 +2690,14 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
int id; int id;
int vnr = adm_ctx->volume; int vnr = adm_ctx->volume;
enum drbd_ret_code err = ERR_NOMEM; enum drbd_ret_code err = ERR_NOMEM;
struct queue_limits lim = {
/*
* Setting the max_hw_sectors to an odd value of 8kibyte here.
* This triggers a max_bio_size message upon first attach or
* connect.
*/
.max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8,
};
device = minor_to_device(minor); device = minor_to_device(minor);
if (device) if (device)
@ -2708,9 +2716,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device); drbd_init_set_defaults(device);
disk = blk_alloc_disk(NUMA_NO_NODE); disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!disk) if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_no_disk; goto out_no_disk;
}
device->vdisk = disk; device->vdisk = disk;
device->rq_queue = disk->queue; device->rq_queue = disk->queue;
@ -2727,9 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
blk_queue_write_cache(disk->queue, true, true); blk_queue_write_cache(disk->queue, true, true);
/* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8);
device->md_io.page = alloc_page(GFP_KERNEL); device->md_io.page = alloc_page(GFP_KERNEL);
if (!device->md_io.page) if (!device->md_io.page)

View File

@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
return 0; return 0;
} }
static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) static unsigned int drbd_max_peer_bio_size(struct drbd_device *device)
{ {
q->limits.discard_granularity = granularity; /*
* We may ignore peer limits if the peer is modern enough. From 8.3.8
* onwards the peer can use multiple BIOs for a single peer_request.
*/
if (device->state.conn < C_WF_REPORT_PARAMS)
return device->peer_max_bio_size;
if (first_peer_device(device)->connection->agreed_pro_version < 94)
return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/*
* Correct old drbd (up to 8.3.7) if it believes it can do more than
* 32KiB.
*/
if (first_peer_device(device)->connection->agreed_pro_version == 94)
return DRBD_MAX_SIZE_H80_PACKET;
/*
* drbd 8.3.8 onwards, before 8.4.0
*/
if (first_peer_device(device)->connection->agreed_pro_version < 100)
return DRBD_MAX_BIO_SIZE_P95;
return DRBD_MAX_BIO_SIZE;
} }
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
@ -1204,24 +1226,81 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
return AL_EXTENT_SIZE >> 9; return AL_EXTENT_SIZE >> 9;
} }
static void decide_on_discard_support(struct drbd_device *device, static bool drbd_discard_supported(struct drbd_connection *connection,
struct drbd_backing_dev *bdev) struct drbd_backing_dev *bdev)
{ {
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue *q = device->rq_queue;
unsigned int max_discard_sectors;
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
goto not_supported; return false;
if (connection->cstate >= C_CONNECTED && if (connection->cstate >= C_CONNECTED &&
!(connection->agreed_features & DRBD_FF_TRIM)) { !(connection->agreed_features & DRBD_FF_TRIM)) {
drbd_info(connection, drbd_info(connection,
"peer DRBD too old, does not support TRIM: disabling discards\n"); "peer DRBD too old, does not support TRIM: disabling discards\n");
goto not_supported; return false;
} }
return true;
}
/* This is the workaround for "bio would need to, but cannot, be split" */
static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device)
{
unsigned int max_segments;
rcu_read_lock();
max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
rcu_read_unlock();
if (!max_segments)
return BLK_MAX_SEGMENTS;
return max_segments;
}
void drbd_reconsider_queue_parameters(struct drbd_device *device,
struct drbd_backing_dev *bdev, struct o_qlim *o)
{
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue * const q = device->rq_queue;
unsigned int now = queue_max_hw_sectors(q) << 9;
struct queue_limits lim;
struct request_queue *b = NULL;
unsigned int new;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
device->local_max_bio_size =
queue_max_hw_sectors(b) << SECTOR_SHIFT;
}
/*
* We may later detach and re-attach on a disconnected Primary. Avoid
* decreasing the value in this case.
*
* We want to store what we know the peer DRBD can handle, not what the
* peer IO backend can handle.
*/
new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size,
max(drbd_max_peer_bio_size(device), device->peer_max_bio_size));
if (new != now) {
if (device->state.role == R_PRIMARY && new < now)
drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n",
new, now);
drbd_info(device, "max BIO size = %u\n", new);
}
lim = queue_limits_start_update(q);
if (bdev) {
blk_set_stacking_limits(&lim);
lim.max_segments = drbd_backing_dev_max_segments(device);
} else {
lim.max_segments = BLK_MAX_SEGMENTS;
}
lim.max_hw_sectors = new >> SECTOR_SHIFT;
lim.seg_boundary_mask = PAGE_SIZE - 1;
/* /*
* We don't care for the granularity, really. * We don't care for the granularity, really.
* *
@ -1230,123 +1309,36 @@ static void decide_on_discard_support(struct drbd_device *device,
* problem, really. If you care, you need to use devices with similar * problem, really. If you care, you need to use devices with similar
* topology on all peers. * topology on all peers.
*/ */
blk_queue_discard_granularity(q, 512); if (drbd_discard_supported(connection, bdev)) {
max_discard_sectors = drbd_max_discard_sectors(connection); lim.discard_granularity = 512;
blk_queue_max_discard_sectors(q, max_discard_sectors); lim.max_hw_discard_sectors =
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); drbd_max_discard_sectors(connection);
return; } else {
lim.discard_granularity = 0;
not_supported: lim.max_hw_discard_sectors = 0;
blk_queue_discard_granularity(q, 0);
blk_queue_max_discard_sectors(q, 0);
} }
static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) if (bdev)
{ blk_stack_limits(&lim, &b->limits, 0);
/* Fixup max_write_zeroes_sectors after blk_stack_limits():
* if we can handle "zeroes" efficiently on the protocol, /*
* we want to do that, even if our backend does not announce * If we can handle "zeroes" efficiently on the protocol, we want to do
* max_write_zeroes_sectors itself. */ * that, even if our backend does not announce max_write_zeroes_sectors
struct drbd_connection *connection = first_peer_device(device)->connection; * itself.
/* If the peer announces WZEROES support, use it. Otherwise, rather */
* send explicit zeroes than rely on some discard-zeroes-data magic. */
if (connection->agreed_features & DRBD_FF_WZEROES) if (connection->agreed_features & DRBD_FF_WZEROES)
q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
else else
q->limits.max_write_zeroes_sectors = 0; lim.max_write_zeroes_sectors = 0;
if ((lim.discard_granularity >> SECTOR_SHIFT) >
lim.max_hw_discard_sectors) {
lim.discard_granularity = 0;
lim.max_hw_discard_sectors = 0;
} }
static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) if (queue_limits_commit_update(q, &lim))
{ drbd_err(device, "setting new queue limits failed\n");
unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
unsigned int discard_granularity =
device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
if (discard_granularity > max_discard) {
blk_queue_discard_granularity(q, 0);
blk_queue_max_discard_sectors(q, 0);
}
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size, struct o_qlim *o)
{
struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9;
unsigned int max_segments = 0;
struct request_queue *b = NULL;
struct disk_conf *dc;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
rcu_read_lock();
dc = rcu_dereference(device->ldev->disk_conf);
max_segments = dc->max_bio_bvecs;
rcu_read_unlock();
blk_set_stacking_limits(&q->limits);
}
blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
decide_on_discard_support(device, bdev);
if (b) {
blk_stack_limits(&q->limits, &b->limits, 0);
disk_update_readahead(device->vdisk);
}
fixup_write_zeroes(device, q);
fixup_discard_support(device, q);
}
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
unsigned int now, new, local, peer;
now = queue_max_hw_sectors(device->rq_queue) << 9;
local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
if (bdev) {
local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
device->local_max_bio_size = local;
}
local = min(local, DRBD_MAX_BIO_SIZE);
/* We may ignore peer limits if the peer is modern enough.
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (device->state.conn >= C_WF_REPORT_PARAMS) {
if (first_peer_device(device)->connection->agreed_pro_version < 94)
peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
else if (first_peer_device(device)->connection->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
else if (first_peer_device(device)->connection->agreed_pro_version < 100)
peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
else
peer = DRBD_MAX_BIO_SIZE;
/* We may later detach and re-attach on a disconnected Primary.
* Avoid this setting to jump back in that case.
* We want to store what we know the peer DRBD can handle,
* not what the peer IO backend can handle. */
if (peer > device->peer_max_bio_size)
device->peer_max_bio_size = peer;
}
new = min(local, peer);
if (device->state.role == R_PRIMARY && new < now)
drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
if (new != now)
drbd_info(device, "max BIO size = %u\n", new);
drbd_setup_queue_param(device, bdev, new, o);
} }
/* Starts the worker thread */ /* Starts the worker thread */

View File

@ -1542,9 +1542,10 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
int notify_resource_state_change(struct sk_buff *skb, int notify_resource_state_change(struct sk_buff *skb,
unsigned int seq, unsigned int seq,
struct drbd_resource_state_change *resource_state_change, void *state_change,
enum drbd_notification_type type) enum drbd_notification_type type)
{ {
struct drbd_resource_state_change *resource_state_change = state_change;
struct drbd_resource *resource = resource_state_change->resource; struct drbd_resource *resource = resource_state_change->resource;
struct resource_info resource_info = { struct resource_info resource_info = {
.res_role = resource_state_change->role[NEW], .res_role = resource_state_change->role[NEW],
@ -1558,13 +1559,14 @@ int notify_resource_state_change(struct sk_buff *skb,
int notify_connection_state_change(struct sk_buff *skb, int notify_connection_state_change(struct sk_buff *skb,
unsigned int seq, unsigned int seq,
struct drbd_connection_state_change *connection_state_change, void *state_change,
enum drbd_notification_type type) enum drbd_notification_type type)
{ {
struct drbd_connection *connection = connection_state_change->connection; struct drbd_connection_state_change *p = state_change;
struct drbd_connection *connection = p->connection;
struct connection_info connection_info = { struct connection_info connection_info = {
.conn_connection_state = connection_state_change->cstate[NEW], .conn_connection_state = p->cstate[NEW],
.conn_role = connection_state_change->peer_role[NEW], .conn_role = p->peer_role[NEW],
}; };
return notify_connection_state(skb, seq, connection, &connection_info, type); return notify_connection_state(skb, seq, connection, &connection_info, type);
@ -1572,9 +1574,10 @@ int notify_connection_state_change(struct sk_buff *skb,
int notify_device_state_change(struct sk_buff *skb, int notify_device_state_change(struct sk_buff *skb,
unsigned int seq, unsigned int seq,
struct drbd_device_state_change *device_state_change, void *state_change,
enum drbd_notification_type type) enum drbd_notification_type type)
{ {
struct drbd_device_state_change *device_state_change = state_change;
struct drbd_device *device = device_state_change->device; struct drbd_device *device = device_state_change->device;
struct device_info device_info = { struct device_info device_info = {
.dev_disk_state = device_state_change->disk_state[NEW], .dev_disk_state = device_state_change->disk_state[NEW],
@ -1585,9 +1588,10 @@ int notify_device_state_change(struct sk_buff *skb,
int notify_peer_device_state_change(struct sk_buff *skb, int notify_peer_device_state_change(struct sk_buff *skb,
unsigned int seq, unsigned int seq,
struct drbd_peer_device_state_change *p, void *state_change,
enum drbd_notification_type type) enum drbd_notification_type type)
{ {
struct drbd_peer_device_state_change *p = state_change;
struct drbd_peer_device *peer_device = p->peer_device; struct drbd_peer_device *peer_device = p->peer_device;
struct peer_device_info peer_device_info = { struct peer_device_info peer_device_info = {
.peer_repl_state = p->repl_state[NEW], .peer_repl_state = p->repl_state[NEW],
@ -1605,8 +1609,8 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
bool resource_state_has_changed; bool resource_state_has_changed;
unsigned int n_device, n_connection, n_peer_device, n_peer_devices; unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
int (*last_func)(struct sk_buff *, unsigned int, void *, int (*last_func)(struct sk_buff *, unsigned int,
enum drbd_notification_type) = NULL; void *, enum drbd_notification_type) = NULL;
void *last_arg = NULL; void *last_arg = NULL;
#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) #define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
@ -1616,7 +1620,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
}) })
#define REMEMBER_STATE_CHANGE(func, arg, type) \ #define REMEMBER_STATE_CHANGE(func, arg, type) \
({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
last_func = (typeof(last_func))func; \ last_func = func; \
last_arg = arg; \ last_arg = arg; \
}) })

View File

@ -46,19 +46,19 @@ extern void forget_state_change(struct drbd_state_change *);
extern int notify_resource_state_change(struct sk_buff *, extern int notify_resource_state_change(struct sk_buff *,
unsigned int, unsigned int,
struct drbd_resource_state_change *, void *,
enum drbd_notification_type type); enum drbd_notification_type type);
extern int notify_connection_state_change(struct sk_buff *, extern int notify_connection_state_change(struct sk_buff *,
unsigned int, unsigned int,
struct drbd_connection_state_change *, void *,
enum drbd_notification_type type); enum drbd_notification_type type);
extern int notify_device_state_change(struct sk_buff *, extern int notify_device_state_change(struct sk_buff *,
unsigned int, unsigned int,
struct drbd_device_state_change *, void *,
enum drbd_notification_type type); enum drbd_notification_type type);
extern int notify_peer_device_state_change(struct sk_buff *, extern int notify_peer_device_state_change(struct sk_buff *,
unsigned int, unsigned int,
struct drbd_peer_device_state_change *, void *,
enum drbd_notification_type type); enum drbd_notification_type type);
#endif /* DRBD_STATE_CHANGE_H */ #endif /* DRBD_STATE_CHANGE_H */

View File

@ -530,14 +530,13 @@ static struct format_descr format_req;
static char *floppy_track_buffer; static char *floppy_track_buffer;
static int max_buffer_sectors; static int max_buffer_sectors;
typedef void (*done_f)(int);
static const struct cont_t { static const struct cont_t {
void (*interrupt)(void); void (*interrupt)(void);
/* this is called after the interrupt of the /* this is called after the interrupt of the
* main command */ * main command */
void (*redo)(void); /* this is called to retry the operation */ void (*redo)(void); /* this is called to retry the operation */
void (*error)(void); /* this is called to tally an error */ void (*error)(void); /* this is called to tally an error */
done_f done; /* this is called to say if the operation has void (*done)(int); /* this is called to say if the operation has
* succeeded/failed */ * succeeded/failed */
} *cont; } *cont;
@ -985,6 +984,10 @@ static void empty(void)
{ {
} }
static void empty_done(int result)
{
}
static void (*floppy_work_fn)(void); static void (*floppy_work_fn)(void);
static void floppy_work_workfn(struct work_struct *work) static void floppy_work_workfn(struct work_struct *work)
@ -1998,14 +2001,14 @@ static const struct cont_t wakeup_cont = {
.interrupt = empty, .interrupt = empty,
.redo = do_wakeup, .redo = do_wakeup,
.error = empty, .error = empty,
.done = (done_f)empty .done = empty_done,
}; };
static const struct cont_t intr_cont = { static const struct cont_t intr_cont = {
.interrupt = empty, .interrupt = empty,
.redo = process_fd_request, .redo = process_fd_request,
.error = empty, .error = empty,
.done = (done_f)empty .done = empty_done,
}; };
/* schedules handler, waiting for completion. May be interrupted, will then /* schedules handler, waiting for completion. May be interrupted, will then
@ -4513,13 +4516,15 @@ static bool floppy_available(int drive)
static int floppy_alloc_disk(unsigned int drive, unsigned int type) static int floppy_alloc_disk(unsigned int drive, unsigned int type)
{ {
struct queue_limits lim = {
.max_hw_sectors = 64,
};
struct gendisk *disk; struct gendisk *disk;
disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);
blk_queue_max_hw_sectors(disk->queue, 64);
disk->major = FLOPPY_MAJOR; disk->major = FLOPPY_MAJOR;
disk->first_minor = TOMINOR(drive) | (type << 2); disk->first_minor = TOMINOR(drive) | (type << 2);
disk->minors = 1; disk->minors = 1;

View File

@ -750,12 +750,13 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group); &loop_attribute_group);
} }
static void loop_config_discard(struct loop_device *lo) static void loop_config_discard(struct loop_device *lo,
struct queue_limits *lim)
{ {
struct file *file = lo->lo_backing_file; struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
struct request_queue *q = lo->lo_queue; u32 granularity = 0, max_discard_sectors = 0;
u32 granularity, max_discard_sectors; struct kstatfs sbuf;
/* /*
* If the backing device is a block device, mirror its zeroing * If the backing device is a block device, mirror its zeroing
@ -775,29 +776,17 @@ static void loop_config_discard(struct loop_device *lo)
* We use punch hole to reclaim the free space used by the * We use punch hole to reclaim the free space used by the
* image a.k.a. discard. * image a.k.a. discard.
*/ */
} else if (!file->f_op->fallocate) { } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
max_discard_sectors = 0;
granularity = 0;
} else {
struct kstatfs sbuf;
max_discard_sectors = UINT_MAX >> 9; max_discard_sectors = UINT_MAX >> 9;
if (!vfs_statfs(&file->f_path, &sbuf))
granularity = sbuf.f_bsize; granularity = sbuf.f_bsize;
else
max_discard_sectors = 0;
} }
if (max_discard_sectors) { lim->max_hw_discard_sectors = max_discard_sectors;
q->limits.discard_granularity = granularity; lim->max_write_zeroes_sectors = max_discard_sectors;
blk_queue_max_discard_sectors(q, max_discard_sectors); if (max_discard_sectors)
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); lim->discard_granularity = granularity;
} else { else
q->limits.discard_granularity = 0; lim->discard_granularity = 0;
blk_queue_max_discard_sectors(q, 0);
blk_queue_max_write_zeroes_sectors(q, 0);
}
} }
struct loop_worker { struct loop_worker {
@ -986,6 +975,20 @@ loop_set_status_from_info(struct loop_device *lo,
return 0; return 0;
} }
static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize,
bool update_discard_settings)
{
struct queue_limits lim;
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
lim.io_min = bsize;
if (update_discard_settings)
loop_config_discard(lo, &lim);
return queue_limits_commit_update(lo->lo_queue, &lim);
}
static int loop_configure(struct loop_device *lo, blk_mode_t mode, static int loop_configure(struct loop_device *lo, blk_mode_t mode,
struct block_device *bdev, struct block_device *bdev,
const struct loop_config *config) const struct loop_config *config)
@ -1083,11 +1086,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
else else
bsize = 512; bsize = 512;
blk_queue_logical_block_size(lo->lo_queue, bsize); error = loop_reconfigure_limits(lo, bsize, true);
blk_queue_physical_block_size(lo->lo_queue, bsize); if (WARN_ON_ONCE(error))
blk_queue_io_min(lo->lo_queue, bsize); goto out_unlock;
loop_config_discard(lo);
loop_update_rotational(lo); loop_update_rotational(lo);
loop_update_dio(lo); loop_update_dio(lo);
loop_sysfs_init(lo); loop_sysfs_init(lo);
@ -1154,9 +1156,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
lo->lo_offset = 0; lo->lo_offset = 0;
lo->lo_sizelimit = 0; lo->lo_sizelimit = 0;
memset(lo->lo_file_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE);
blk_queue_logical_block_size(lo->lo_queue, 512); loop_reconfigure_limits(lo, 512, false);
blk_queue_physical_block_size(lo->lo_queue, 512);
blk_queue_io_min(lo->lo_queue, 512);
invalidate_disk(lo->lo_disk); invalidate_disk(lo->lo_disk);
loop_sysfs_exit(lo); loop_sysfs_exit(lo);
/* let user-space know about this change */ /* let user-space know about this change */
@ -1488,9 +1488,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
invalidate_bdev(lo->lo_device); invalidate_bdev(lo->lo_device);
blk_mq_freeze_queue(lo->lo_queue); blk_mq_freeze_queue(lo->lo_queue);
blk_queue_logical_block_size(lo->lo_queue, arg); err = loop_reconfigure_limits(lo, arg, false);
blk_queue_physical_block_size(lo->lo_queue, arg);
blk_queue_io_min(lo->lo_queue, arg);
loop_update_dio(lo); loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue); blk_mq_unfreeze_queue(lo->lo_queue);
@ -1982,6 +1980,12 @@ static const struct blk_mq_ops loop_mq_ops = {
static int loop_add(int i) static int loop_add(int i)
{ {
struct queue_limits lim = {
/*
* Random number picked from the historic block max_sectors cap.
*/
.max_hw_sectors = 2560u,
};
struct loop_device *lo; struct loop_device *lo;
struct gendisk *disk; struct gendisk *disk;
int err; int err;
@ -2025,16 +2029,13 @@ static int loop_add(int i)
if (err) if (err)
goto out_free_idr; goto out_free_idr;
disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
err = PTR_ERR(disk); err = PTR_ERR(disk);
goto out_cleanup_tags; goto out_cleanup_tags;
} }
lo->lo_queue = lo->lo_disk->queue; lo->lo_queue = lo->lo_disk->queue;
/* random number picked from the history block max_sectors cap */
blk_queue_max_hw_sectors(lo->lo_queue, 2560u);
/* /*
* By default, we do buffer IO, so it doesn't make sense to enable * By default, we do buffer IO, so it doesn't make sense to enable
* merge because the I/O submitted to backing file is handled page by * merge because the I/O submitted to backing file is handled page by

View File

@ -3401,6 +3401,12 @@ static const struct blk_mq_ops mtip_mq_ops = {
*/ */
static int mtip_block_initialize(struct driver_data *dd) static int mtip_block_initialize(struct driver_data *dd)
{ {
struct queue_limits lim = {
.physical_block_size = 4096,
.max_hw_sectors = 0xffff,
.max_segments = MTIP_MAX_SG,
.max_segment_size = 0x400000,
};
int rv = 0, wait_for_rebuild = 0; int rv = 0, wait_for_rebuild = 0;
sector_t capacity; sector_t capacity;
unsigned int index = 0; unsigned int index = 0;
@ -3431,7 +3437,7 @@ static int mtip_block_initialize(struct driver_data *dd)
goto block_queue_alloc_tag_error; goto block_queue_alloc_tag_error;
} }
dd->disk = blk_mq_alloc_disk(&dd->tags, dd); dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd);
if (IS_ERR(dd->disk)) { if (IS_ERR(dd->disk)) {
dev_err(&dd->pdev->dev, dev_err(&dd->pdev->dev,
"Unable to allocate request queue\n"); "Unable to allocate request queue\n");
@ -3481,12 +3487,7 @@ skip_create_disk:
/* Set device limits. */ /* Set device limits. */
blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
blk_queue_physical_block_size(dd->queue, 4096);
blk_queue_max_hw_sectors(dd->queue, 0xffff);
blk_queue_max_segment_size(dd->queue, 0x400000);
dma_set_max_seg_size(&dd->pdev->dev, 0x400000); dma_set_max_seg_size(&dd->pdev->dev, 0x400000);
blk_queue_io_min(dd->queue, 4096);
/* Set the capacity of the device in 512 byte sectors. */ /* Set the capacity of the device in 512 byte sectors. */
if (!(mtip_hw_get_capacity(dd, &capacity))) { if (!(mtip_hw_get_capacity(dd, &capacity))) {

View File

@ -114,6 +114,10 @@ static const struct block_device_operations n64cart_fops = {
*/ */
static int __init n64cart_probe(struct platform_device *pdev) static int __init n64cart_probe(struct platform_device *pdev)
{ {
struct queue_limits lim = {
.physical_block_size = 4096,
.logical_block_size = 4096,
};
struct gendisk *disk; struct gendisk *disk;
int err = -ENOMEM; int err = -ENOMEM;
@ -131,9 +135,11 @@ static int __init n64cart_probe(struct platform_device *pdev)
if (IS_ERR(reg_base)) if (IS_ERR(reg_base))
return PTR_ERR(reg_base); return PTR_ERR(reg_base);
disk = blk_alloc_disk(NUMA_NO_NODE); disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!disk) if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out; goto out;
}
disk->first_minor = 0; disk->first_minor = 0;
disk->flags = GENHD_FL_NO_PART; disk->flags = GENHD_FL_NO_PART;
@ -145,8 +151,6 @@ static int __init n64cart_probe(struct platform_device *pdev)
set_disk_ro(disk, 1); set_disk_ro(disk, 1);
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_physical_block_size(disk->queue, 4096);
blk_queue_logical_block_size(disk->queue, 4096);
err = add_disk(disk); err = add_disk(disk);
if (err) if (err)

View File

@ -316,9 +316,12 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0; nsock->sent = 0;
} }
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize) loff_t blksize)
{ {
struct queue_limits lim;
int error;
if (!blksize) if (!blksize)
blksize = 1u << NBD_DEF_BLKSIZE_BITS; blksize = 1u << NBD_DEF_BLKSIZE_BITS;
@ -334,10 +337,16 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
if (!nbd->pid) if (!nbd->pid)
return 0; return 0;
lim = queue_limits_start_update(nbd->disk->queue);
if (nbd->config->flags & NBD_FLAG_SEND_TRIM) if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); lim.max_hw_discard_sectors = UINT_MAX;
blk_queue_logical_block_size(nbd->disk->queue, blksize); else
blk_queue_physical_block_size(nbd->disk->queue, blksize); lim.max_hw_discard_sectors = 0;
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
if (error)
return error;
if (max_part) if (max_part)
set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
@ -346,6 +355,18 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
return 0; return 0;
} }
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
int error;
blk_mq_freeze_queue(nbd->disk->queue);
error = __nbd_set_size(nbd, bytesize, blksize);
blk_mq_unfreeze_queue(nbd->disk->queue);
return error;
}
static void nbd_complete_rq(struct request *req) static void nbd_complete_rq(struct request *req)
{ {
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@ -1351,7 +1372,6 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL; nbd->config = NULL;
nbd->tag_set.timeout = 0; nbd->tag_set.timeout = 0;
blk_queue_max_discard_sectors(nbd->disk->queue, 0);
mutex_unlock(&nbd->config_lock); mutex_unlock(&nbd->config_lock);
nbd_put(nbd); nbd_put(nbd);
@ -1783,6 +1803,12 @@ static const struct blk_mq_ops nbd_mq_ops = {
static struct nbd_device *nbd_dev_add(int index, unsigned int refs) static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{ {
struct queue_limits lim = {
.max_hw_sectors = 65536,
.max_user_sectors = 256,
.max_segments = USHRT_MAX,
.max_segment_size = UINT_MAX,
};
struct nbd_device *nbd; struct nbd_device *nbd;
struct gendisk *disk; struct gendisk *disk;
int err = -ENOMEM; int err = -ENOMEM;
@ -1823,7 +1849,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
if (err < 0) if (err < 0)
goto out_free_tags; goto out_free_tags;
disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
err = PTR_ERR(disk); err = PTR_ERR(disk);
goto out_free_idr; goto out_free_idr;
@ -1843,11 +1869,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
* Tell the block layer that we are not a rotational device * Tell the block layer that we are not a rotational device
*/ */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_max_discard_sectors(disk->queue, 0);
blk_queue_max_segment_size(disk->queue, UINT_MAX);
blk_queue_max_segments(disk->queue, USHRT_MAX);
blk_queue_max_hw_sectors(disk->queue, 65536);
disk->queue->limits.max_sectors = 256;
mutex_init(&nbd->config_lock); mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0); refcount_set(&nbd->config_refs, 0);
@ -2433,6 +2454,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
} }
dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
if (!dev_list) {
nlmsg_free(reply);
ret = -EMSGSIZE;
goto out;
}
if (index == -1) { if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply); ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) { if (ret) {

View File

@ -115,6 +115,18 @@ module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
#endif #endif
/*
* Historic queue modes.
*
* These days nothing but NULL_Q_MQ is actually supported, but we keep it the
* enum for error reporting.
*/
enum {
NULL_Q_BIO = 0,
NULL_Q_RQ = 1,
NULL_Q_MQ = 2,
};
static int g_queue_mode = NULL_Q_MQ; static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max) static int null_param_store_val(const char *str, int *val, int min, int max)
@ -165,8 +177,8 @@ static bool g_blocking;
module_param_named(blocking, g_blocking, bool, 0444); module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
static bool shared_tags; static bool g_shared_tags;
module_param(shared_tags, bool, 0444); module_param_named(shared_tags, g_shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
static bool g_shared_tag_bitmap; static bool g_shared_tag_bitmap;
@ -426,6 +438,7 @@ NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page) static ssize_t nullb_device_power_show(struct config_item *item, char *page)
@ -571,6 +584,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_offline, &nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary, &nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched, &nullb_device_attr_no_sched,
&nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap, &nullb_device_attr_shared_tag_bitmap,
NULL, NULL,
}; };
@ -653,10 +667,11 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
"badblocks,blocking,blocksize,cache_size," "badblocks,blocking,blocksize,cache_size,"
"completion_nsec,discard,home_node,hw_queue_depth," "completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched," "irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,size," "poll_queues,power,queue_mode,shared_tag_bitmap,"
"submit_queues,use_per_node_hctx,virt_boundary,zoned," "shared_tags,size,submit_queues,use_per_node_hctx,"
"zone_capacity,zone_max_active,zone_max_open," "virt_boundary,zoned,zone_capacity,zone_max_active,"
"zone_nr_conv,zone_offline,zone_readonly,zone_size\n"); "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
"zone_size\n");
} }
CONFIGFS_ATTR_RO(memb_group_, features); CONFIGFS_ATTR_RO(memb_group_, features);
@ -738,6 +753,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_max_active = g_zone_max_active; dev->zone_max_active = g_zone_max_active;
dev->virt_boundary = g_virt_boundary; dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched; dev->no_sched = g_no_sched;
dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap; dev->shared_tag_bitmap = g_shared_tag_bitmap;
return dev; return dev;
} }
@ -752,98 +768,11 @@ static void null_free_dev(struct nullb_device *dev)
kfree(dev); kfree(dev);
} }
static void put_tag(struct nullb_queue *nq, unsigned int tag)
{
clear_bit_unlock(tag, nq->tag_map);
if (waitqueue_active(&nq->wait))
wake_up(&nq->wait);
}
static unsigned int get_tag(struct nullb_queue *nq)
{
unsigned int tag;
do {
tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
if (tag >= nq->queue_depth)
return -1U;
} while (test_and_set_bit_lock(tag, nq->tag_map));
return tag;
}
static void free_cmd(struct nullb_cmd *cmd)
{
put_tag(cmd->nq, cmd->tag);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
unsigned int tag;
tag = get_tag(nq);
if (tag != -1U) {
cmd = &nq->cmds[tag];
cmd->tag = tag;
cmd->error = BLK_STS_OK;
cmd->nq = nq;
if (nq->dev->irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
}
return cmd;
}
return NULL;
}
static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)
{
struct nullb_cmd *cmd;
DEFINE_WAIT(wait);
do {
/*
* This avoids multiple return statements, multiple calls to
* __alloc_cmd() and a fast path call to prepare_to_wait().
*/
cmd = __alloc_cmd(nq);
if (cmd) {
cmd->bio = bio;
return cmd;
}
prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
io_schedule();
finish_wait(&nq->wait, &wait);
} while (1);
}
static void end_cmd(struct nullb_cmd *cmd)
{
int queue_mode = cmd->nq->dev->queue_mode;
switch (queue_mode) {
case NULL_Q_MQ:
blk_mq_end_request(cmd->rq, cmd->error);
return;
case NULL_Q_BIO:
cmd->bio->bi_status = cmd->error;
bio_endio(cmd->bio);
break;
}
free_cmd(cmd);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
{ {
end_cmd(container_of(timer, struct nullb_cmd, timer)); struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
@ -856,7 +785,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
static void null_complete_rq(struct request *rq) static void null_complete_rq(struct request *rq)
{ {
end_cmd(blk_mq_rq_to_pdu(rq)); struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
blk_mq_end_request(rq, cmd->error);
} }
static struct nullb_page *null_alloc_page(void) static struct nullb_page *null_alloc_page(void)
@ -1273,7 +1204,7 @@ static int null_transfer(struct nullb *nullb, struct page *page,
static int null_handle_rq(struct nullb_cmd *cmd) static int null_handle_rq(struct nullb_cmd *cmd)
{ {
struct request *rq = cmd->rq; struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb; struct nullb *nullb = cmd->nq->dev->nullb;
int err; int err;
unsigned int len; unsigned int len;
@ -1298,63 +1229,21 @@ static int null_handle_rq(struct nullb_cmd *cmd)
return 0; return 0;
} }
static int null_handle_bio(struct nullb_cmd *cmd)
{
struct bio *bio = cmd->bio;
struct nullb *nullb = cmd->nq->dev->nullb;
int err;
unsigned int len;
sector_t sector = bio->bi_iter.bi_sector;
struct bio_vec bvec;
struct bvec_iter iter;
spin_lock_irq(&nullb->lock);
bio_for_each_segment(bvec, bio, iter) {
len = bvec.bv_len;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), sector,
bio->bi_opf & REQ_FUA);
if (err) {
spin_unlock_irq(&nullb->lock);
return err;
}
sector += len >> SECTOR_SHIFT;
}
spin_unlock_irq(&nullb->lock);
return 0;
}
static void null_stop_queue(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
if (nullb->dev->queue_mode == NULL_Q_MQ)
blk_mq_stop_hw_queues(q);
}
static void null_restart_queue_async(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
if (nullb->dev->queue_mode == NULL_Q_MQ)
blk_mq_start_stopped_hw_queues(q, true);
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{ {
struct nullb_device *dev = cmd->nq->dev; struct nullb_device *dev = cmd->nq->dev;
struct nullb *nullb = dev->nullb; struct nullb *nullb = dev->nullb;
blk_status_t sts = BLK_STS_OK; blk_status_t sts = BLK_STS_OK;
struct request *rq = cmd->rq; struct request *rq = blk_mq_rq_from_pdu(cmd);
if (!hrtimer_active(&nullb->bw_timer)) if (!hrtimer_active(&nullb->bw_timer))
hrtimer_restart(&nullb->bw_timer); hrtimer_restart(&nullb->bw_timer);
if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
null_stop_queue(nullb); blk_mq_stop_hw_queues(nullb->q);
/* race with timer */ /* race with timer */
if (atomic_long_read(&nullb->cur_bytes) > 0) if (atomic_long_read(&nullb->cur_bytes) > 0)
null_restart_queue_async(nullb); blk_mq_start_stopped_hw_queues(nullb->q, true);
/* requeue request */ /* requeue request */
sts = BLK_STS_DEV_RESOURCE; sts = BLK_STS_DEV_RESOURCE;
} }
@ -1381,37 +1270,29 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
sector_t nr_sectors) sector_t nr_sectors)
{ {
struct nullb_device *dev = cmd->nq->dev; struct nullb_device *dev = cmd->nq->dev;
int err;
if (op == REQ_OP_DISCARD) if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors); return null_handle_discard(dev, sector, nr_sectors);
return errno_to_blk_status(null_handle_rq(cmd));
if (dev->queue_mode == NULL_Q_BIO)
err = null_handle_bio(cmd);
else
err = null_handle_rq(cmd);
return errno_to_blk_status(err);
} }
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
{ {
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb_device *dev = cmd->nq->dev; struct nullb_device *dev = cmd->nq->dev;
struct bio *bio; struct bio *bio;
if (dev->memory_backed) if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) {
return; __rq_for_each_bio(bio, rq)
if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
zero_fill_bio(cmd->bio);
} else if (req_op(cmd->rq) == REQ_OP_READ) {
__rq_for_each_bio(bio, cmd->rq)
zero_fill_bio(bio); zero_fill_bio(bio);
} }
} }
static inline void nullb_complete_cmd(struct nullb_cmd *cmd) static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{ {
struct request *rq = blk_mq_rq_from_pdu(cmd);
/* /*
* Since root privileges are required to configure the null_blk * Since root privileges are required to configure the null_blk
* driver, it is fine that this driver does not initialize the * driver, it is fine that this driver does not initialize the
@ -1425,20 +1306,10 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
/* Complete IO by inline, softirq or timer */ /* Complete IO by inline, softirq or timer */
switch (cmd->nq->dev->irqmode) { switch (cmd->nq->dev->irqmode) {
case NULL_IRQ_SOFTIRQ: case NULL_IRQ_SOFTIRQ:
switch (cmd->nq->dev->queue_mode) { blk_mq_complete_request(rq);
case NULL_Q_MQ:
blk_mq_complete_request(cmd->rq);
break;
case NULL_Q_BIO:
/*
* XXX: no proper submitting cpu information available.
*/
end_cmd(cmd);
break;
}
break; break;
case NULL_IRQ_NONE: case NULL_IRQ_NONE:
end_cmd(cmd); blk_mq_end_request(rq, cmd->error);
break; break;
case NULL_IRQ_TIMER: case NULL_IRQ_TIMER:
null_cmd_end_timer(cmd); null_cmd_end_timer(cmd);
@ -1499,7 +1370,7 @@ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
null_restart_queue_async(nullb); blk_mq_start_stopped_hw_queues(nullb->q, true);
hrtimer_forward_now(&nullb->bw_timer, timer_interval); hrtimer_forward_now(&nullb->bw_timer, timer_interval);
@ -1516,26 +1387,6 @@ static void nullb_setup_bwtimer(struct nullb *nullb)
hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
} }
static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
{
int index = 0;
if (nullb->nr_queues != 1)
index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
return &nullb->queues[index];
}
static void null_submit_bio(struct bio *bio)
{
sector_t sector = bio->bi_iter.bi_sector;
sector_t nr_sectors = bio_sectors(bio);
struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
struct nullb_queue *nq = nullb_to_queue(nullb);
null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio));
}
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static bool should_timeout_request(struct request *rq) static bool should_timeout_request(struct request *rq)
@ -1655,7 +1506,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
blk_rq_sectors(req)); blk_rq_sectors(req));
if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
blk_mq_end_request_batch)) blk_mq_end_request_batch))
end_cmd(cmd); blk_mq_end_request(req, cmd->error);
nr++; nr++;
} }
@ -1711,7 +1562,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired; cmd->timer.function = null_cmd_timer_expired;
} }
cmd->rq = rq;
cmd->error = BLK_STS_OK; cmd->error = BLK_STS_OK;
cmd->nq = nq; cmd->nq = nq;
cmd->fake_timeout = should_timeout_request(rq) || cmd->fake_timeout = should_timeout_request(rq) ||
@ -1770,34 +1620,8 @@ static void null_queue_rqs(struct request **rqlist)
*rqlist = requeue_list; *rqlist = requeue_list;
} }
static void cleanup_queue(struct nullb_queue *nq)
{
bitmap_free(nq->tag_map);
kfree(nq->cmds);
}
static void cleanup_queues(struct nullb *nullb)
{
int i;
for (i = 0; i < nullb->nr_queues; i++)
cleanup_queue(&nullb->queues[i]);
kfree(nullb->queues);
}
static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct nullb_queue *nq = hctx->driver_data;
struct nullb *nullb = nq->dev->nullb;
nullb->nr_queues--;
}
static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
{ {
init_waitqueue_head(&nq->wait);
nq->queue_depth = nullb->queue_depth;
nq->dev = nullb->dev; nq->dev = nullb->dev;
INIT_LIST_HEAD(&nq->poll_list); INIT_LIST_HEAD(&nq->poll_list);
spin_lock_init(&nq->poll_lock); spin_lock_init(&nq->poll_lock);
@ -1815,7 +1639,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
nq = &nullb->queues[hctx_idx]; nq = &nullb->queues[hctx_idx];
hctx->driver_data = nq; hctx->driver_data = nq;
null_init_queue(nullb, nq); null_init_queue(nullb, nq);
nullb->nr_queues++;
return 0; return 0;
} }
@ -1828,7 +1651,6 @@ static const struct blk_mq_ops null_mq_ops = {
.poll = null_poll, .poll = null_poll,
.map_queues = null_map_queues, .map_queues = null_map_queues,
.init_hctx = null_init_hctx, .init_hctx = null_init_hctx,
.exit_hctx = null_exit_hctx,
}; };
static void null_del_dev(struct nullb *nullb) static void null_del_dev(struct nullb *nullb)
@ -1849,21 +1671,20 @@ static void null_del_dev(struct nullb *nullb)
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer); hrtimer_cancel(&nullb->bw_timer);
atomic_long_set(&nullb->cur_bytes, LONG_MAX); atomic_long_set(&nullb->cur_bytes, LONG_MAX);
null_restart_queue_async(nullb); blk_mq_start_stopped_hw_queues(nullb->q, true);
} }
put_disk(nullb->disk); put_disk(nullb->disk);
if (dev->queue_mode == NULL_Q_MQ && if (nullb->tag_set == &nullb->__tag_set)
nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set); blk_mq_free_tag_set(nullb->tag_set);
cleanup_queues(nullb); kfree(nullb->queues);
if (null_cache_active(nullb)) if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true); null_free_device_storage(nullb->dev, true);
kfree(nullb); kfree(nullb);
dev->nullb = NULL; dev->nullb = NULL;
} }
static void null_config_discard(struct nullb *nullb) static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
{ {
if (nullb->dev->discard == false) if (nullb->dev->discard == false)
return; return;
@ -1880,43 +1701,14 @@ static void null_config_discard(struct nullb *nullb)
return; return;
} }
blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); lim->max_hw_discard_sectors = UINT_MAX >> 9;
} }
static const struct block_device_operations null_bio_ops = { static const struct block_device_operations null_ops = {
.owner = THIS_MODULE,
.submit_bio = null_submit_bio,
.report_zones = null_report_zones,
};
static const struct block_device_operations null_rq_ops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.report_zones = null_report_zones, .report_zones = null_report_zones,
}; };
static int setup_commands(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
int i;
nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
if (!nq->cmds)
return -ENOMEM;
nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
if (!nq->tag_map) {
kfree(nq->cmds);
return -ENOMEM;
}
for (i = 0; i < nq->queue_depth; i++) {
cmd = &nq->cmds[i];
cmd->tag = -1U;
}
return 0;
}
static int setup_queues(struct nullb *nullb) static int setup_queues(struct nullb *nullb)
{ {
int nqueues = nr_cpu_ids; int nqueues = nr_cpu_ids;
@ -1929,101 +1721,66 @@ static int setup_queues(struct nullb *nullb)
if (!nullb->queues) if (!nullb->queues)
return -ENOMEM; return -ENOMEM;
nullb->queue_depth = nullb->dev->hw_queue_depth;
return 0; return 0;
} }
static int init_driver_queues(struct nullb *nullb) static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues)
{ {
struct nullb_queue *nq;
int i, ret = 0;
for (i = 0; i < nullb->dev->submit_queues; i++) {
nq = &nullb->queues[i];
null_init_queue(nullb, nq);
ret = setup_commands(nq);
if (ret)
return ret;
nullb->nr_queues++;
}
return 0;
}
static int null_gendisk_register(struct nullb *nullb)
{
sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
struct gendisk *disk = nullb->disk;
set_capacity(disk, size);
disk->major = null_major;
disk->first_minor = nullb->index;
disk->minors = 1;
if (queue_is_mq(nullb->q))
disk->fops = &null_rq_ops;
else
disk->fops = &null_bio_ops;
disk->private_data = nullb;
strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
int ret = null_register_zoned_dev(nullb);
if (ret)
return ret;
}
return add_disk(disk);
}
static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
{
unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
int hw_queues, numa_node;
unsigned int queue_depth;
int poll_queues;
if (nullb) {
hw_queues = nullb->dev->submit_queues;
poll_queues = nullb->dev->poll_queues;
queue_depth = nullb->dev->hw_queue_depth;
numa_node = nullb->dev->home_node;
if (nullb->dev->no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (nullb->dev->shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (nullb->dev->blocking)
flags |= BLK_MQ_F_BLOCKING;
} else {
hw_queues = g_submit_queues;
poll_queues = g_poll_queues;
queue_depth = g_hw_queue_depth;
numa_node = g_home_node;
if (g_no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (g_blocking)
flags |= BLK_MQ_F_BLOCKING;
}
set->ops = &null_mq_ops; set->ops = &null_mq_ops;
set->cmd_size = sizeof(struct nullb_cmd); set->cmd_size = sizeof(struct nullb_cmd);
set->flags = flags; set->timeout = 5 * HZ;
set->driver_data = nullb; set->nr_maps = 1;
set->nr_hw_queues = hw_queues;
set->queue_depth = queue_depth;
set->numa_node = numa_node;
if (poll_queues) { if (poll_queues) {
set->nr_hw_queues += poll_queues; set->nr_hw_queues += poll_queues;
set->nr_maps = 3; set->nr_maps += 2;
} else { }
set->nr_maps = 1; return blk_mq_alloc_tag_set(set);
} }
return blk_mq_alloc_tag_set(set); static int null_init_global_tag_set(void)
{
int error;
if (tag_set.ops)
return 0;
tag_set.nr_hw_queues = g_submit_queues;
tag_set.queue_depth = g_hw_queue_depth;
tag_set.numa_node = g_home_node;
tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
if (g_no_sched)
tag_set.flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (g_blocking)
tag_set.flags |= BLK_MQ_F_BLOCKING;
error = null_init_tag_set(&tag_set, g_poll_queues);
if (error)
tag_set.ops = NULL;
return error;
}
static int null_setup_tagset(struct nullb *nullb)
{
if (nullb->dev->shared_tags) {
nullb->tag_set = &tag_set;
return null_init_global_tag_set();
}
nullb->tag_set = &nullb->__tag_set;
nullb->tag_set->driver_data = nullb;
nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
nullb->tag_set->numa_node = nullb->dev->home_node;
nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
if (nullb->dev->no_sched)
nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED;
if (nullb->dev->shared_tag_bitmap)
nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (nullb->dev->blocking)
nullb->tag_set->flags |= BLK_MQ_F_BLOCKING;
return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues);
} }
static int null_validate_conf(struct nullb_device *dev) static int null_validate_conf(struct nullb_device *dev)
@ -2032,11 +1789,15 @@ static int null_validate_conf(struct nullb_device *dev)
pr_err("legacy IO path is no longer available\n"); pr_err("legacy IO path is no longer available\n");
return -EINVAL; return -EINVAL;
} }
if (dev->queue_mode == NULL_Q_BIO) {
pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
dev->queue_mode = NULL_Q_MQ;
}
dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { if (dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes) if (dev->submit_queues != nr_online_nodes)
dev->submit_queues = nr_online_nodes; dev->submit_queues = nr_online_nodes;
} else if (dev->submit_queues > nr_cpu_ids) } else if (dev->submit_queues > nr_cpu_ids)
@ -2048,8 +1809,6 @@ static int null_validate_conf(struct nullb_device *dev)
if (dev->poll_queues > g_poll_queues) if (dev->poll_queues > g_poll_queues)
dev->poll_queues = g_poll_queues; dev->poll_queues = g_poll_queues;
dev->prev_poll_queues = dev->poll_queues; dev->prev_poll_queues = dev->poll_queues;
dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
/* Do memory allocation, so set blocking */ /* Do memory allocation, so set blocking */
@ -2060,9 +1819,6 @@ static int null_validate_conf(struct nullb_device *dev)
dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
dev->cache_size); dev->cache_size);
dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
/* can not stop a queue */
if (dev->queue_mode == NULL_Q_BIO)
dev->mbps = 0;
if (dev->zoned && if (dev->zoned &&
(!dev->zone_size || !is_power_of_2(dev->zone_size))) { (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
@ -2102,6 +1858,12 @@ static bool null_setup_fault(void)
static int null_add_dev(struct nullb_device *dev) static int null_add_dev(struct nullb_device *dev)
{ {
struct queue_limits lim = {
.logical_block_size = dev->blocksize,
.physical_block_size = dev->blocksize,
.max_hw_sectors = dev->max_sectors,
};
struct nullb *nullb; struct nullb *nullb;
int rv; int rv;
@ -2123,36 +1885,25 @@ static int null_add_dev(struct nullb_device *dev)
if (rv) if (rv)
goto out_free_nullb; goto out_free_nullb;
if (dev->queue_mode == NULL_Q_MQ) { rv = null_setup_tagset(nullb);
if (shared_tags) {
nullb->tag_set = &tag_set;
rv = 0;
} else {
nullb->tag_set = &nullb->__tag_set;
rv = null_init_tag_set(nullb, nullb->tag_set);
}
if (rv) if (rv)
goto out_cleanup_queues; goto out_cleanup_queues;
nullb->tag_set->timeout = 5 * HZ; if (dev->virt_boundary)
nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); lim.virt_boundary_mask = PAGE_SIZE - 1;
if (IS_ERR(nullb->disk)) { null_config_discard(nullb, &lim);
rv = PTR_ERR(nullb->disk); if (dev->zoned) {
rv = null_init_zoned_dev(dev, &lim);
if (rv)
goto out_cleanup_tags; goto out_cleanup_tags;
} }
nullb->q = nullb->disk->queue;
} else if (dev->queue_mode == NULL_Q_BIO) {
rv = -ENOMEM;
nullb->disk = blk_alloc_disk(nullb->dev->home_node);
if (!nullb->disk)
goto out_cleanup_queues;
nullb->q = nullb->disk->queue; nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
rv = init_driver_queues(nullb); if (IS_ERR(nullb->disk)) {
if (rv) rv = PTR_ERR(nullb->disk);
goto out_cleanup_disk; goto out_cleanup_zone;
} }
nullb->q = nullb->disk->queue;
if (dev->mbps) { if (dev->mbps) {
set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
@ -2164,12 +1915,6 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_write_cache(nullb->q, true, true); blk_queue_write_cache(nullb->q, true, true);
} }
if (dev->zoned) {
rv = null_init_zoned_dev(dev, nullb->q);
if (rv)
goto out_cleanup_disk;
}
nullb->q->queuedata = nullb; nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
@ -2177,22 +1922,12 @@ static int null_add_dev(struct nullb_device *dev)
rv = ida_alloc(&nullb_indexes, GFP_KERNEL); rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
if (rv < 0) { if (rv < 0) {
mutex_unlock(&lock); mutex_unlock(&lock);
goto out_cleanup_zone; goto out_cleanup_disk;
} }
nullb->index = rv; nullb->index = rv;
dev->index = rv; dev->index = rv;
mutex_unlock(&lock); mutex_unlock(&lock);
blk_queue_logical_block_size(nullb->q, dev->blocksize);
blk_queue_physical_block_size(nullb->q, dev->blocksize);
if (dev->max_sectors)
blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
if (dev->virt_boundary)
blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
null_config_discard(nullb);
if (config_item_name(&dev->group.cg_item)) { if (config_item_name(&dev->group.cg_item)) {
/* Use configfs dir name as the device name */ /* Use configfs dir name as the device name */
snprintf(nullb->disk_name, sizeof(nullb->disk_name), snprintf(nullb->disk_name, sizeof(nullb->disk_name),
@ -2201,7 +1936,22 @@ static int null_add_dev(struct nullb_device *dev)
sprintf(nullb->disk_name, "nullb%d", nullb->index); sprintf(nullb->disk_name, "nullb%d", nullb->index);
} }
rv = null_gendisk_register(nullb); set_capacity(nullb->disk,
((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT);
nullb->disk->major = null_major;
nullb->disk->first_minor = nullb->index;
nullb->disk->minors = 1;
nullb->disk->fops = &null_ops;
nullb->disk->private_data = nullb;
strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
rv = null_register_zoned_dev(nullb);
if (rv)
goto out_ida_free;
}
rv = add_disk(nullb->disk);
if (rv) if (rv)
goto out_ida_free; goto out_ida_free;
@ -2220,10 +1970,10 @@ out_cleanup_zone:
out_cleanup_disk: out_cleanup_disk:
put_disk(nullb->disk); put_disk(nullb->disk);
out_cleanup_tags: out_cleanup_tags:
if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set); blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues: out_cleanup_queues:
cleanup_queues(nullb); kfree(nullb->queues);
out_free_nullb: out_free_nullb:
kfree(nullb); kfree(nullb);
dev->nullb = NULL; dev->nullb = NULL;
@ -2299,7 +2049,7 @@ static int __init null_init(void)
return -EINVAL; return -EINVAL;
} }
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) { if (g_submit_queues != nr_online_nodes) {
pr_warn("submit_queues param is set to %u.\n", pr_warn("submit_queues param is set to %u.\n",
nr_online_nodes); nr_online_nodes);
@ -2311,18 +2061,12 @@ static int __init null_init(void)
g_submit_queues = 1; g_submit_queues = 1;
} }
if (g_queue_mode == NULL_Q_MQ && shared_tags) {
ret = null_init_tag_set(NULL, &tag_set);
if (ret)
return ret;
}
config_group_init(&nullb_subsys.su_group); config_group_init(&nullb_subsys.su_group);
mutex_init(&nullb_subsys.su_mutex); mutex_init(&nullb_subsys.su_mutex);
ret = configfs_register_subsystem(&nullb_subsys); ret = configfs_register_subsystem(&nullb_subsys);
if (ret) if (ret)
goto err_tagset; return ret;
mutex_init(&lock); mutex_init(&lock);
@ -2349,9 +2093,6 @@ err_dev:
unregister_blkdev(null_major, "nullb"); unregister_blkdev(null_major, "nullb");
err_conf: err_conf:
configfs_unregister_subsystem(&nullb_subsys); configfs_unregister_subsystem(&nullb_subsys);
err_tagset:
if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
return ret; return ret;
} }
@ -2370,7 +2111,7 @@ static void __exit null_exit(void)
} }
mutex_unlock(&lock); mutex_unlock(&lock);
if (g_queue_mode == NULL_Q_MQ && shared_tags) if (tag_set.ops)
blk_mq_free_tag_set(&tag_set); blk_mq_free_tag_set(&tag_set);
} }

View File

@ -16,11 +16,6 @@
#include <linux/mutex.h> #include <linux/mutex.h>
struct nullb_cmd { struct nullb_cmd {
union {
struct request *rq;
struct bio *bio;
};
unsigned int tag;
blk_status_t error; blk_status_t error;
bool fake_timeout; bool fake_timeout;
struct nullb_queue *nq; struct nullb_queue *nq;
@ -28,16 +23,11 @@ struct nullb_cmd {
}; };
struct nullb_queue { struct nullb_queue {
unsigned long *tag_map;
wait_queue_head_t wait;
unsigned int queue_depth;
struct nullb_device *dev; struct nullb_device *dev;
unsigned int requeue_selection; unsigned int requeue_selection;
struct list_head poll_list; struct list_head poll_list;
spinlock_t poll_lock; spinlock_t poll_lock;
struct nullb_cmd *cmds;
}; };
struct nullb_zone { struct nullb_zone {
@ -60,13 +50,6 @@ struct nullb_zone {
unsigned int capacity; unsigned int capacity;
}; };
/* Queue modes */
enum {
NULL_Q_BIO = 0,
NULL_Q_RQ = 1,
NULL_Q_MQ = 2,
};
struct nullb_device { struct nullb_device {
struct nullb *nullb; struct nullb *nullb;
struct config_group group; struct config_group group;
@ -119,6 +102,7 @@ struct nullb_device {
bool zoned; /* if device is zoned */ bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */ bool virt_boundary; /* virtual boundary on/off for the device */
bool no_sched; /* no IO scheduler for the device */ bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */ bool shared_tag_bitmap; /* use hostwide shared tags */
}; };
@ -130,14 +114,12 @@ struct nullb {
struct gendisk *disk; struct gendisk *disk;
struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set; struct blk_mq_tag_set __tag_set;
unsigned int queue_depth;
atomic_long_t cur_bytes; atomic_long_t cur_bytes;
struct hrtimer bw_timer; struct hrtimer bw_timer;
unsigned long cache_flush_pos; unsigned long cache_flush_pos;
spinlock_t lock; spinlock_t lock;
struct nullb_queue *queues; struct nullb_queue *queues;
unsigned int nr_queues;
char disk_name[DISK_NAME_LEN]; char disk_name[DISK_NAME_LEN];
}; };
@ -147,7 +129,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors); sector_t sector, unsigned int nr_sectors);
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb); int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev); void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector, int null_report_zones(struct gendisk *disk, sector_t sector,
@ -160,7 +142,7 @@ ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond); size_t count, enum blk_zone_cond cond);
#else #else
static inline int null_init_zoned_dev(struct nullb_device *dev, static inline int null_init_zoned_dev(struct nullb_device *dev,
struct request_queue *q) struct queue_limits *lim)
{ {
pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
return -EINVAL; return -EINVAL;

View File

@ -41,10 +41,11 @@ TRACE_EVENT(nullb_zone_op,
__field(unsigned int, zone_cond) __field(unsigned int, zone_cond)
), ),
TP_fast_assign( TP_fast_assign(
__entry->op = req_op(cmd->rq); __entry->op = req_op(blk_mq_rq_from_pdu(cmd));
__entry->zone_no = zone_no; __entry->zone_no = zone_no;
__entry->zone_cond = zone_cond; __entry->zone_cond = zone_cond;
__assign_disk_name(__entry->disk, cmd->rq->q->disk); __assign_disk_name(__entry->disk,
blk_mq_rq_from_pdu(cmd)->q->disk);
), ),
TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s",
__print_disk_name(__entry->disk), __print_disk_name(__entry->disk),

View File

@ -58,7 +58,8 @@ static inline void null_unlock_zone(struct nullb_device *dev,
mutex_unlock(&zone->mutex); mutex_unlock(&zone->mutex);
} }
int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) int null_init_zoned_dev(struct nullb_device *dev,
struct queue_limits *lim)
{ {
sector_t dev_capacity_sects, zone_capacity_sects; sector_t dev_capacity_sects, zone_capacity_sects;
struct nullb_zone *zone; struct nullb_zone *zone;
@ -151,27 +152,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
sector += dev->zone_size_sects; sector += dev->zone_size_sects;
} }
lim->zoned = true;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_size_sects;
lim->max_open_zones = dev->zone_max_open;
lim->max_active_zones = dev->zone_max_active;
return 0; return 0;
} }
int null_register_zoned_dev(struct nullb *nullb) int null_register_zoned_dev(struct nullb *nullb)
{ {
struct nullb_device *dev = nullb->dev;
struct request_queue *q = nullb->q; struct request_queue *q = nullb->q;
disk_set_zoned(nullb->disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
blk_queue_chunk_sectors(q, dev->zone_size_sects);
nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
disk_set_max_open_zones(nullb->disk, dev->zone_max_open);
disk_set_max_active_zones(nullb->disk, dev->zone_max_active);
if (queue_is_mq(q))
return blk_revalidate_disk_zones(nullb->disk, NULL); return blk_revalidate_disk_zones(nullb->disk, NULL);
return 0;
} }
void null_free_zoned_dev(struct nullb_device *dev) void null_free_zoned_dev(struct nullb_device *dev)
@ -394,10 +390,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
*/ */
if (append) { if (append) {
sector = zone->wp; sector = zone->wp;
if (dev->queue_mode == NULL_Q_MQ) blk_mq_rq_from_pdu(cmd)->__sector = sector;
cmd->rq->__sector = sector;
else
cmd->bio->bi_iter.bi_sector = sector;
} else if (sector != zone->wp) { } else if (sector != zone->wp) {
ret = BLK_STS_IOERR; ret = BLK_STS_IOERR;
goto unlock; goto unlock;

View File

@ -828,6 +828,12 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
*/ */
static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
{ {
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
*/
bio->bi_opf |= REQ_NOMERGE;
spin_lock(&pd->iosched.lock); spin_lock(&pd->iosched.lock);
if (bio_data_dir(bio) == READ) if (bio_data_dir(bio) == READ)
bio_list_add(&pd->iosched.read_queue, bio); bio_list_add(&pd->iosched.read_queue, bio);
@ -2191,11 +2197,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
ret = pkt_open_write(pd); ret = pkt_open_write(pd);
if (ret) if (ret)
goto out_putdev; goto out_putdev;
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
*/
blk_queue_max_hw_sectors(q, pd->settings.size);
set_bit(PACKET_WRITABLE, &pd->flags); set_bit(PACKET_WRITABLE, &pd->flags);
} else { } else {
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
@ -2338,9 +2339,9 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
pkt_queue_bio(pd, cloned_bio); pkt_queue_bio(pd, cloned_bio);
} }
static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_make_request_write(struct bio *bio)
{ {
struct pktcdvd_device *pd = q->queuedata; struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
sector_t zone; sector_t zone;
struct packet_data *pkt; struct packet_data *pkt;
int was_empty, blocked_bio; int was_empty, blocked_bio;
@ -2432,7 +2433,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
static void pkt_submit_bio(struct bio *bio) static void pkt_submit_bio(struct bio *bio)
{ {
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
struct device *ddev = disk_to_dev(pd->disk); struct device *ddev = disk_to_dev(pd->disk);
struct bio *split; struct bio *split;
@ -2476,7 +2477,7 @@ static void pkt_submit_bio(struct bio *bio)
split = bio; split = bio;
} }
pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); pkt_make_request_write(split);
} while (split != bio); } while (split != bio);
return; return;
@ -2484,15 +2485,6 @@ end_io:
bio_io_error(bio); bio_io_error(bio);
} }
static void pkt_init_queue(struct pktcdvd_device *pd)
{
struct request_queue *q = pd->disk->queue;
blk_queue_logical_block_size(q, CD_FRAMESIZE);
blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
q->queuedata = pd;
}
static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
{ {
struct device *ddev = disk_to_dev(pd->disk); struct device *ddev = disk_to_dev(pd->disk);
@ -2536,8 +2528,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
pd->bdev_file = bdev_file; pd->bdev_file = bdev_file;
set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE); set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE);
pkt_init_queue(pd);
atomic_set(&pd->cdrw.pending_bios, 0); atomic_set(&pd->cdrw.pending_bios, 0);
pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name);
if (IS_ERR(pd->cdrw.thread)) { if (IS_ERR(pd->cdrw.thread)) {
@ -2634,6 +2624,10 @@ static const struct block_device_operations pktcdvd_ops = {
*/ */
static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
{ {
struct queue_limits lim = {
.max_hw_sectors = PACKET_MAX_SECTORS,
.logical_block_size = CD_FRAMESIZE,
};
int idx; int idx;
int ret = -ENOMEM; int ret = -ENOMEM;
struct pktcdvd_device *pd; struct pktcdvd_device *pd;
@ -2673,10 +2667,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->write_congestion_on = write_congestion_on; pd->write_congestion_on = write_congestion_on;
pd->write_congestion_off = write_congestion_off; pd->write_congestion_off = write_congestion_off;
ret = -ENOMEM; disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
disk = blk_alloc_disk(NUMA_NO_NODE); if (IS_ERR(disk)) {
if (!disk) ret = PTR_ERR(disk);
goto out_mem; goto out_mem;
}
pd->disk = disk; pd->disk = disk;
disk->major = pktdev_major; disk->major = pktdev_major;
disk->first_minor = idx; disk->first_minor = idx;

View File

@ -382,6 +382,14 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
struct ps3disk_private *priv; struct ps3disk_private *priv;
int error; int error;
unsigned int devidx; unsigned int devidx;
struct queue_limits lim = {
.logical_block_size = dev->blk_size,
.max_hw_sectors = dev->bounce_size >> 9,
.max_segments = -1,
.max_segment_size = dev->bounce_size,
.dma_alignment = dev->blk_size - 1,
};
struct request_queue *queue; struct request_queue *queue;
struct gendisk *gendisk; struct gendisk *gendisk;
@ -431,7 +439,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
if (error) if (error)
goto fail_teardown; goto fail_teardown;
gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev);
if (IS_ERR(gendisk)) { if (IS_ERR(gendisk)) {
dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n",
__func__, __LINE__); __func__, __LINE__);
@ -441,15 +449,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
queue = gendisk->queue; queue = gendisk->queue;
blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);
blk_queue_write_cache(queue, true, false); blk_queue_write_cache(queue, true, false);
blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
priv->gendisk = gendisk; priv->gendisk = gendisk;
gendisk->major = ps3disk_major; gendisk->major = ps3disk_major;
gendisk->first_minor = devidx * PS3DISK_MINORS; gendisk->first_minor = devidx * PS3DISK_MINORS;

View File

@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
ps3vram_proc_init(dev); ps3vram_proc_init(dev);
gendisk = blk_alloc_disk(NUMA_NO_NODE); gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (!gendisk) { if (IS_ERR(gendisk)) {
dev_err(&dev->core, "blk_alloc_disk failed\n"); dev_err(&dev->core, "blk_alloc_disk failed\n");
error = -ENOMEM; error = PTR_ERR(gendisk);
goto out_cache_cleanup; goto out_cache_cleanup;
} }

View File

@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = {
}; };
__ATTRIBUTE_GROUPS(rbd_bus); __ATTRIBUTE_GROUPS(rbd_bus);
static struct bus_type rbd_bus_type = { static const struct bus_type rbd_bus_type = {
.name = "rbd", .name = "rbd",
.bus_groups = rbd_bus_groups, .bus_groups = rbd_bus_groups,
}; };
@ -4952,6 +4952,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
struct request_queue *q; struct request_queue *q;
unsigned int objset_bytes = unsigned int objset_bytes =
rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
struct queue_limits lim = {
.max_hw_sectors = objset_bytes >> SECTOR_SHIFT,
.max_user_sectors = objset_bytes >> SECTOR_SHIFT,
.io_min = rbd_dev->opts->alloc_size,
.io_opt = rbd_dev->opts->alloc_size,
.max_segments = USHRT_MAX,
.max_segment_size = UINT_MAX,
};
int err; int err;
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
@ -4966,7 +4974,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (err) if (err)
return err; return err;
disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); if (rbd_dev->opts->trim) {
lim.discard_granularity = rbd_dev->opts->alloc_size;
lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT;
lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT;
}
disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
err = PTR_ERR(disk); err = PTR_ERR(disk);
goto out_tag_set; goto out_tag_set;
@ -4987,19 +5001,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
q->limits.max_sectors = queue_max_hw_sectors(q);
blk_queue_max_segments(q, USHRT_MAX);
blk_queue_max_segment_size(q, UINT_MAX);
blk_queue_io_min(q, rbd_dev->opts->alloc_size);
blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
if (rbd_dev->opts->trim) {
q->limits.discard_granularity = rbd_dev->opts->alloc_size;
blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
}
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);

View File

@ -1329,43 +1329,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
} }
} }
static void setup_request_queue(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
blk_queue_logical_block_size(dev->queue,
le16_to_cpu(rsp->logical_block_size));
blk_queue_physical_block_size(dev->queue,
le16_to_cpu(rsp->physical_block_size));
blk_queue_max_hw_sectors(dev->queue,
dev->sess->max_io_size / SECTOR_SIZE);
/*
* we don't support discards to "discontiguous" segments
* in on request
*/
blk_queue_max_discard_segments(dev->queue, 1);
blk_queue_max_discard_sectors(dev->queue,
le32_to_cpu(rsp->max_discard_sectors));
dev->queue->limits.discard_granularity =
le32_to_cpu(rsp->discard_granularity);
dev->queue->limits.discard_alignment =
le32_to_cpu(rsp->discard_alignment);
if (le16_to_cpu(rsp->secure_discard))
blk_queue_max_secure_erase_sectors(dev->queue,
le32_to_cpu(rsp->max_discard_sectors));
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
blk_queue_max_segments(dev->queue, dev->sess->max_segments);
blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
blk_queue_write_cache(dev->queue,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
blk_queue_max_write_zeroes_sectors(dev->queue,
le32_to_cpu(rsp->max_write_zeroes_sectors));
}
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp, int idx) struct rnbd_msg_open_rsp *rsp, int idx)
{ {
@ -1403,18 +1366,41 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp) struct rnbd_msg_open_rsp *rsp)
{ {
struct queue_limits lim = {
.logical_block_size = le16_to_cpu(rsp->logical_block_size),
.physical_block_size = le16_to_cpu(rsp->physical_block_size),
.io_opt = dev->sess->max_io_size,
.max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE,
.max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors),
.discard_granularity = le32_to_cpu(rsp->discard_granularity),
.discard_alignment = le32_to_cpu(rsp->discard_alignment),
.max_segments = dev->sess->max_segments,
.virt_boundary_mask = SZ_4K - 1,
.max_write_zeroes_sectors =
le32_to_cpu(rsp->max_write_zeroes_sectors),
};
int idx = dev->clt_device_id; int idx = dev->clt_device_id;
dev->size = le64_to_cpu(rsp->nsectors) * dev->size = le64_to_cpu(rsp->nsectors) *
le16_to_cpu(rsp->logical_block_size); le16_to_cpu(rsp->logical_block_size);
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); if (rsp->secure_discard) {
lim.max_secure_erase_sectors =
le32_to_cpu(rsp->max_discard_sectors);
}
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev);
if (IS_ERR(dev->gd)) if (IS_ERR(dev->gd))
return PTR_ERR(dev->gd); return PTR_ERR(dev->gd);
dev->queue = dev->gd->queue; dev->queue = dev->gd->queue;
rnbd_init_mq_hw_queues(dev); rnbd_init_mq_hw_queues(dev);
setup_request_queue(dev, rsp); blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
blk_queue_write_cache(dev->queue,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
return rnbd_clt_setup_gen_disk(dev, rsp, idx); return rnbd_clt_setup_gen_disk(dev, rsp, idx);
} }

View File

@ -784,6 +784,14 @@ static const struct blk_mq_ops vdc_mq_ops = {
static int probe_disk(struct vdc_port *port) static int probe_disk(struct vdc_port *port)
{ {
struct queue_limits lim = {
.physical_block_size = port->vdisk_phys_blksz,
.max_hw_sectors = port->max_xfer_size,
/* Each segment in a request is up to an aligned page in size. */
.seg_boundary_mask = PAGE_SIZE - 1,
.max_segment_size = PAGE_SIZE,
.max_segments = port->ring_cookies,
};
struct request_queue *q; struct request_queue *q;
struct gendisk *g; struct gendisk *g;
int err; int err;
@ -824,7 +832,7 @@ static int probe_disk(struct vdc_port *port)
if (err) if (err)
return err; return err;
g = blk_mq_alloc_disk(&port->tag_set, port); g = blk_mq_alloc_disk(&port->tag_set, &lim, port);
if (IS_ERR(g)) { if (IS_ERR(g)) {
printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
port->vio.name); port->vio.name);
@ -835,12 +843,6 @@ static int probe_disk(struct vdc_port *port)
port->disk = g; port->disk = g;
q = g->queue; q = g->queue;
/* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_max_segments(q, port->ring_cookies);
blk_queue_max_hw_sectors(q, port->max_xfer_size);
g->major = vdc_major; g->major = vdc_major;
g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
g->minors = 1 << PARTITION_SHIFT; g->minors = 1 << PARTITION_SHIFT;
@ -872,8 +874,6 @@ static int probe_disk(struct vdc_port *port)
} }
} }
blk_queue_physical_block_size(q, port->vdisk_phys_blksz);
pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
g->disk_name, g->disk_name,
port->vdisk_size, (port->vdisk_size >> (20 - 9)), port->vdisk_size, (port->vdisk_size >> (20 - 9)),

View File

@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd)
goto exit_put_disks; goto exit_put_disks;
swd->unit[drive].disk = swd->unit[drive].disk =
blk_mq_alloc_disk(&swd->unit[drive].tag_set, blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL,
&swd->unit[drive]); &swd->unit[drive]);
if (IS_ERR(swd->unit[drive].disk)) { if (IS_ERR(swd->unit[drive].disk)) {
blk_mq_free_tag_set(&swd->unit[drive].tag_set); blk_mq_free_tag_set(&swd->unit[drive].tag_set);
@ -916,7 +916,7 @@ out:
return ret; return ret;
} }
static int swim_remove(struct platform_device *dev) static void swim_remove(struct platform_device *dev)
{ {
struct swim_priv *swd = platform_get_drvdata(dev); struct swim_priv *swd = platform_get_drvdata(dev);
int drive; int drive;
@ -937,13 +937,11 @@ static int swim_remove(struct platform_device *dev)
release_mem_region(res->start, resource_size(res)); release_mem_region(res->start, resource_size(res));
kfree(swd); kfree(swd);
return 0;
} }
static struct platform_driver swim_driver = { static struct platform_driver swim_driver = {
.probe = swim_probe, .probe = swim_probe,
.remove = swim_remove, .remove_new = swim_remove,
.driver = { .driver = {
.name = CARDNAME, .name = CARDNAME,
}, },

View File

@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev,
if (rc) if (rc)
goto out_unregister; goto out_unregister;
disk = blk_mq_alloc_disk(&fs->tag_set, fs); disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
rc = PTR_ERR(disk); rc = PTR_ERR(disk);
goto out_free_tag_set; goto out_free_tag_set;

View File

@ -246,21 +246,12 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return 0; return 0;
} }
static int ublk_dev_param_zoned_apply(struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{ {
const struct ublk_param_zoned *p = &ub->params.zoned;
disk_set_zoned(ub->ub_disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
blk_queue_required_elevator_features(ub->ub_disk->queue, blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE); ELEVATOR_F_ZBD_SEQ_WRITE);
disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
return 0;
} }
/* Based on virtblk_alloc_report_buffer */ /* Based on virtblk_alloc_report_buffer */
@ -432,9 +423,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static int ublk_dev_param_zoned_apply(struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{ {
return -EOPNOTSUPP;
} }
static int ublk_revalidate_disk_zones(struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub)
@ -498,11 +488,6 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
struct request_queue *q = ub->ub_disk->queue; struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_basic *p = &ub->params.basic; const struct ublk_param_basic *p = &ub->params.basic;
blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
blk_queue_io_min(q, 1 << p->io_min_shift);
blk_queue_io_opt(q, 1 << p->io_opt_shift);
blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
p->attrs & UBLK_ATTR_FUA); p->attrs & UBLK_ATTR_FUA);
if (p->attrs & UBLK_ATTR_ROTATIONAL) if (p->attrs & UBLK_ATTR_ROTATIONAL)
@ -510,29 +495,12 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
else else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_max_hw_sectors(q, p->max_sectors);
blk_queue_chunk_sectors(q, p->chunk_sectors);
blk_queue_virt_boundary(q, p->virt_boundary_mask);
if (p->attrs & UBLK_ATTR_READ_ONLY) if (p->attrs & UBLK_ATTR_READ_ONLY)
set_disk_ro(ub->ub_disk, true); set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors); set_capacity(ub->ub_disk, p->dev_sectors);
} }
static void ublk_dev_param_discard_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_discard *p = &ub->params.discard;
q->limits.discard_alignment = p->discard_alignment;
q->limits.discard_granularity = p->discard_granularity;
blk_queue_max_discard_sectors(q, p->max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q,
p->max_write_zeroes_sectors);
blk_queue_max_discard_segments(q, p->max_discard_segments);
}
static int ublk_validate_params(const struct ublk_device *ub) static int ublk_validate_params(const struct ublk_device *ub)
{ {
/* basic param is the only one which must be set */ /* basic param is the only one which must be set */
@ -576,20 +544,12 @@ static int ublk_validate_params(const struct ublk_device *ub)
return 0; return 0;
} }
static int ublk_apply_params(struct ublk_device *ub) static void ublk_apply_params(struct ublk_device *ub)
{ {
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
return -EINVAL;
ublk_dev_param_basic_apply(ub); ublk_dev_param_basic_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_apply(ub); ublk_dev_param_zoned_apply(ub);
return 0;
} }
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
@ -645,14 +605,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA; return ubq->flags & UBLK_F_NEED_GET_DATA;
} }
static struct ublk_device *ublk_get_device(struct ublk_device *ub) /* Called in slow path only, keep it noinline for trace purpose */
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{ {
if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
return ub; return ub;
return NULL; return NULL;
} }
static void ublk_put_device(struct ublk_device *ub) /* Called in slow path only, keep it noinline for trace purpose */
static noinline void ublk_put_device(struct ublk_device *ub)
{ {
put_device(&ub->cdev_dev); put_device(&ub->cdev_dev);
} }
@ -711,7 +673,7 @@ static void ublk_free_disk(struct gendisk *disk)
struct ublk_device *ub = disk->private_data; struct ublk_device *ub = disk->private_data;
clear_bit(UB_STATE_USED, &ub->state); clear_bit(UB_STATE_USED, &ub->state);
put_device(&ub->cdev_dev); ublk_put_device(ub);
} }
static void ublk_store_owner_uid_gid(unsigned int *owner_uid, static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
@ -2182,7 +2144,7 @@ static void ublk_remove(struct ublk_device *ub)
cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->stop_work);
cancel_work_sync(&ub->quiesce_work); cancel_work_sync(&ub->quiesce_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev); cdev_device_del(&ub->cdev, &ub->cdev_dev);
put_device(&ub->cdev_dev); ublk_put_device(ub);
ublks_added--; ublks_added--;
} }
@ -2205,12 +2167,47 @@ static struct ublk_device *ublk_get_device_from_id(int idx)
static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
{ {
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
const struct ublk_param_basic *p = &ub->params.basic;
int ublksrv_pid = (int)header->data[0]; int ublksrv_pid = (int)header->data[0];
struct queue_limits lim = {
.logical_block_size = 1 << p->logical_bs_shift,
.physical_block_size = 1 << p->physical_bs_shift,
.io_min = 1 << p->io_min_shift,
.io_opt = 1 << p->io_opt_shift,
.max_hw_sectors = p->max_sectors,
.chunk_sectors = p->chunk_sectors,
.virt_boundary_mask = p->virt_boundary_mask,
};
struct gendisk *disk; struct gendisk *disk;
int ret = -EINVAL; int ret = -EINVAL;
if (ublksrv_pid <= 0) if (ublksrv_pid <= 0)
return -EINVAL; return -EINVAL;
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
const struct ublk_param_discard *pd = &ub->params.discard;
lim.discard_alignment = pd->discard_alignment;
lim.discard_granularity = pd->discard_granularity;
lim.max_hw_discard_sectors = pd->max_discard_sectors;
lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
lim.max_discard_segments = pd->max_discard_segments;
}
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
const struct ublk_param_zoned *p = &ub->params.zoned;
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
lim.zoned = true;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones = p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
}
if (wait_for_completion_interruptible(&ub->completion) != 0) if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR; return -EINTR;
@ -2222,7 +2219,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
goto out_unlock; goto out_unlock;
} }
disk = blk_mq_alloc_disk(&ub->tag_set, NULL); disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
if (IS_ERR(disk)) { if (IS_ERR(disk)) {
ret = PTR_ERR(disk); ret = PTR_ERR(disk);
goto out_unlock; goto out_unlock;
@ -2234,15 +2231,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
ub->dev_info.ublksrv_pid = ublksrv_pid; ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk; ub->ub_disk = disk;
ret = ublk_apply_params(ub); ublk_apply_params(ub);
if (ret)
goto out_put_disk;
/* don't probe partitions if any one ubq daemon is un-trusted */ /* don't probe partitions if any one ubq daemon is un-trusted */
if (ub->nr_privileged_daemon != ub->nr_queues_ready) if (ub->nr_privileged_daemon != ub->nr_queues_ready)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
get_device(&ub->cdev_dev); ublk_get_device(ub);
ub->dev_info.state = UBLK_S_DEV_LIVE; ub->dev_info.state = UBLK_S_DEV_LIVE;
if (ublk_dev_is_zoned(ub)) { if (ublk_dev_is_zoned(ub)) {
@ -2262,7 +2257,6 @@ out_put_cdev:
ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub); ublk_put_device(ub);
} }
out_put_disk:
if (ret) if (ret)
put_disk(disk); put_disk(disk);
out_unlock: out_unlock:
@ -2474,7 +2468,7 @@ static inline bool ublk_idr_freed(int id)
return ptr == NULL; return ptr == NULL;
} }
static int ublk_ctrl_del_dev(struct ublk_device **p_ub) static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
{ {
struct ublk_device *ub = *p_ub; struct ublk_device *ub = *p_ub;
int idx = ub->ub_number; int idx = ub->ub_number;
@ -2508,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
* - the device number is freed already, we will not find this * - the device number is freed already, we will not find this
* device via ublk_get_device_from_id() * device via ublk_get_device_from_id()
*/ */
if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
return -EINTR; return -EINTR;
return 0; return 0;
} }
@ -2907,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
ret = ublk_ctrl_add_dev(cmd); ret = ublk_ctrl_add_dev(cmd);
break; break;
case UBLK_CMD_DEL_DEV: case UBLK_CMD_DEL_DEV:
ret = ublk_ctrl_del_dev(&ub); ret = ublk_ctrl_del_dev(&ub, true);
break;
case UBLK_U_CMD_DEL_DEV_ASYNC:
ret = ublk_ctrl_del_dev(&ub, false);
break; break;
case UBLK_CMD_GET_QUEUE_AFFINITY: case UBLK_CMD_GET_QUEUE_AFFINITY:
ret = ublk_ctrl_get_queue_affinity(ub, cmd); ret = ublk_ctrl_get_queue_affinity(ub, cmd);

View File

@ -720,25 +720,24 @@ fail_report:
return ret; return ret;
} }
static int virtblk_probe_zoned_device(struct virtio_device *vdev, static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
struct virtio_blk *vblk, struct queue_limits *lim)
struct request_queue *q)
{ {
struct virtio_device *vdev = vblk->vdev;
u32 v, wg; u32 v, wg;
dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
disk_set_zoned(vblk->disk); lim->zoned = true;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
virtio_cread(vdev, struct virtio_blk_config, virtio_cread(vdev, struct virtio_blk_config,
zoned.max_open_zones, &v); zoned.max_open_zones, &v);
disk_set_max_open_zones(vblk->disk, v); lim->max_open_zones = v;
dev_dbg(&vdev->dev, "max open zones = %u\n", v); dev_dbg(&vdev->dev, "max open zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config, virtio_cread(vdev, struct virtio_blk_config,
zoned.max_active_zones, &v); zoned.max_active_zones, &v);
disk_set_max_active_zones(vblk->disk, v); lim->max_active_zones = v;
dev_dbg(&vdev->dev, "max active zones = %u\n", v); dev_dbg(&vdev->dev, "max active zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config, virtio_cread(vdev, struct virtio_blk_config,
@ -747,8 +746,8 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
dev_warn(&vdev->dev, "zero write granularity reported\n"); dev_warn(&vdev->dev, "zero write granularity reported\n");
return -ENODEV; return -ENODEV;
} }
blk_queue_physical_block_size(q, wg); lim->physical_block_size = wg;
blk_queue_io_min(q, wg); lim->io_min = wg;
dev_dbg(&vdev->dev, "write granularity = %u\n", wg); dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
@ -764,13 +763,13 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
vblk->zone_sectors); vblk->zone_sectors);
return -ENODEV; return -ENODEV;
} }
blk_queue_chunk_sectors(q, vblk->zone_sectors); lim->chunk_sectors = vblk->zone_sectors;
dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
dev_warn(&vblk->vdev->dev, dev_warn(&vblk->vdev->dev,
"ignoring negotiated F_DISCARD for zoned device\n"); "ignoring negotiated F_DISCARD for zoned device\n");
blk_queue_max_discard_sectors(q, 0); lim->max_hw_discard_sectors = 0;
} }
virtio_cread(vdev, struct virtio_blk_config, virtio_cread(vdev, struct virtio_blk_config,
@ -785,25 +784,21 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
wg, v); wg, v);
return -ENODEV; return -ENODEV;
} }
blk_queue_max_zone_append_sectors(q, v); lim->max_zone_append_sectors = v;
dev_dbg(&vdev->dev, "max append sectors = %u\n", v); dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
return blk_revalidate_disk_zones(vblk->disk, NULL); return 0;
} }
#else #else
/* /*
* Zoned block device support is not configured in this kernel. * Zoned block device support is not configured in this kernel, host-managed
* Host-managed zoned devices can't be supported, but others are * zoned devices can't be supported.
* good to go as regular block devices.
*/ */
#define virtblk_report_zones NULL #define virtblk_report_zones NULL
static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk,
static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, struct queue_limits *lim)
struct virtio_blk *vblk, struct request_queue *q)
{ {
dev_err(&vdev->dev, dev_err(&vblk->vdev->dev,
"virtio_blk: zoned devices are not supported"); "virtio_blk: zoned devices are not supported");
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
@ -1248,31 +1243,17 @@ static const struct blk_mq_ops virtio_mq_ops = {
static unsigned int virtblk_queue_depth; static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
static int virtblk_probe(struct virtio_device *vdev) static int virtblk_read_limits(struct virtio_blk *vblk,
struct queue_limits *lim)
{ {
struct virtio_blk *vblk; struct virtio_device *vdev = vblk->vdev;
struct request_queue *q;
int err, index;
u32 v, blk_size, max_size, sg_elems, opt_io_size; u32 v, blk_size, max_size, sg_elems, opt_io_size;
u32 max_discard_segs = 0; u32 max_discard_segs = 0;
u32 discard_granularity = 0; u32 discard_granularity = 0;
u16 min_io_size; u16 min_io_size;
u8 physical_block_exp, alignment_offset; u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
size_t max_dma_size; size_t max_dma_size;
int err;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
__func__);
return -EINVAL;
}
err = ida_alloc_range(&vd_index_ida, 0,
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
/* We need to know how many segments before we allocate. */ /* We need to know how many segments before we allocate. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
@ -1286,6 +1267,203 @@ static int virtblk_probe(struct virtio_device *vdev)
/* Prevent integer overflows and honor max vq size */ /* Prevent integer overflows and honor max vq size */
sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
/* We can handle whatever the host told us to handle. */
lim->max_segments = sg_elems;
/* No real sector limit. */
lim->max_hw_sectors = UINT_MAX;
max_dma_size = virtio_max_dma_size(vdev);
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
struct virtio_blk_config, size_max, &v);
if (!err)
max_size = min(max_size, v);
lim->max_segment_size = max_size;
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
if (!err) {
err = blk_validate_block_size(blk_size);
if (err) {
dev_err(&vdev->dev,
"virtio_blk: invalid block size: 0x%x\n",
blk_size);
return err;
}
lim->logical_block_size = blk_size;
} else
blk_size = lim->logical_block_size;
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
lim->physical_block_size = blk_size * (1 << physical_block_exp);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
lim->alignment_offset = blk_size * alignment_offset;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
lim->io_min = blk_size * min_io_size;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
lim->io_opt = blk_size * opt_io_size;
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
virtio_cread(vdev, struct virtio_blk_config,
discard_sector_alignment, &discard_granularity);
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
lim->max_hw_discard_sectors = v ? v : UINT_MAX;
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&max_discard_segs);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
lim->max_write_zeroes_sectors = v ? v : UINT_MAX;
}
/* The discard and secure erase limits are combined since the Linux
* block layer uses the same limit for both commands.
*
* If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
* are negotiated, we will use the minimum between the limits.
*
* discard sector alignment is set to the minimum between discard_sector_alignment
* and secure_erase_sector_alignment.
*
* max discard sectors is set to the minimum between max_discard_seg and
* max_secure_erase_seg.
*/
if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
virtio_cread(vdev, struct virtio_blk_config,
secure_erase_sector_alignment, &v);
/* secure_erase_sector_alignment should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
return -EINVAL;
}
discard_granularity = min_not_zero(discard_granularity, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_sectors, &v);
/* max_secure_erase_sectors should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_sectors can't be 0\n");
return -EINVAL;
}
lim->max_secure_erase_sectors = v;
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_seg, &v);
/* max_secure_erase_seg should not be zero, the device should set a
* valid number of segments
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_seg can't be 0\n");
return -EINVAL;
}
max_discard_segs = min_not_zero(max_discard_segs, v);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
/* max_discard_seg and discard_granularity will be 0 only
* if max_discard_seg and discard_sector_alignment fields in the virtio
* config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
* In this case, we use default values.
*/
if (!max_discard_segs)
max_discard_segs = sg_elems;
lim->max_discard_segments =
min(max_discard_segs, MAX_DISCARD_SEGMENTS);
if (discard_granularity)
lim->discard_granularity =
discard_granularity << SECTOR_SHIFT;
else
lim->discard_granularity = blk_size;
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
u8 model;
virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
switch (model) {
case VIRTIO_BLK_Z_NONE:
case VIRTIO_BLK_Z_HA:
/* treat host-aware devices as non-zoned */
return 0;
case VIRTIO_BLK_Z_HM:
err = virtblk_read_zoned_limits(vblk, lim);
if (err)
return err;
break;
default:
dev_err(&vdev->dev, "unsupported zone model %d\n", model);
return -EINVAL;
}
}
return 0;
}
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct queue_limits lim = { };
int err, index;
unsigned int queue_depth;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
__func__);
return -EINVAL;
}
err = ida_alloc_range(&vd_index_ida, 0,
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
if (!vblk) { if (!vblk) {
err = -ENOMEM; err = -ENOMEM;
@ -1330,12 +1508,15 @@ static int virtblk_probe(struct virtio_device *vdev)
if (err) if (err)
goto out_free_vq; goto out_free_vq;
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); err = virtblk_read_limits(vblk, &lim);
if (err)
goto out_free_tags;
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk);
if (IS_ERR(vblk->disk)) { if (IS_ERR(vblk->disk)) {
err = PTR_ERR(vblk->disk); err = PTR_ERR(vblk->disk);
goto out_free_tags; goto out_free_tags;
} }
q = vblk->disk->queue;
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@ -1353,164 +1534,6 @@ static int virtblk_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
set_disk_ro(vblk->disk, 1); set_disk_ro(vblk->disk, 1);
/* We can handle whatever the host told us to handle. */
blk_queue_max_segments(q, sg_elems);
/* No real sector limit. */
blk_queue_max_hw_sectors(q, UINT_MAX);
max_dma_size = virtio_max_dma_size(vdev);
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
struct virtio_blk_config, size_max, &v);
if (!err)
max_size = min(max_size, v);
blk_queue_max_segment_size(q, max_size);
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
if (!err) {
err = blk_validate_block_size(blk_size);
if (err) {
dev_err(&vdev->dev,
"virtio_blk: invalid block size: 0x%x\n",
blk_size);
goto out_cleanup_disk;
}
blk_queue_logical_block_size(q, blk_size);
} else
blk_size = queue_logical_block_size(q);
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
blk_queue_physical_block_size(q,
blk_size * (1 << physical_block_exp));
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
blk_queue_alignment_offset(q, blk_size * alignment_offset);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
blk_queue_io_min(q, blk_size * min_io_size);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
virtio_cread(vdev, struct virtio_blk_config,
discard_sector_alignment, &discard_granularity);
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&max_discard_segs);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
}
/* The discard and secure erase limits are combined since the Linux
* block layer uses the same limit for both commands.
*
* If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
* are negotiated, we will use the minimum between the limits.
*
* discard sector alignment is set to the minimum between discard_sector_alignment
* and secure_erase_sector_alignment.
*
* max discard sectors is set to the minimum between max_discard_seg and
* max_secure_erase_seg.
*/
if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
virtio_cread(vdev, struct virtio_blk_config,
secure_erase_sector_alignment, &v);
/* secure_erase_sector_alignment should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
discard_granularity = min_not_zero(discard_granularity, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_sectors, &v);
/* max_secure_erase_sectors should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_sectors can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
blk_queue_max_secure_erase_sectors(q, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_seg, &v);
/* max_secure_erase_seg should not be zero, the device should set a
* valid number of segments
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_seg can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
max_discard_segs = min_not_zero(max_discard_segs, v);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
/* max_discard_seg and discard_granularity will be 0 only
* if max_discard_seg and discard_sector_alignment fields in the virtio
* config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
* In this case, we use default values.
*/
if (!max_discard_segs)
max_discard_segs = sg_elems;
blk_queue_max_discard_segments(q,
min(max_discard_segs, MAX_DISCARD_SEGMENTS));
if (discard_granularity)
q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT;
else
q->limits.discard_granularity = blk_size;
}
virtblk_update_capacity(vblk, false); virtblk_update_capacity(vblk, false);
virtio_device_ready(vdev); virtio_device_ready(vdev);
@ -1518,27 +1541,11 @@ static int virtblk_probe(struct virtio_device *vdev)
* All steps that follow use the VQs therefore they need to be * All steps that follow use the VQs therefore they need to be
* placed after the virtio_device_ready() call above. * placed after the virtio_device_ready() call above.
*/ */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
u8 model; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk, NULL);
virtio_cread(vdev, struct virtio_blk_config, zoned.model,
&model);
switch (model) {
case VIRTIO_BLK_Z_NONE:
case VIRTIO_BLK_Z_HA:
/* Present the host-aware device as non-zoned */
break;
case VIRTIO_BLK_Z_HM:
err = virtblk_probe_zoned_device(vdev, vblk, q);
if (err) if (err)
goto out_cleanup_disk; goto out_cleanup_disk;
break;
default:
dev_err(&vdev->dev, "unsupported zone model %d\n",
model);
err = -EINVAL;
goto out_cleanup_disk;
}
} }
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);

View File

@ -941,39 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
.complete = blkif_complete_rq, .complete = blkif_complete_rq,
}; };
static void blkif_set_queue_limits(struct blkfront_info *info) static void blkif_set_queue_limits(const struct blkfront_info *info,
struct queue_limits *lim)
{ {
struct request_queue *rq = info->rq;
struct gendisk *gd = info->gd;
unsigned int segments = info->max_indirect_segments ? : unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST; BLKIF_MAX_SEGMENTS_PER_REQUEST;
blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
if (info->feature_discard) { if (info->feature_discard) {
blk_queue_max_discard_sectors(rq, get_capacity(gd)); lim->max_hw_discard_sectors = UINT_MAX;
rq->limits.discard_granularity = info->discard_granularity ?: if (info->discard_granularity)
info->physical_sector_size; lim->discard_granularity = info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment; lim->discard_alignment = info->discard_alignment;
if (info->feature_secdiscard) if (info->feature_secdiscard)
blk_queue_max_secure_erase_sectors(rq, lim->max_secure_erase_sectors = UINT_MAX;
get_capacity(gd));
} }
/* Hard sector size and max sectors impersonate the equiv. hardware. */ /* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_logical_block_size(rq, info->sector_size); lim->logical_block_size = info->sector_size;
blk_queue_physical_block_size(rq, info->physical_sector_size); lim->physical_block_size = info->physical_sector_size;
blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512;
/* Each segment in a request is up to an aligned page in size. */ /* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(rq, PAGE_SIZE - 1); lim->seg_boundary_mask = PAGE_SIZE - 1;
blk_queue_max_segment_size(rq, PAGE_SIZE); lim->max_segment_size = PAGE_SIZE;
/* Ensure a merged request will fit in a single I/O ring slot. */ /* Ensure a merged request will fit in a single I/O ring slot. */
blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); lim->max_segments = segments / GRANTS_PER_PSEG;
/* Make sure buffer addresses are sector-aligned. */ /* Make sure buffer addresses are sector-aligned. */
blk_queue_dma_alignment(rq, 511); lim->dma_alignment = 511;
} }
static const char *flush_info(struct blkfront_info *info) static const char *flush_info(struct blkfront_info *info)
@ -1070,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info, u16 sector_size, struct blkfront_info *info, u16 sector_size,
unsigned int physical_sector_size) unsigned int physical_sector_size)
{ {
struct queue_limits lim = {};
struct gendisk *gd; struct gendisk *gd;
int nr_minors = 1; int nr_minors = 1;
int err; int err;
@ -1136,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (err) if (err)
goto out_release_minors; goto out_release_minors;
gd = blk_mq_alloc_disk(&info->tag_set, info); blkif_set_queue_limits(info, &lim);
gd = blk_mq_alloc_disk(&info->tag_set, &lim, info);
if (IS_ERR(gd)) { if (IS_ERR(gd)) {
err = PTR_ERR(gd); err = PTR_ERR(gd);
goto out_free_tag_set; goto out_free_tag_set;
} }
blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue);
strcpy(gd->disk_name, DEV_NAME); strcpy(gd->disk_name, DEV_NAME);
ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
@ -1162,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->gd = gd; info->gd = gd;
info->sector_size = sector_size; info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size; info->physical_sector_size = physical_sector_size;
blkif_set_queue_limits(info);
xlvbd_flush(info); xlvbd_flush(info);
@ -2006,18 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev,
static int blkif_recover(struct blkfront_info *info) static int blkif_recover(struct blkfront_info *info)
{ {
struct queue_limits lim;
unsigned int r_index; unsigned int r_index;
struct request *req, *n; struct request *req, *n;
int rc; int rc;
struct bio *bio; struct bio *bio;
unsigned int segs;
struct blkfront_ring_info *rinfo; struct blkfront_ring_info *rinfo;
lim = queue_limits_start_update(info->rq);
blkfront_gather_backend_features(info); blkfront_gather_backend_features(info);
/* Reset limits changed by blk_mq_update_nr_hw_queues(). */ blkif_set_queue_limits(info, &lim);
blkif_set_queue_limits(info); rc = queue_limits_commit_update(info->rq, &lim);
segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; if (rc)
blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); return rc;
for_each_rinfo(info, rinfo, r_index) { for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo); rc = blkfront_setup_indirect(rinfo);
@ -2037,7 +2036,9 @@ static int blkif_recover(struct blkfront_info *info)
list_for_each_entry_safe(req, n, &info->requests, queuelist) { list_for_each_entry_safe(req, n, &info->requests, queuelist) {
/* Requeue pending requests (flush or discard) */ /* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist); list_del_init(&req->queuelist);
BUG_ON(req->nr_phys_segments > segs); BUG_ON(req->nr_phys_segments >
(info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST));
blk_mq_requeue_request(req, false); blk_mq_requeue_request(req, false);
} }
blk_mq_start_stopped_hw_queues(info->rq, true); blk_mq_start_stopped_hw_queues(info->rq, true);

View File

@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor)
struct gendisk *disk; struct gendisk *disk;
int err; int err;
disk = blk_mq_alloc_disk(&tag_set, NULL); disk = blk_mq_alloc_disk(&tag_set, NULL, NULL);
if (IS_ERR(disk)) if (IS_ERR(disk))
return PTR_ERR(disk); return PTR_ERR(disk);

View File

@ -2177,6 +2177,28 @@ ATTRIBUTE_GROUPS(zram_disk);
*/ */
static int zram_add(void) static int zram_add(void)
{ {
struct queue_limits lim = {
.logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE,
/*
* To ensure that we always get PAGE_SIZE aligned and
* n*PAGE_SIZED sized I/O requests.
*/
.physical_block_size = PAGE_SIZE,
.io_min = PAGE_SIZE,
.io_opt = PAGE_SIZE,
.max_hw_discard_sectors = UINT_MAX,
/*
* zram_bio_discard() will clear all logical blocks if logical
* block size is identical with physical block size(PAGE_SIZE).
* But if it is different, we will skip discarding some parts of
* logical blocks in the part of the request range which isn't
* aligned to physical block size. So we can't ensure that all
* discarded logical blocks are zeroed.
*/
#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors = UINT_MAX,
#endif
};
struct zram *zram; struct zram *zram;
int ret, device_id; int ret, device_id;
@ -2195,11 +2217,11 @@ static int zram_add(void)
#endif #endif
/* gendisk structure */ /* gendisk structure */
zram->disk = blk_alloc_disk(NUMA_NO_NODE); zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!zram->disk) { if (IS_ERR(zram->disk)) {
pr_err("Error allocating disk structure for device %d\n", pr_err("Error allocating disk structure for device %d\n",
device_id); device_id);
ret = -ENOMEM; ret = PTR_ERR(zram->disk);
goto out_free_idr; goto out_free_idr;
} }
@ -2216,29 +2238,6 @@ static int zram_add(void)
/* zram devices sort of resembles non-rotational disks */ /* zram devices sort of resembles non-rotational disks */
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
/*
* To ensure that we always get PAGE_SIZE aligned
* and n*PAGE_SIZED sized I/O requests.
*/
blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
blk_queue_logical_block_size(zram->disk->queue,
ZRAM_LOGICAL_BLOCK_SIZE);
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
/*
* zram_bio_discard() will clear all logical blocks if logical block
* size is identical with physical block size(PAGE_SIZE). But if it is
* different, we will skip discarding some parts of logical blocks in
* the part of the request range which isn't aligned to physical block
* size. So we can't ensure that all discarded logical blocks are
* zeroed.
*/
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups); ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret) if (ret)

View File

@ -724,11 +724,6 @@ static void probe_gdrom_setupdisk(void)
static int probe_gdrom_setupqueue(void) static int probe_gdrom_setupqueue(void)
{ {
blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
/* using DMA so memory will need to be contiguous */
blk_queue_max_segments(gd.gdrom_rq, 1);
/* set a large max size to get most from DMA */
blk_queue_max_segment_size(gd.gdrom_rq, 0x40000);
gd.disk->queue = gd.gdrom_rq; gd.disk->queue = gd.gdrom_rq;
return gdrom_init_dma_mode(); return gdrom_init_dma_mode();
} }
@ -743,6 +738,13 @@ static const struct blk_mq_ops gdrom_mq_ops = {
*/ */
static int probe_gdrom(struct platform_device *devptr) static int probe_gdrom(struct platform_device *devptr)
{ {
struct queue_limits lim = {
.logical_block_size = GDROM_HARD_SECTOR,
/* using DMA so memory will need to be contiguous */
.max_segments = 1,
/* set a large max size to get most from DMA */
.max_segment_size = 0x40000,
};
int err; int err;
/* /*
@ -778,7 +780,7 @@ static int probe_gdrom(struct platform_device *devptr)
if (err) if (err)
goto probe_fail_free_cd_info; goto probe_fail_free_cd_info;
gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); gd.disk = blk_mq_alloc_disk(&gd.tag_set, &lim, NULL);
if (IS_ERR(gd.disk)) { if (IS_ERR(gd.disk)) {
err = PTR_ERR(gd.disk); err = PTR_ERR(gd.disk);
goto probe_fail_free_tag_set; goto probe_fail_free_tag_set;
@ -829,7 +831,7 @@ probe_fail_no_mem:
return err; return err;
} }
static int remove_gdrom(struct platform_device *devptr) static void remove_gdrom(struct platform_device *devptr)
{ {
blk_mq_free_tag_set(&gd.tag_set); blk_mq_free_tag_set(&gd.tag_set);
free_irq(HW_EVENT_GDROM_CMD, &gd); free_irq(HW_EVENT_GDROM_CMD, &gd);
@ -840,13 +842,11 @@ static int remove_gdrom(struct platform_device *devptr)
unregister_cdrom(gd.cd_info); unregister_cdrom(gd.cd_info);
kfree(gd.cd_info); kfree(gd.cd_info);
kfree(gd.toc); kfree(gd.toc);
return 0;
} }
static struct platform_driver gdrom_driver = { static struct platform_driver gdrom_driver = {
.probe = probe_gdrom, .probe = probe_gdrom,
.remove = remove_gdrom, .remove_new = remove_gdrom,
.driver = { .driver = {
.name = GDROM_DEV_NAME, .name = GDROM_DEV_NAME,
}, },

View File

@ -900,9 +900,23 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q; struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX, const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t)); SIZE_MAX / sizeof(atomic_t));
struct queue_limits lim = {
.max_hw_sectors = UINT_MAX,
.max_sectors = UINT_MAX,
.max_segment_size = UINT_MAX,
.max_segments = BIO_MAX_VECS,
.max_hw_discard_sectors = UINT_MAX,
.io_min = block_size,
.logical_block_size = block_size,
.physical_block_size = block_size,
};
uint64_t n; uint64_t n;
int idx; int idx;
if (cached_bdev) {
d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT;
lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev));
}
if (!d->stripe_size) if (!d->stripe_size)
d->stripe_size = 1 << 31; d->stripe_size = 1 << 31;
else if (d->stripe_size < BCH_MIN_STRIPE_SZ) else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
@ -935,8 +949,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto out_ida_remove; goto out_ida_remove;
d->disk = blk_alloc_disk(NUMA_NO_NODE); if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
if (!d->disk) /*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
idx, lim.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
lim.logical_block_size = bdev_logical_block_size(cached_bdev);
}
d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(d->disk))
goto out_bioset_exit; goto out_bioset_exit;
set_capacity(d->disk, sectors); set_capacity(d->disk, sectors);
@ -949,27 +976,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->private_data = d; d->disk->private_data = d;
q = d->disk->queue; q = d->disk->queue;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_VECS;
blk_queue_max_discard_sectors(q, UINT_MAX);
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
/*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
d->disk->disk_name, q->limits.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
@ -1416,9 +1422,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
} }
dc->disk.stripe_size = q->limits.io_opt >> 9; if (bdev_io_opt(dc->bdev))
if (dc->disk.stripe_size)
dc->partial_stripes_expensive = dc->partial_stripes_expensive =
q->limits.raid_partial_stripes_expensive; q->limits.raid_partial_stripes_expensive;
@ -1428,9 +1432,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
if (ret) if (ret)
return ret; return ret;
blk_queue_io_opt(dc->disk.disk->queue,
max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
atomic_set(&dc->io_errors, 0); atomic_set(&dc->io_errors, 0);
dc->io_disable = false; dc->io_disable = false;
dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;

View File

@ -213,6 +213,7 @@ struct raid_dev {
#define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7 #define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8 #define RT_FLAG_RS_GROW 8
#define RT_FLAG_RS_FROZEN 9
/* Array elements of 64 bit needed for rebuild/failed disk bits */ /* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@ -3240,11 +3241,12 @@ size_check:
rs->md.ro = 1; rs->md.ro = 1;
rs->md.in_sync = 1; rs->md.in_sync = 1;
/* Keep array frozen until resume. */
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */ /* Has to be held on running the array */
mddev_suspend_and_lock_nointr(&rs->md); mddev_suspend_and_lock_nointr(&rs->md);
/* Keep array frozen until resume. */
md_frozen_sync_thread(&rs->md);
r = md_run(&rs->md); r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */ rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) { if (r) {
@ -3339,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE; return DM_MAPIO_REQUEUE;
md_handle_request(mddev, bio); if (unlikely(!md_handle_request(mddev, bio)))
return DM_MAPIO_REQUEUE;
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
{ {
struct raid_set *rs = ti->private; struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
int ret = 0;
if (!mddev->pers || !mddev->pers->sync_request) if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL; return -EINVAL;
if (!strcasecmp(argv[0], "frozen")) if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) ||
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags))
else return -EBUSY;
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (!strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) { ret = mddev_lock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); if (ret)
md_reap_sync_thread(mddev); return ret;
md_frozen_sync_thread(mddev);
mddev_unlock(mddev);
} else if (!strcasecmp(argv[0], "idle")) {
ret = mddev_lock(mddev);
if (ret)
return ret;
md_idle_sync_thread(mddev);
mddev_unlock(mddev);
} }
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
return -EBUSY; return -EBUSY;
else if (!strcasecmp(argv[0], "resync")) else if (!strcasecmp(argv[0], "resync"))
; /* MD_RECOVERY_NEEDED set below */ ; /* MD_RECOVERY_NEEDED set below */
@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
} }
static void raid_presuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
/*
* From now on, disallow raid_message() to change sync_thread until
* resume, raid_postsuspend() is too late.
*/
set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
if (!reshape_interrupted(mddev))
return;
/*
* For raid456, if reshape is interrupted, IO across reshape position
* will never make progress, while caller will wait for IO to be done.
* Inform raid456 to handle those IO to prevent deadlock.
*/
if (mddev->pers && mddev->pers->prepare_suspend)
mddev->pers->prepare_suspend(mddev);
}
static void raid_presuspend_undo(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
}
static void raid_postsuspend(struct dm_target *ti) static void raid_postsuspend(struct dm_target *ti)
{ {
struct raid_set *rs = ti->private; struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
/* Writes have to be stopped before suspending to avoid deadlocks. */ /*
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) * sync_thread must be stopped during suspend, and writes have
* to be stopped before suspending to avoid deadlocks.
*/
md_stop_writes(&rs->md); md_stop_writes(&rs->md);
mddev_suspend(&rs->md, false); mddev_suspend(&rs->md, false);
} }
} }
@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti)
} }
/* Check for any resize/reshape on @rs and adjust/initiate */ /* Check for any resize/reshape on @rs and adjust/initiate */
/* Be prepared for mddev_resume() in raid_resume() */
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp; mddev->resync_min = mddev->recovery_cp;
@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti)
* Take this opportunity to check whether any failed * Take this opportunity to check whether any failed
* devices are reachable again. * devices are reachable again.
*/ */
mddev_lock_nointr(mddev);
attempt_restore_of_faulty_devices(rs); attempt_restore_of_faulty_devices(rs);
mddev_unlock(mddev);
} }
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti)
if (mddev->delta_disks < 0) if (mddev->delta_disks < 0)
rs_set_capacity(rs); rs_set_capacity(rs);
WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0; mddev->ro = 0;
mddev->in_sync = 0; mddev->in_sync = 0;
md_unfrozen_sync_thread(mddev);
mddev_unlock_and_resume(mddev); mddev_unlock_and_resume(mddev);
} }
} }
@ -4074,6 +4123,8 @@ static struct target_type raid_target = {
.message = raid_message, .message = raid_message,
.iterate_devices = raid_iterate_devices, .iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints, .io_hints = raid_io_hints,
.presuspend = raid_presuspend,
.presuspend_undo = raid_presuspend_undo,
.postsuspend = raid_postsuspend, .postsuspend = raid_postsuspend,
.preresume = raid_preresume, .preresume = raid_preresume,
.resume = raid_resume, .resume = raid_resume,

View File

@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
bool wc = false, fua = false; bool wc = false, fua = false;
int r; int r;
/*
* Copy table's limits to the DM device's request_queue
*/
q->limits = *limits;
if (dm_table_supports_nowait(t)) if (dm_table_supports_nowait(t))
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
else else
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
if (!dm_table_supports_discards(t)) { if (!dm_table_supports_discards(t)) {
q->limits.max_discard_sectors = 0; limits->max_hw_discard_sectors = 0;
q->limits.max_hw_discard_sectors = 0; limits->discard_granularity = 0;
q->limits.discard_granularity = 0; limits->discard_alignment = 0;
q->limits.discard_alignment = 0; limits->discard_misaligned = 0;
q->limits.discard_misaligned = 0;
} }
if (!dm_table_supports_write_zeroes(t))
limits->max_write_zeroes_sectors = 0;
if (!dm_table_supports_secure_erase(t)) if (!dm_table_supports_secure_erase(t))
q->limits.max_secure_erase_sectors = 0; limits->max_secure_erase_sectors = 0;
r = queue_limits_set(q, limits);
if (r)
return r;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
wc = true; wc = true;
@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0;
dm_table_verify_integrity(t); dm_table_verify_integrity(t);
/* /*
@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
} }
dm_update_crypto_profile(q, t); dm_update_crypto_profile(q, t);
disk_update_readahead(t->md->disk);
/* /*
* Check for request-based device is left to * Check for request-based device is left to

View File

@ -1655,10 +1655,13 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
struct dmz_dev *dev = zone->dev; struct dmz_dev *dev = zone->dev;
unsigned int noio_flag;
noio_flag = memalloc_noio_save();
ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
dmz_start_sect(zmd, zone), dmz_start_sect(zmd, zone),
zmd->zone_nr_sectors, GFP_NOIO); zmd->zone_nr_sectors);
memalloc_noio_restore(noio_flag);
if (ret) { if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d", dmz_dev_err(dev, "Reset zone %u failed %d",
zone->id, ret); zone->id, ret);

View File

@ -2101,8 +2101,8 @@ static struct mapped_device *alloc_dev(int minor)
* established. If request-based table is loaded: blk-mq will * established. If request-based table is loaded: blk-mq will
* override accordingly. * override accordingly.
*/ */
md->disk = blk_alloc_disk(md->numa_node_id); md->disk = blk_alloc_disk(NULL, md->numa_node_id);
if (!md->disk) if (IS_ERR(md->disk))
goto bad; goto bad;
md->queue = md->disk->queue; md->queue = md->disk->queue;

View File

@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
sector_t doff; sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (pg_index == store->file_pages - 1) { /* we compare length (page numbers), not page offset. */
if ((pg_index - store->sb_index) == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0) if (last_page_size == 0)
@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page = store->filemap[pg_index]; struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) { if (mddev_is_clustered(bitmap->mddev)) {
pg_index += bitmap->cluster_slot * /* go to node bitmap area starting point */
DIV_ROUND_UP(store->bytes, PAGE_SIZE); pg_index += store->sb_index;
} }
if (store->file) if (store->file)
@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk); unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0; unsigned long node_offset = 0;
index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev)) if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages; node_offset = bitmap->cluster_slot * store->file_pages;
@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk); unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0; unsigned long node_offset = 0;
index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev)) if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages; node_offset = bitmap->cluster_slot * store->file_pages;
@ -1043,8 +1046,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (dirty || need_write) { if (dirty || need_write) {
if (!writing) { if (!writing) {
md_bitmap_wait_writes(bitmap); md_bitmap_wait_writes(bitmap);
if (bitmap->mddev->queue) mddev_add_trace_msg(bitmap->mddev,
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_unplug"); "md bitmap_unplug");
} }
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
@ -1316,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
} }
bitmap->allclean = 1; bitmap->allclean = 1;
if (bitmap->mddev->queue) mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_daemon_work");
/* Any file-page which is PENDING now needs to be written. /* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes * So set NEEDWRITE now, then after we make any last-minute changes

View File

@ -1,17 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINEAR_H
#define _LINEAR_H
struct dev_info {
struct md_rdev *rdev;
sector_t end_sector;
};
struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[] __counted_by(raid_disks);
};
#endif

View File

@ -1,32 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MULTIPATH_H
#define _MULTIPATH_H
struct multipath_info {
struct md_rdev *rdev;
};
struct mpconf {
struct mddev *mddev;
struct multipath_info *multipaths;
int raid_disks;
spinlock_t device_lock;
struct list_head retry_list;
mempool_t pool;
};
/*
* this is our 'private' 'collective' MULTIPATH buffer head.
* it contains information about what kind of IO operations were started
* for this MULTIPATH operation, and about their status:
*/
struct multipath_bh {
struct mddev *mddev;
struct bio *master_bio;
struct bio bio;
int path;
struct list_head retry_list;
};
#endif

View File

@ -65,7 +65,6 @@
#include <linux/percpu-refcount.h> #include <linux/percpu-refcount.h>
#include <linux/part_stat.h> #include <linux/part_stat.h>
#include <trace/events/block.h>
#include "md.h" #include "md.h"
#include "md-bitmap.h" #include "md-bitmap.h"
#include "md-cluster.h" #include "md-cluster.h"
@ -99,18 +98,6 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread); static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
enum md_ro_state {
MD_RDWR,
MD_RDONLY,
MD_AUTO_READ,
MD_MAX_STATE
};
static bool md_is_rdwr(struct mddev *mddev)
{
return (mddev->ro == MD_RDWR);
}
/* /*
* Default number of read corrections we'll attempt on an rdev * Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error * before ejecting it from the array. We divide the read error
@ -378,7 +365,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true; return true;
} }
void md_handle_request(struct mddev *mddev, struct bio *bio) bool md_handle_request(struct mddev *mddev, struct bio *bio)
{ {
check_suspended: check_suspended:
if (is_suspended(mddev, bio)) { if (is_suspended(mddev, bio)) {
@ -386,7 +373,7 @@ check_suspended:
/* Bail out if REQ_NOWAIT is set for the bio */ /* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) { if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); bio_wouldblock_error(bio);
return; return true;
} }
for (;;) { for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait, prepare_to_wait(&mddev->sb_wait, &__wait,
@ -402,10 +389,13 @@ check_suspended:
if (!mddev->pers->make_request(mddev, bio)) { if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io); percpu_ref_put(&mddev->active_io);
if (!mddev->gendisk && mddev->pers->prepare_suspend)
return false;
goto check_suspended; goto check_suspended;
} }
percpu_ref_put(&mddev->active_io); percpu_ref_put(&mddev->active_io);
return true;
} }
EXPORT_SYMBOL(md_handle_request); EXPORT_SYMBOL(md_handle_request);
@ -529,6 +519,24 @@ void mddev_resume(struct mddev *mddev)
} }
EXPORT_SYMBOL_GPL(mddev_resume); EXPORT_SYMBOL_GPL(mddev_resume);
/* sync bdev before setting device to readonly or stopping raid*/
static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
{
mutex_lock(&mddev->open_mutex);
if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
mutex_unlock(&mddev->open_mutex);
sync_blockdev(mddev->gendisk->part0);
return 0;
}
/* /*
* Generic flush handling for md * Generic flush handling for md
*/ */
@ -2406,7 +2414,7 @@ int md_integrity_register(struct mddev *mddev)
if (list_empty(&mddev->disks)) if (list_empty(&mddev->disks))
return 0; /* nothing to do */ return 0; /* nothing to do */
if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */ return 0; /* shouldn't register, or already is */
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */ /* skip spares and non-functional disks */
@ -2459,7 +2467,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{ {
struct blk_integrity *bi_mddev; struct blk_integrity *bi_mddev;
if (!mddev->gendisk) if (mddev_is_dm(mddev))
return 0; return 0;
bi_mddev = blk_get_integrity(mddev->gendisk); bi_mddev = blk_get_integrity(mddev->gendisk);
@ -2566,6 +2574,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
fail: fail:
pr_warn("md: failed to register dev-%s for %s\n", pr_warn("md: failed to register dev-%s for %s\n",
b, mdname(mddev)); b, mdname(mddev));
mddev_destroy_serial_pool(mddev, rdev);
return err; return err;
} }
@ -2595,7 +2604,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
list_del_rcu(&rdev->same_set); list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev); pr_debug("md: unbind<%pg>\n", rdev->bdev);
mddev_destroy_serial_pool(rdev->mddev, rdev); mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL; WRITE_ONCE(rdev->mddev, NULL);
sysfs_remove_link(&rdev->kobj, "block"); sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_state);
sysfs_put(rdev->sysfs_unack_badblocks); sysfs_put(rdev->sysfs_unack_badblocks);
@ -2851,8 +2860,7 @@ repeat:
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync); mdname(mddev), mddev->in_sync);
if (mddev->queue) mddev_add_trace_msg(mddev, "md md_update_sb");
blk_add_trace_msg(mddev->queue, "md md_update_sb");
rewrite: rewrite:
md_bitmap_update_sb(mddev->bitmap); md_bitmap_update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
@ -2933,7 +2941,6 @@ static int add_bound_rdev(struct md_rdev *rdev)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_new_event(); md_new_event();
md_wakeup_thread(mddev->thread);
return 0; return 0;
} }
@ -3048,10 +3055,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (err == 0) { if (err == 0) {
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
if (mddev->pers) { if (mddev->pers)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
}
md_new_event(); md_new_event();
} }
} }
@ -3081,7 +3086,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
clear_bit(BlockedBadBlocks, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait); wake_up(&rdev->blocked_wait);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0; err = 0;
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
@ -3119,7 +3123,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
!test_bit(Replacement, &rdev->flags)) !test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags); set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0; err = 0;
} else if (cmd_match(buf, "-want_replacement")) { } else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed. /* Clearing 'want_replacement' is always allowed.
@ -3249,7 +3252,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
return -EBUSY; return -EBUSY;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) { } else if (rdev->mddev->pers) {
/* Activating a spare .. or possibly reactivating /* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here. * if we ever get bitmaps working here.
@ -3343,8 +3345,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
if (kstrtoull(buf, 10, &new_offset) < 0) if (kstrtoull(buf, 10, &new_offset) < 0)
return -EINVAL; return -EINVAL;
if (mddev->sync_thread || if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
return -EBUSY; return -EBUSY;
if (new_offset == rdev->data_offset) if (new_offset == rdev->data_offset)
/* reset is always permitted */ /* reset is always permitted */
@ -3675,7 +3676,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct kernfs_node *kn = NULL; struct kernfs_node *kn = NULL;
bool suspend = false; bool suspend = false;
ssize_t rv; ssize_t rv;
struct mddev *mddev = rdev->mddev; struct mddev *mddev = READ_ONCE(rdev->mddev);
if (!entry->store) if (!entry->store)
return -EIO; return -EIO;
@ -4017,8 +4018,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
*/ */
rv = -EBUSY; rv = -EBUSY;
if (mddev->sync_thread || if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector || mddev->reshape_position != MaxSector ||
mddev->sysfs_active) mddev->sysfs_active)
goto out_unlock; goto out_unlock;
@ -4168,7 +4168,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1; mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer); del_timer_sync(&mddev->safemode_timer);
} }
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev); pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (!mddev->thread) if (!mddev->thread)
@ -4475,8 +4474,8 @@ array_state_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]); return sprintf(page, "%s\n", array_states[st]);
} }
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); static int do_md_stop(struct mddev *mddev, int ro);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); static int md_set_readonly(struct mddev *mddev);
static int restart_array(struct mddev *mddev); static int restart_array(struct mddev *mddev);
static ssize_t static ssize_t
@ -4493,6 +4492,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case broken: /* cannot be set */ case broken: /* cannot be set */
case bad_word: case bad_word:
return -EINVAL; return -EINVAL;
case clear:
case readonly:
case inactive:
case read_auto:
if (!mddev->pers || !md_is_rdwr(mddev))
break;
/* write sysfs will not open mddev and opener should be 0 */
err = mddev_set_closing_and_sync_blockdev(mddev, 0);
if (err)
return err;
break;
default: default:
break; break;
} }
@ -4526,14 +4536,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case inactive: case inactive:
/* stop an active array, return 0 otherwise */ /* stop an active array, return 0 otherwise */
if (mddev->pers) if (mddev->pers)
err = do_md_stop(mddev, 2, NULL); err = do_md_stop(mddev, 2);
break; break;
case clear: case clear:
err = do_md_stop(mddev, 0, NULL); err = do_md_stop(mddev, 0);
break; break;
case readonly: case readonly:
if (mddev->pers) if (mddev->pers)
err = md_set_readonly(mddev, NULL); err = md_set_readonly(mddev);
else { else {
mddev->ro = MD_RDONLY; mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
@ -4543,7 +4553,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case read_auto: case read_auto:
if (mddev->pers) { if (mddev->pers) {
if (md_is_rdwr(mddev)) if (md_is_rdwr(mddev))
err = md_set_readonly(mddev, NULL); err = md_set_readonly(mddev);
else if (mddev->ro == MD_RDONLY) else if (mddev->ro == MD_RDONLY)
err = restart_array(mddev); err = restart_array(mddev);
if (err == 0) { if (err == 0) {
@ -4592,6 +4602,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
} }
mddev_unlock(mddev); mddev_unlock(mddev);
if (st == readonly || st == read_auto || st == inactive ||
(err && st == clear))
clear_bit(MD_CLOSING, &mddev->flags);
return err ?: len; return err ?: len;
} }
static struct md_sysfs_entry md_array_state = static struct md_sysfs_entry md_array_state =
@ -4919,6 +4934,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
} }
void md_idle_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, true);
}
EXPORT_SYMBOL_GPL(md_idle_sync_thread);
void md_frozen_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, false);
}
EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
void md_unfrozen_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
}
EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
static void idle_sync_thread(struct mddev *mddev) static void idle_sync_thread(struct mddev *mddev)
{ {
mutex_lock(&mddev->sync_mutex); mutex_lock(&mddev->sync_mutex);
@ -5710,6 +5754,51 @@ static const struct kobj_type md_ktype = {
int mdp_major = 0; int mdp_major = 0;
/* stack the limit for all rdevs into lim */
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
}
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
/* apply the extra stacking limits from a new rdev into mddev */
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return 0;
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
return queue_limits_commit_update(mddev->gendisk->queue, &lim);
}
EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
/* update the optimal I/O size after a reshape */
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return;
/* don't bother updating io_opt if we can't suspend the array */
if (mddev_suspend(mddev, false) < 0)
return;
lim = queue_limits_start_update(mddev->gendisk->queue);
lim.io_opt = lim.io_min * nr_stripes;
queue_limits_commit_update(mddev->gendisk->queue, &lim);
mddev_resume(mddev);
}
EXPORT_SYMBOL_GPL(mddev_update_io_opt);
static void mddev_delayed_delete(struct work_struct *ws) static void mddev_delayed_delete(struct work_struct *ws)
{ {
struct mddev *mddev = container_of(ws, struct mddev, del_work); struct mddev *mddev = container_of(ws, struct mddev, del_work);
@ -5774,10 +5863,11 @@ struct mddev *md_alloc(dev_t dev, char *name)
*/ */
mddev->hold_active = UNTIL_STOP; mddev->hold_active = UNTIL_STOP;
error = -ENOMEM; disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
disk = blk_alloc_disk(NUMA_NO_NODE); if (IS_ERR(disk)) {
if (!disk) error = PTR_ERR(disk);
goto out_free_mddev; goto out_free_mddev;
}
disk->major = MAJOR(mddev->unit); disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift; disk->first_minor = unit << shift;
@ -5791,9 +5881,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
disk->fops = &md_fops; disk->fops = &md_fops;
disk->private_data = mddev; disk->private_data = mddev;
mddev->queue = disk->queue; blk_queue_write_cache(disk->queue, true, true);
blk_set_stacking_limits(&mddev->queue->limits);
blk_queue_write_cache(mddev->queue, true, true);
disk->events |= DISK_EVENT_MEDIA_CHANGE; disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk; mddev->gendisk = disk;
error = add_disk(disk); error = add_disk(disk);
@ -5935,7 +6023,7 @@ int md_run(struct mddev *mddev)
invalidate_bdev(rdev->bdev); invalidate_bdev(rdev->bdev);
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY; mddev->ro = MD_RDONLY;
if (mddev->gendisk) if (!mddev_is_dm(mddev))
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
} }
@ -6038,7 +6126,10 @@ int md_run(struct mddev *mddev)
pr_warn("True protection against single-disk failure might be compromised.\n"); pr_warn("True protection against single-disk failure might be compromised.\n");
} }
/* dm-raid expect sync_thread to be frozen until resume */
if (mddev->gendisk)
mddev->recovery = 0; mddev->recovery = 0;
/* may be over-ridden by personality */ /* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors; mddev->resync_max_sectors = mddev->dev_sectors;
@ -6094,7 +6185,8 @@ int md_run(struct mddev *mddev)
} }
} }
if (mddev->queue) { if (!mddev_is_dm(mddev)) {
struct request_queue *q = mddev->gendisk->queue;
bool nonrot = true; bool nonrot = true;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
@ -6106,14 +6198,14 @@ int md_run(struct mddev *mddev)
if (mddev->degraded) if (mddev->degraded)
nonrot = false; nonrot = false;
if (nonrot) if (nonrot)
blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
else else
blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
/* Set the NOWAIT flags if all underlying devices support it */ /* Set the NOWAIT flags if all underlying devices support it */
if (nowait) if (nowait)
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
} }
if (pers->sync_request) { if (pers->sync_request) {
if (mddev->kobj.sd && if (mddev->kobj.sd &&
@ -6192,7 +6284,6 @@ int do_md_run(struct mddev *mddev)
/* run start up tasks that require md_thread */ /* run start up tasks that require md_thread */
md_start(mddev); md_start(mddev);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
@ -6213,7 +6304,6 @@ int md_start(struct mddev *mddev)
if (mddev->pers->start) { if (mddev->pers->start) {
set_bit(MD_RECOVERY_WAIT, &mddev->recovery); set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->thread);
ret = mddev->pers->start(mddev); ret = mddev->pers->start(mddev);
clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
@ -6258,7 +6348,6 @@ static int restart_array(struct mddev *mddev)
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
/* Kick recovery or resync if necessary */ /* Kick recovery or resync if necessary */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0; return 0;
@ -6278,7 +6367,15 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0; mddev->persistent = 0;
mddev->level = LEVEL_NONE; mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0; mddev->clevel[0] = 0;
/*
* Don't clear MD_CLOSING, or mddev can be opened again.
* 'hold_active != 0' means mddev is still in the creation
* process and will be used later.
*/
if (mddev->hold_active)
mddev->flags = 0; mddev->flags = 0;
else
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
mddev->sb_flags = 0; mddev->sb_flags = 0;
mddev->ro = MD_RDWR; mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0; mddev->metadata_type[0] = 0;
@ -6315,7 +6412,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev)
{ {
stop_sync_thread(mddev, true, false);
del_timer_sync(&mddev->safemode_timer); del_timer_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) { if (mddev->pers && mddev->pers->quiesce) {
@ -6340,6 +6436,8 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev)
{ {
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, false);
__md_stop_writes(mddev); __md_stop_writes(mddev);
mddev_unlock(mddev); mddev_unlock(mddev);
} }
@ -6353,8 +6451,10 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0); mddev->pers->quiesce(mddev, 0);
} }
md_unregister_thread(mddev, &mddev->thread); md_unregister_thread(mddev, &mddev->thread);
if (mddev->queue)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ /* the unplug fn references 'conf' */
if (!mddev_is_dm(mddev))
blk_sync_queue(mddev->gendisk->queue);
} }
static void __md_stop(struct mddev *mddev) static void __md_stop(struct mddev *mddev)
@ -6391,7 +6491,8 @@ void md_stop(struct mddev *mddev)
EXPORT_SYMBOL_GPL(md_stop); EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) /* ensure 'mddev->pers' exist before calling md_set_readonly() */
static int md_set_readonly(struct mddev *mddev)
{ {
int err = 0; int err = 0;
int did_freeze = 0; int did_freeze = 0;
@ -6402,7 +6503,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1; did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} }
stop_sync_thread(mddev, false, false); stop_sync_thread(mddev, false, false);
@ -6410,16 +6510,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev)); pr_warn("md: %s still in use.\n",mdname(mddev));
err = -EBUSY; err = -EBUSY;
goto out; goto out;
} }
if (mddev->pers) {
__md_stop_writes(mddev); __md_stop_writes(mddev);
if (mddev->ro == MD_RDONLY) { if (mddev->ro == MD_RDONLY) {
@ -6429,17 +6525,14 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
mddev->ro = MD_RDONLY; mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
}
out: out:
if ((mddev->pers && !err) || did_freeze) { if (!err || did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
} }
mutex_unlock(&mddev->open_mutex);
return err; return err;
} }
@ -6447,8 +6540,7 @@ out:
* 0 - completely stop and dis-assemble array * 0 - completely stop and dis-assemble array
* 2 - stop but do not disassemble array * 2 - stop but do not disassemble array
*/ */
static int do_md_stop(struct mddev *mddev, int mode, static int do_md_stop(struct mddev *mddev, int mode)
struct block_device *bdev)
{ {
struct gendisk *disk = mddev->gendisk; struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev; struct md_rdev *rdev;
@ -6457,22 +6549,16 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1; did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} }
stop_sync_thread(mddev, true, false); stop_sync_thread(mddev, true, false);
mutex_lock(&mddev->open_mutex); if (mddev->sysfs_active ||
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sysfs_active ||
mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev)); pr_warn("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex);
if (did_freeze) { if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} }
return -EBUSY; return -EBUSY;
} }
@ -6491,13 +6577,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
set_capacity_and_notify(disk, 0); set_capacity_and_notify(disk, 0);
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1; mddev->changed = 1;
if (!md_is_rdwr(mddev)) if (!md_is_rdwr(mddev))
mddev->ro = MD_RDWR; mddev->ro = MD_RDWR;
} else }
mutex_unlock(&mddev->open_mutex);
/* /*
* Free resources if final stop * Free resources if final stop
*/ */
@ -6543,7 +6627,7 @@ static void autorun_array(struct mddev *mddev)
err = do_md_run(mddev); err = do_md_run(mddev);
if (err) { if (err) {
pr_warn("md: do_md_run() returned %d\n", err); pr_warn("md: do_md_run() returned %d\n", err);
do_md_stop(mddev, 0, NULL); do_md_stop(mddev, 0);
} }
} }
@ -7013,9 +7097,7 @@ kick_rdev:
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (mddev->thread) if (!mddev->thread)
md_wakeup_thread(mddev->thread);
else
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
md_new_event(); md_new_event();
@ -7090,14 +7172,13 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
if (!bdev_nowait(rdev->bdev)) { if (!bdev_nowait(rdev->bdev)) {
pr_info("%s: Disabling nowait because %pg does not support nowait\n", pr_info("%s: Disabling nowait because %pg does not support nowait\n",
mdname(mddev), rdev->bdev); mdname(mddev), rdev->bdev);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
} }
/* /*
* Kick recovery, maybe this spare has to be added to the * Kick recovery, maybe this spare has to be added to the
* array immediately. * array immediately.
*/ */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_new_event(); md_new_event();
return 0; return 0;
@ -7311,8 +7392,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* of each device. If num_sectors is zero, we find the largest size * of each device. If num_sectors is zero, we find the largest size
* that fits. * that fits.
*/ */
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
mddev->sync_thread)
return -EBUSY; return -EBUSY;
if (!md_is_rdwr(mddev)) if (!md_is_rdwr(mddev))
return -EROFS; return -EROFS;
@ -7329,11 +7409,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (!rv) { if (!rv) {
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
md_cluster_ops->update_size(mddev, old_dev_sectors); md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) { else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk, set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors); mddev->array_sectors);
} }
}
return rv; return rv;
} }
@ -7349,8 +7428,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
if (raid_disks <= 0 || if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks)) (mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL; return -EINVAL;
if (mddev->sync_thread || if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
mddev->reshape_position != MaxSector) mddev->reshape_position != MaxSector)
return -EBUSY; return -EBUSY;
@ -7546,16 +7624,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0; return 0;
} }
static inline bool md_ioctl_valid(unsigned int cmd) static inline int md_ioctl_valid(unsigned int cmd)
{ {
switch (cmd) { switch (cmd) {
case ADD_NEW_DISK:
case GET_ARRAY_INFO: case GET_ARRAY_INFO:
case GET_BITMAP_FILE:
case GET_DISK_INFO: case GET_DISK_INFO:
case RAID_VERSION:
return 0;
case ADD_NEW_DISK:
case GET_BITMAP_FILE:
case HOT_ADD_DISK: case HOT_ADD_DISK:
case HOT_REMOVE_DISK: case HOT_REMOVE_DISK:
case RAID_VERSION:
case RESTART_ARRAY_RW: case RESTART_ARRAY_RW:
case RUN_ARRAY: case RUN_ARRAY:
case SET_ARRAY_INFO: case SET_ARRAY_INFO:
@ -7564,9 +7643,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case STOP_ARRAY: case STOP_ARRAY:
case STOP_ARRAY_RO: case STOP_ARRAY_RO:
case CLUSTERED_DISK_NACK: case CLUSTERED_DISK_NACK:
return true; if (!capable(CAP_SYS_ADMIN))
return -EACCES;
return 0;
default: default:
return false; return -ENOTTY;
} }
} }
@ -7624,31 +7705,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
int err = 0; int err = 0;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL; struct mddev *mddev = NULL;
bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd)) err = md_ioctl_valid(cmd);
return -ENOTTY; if (err)
return err;
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
}
/* /*
* Commands dealing with the RAID driver but not any * Commands dealing with the RAID driver but not any
* particular array: * particular array:
*/ */
switch (cmd) { if (cmd == RAID_VERSION)
case RAID_VERSION: return get_version(argp);
err = get_version(argp);
goto out;
default:;
}
/* /*
* Commands creating/starting a new array: * Commands creating/starting a new array:
@ -7656,35 +7723,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mddev = bdev->bd_disk->private_data; mddev = bdev->bd_disk->private_data;
if (!mddev) {
BUG();
goto out;
}
/* Some actions do not requires the mutex */ /* Some actions do not requires the mutex */
switch (cmd) { switch (cmd) {
case GET_ARRAY_INFO: case GET_ARRAY_INFO:
if (!mddev->raid_disks && !mddev->external) if (!mddev->raid_disks && !mddev->external)
err = -ENODEV; return -ENODEV;
else return get_array_info(mddev, argp);
err = get_array_info(mddev, argp);
goto out;
case GET_DISK_INFO: case GET_DISK_INFO:
if (!mddev->raid_disks && !mddev->external) if (!mddev->raid_disks && !mddev->external)
err = -ENODEV; return -ENODEV;
else return get_disk_info(mddev, argp);
err = get_disk_info(mddev, argp);
goto out;
case SET_DISK_FAULTY: case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, new_decode_dev(arg)); return set_disk_faulty(mddev, new_decode_dev(arg));
goto out;
case GET_BITMAP_FILE: case GET_BITMAP_FILE:
err = get_bitmap_file(mddev, argp); return get_bitmap_file(mddev, argp);
goto out;
} }
if (cmd == HOT_REMOVE_DISK) if (cmd == HOT_REMOVE_DISK)
@ -7697,20 +7752,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
/* Need to flush page cache, and ensure no-one else opens /* Need to flush page cache, and ensure no-one else opens
* and writes * and writes
*/ */
mutex_lock(&mddev->open_mutex); err = mddev_set_closing_and_sync_blockdev(mddev, 1);
if (mddev->pers && atomic_read(&mddev->openers) > 1) { if (err)
mutex_unlock(&mddev->open_mutex); return err;
err = -EBUSY;
goto out;
}
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex);
err = -EBUSY;
goto out;
}
did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
} }
if (!md_is_rdwr(mddev)) if (!md_is_rdwr(mddev))
@ -7751,11 +7795,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
goto unlock; goto unlock;
case STOP_ARRAY: case STOP_ARRAY:
err = do_md_stop(mddev, 0, bdev); err = do_md_stop(mddev, 0);
goto unlock; goto unlock;
case STOP_ARRAY_RO: case STOP_ARRAY_RO:
err = md_set_readonly(mddev, bdev); if (mddev->pers)
err = md_set_readonly(mddev);
goto unlock; goto unlock;
case HOT_REMOVE_DISK: case HOT_REMOVE_DISK:
@ -7850,7 +7895,7 @@ unlock:
mddev_unlock(mddev); mddev_unlock(mddev);
out: out:
if(did_set_md_closing) if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
clear_bit(MD_CLOSING, &mddev->flags); clear_bit(MD_CLOSING, &mddev->flags);
return err; return err;
} }
@ -8687,10 +8732,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
bio_chain(discard_bio, bio); bio_chain(discard_bio, bio);
bio_clone_blkg_association(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk) mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
trace_block_bio_remap(discard_bio,
disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
submit_bio_noacct(discard_bio); submit_bio_noacct(discard_bio);
} }
EXPORT_SYMBOL_GPL(md_submit_discard_bio); EXPORT_SYMBOL_GPL(md_submit_discard_bio);
@ -8737,6 +8779,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
} }
EXPORT_SYMBOL_GPL(md_account_bio); EXPORT_SYMBOL_GPL(md_account_bio);
void md_free_cloned_bio(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
bio_put(bio);
percpu_ref_put(&mddev->active_io);
}
EXPORT_SYMBOL_GPL(md_free_cloned_bio);
/* md_allow_write(mddev) /* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes * Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before * may proceed without blocking. It is important to call this before
@ -9170,7 +9229,7 @@ void md_do_sync(struct md_thread *thread)
mddev->delta_disks > 0 && mddev->delta_disks > 0 &&
mddev->pers->finish_reshape && mddev->pers->finish_reshape &&
mddev->pers->size && mddev->pers->size &&
mddev->queue) { !mddev_is_dm(mddev)) {
mddev_lock_nointr(mddev); mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev); mddev_unlock(mddev);
@ -9270,9 +9329,14 @@ static bool md_spares_need_change(struct mddev *mddev)
{ {
struct md_rdev *rdev; struct md_rdev *rdev;
rdev_for_each(rdev, mddev) rcu_read_lock();
if (rdev_removeable(rdev) || rdev_addable(rdev)) rdev_for_each_rcu(rdev, mddev) {
if (rdev_removeable(rdev) || rdev_addable(rdev)) {
rcu_read_unlock();
return true; return true;
}
}
rcu_read_unlock();
return false; return false;
} }

View File

@ -18,6 +18,7 @@
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <trace/events/block.h>
#include "md-cluster.h" #include "md-cluster.h"
#define MaxSector (~(sector_t)0) #define MaxSector (~(sector_t)0)
@ -207,6 +208,7 @@ enum flag_bits {
* check if there is collision between raid1 * check if there is collision between raid1
* serial bios. * serial bios.
*/ */
Nonrot, /* non-rotational device (SSD) */
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@ -222,6 +224,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
} }
return 0; return 0;
} }
static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
int bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
@ -468,7 +480,6 @@ struct mddev {
struct timer_list safemode_timer; struct timer_list safemode_timer;
struct percpu_ref writes_pending; struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */ int sync_checkers; /* # of threads checking writes_pending */
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */ struct bitmap *bitmap; /* the bitmap for the device */
struct { struct {
@ -558,6 +569,37 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, /* remote node is running resync thread */
}; };
enum md_ro_state {
MD_RDWR,
MD_RDONLY,
MD_AUTO_READ,
MD_MAX_STATE
};
static inline bool md_is_rdwr(struct mddev *mddev)
{
return (mddev->ro == MD_RDWR);
}
static inline bool reshape_interrupted(struct mddev *mddev)
{
/* reshape never start */
if (mddev->reshape_position == MaxSector)
return false;
/* interrupted */
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return true;
/* running reshape will be interrupted soon. */
if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
return true;
return false;
}
static inline int __must_check mddev_lock(struct mddev *mddev) static inline int __must_check mddev_lock(struct mddev *mddev)
{ {
return mutex_lock_interruptible(&mddev->reconfig_mutex); return mutex_lock_interruptible(&mddev->reconfig_mutex);
@ -617,6 +659,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev);
void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing. /* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete * 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour * 0 - return to normal behaviour
@ -750,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size); struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio); void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@ -778,9 +822,12 @@ extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev); extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev);
extern void md_idle_sync_thread(struct mddev *mddev);
extern void md_frozen_sync_thread(struct mddev *mddev);
extern void md_unfrozen_sync_thread(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force); extern void md_update_sb(struct mddev *mddev, int force);
@ -821,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
{ {
if (bio_op(bio) == REQ_OP_WRITE_ZEROES && if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0; mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
} }
static inline int mddev_suspend_and_lock(struct mddev *mddev) static inline int mddev_suspend_and_lock(struct mddev *mddev)
@ -860,7 +907,31 @@ void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev); int do_md_run(struct mddev *mddev);
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim);
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
extern const struct block_device_operations md_fops; extern const struct block_device_operations md_fops;
/*
* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
*/
static inline bool mddev_is_dm(struct mddev *mddev)
{
return !mddev->gendisk;
}
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
if (!mddev_is_dm(mddev))
trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
}
#define mddev_add_trace_msg(mddev, fmt, args...) \
do { \
if (!mddev_is_dm(mddev)) \
blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
} while (0)
#endif /* _MD_MD_H */ #endif /* _MD_MD_H */

View File

@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv)
free_conf(mddev, conf); free_conf(mddev, conf);
} }
static int raid0_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid0_run(struct mddev *mddev) static int raid0_run(struct mddev *mddev)
{ {
struct r0conf *conf; struct r0conf *conf;
@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf; mddev->private = conf;
} }
conf = mddev->private; conf = mddev->private;
if (mddev->queue) { if (!mddev_is_dm(mddev)) {
struct md_rdev *rdev; ret = raid0_set_limits(mddev);
if (ret)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); goto out_free_conf;
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
} }
/* calculate array device size */ /* calculate array device size */
@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev); ret = md_integrity_register(mddev);
if (ret) if (ret)
goto out_free_conf;
return 0;
out_free_conf:
free_conf(mddev, conf); free_conf(mddev, conf);
return ret; return ret;
} }
@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
bio_set_dev(bio, tmp_dev->bdev); bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start + bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset; tmp_dev->data_offset;
mddev_trace_remap(mddev, bio, bio_sector);
if (mddev->gendisk)
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_write_zeroes(mddev, bio); mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio); submit_bio_noacct(bio);
} }

View File

@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
return false; return false;
} }
/**
* raid1_check_read_range() - check a given read range for bad blocks,
* available read length is returned;
* @rdev: the rdev to read;
* @this_sector: read position;
* @len: read length;
*
* helper function for read_balance()
*
* 1) If there are no bad blocks in the range, @len is returned;
* 2) If the range are all bad blocks, 0 is returned;
* 3) If there are partial bad blocks:
* - If the bad block range starts after @this_sector, the length of first
* good region is returned;
* - If the bad block range starts before @this_sector, 0 is returned and
* the @len is updated to the offset into the region before we get to the
* good blocks;
*/
static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
int bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
return *len;
/*
* bad block range starts offset into our range so we can return the
* number of sectors before the bad blocks start.
*/
if (first_bad > this_sector)
return first_bad - this_sector;
/* read range is fully consumed by bad blocks. */
if (this_sector + *len <= first_bad + bad_sectors)
return 0;
/*
* final case, bad block range starts before or at the start of our
* range but does not cover our entire range so we still return 0 but
* update the length with the number of sectors before we get to the
* good ones.
*/
*len = first_bad + bad_sectors - this_sector;
return 0;
}
/*
* Check if read should choose the first rdev.
*
* Balance on the whole device if no resync is going on (recovery is ok) or
* below the resync window. Otherwise, take the first readable disk.
*/
static inline bool raid1_should_read_first(struct mddev *mddev,
sector_t this_sector, int len)
{
if ((mddev->recovery_cp < this_sector + len))
return true;
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, READ, this_sector,
this_sector + len))
return true;
return false;
}

View File

@ -46,9 +46,6 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
#define RAID_1_10_NAME "raid1" #define RAID_1_10_NAME "raid1"
#include "raid1-10.c" #include "raid1-10.c"
@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it * to user-side. So if something waits for IO, then it
* will wait for the 'master' bio. * will wait for the 'master' bio.
*/ */
sector_t first_bad;
int bad_sectors;
r1_bio->bios[mirror] = NULL; r1_bio->bios[mirror] = NULL;
to_put = bio; to_put = bio;
/* /*
@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state); set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
&first_bad, &bad_sectors) && !discard_error) { !discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD; r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state); set_bit(R1BIO_MadeGood, &r1_bio->state);
} }
@ -582,211 +576,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
return len; return len;
} }
/* static void update_read_sectors(struct r1conf *conf, int disk,
* This routine returns the disk from which the requested read should sector_t this_sector, int len)
* be done. There is a per-array 'next expected sequential IO' sector
* number - if this matches on the next IO then we use the last disk.
* There is also a per-disk 'last know head position' sector that is
* maintained from IRQ contexts, both the normal and the resync IO
* completion handlers update this position correctly. If there is no
* perfect sequential match then we pick the disk whose head is closest.
*
* If there are 2 mirrors in the same 2 devices, performance degrades
* because position is mirror, not device based.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
{ {
const sector_t this_sector = r1_bio->sector; struct raid1_info *info = &conf->mirrors[disk];
int sectors;
int best_good_sectors; atomic_inc(&info->rdev->nr_pending);
int best_disk, best_dist_disk, best_pending_disk; if (info->next_seq_sect != this_sector)
int has_nonrot_disk; info->seq_start = this_sector;
info->next_seq_sect = this_sector + len;
}
static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int len = r1_bio->sectors;
int disk; int disk;
sector_t best_dist;
unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
int choose_next_idle;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
retry:
sectors = r1_bio->sectors;
best_disk = -1;
best_dist_disk = -1;
best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
has_nonrot_disk = 0;
choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
choose_first = 0;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist; struct md_rdev *rdev;
sector_t first_bad; int read_len;
int bad_sectors;
unsigned int pending; if (r1_bio->bios[disk] == IO_BLOCKED)
bool nonrot; continue;
rdev = conf->mirrors[disk].rdev; rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED if (!rdev || test_bit(Faulty, &rdev->flags))
|| rdev == NULL
|| test_bit(Faulty, &rdev->flags))
continue; continue;
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < this_sector + sectors)
continue;
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
if (best_dist_disk < 0) {
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad <= this_sector)
/* Cannot use this */
continue;
best_good_sectors = first_bad - this_sector;
} else
best_good_sectors = sectors;
best_dist_disk = disk;
best_pending_disk = disk;
}
continue;
}
/* This is a reasonable device to use. It might
* even be best.
*/
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (best_dist < MaxSector)
/* already have a better device */
continue;
if (first_bad <= this_sector) {
/* cannot read here. If this is the 'primary'
* device, then we must not read beyond
* bad_sectors from another device..
*/
bad_sectors -= (this_sector - first_bad);
if (choose_first && sectors > bad_sectors)
sectors = bad_sectors;
if (best_good_sectors > sectors)
best_good_sectors = sectors;
} else { /* choose the first disk even if it has some bad blocks. */
sector_t good_sectors = first_bad - this_sector; read_len = raid1_check_read_range(rdev, this_sector, &len);
if (good_sectors > best_good_sectors) { if (read_len > 0) {
best_good_sectors = good_sectors; update_read_sectors(conf, disk, this_sector, read_len);
*max_sectors = read_len;
return disk;
}
}
return -1;
}
static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int best_disk = -1;
int best_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
int len;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
test_bit(WriteMostly, &rdev->flags))
continue;
/* keep track of the disk with the most readable sectors. */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len > best_len) {
best_disk = disk; best_disk = disk;
best_len = read_len;
} }
if (choose_first)
break;
}
continue;
} else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors;
} }
if (best_disk >= 0) if (best_disk != -1) {
*max_sectors = best_len;
update_read_sectors(conf, best_disk, this_sector, best_len);
}
return best_disk;
}
static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int bb_disk = -1;
int bb_read_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
int len;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
!test_bit(WriteMostly, &rdev->flags))
continue;
/* there are no bad blocks, we can use this disk */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len == r1_bio->sectors) {
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
/*
* there are partial bad blocks, choose the rdev with largest
* read length.
*/
if (read_len > bb_read_len) {
bb_disk = disk;
bb_read_len = read_len;
}
}
if (bb_disk != -1) {
*max_sectors = bb_read_len;
update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
}
return bb_disk;
}
static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
{
/* TODO: address issues with this check and concurrency. */
return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
conf->mirrors[disk].head_position == r1_bio->sector;
}
/*
* If buffered sequential IO size exceeds optimal iosize, check if there is idle
* disk. If yes, choose the idle disk.
*/
static bool should_choose_next(struct r1conf *conf, int disk)
{
struct raid1_info *mirror = &conf->mirrors[disk];
int opt_iosize;
if (!test_bit(Nonrot, &mirror->rdev->flags))
return false;
opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
return opt_iosize > 0 && mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
}
static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
{
if (!rdev || test_bit(Faulty, &rdev->flags))
return false;
/* still in recovery */
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
return false;
/* don't read from slow disk unless have to */
if (test_bit(WriteMostly, &rdev->flags))
return false;
/* don't split IO for bad blocks unless have to */
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
return false;
return true;
}
struct read_balance_ctl {
sector_t closest_dist;
int closest_dist_disk;
int min_pending;
int min_pending_disk;
int sequential_disk;
int readable_disks;
};
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
{
int disk;
struct read_balance_ctl ctl = {
.closest_dist_disk = -1,
.closest_dist = MaxSector,
.min_pending_disk = -1,
.min_pending = UINT_MAX,
.sequential_disk = -1,
};
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
sector_t dist;
unsigned int pending;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev_readable(rdev, r1_bio))
continue;
/* At least two disks to choose from so failfast is OK */ /* At least two disks to choose from so failfast is OK */
if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state); set_bit(R1BIO_FailFast, &r1_bio->state);
nonrot = bdev_nonrot(rdev->bdev);
has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending); pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position); dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
if (choose_first) {
best_disk = disk;
break;
}
/* Don't change to another disk for sequential reads */ /* Don't change to another disk for sequential reads */
if (conf->mirrors[disk].next_seq_sect == this_sector if (is_sequential(conf, disk, r1_bio)) {
|| dist == 0) { if (!should_choose_next(conf, disk))
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; return disk;
struct raid1_info *mirror = &conf->mirrors[disk];
best_disk = disk;
/* /*
* If buffered sequential IO size exceeds optimal * Add 'pending' to avoid choosing this disk if
* iosize, check if there is idle disk. If yes, choose * there is other idle disk.
* the idle disk. read_balance could already choose an
* idle disk before noticing it's a sequential IO in
* this disk. This doesn't matter because this disk
* will idle, next time it will be utilized after the
* first disk has IO size exceeds optimal iosize. In
* this way, iosize of the first disk will be optimal
* iosize at least. iosize of the second disk might be
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/ */
if (nonrot && opt_iosize > 0 && pending++;
mirror->seq_start != MaxSector && /*
mirror->next_seq_sect > opt_iosize && * If there is no other idle disk, this disk
mirror->next_seq_sect - opt_iosize >= * will be chosen.
mirror->seq_start) { */
choose_next_idle = 1; ctl.sequential_disk = disk;
continue;
}
break;
} }
if (choose_next_idle) if (ctl.min_pending > pending) {
continue; ctl.min_pending = pending;
ctl.min_pending_disk = disk;
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
} }
if (dist < best_dist) { if (ctl.closest_dist > dist) {
best_dist = dist; ctl.closest_dist = dist;
best_dist_disk = disk; ctl.closest_dist_disk = disk;
} }
} }
/*
* sequential IO size exceeds optimal iosize, however, there is no other
* idle disk, so choose the sequential disk.
*/
if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
return ctl.sequential_disk;
/* /*
* If all disks are rotational, choose the closest disk. If any disk is * If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the * non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with * disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload. * mixed ratation/non-rotational disks depending on workload.
*/ */
if (best_disk == -1) { if (ctl.min_pending_disk != -1 &&
if (has_nonrot_disk || min_pending == 0) (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
best_disk = best_pending_disk; return ctl.min_pending_disk;
else else
best_disk = best_dist_disk; return ctl.closest_dist_disk;
} }
if (best_disk >= 0) { /*
rdev = conf->mirrors[best_disk].rdev; * This routine returns the disk from which the requested read should be done.
if (!rdev) *
goto retry; * 1) If resync is in progress, find the first usable disk and use it even if it
atomic_inc(&rdev->nr_pending); * has some bad blocks.
sectors = best_good_sectors; *
* 2) Now that there is no resync, loop through all disks and skipping slow
* disks and disks with bad blocks for now. Only pay attention to key disk
* choice.
*
* 3) If we've made it this far, now look for disks with bad blocks and choose
* the one with most number of sectors.
*
* 4) If we are all the way at the end, we have no choice but to use a disk even
* if it is write mostly.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
int disk;
if (conf->mirrors[best_disk].next_seq_sect != this_sector) clear_bit(R1BIO_FailFast, &r1_bio->state);
conf->mirrors[best_disk].seq_start = this_sector;
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; if (raid1_should_read_first(conf->mddev, r1_bio->sector,
r1_bio->sectors))
return choose_first_rdev(conf, r1_bio, max_sectors);
disk = choose_best_rdev(conf, r1_bio);
if (disk >= 0) {
*max_sectors = r1_bio->sectors;
update_read_sectors(conf, disk, r1_bio->sector,
r1_bio->sectors);
return disk;
} }
*max_sectors = sectors;
return best_disk; /*
* If we are here it means we didn't find a perfectly good disk so
* now spend a bit more time trying to find one with the most good
* sectors.
*/
disk = choose_bb_rdev(conf, r1_bio, max_sectors);
if (disk >= 0)
return disk;
return choose_slow_rdev(conf, r1_bio, max_sectors);
} }
static void wake_up_barrier(struct r1conf *conf) static void wake_up_barrier(struct r1conf *conf)
@ -1098,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra)
*/ */
spin_lock_irq(&conf->resync_lock); spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 1; conf->array_frozen = 1;
raid1_log(conf->mddev, "wait freeze"); mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
wait_event_lock_irq_cmd( wait_event_lock_irq_cmd(
conf->wait_barrier, conf->wait_barrier,
get_unqueued_pending(conf) == extra, get_unqueued_pending(conf) == extra,
@ -1287,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* Reading from a write-mostly device must take care not to * Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind' * over-take any writes that are 'behind'
*/ */
raid1_log(mddev, "wait behind writes"); mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait, wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0); atomic_read(&bitmap->behind_writes) == 0);
} }
@ -1320,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R1BIO_FailFast, &r1_bio->state)) test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r1_bio; read_bio->bi_private = r1_bio;
mddev_trace_remap(mddev, read_bio, r1_bio->sector);
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
submit_bio_noacct(read_bio); submit_bio_noacct(read_bio);
} }
@ -1474,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio); bio_wouldblock_error(bio);
return; return;
} }
raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev); md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, bio->bi_iter.bi_sector, false); wait_barrier(conf, bio->bi_iter.bi_sector, false);
goto retry_write; goto retry_write;
@ -1557,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_private = r1_bio; mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining); atomic_inc(&r1_bio->remaining);
mddev_trace_remap(mddev, mbio, r1_bio->sector);
if (mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/ /* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev; mbio->bi_bdev = (void *)rdev;
if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
@ -1760,6 +1849,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count; return count;
} }
static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
bool replacement)
{
struct raid1_info *info = conf->mirrors + disk;
if (replacement)
info += conf->raid_disks;
if (info->rdev)
return false;
if (bdev_nonrot(rdev->bdev)) {
set_bit(Nonrot, &rdev->flags);
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
}
rdev->raid_disk = disk;
info->head_position = 0;
info->seq_start = MaxSector;
WRITE_ONCE(info->rdev, rdev);
return true;
}
static bool raid1_remove_conf(struct r1conf *conf, int disk)
{
struct raid1_info *info = conf->mirrors + disk;
struct md_rdev *rdev = info->rdev;
if (!rdev || test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending))
return false;
/* Only remove non-faulty devices if recovery is not possible. */
if (!test_bit(Faulty, &rdev->flags) &&
rdev->mddev->recovery_disabled != conf->recovery_disabled &&
rdev->mddev->degraded < conf->raid_disks)
return false;
if (test_and_clear_bit(Nonrot, &rdev->flags))
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
WRITE_ONCE(info->rdev, NULL);
return true;
}
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
@ -1791,19 +1926,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (mirror = first; mirror <= last; mirror++) { for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors + mirror; p = conf->mirrors + mirror;
if (!p->rdev) { if (!p->rdev) {
if (mddev->gendisk) err = mddev_stack_new_rdev(mddev, rdev);
disk_stack_limits(mddev->gendisk, rdev->bdev, if (err)
rdev->data_offset << 9); return err;
p->head_position = 0; raid1_add_conf(conf, rdev, mirror, false);
rdev->raid_disk = mirror;
err = 0;
/* As all devices are equivalent, we don't need a full recovery /* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array * if this was recently any drive of the array
*/ */
if (rdev->saved_raid_disk < 0) if (rdev->saved_raid_disk < 0)
conf->fullsync = 1; conf->fullsync = 1;
WRITE_ONCE(p->rdev, rdev);
break; break;
} }
if (test_bit(WantReplacement, &p->rdev->flags) && if (test_bit(WantReplacement, &p->rdev->flags) &&
@ -1813,13 +1945,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) { if (err && repl_slot >= 0) {
/* Add this device as a replacement */ /* Add this device as a replacement */
p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags); set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot; raid1_add_conf(conf, rdev, repl_slot, true);
err = 0; err = 0;
conf->fullsync = 1; conf->fullsync = 1;
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
} }
print_conf(conf); print_conf(conf);
@ -1836,27 +1966,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks)) if (unlikely(number >= conf->raid_disks))
goto abort; goto abort;
if (rdev != p->rdev) if (rdev != p->rdev) {
p = conf->mirrors + conf->raid_disks + number; number += conf->raid_disks;
p = conf->mirrors + number;
}
print_conf(conf); print_conf(conf);
if (rdev == p->rdev) { if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) || if (!raid1_remove_conf(conf, number)) {
atomic_read(&rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
/* Only remove non-faulty devices if recovery
* is not possible. if (number < conf->raid_disks &&
*/ conf->mirrors[conf->raid_disks + number].rdev) {
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
WRITE_ONCE(p->rdev, NULL);
if (conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced. /* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before * Move down the replacement. We drain all IO before
* doing this to avoid confusion. * doing this to avoid confusion.
@ -1944,8 +2067,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio); struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev; struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
sector_t first_bad;
int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) { if (!uptodate) {
@ -1955,14 +2076,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, & set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery); mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state); set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
&first_bad, &bad_sectors) && !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
!is_badblock(conf->mirrors[r1_bio->read_disk].rdev, r1_bio->sector, r1_bio->sectors)) {
r1_bio->sector,
r1_bio->sectors,
&first_bad, &bad_sectors)
)
set_bit(R1BIO_MadeGood, &r1_bio->state); set_bit(R1BIO_MadeGood, &r1_bio->state);
}
put_sync_write_buf(r1_bio, uptodate); put_sync_write_buf(r1_bio, uptodate);
} }
@ -2279,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
do { do {
sector_t first_bad;
int bad_sectors;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
(test_bit(In_sync, &rdev->flags) || (test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) && (!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) && rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s, rdev_has_badblock(rdev, sect, s) == 0) {
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9, if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false)) conf->tmppage, REQ_OP_READ, false))
@ -3006,23 +3120,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL; err = -EINVAL;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk; int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
if (test_bit(Replacement, &rdev->flags))
disk = conf->mirrors + mddev->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx;
if (disk->rdev) if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue;
if (!raid1_add_conf(conf, rdev, disk_idx,
test_bit(Replacement, &rdev->flags)))
goto abort; goto abort;
disk->rdev = rdev;
disk->head_position = 0;
disk->seq_start = MaxSector;
} }
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev; conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list); INIT_LIST_HEAD(&conf->bio_end_io_list);
@ -3086,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err); return ERR_PTR(err);
} }
static int raid1_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static void raid1_free(struct mddev *mddev, void *priv); static void raid1_free(struct mddev *mddev, void *priv);
static int raid1_run(struct mddev *mddev) static int raid1_run(struct mddev *mddev)
{ {
struct r1conf *conf; struct r1conf *conf;
int i; int i;
struct md_rdev *rdev;
int ret; int ret;
if (mddev->level != 1) { if (mddev->level != 1) {
@ -3118,14 +3235,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf)) if (IS_ERR(conf))
return PTR_ERR(conf); return PTR_ERR(conf);
if (mddev->queue) if (!mddev_is_dm(mddev)) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0); ret = raid1_set_limits(mddev);
if (ret)
rdev_for_each(rdev, mddev) { goto abort;
if (!mddev->gendisk)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
} }
mddev->degraded = 0; mddev->degraded = 0;

View File

@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements. * allow for replacements.
*/ */
int raid_disks; int raid_disks;
int nonrot_disks;
spinlock_t device_lock; spinlock_t device_lock;

View File

@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio); static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf); static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
#include "raid1-10.c" #include "raid1-10.c"
#define NULL_CMD #define NULL_CMD
@ -518,11 +515,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to * The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will * user-side. So if something waits for IO, then it will
* wait for the 'master' bio. * wait for the 'master' bio.
*/ *
sector_t first_bad;
int bad_sectors;
/*
* Do not set R10BIO_Uptodate if the current device is * Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use * rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could * such device for properly reading the data back (we could
@ -535,10 +528,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(rdev, if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->devs[slot].addr, r10_bio->sectors) &&
r10_bio->sectors, !discard_error) {
&first_bad, &bad_sectors) && !discard_error) {
bio_put(bio); bio_put(bio);
if (repl) if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@ -753,17 +745,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_good_sectors = 0; best_good_sectors = 0;
do_balance = 1; do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state); clear_bit(R10BIO_FailFast, &r10_bio->state);
/*
* Check if we can balance. We can balance on the whole if (raid1_should_read_first(conf->mddev, this_sector, sectors))
* device if no resync is going on (recovery is ok), or below
* the resync window. We take the first readable disk when
* above the resync window.
*/
if ((conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
do_balance = 0; do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) { for (slot = 0; slot < conf->copies ; slot++) {
@ -1033,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
ret = false; ret = false;
} else { } else {
conf->nr_waiting++; conf->nr_waiting++;
raid10_log(conf->mddev, "wait barrier"); mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf)); wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--; conf->nr_waiting--;
} }
@ -1152,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
bio_wouldblock_error(bio); bio_wouldblock_error(bio);
return false; return false;
} }
raid10_log(conf->mddev, "wait reshape"); mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier, wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector + conf->reshape_progress >= bio->bi_iter.bi_sector +
@ -1249,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R10BIO_FailFast, &r10_bio->state)) test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r10_bio; read_bio->bi_private = r10_bio;
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
submit_bio_noacct(read_bio); submit_bio_noacct(read_bio);
return; return;
} }
@ -1288,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
&& enough(conf, devnum)) && enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST; mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
mddev_trace_remap(mddev, mbio, r10_bio->sector);
if (conf->mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/ /* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev; mbio->bi_bdev = (void *)rdev;
@ -1330,10 +1307,7 @@ retry_wait:
} }
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr; sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors;
int is_bad;
/* /*
* Discard request doesn't care the write result * Discard request doesn't care the write result
@ -1342,9 +1316,8 @@ retry_wait:
if (!r10_bio->sectors) if (!r10_bio->sectors)
continue; continue;
is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, if (rdev_has_badblock(rdev, dev_sector,
&first_bad, &bad_sectors); r10_bio->sectors) < 0) {
if (is_bad < 0) {
/* /*
* Mustn't write here until the bad block * Mustn't write here until the bad block
* is acknowledged * is acknowledged
@ -1360,7 +1333,8 @@ retry_wait:
if (unlikely(blocked_rdev)) { if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */ /* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf); allow_barrier(conf);
raid10_log(conf->mddev, "%s wait rdev %d blocked", mddev_add_trace_msg(conf->mddev,
"raid10 %s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk); __func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev); md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false); wait_barrier(conf, false);
@ -1416,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio); bio_wouldblock_error(bio);
return; return;
} }
raid10_log(conf->mddev, "wait reshape metadata"); mddev_add_trace_msg(conf->mddev,
"raid10 wait reshape metadata");
wait_event(mddev->sb_wait, wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
@ -2131,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
continue; continue;
} }
if (mddev->gendisk) err = mddev_stack_new_rdev(mddev, rdev);
disk_stack_limits(mddev->gendisk, rdev->bdev, if (err)
rdev->data_offset << 9); return err;
p->head_position = 0; p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1; p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
@ -2150,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags); set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot; rdev->raid_disk = repl_slot;
err = 0; err = mddev_stack_new_rdev(mddev, rdev);
if (mddev->gendisk) if (err)
disk_stack_limits(mddev->gendisk, rdev->bdev, return err;
rdev->data_offset << 9);
conf->fullsync = 1; conf->fullsync = 1;
WRITE_ONCE(p->replacement, rdev); WRITE_ONCE(p->replacement, rdev);
} }
@ -2290,8 +2263,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev; struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int d; int d;
sector_t first_bad;
int bad_sectors;
int slot; int slot;
int repl; int repl;
struct md_rdev *rdev = NULL; struct md_rdev *rdev = NULL;
@ -2312,11 +2283,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery); &rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state); set_bit(R10BIO_WriteError, &r10_bio->state);
} }
} else if (is_badblock(rdev, } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->devs[slot].addr, r10_bio->sectors)) {
r10_bio->sectors,
&first_bad, &bad_sectors))
set_bit(R10BIO_MadeGood, &r10_bio->state); set_bit(R10BIO_MadeGood, &r10_bio->state);
}
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
@ -2597,11 +2567,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op) int sectors, struct page *page, enum req_op op)
{ {
sector_t first_bad; if (rdev_has_badblock(rdev, sector, sectors) &&
int bad_sectors; (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
&& (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1; return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */ /* success */
@ -2658,16 +2625,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
do { do {
sector_t first_bad;
int bad_sectors;
d = r10_bio->devs[sl].devnum; d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
test_bit(In_sync, &rdev->flags) && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, rdev_has_badblock(rdev,
&first_bad, &bad_sectors) == 0) { r10_bio->devs[sl].addr + sect,
s) == 0) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev, success = sync_page_io(rdev,
r10_bio->devs[sl].addr + r10_bio->devs[sl].addr +
@ -4002,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err); return ERR_PTR(err);
} }
static void raid10_set_io_opt(struct r10conf *conf) static unsigned int raid10_nr_stripes(struct r10conf *conf)
{ {
int raid_disks = conf->geo.raid_disks; unsigned int raid_disks = conf->geo.raid_disks;
if (!(conf->geo.raid_disks % conf->geo.near_copies)) if (conf->geo.raid_disks % conf->geo.near_copies)
raid_disks /= conf->geo.near_copies; return raid_disks;
blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * return raid_disks / conf->geo.near_copies;
raid_disks); }
static int raid10_set_queue_limits(struct mddev *mddev)
{
struct r10conf *conf = mddev->private;
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
} }
static int raid10_run(struct mddev *mddev) static int raid10_run(struct mddev *mddev)
@ -4021,6 +3998,7 @@ static int raid10_run(struct mddev *mddev)
sector_t size; sector_t size;
sector_t min_offset_diff = 0; sector_t min_offset_diff = 0;
int first = 1; int first = 1;
int ret = -EIO;
if (mddev->private == NULL) { if (mddev->private == NULL) {
conf = setup_conf(mddev); conf = setup_conf(mddev);
@ -4047,12 +4025,6 @@ static int raid10_run(struct mddev *mddev)
} }
} }
if (mddev->queue) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
long long diff; long long diff;
@ -4081,14 +4053,16 @@ static int raid10_run(struct mddev *mddev)
if (first || diff < min_offset_diff) if (first || diff < min_offset_diff)
min_offset_diff = diff; min_offset_diff = diff;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk->head_position = 0; disk->head_position = 0;
first = 0; first = 0;
} }
if (!mddev_is_dm(conf->mddev)) {
ret = raid10_set_queue_limits(mddev);
if (ret)
goto out_free_conf;
}
/* need to check that every block has at least one working mirror */ /* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) { if (!enough(conf, -1)) {
pr_err("md/raid10:%s: not enough operational mirrors.\n", pr_err("md/raid10:%s: not enough operational mirrors.\n",
@ -4185,7 +4159,7 @@ out_free_conf:
raid10_free_conf(conf); raid10_free_conf(conf);
mddev->private = NULL; mddev->private = NULL;
out: out:
return -EIO; return ret;
} }
static void raid10_free(struct mddev *mddev, void *priv) static void raid10_free(struct mddev *mddev, void *priv)
@ -4954,8 +4928,7 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector; conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (conf->mddev->queue) mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
raid10_set_io_opt(conf);
conf->fullsync = 0; conf->fullsync = 0;
} }

View File

@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf)
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512; ppl_conf->block_size = 512;
} else { } else {
ppl_conf->block_size = queue_logical_block_size(mddev->queue); ppl_conf->block_size =
queue_logical_block_size(mddev->gendisk->queue);
} }
for (i = 0; i < ppl_conf->count; i++) { for (i = 0; i < ppl_conf->count; i++) {

View File

@ -36,6 +36,7 @@
*/ */
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/raid/pq.h> #include <linux/raid/pq.h>
#include <linux/async_tx.h> #include <linux/async_tx.h>
@ -760,6 +761,7 @@ enum stripe_result {
STRIPE_RETRY, STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY, STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL, STRIPE_FAIL,
STRIPE_WAIT_RESHAPE,
}; };
struct stripe_request_ctx { struct stripe_request_ctx {
@ -1210,10 +1212,8 @@ again:
*/ */
while (op_is_write(op) && rdev && while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) { test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad; int bad = rdev_has_badblock(rdev, sh->sector,
int bad_sectors; RAID5_STRIPE_SECTORS(conf));
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad) if (!bad)
break; break;
@ -1295,10 +1295,7 @@ again:
if (rrdev) if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
if (conf->mddev->gendisk) mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
trace_block_bio_remap(bi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
if (should_defer && op_is_write(op)) if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi); bio_list_add(&pending_bios, bi);
else else
@ -1342,10 +1339,7 @@ again:
*/ */
if (op == REQ_OP_DISCARD) if (op == REQ_OP_DISCARD)
rbi->bi_vcnt = 0; rbi->bi_vcnt = 0;
if (conf->mddev->gendisk) mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
trace_block_bio_remap(rbi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
if (should_defer && op_is_write(op)) if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi); bio_list_add(&pending_bios, rbi);
else else
@ -2412,7 +2406,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
atomic_inc(&conf->active_stripes); atomic_inc(&conf->active_stripes);
raid5_release_stripe(sh); raid5_release_stripe(sh);
conf->max_nr_stripes++; WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
return 1; return 1;
} }
@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
size_t namelen = sizeof(conf->cache_name[0]); size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks); int devs = max(conf->raid_disks, conf->previous_raid_disks);
if (conf->mddev->gendisk) if (mddev_is_dm(conf->mddev))
snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
else
snprintf(conf->cache_name[0], namelen, snprintf(conf->cache_name[0], namelen,
"raid%d-%p", conf->level, conf->mddev); "raid%d-%p", conf->level, conf->mddev);
else
snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0; conf->active_name = 0;
@ -2707,7 +2701,7 @@ static int drop_one_stripe(struct r5conf *conf)
shrink_buffers(sh); shrink_buffers(sh);
free_stripe(conf->slab_cache, sh); free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes); atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--; WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
return 1; return 1;
} }
@ -2855,8 +2849,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i; int disks = sh->disks, i;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0; int replacement = 0;
for (i = 0 ; i < disks; i++) { for (i = 0 ; i < disks; i++) {
@ -2888,9 +2880,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) { if (replacement) {
if (bi->bi_status) if (bi->bi_status)
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector, else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), RAID5_STRIPE_SECTORS(conf)))
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else { } else {
if (bi->bi_status) { if (bi->bi_status) {
@ -2900,9 +2891,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags)) if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery); &rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector, } else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), RAID5_STRIPE_SECTORS(conf))) {
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags); set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make /* That was a successful write so make
@ -4205,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */ /* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue) mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
blk_add_trace_msg(conf->mddev->queue, sh->sector, rmw);
"raid5 rmw %llu %d",
(unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) && if (test_bit(R5_InJournal, &dev->flags) &&
@ -4285,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_DELAYED, &sh->state);
} }
} }
if (rcw && conf->mddev->queue) if (rcw && !mddev_is_dm(conf->mddev))
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", blk_add_trace_msg(conf->mddev->gendisk->queue,
(unsigned long long)sh->sector, "raid5 rcw %llu %d %d %d",
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); (unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
} }
if (rcw > disks && rmw > disks && if (rcw > disks && rmw > disks &&
@ -4674,8 +4664,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
for (i=disks; i--; ) { for (i=disks; i--; ) {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int is_bad = 0; int is_bad = 0;
dev = &sh->dev[i]; dev = &sh->dev[i];
@ -4719,8 +4707,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement; rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) && if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), !rdev_has_badblock(rdev, sh->sector,
&first_bad, &bad_sectors)) RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags); set_bit(R5_ReadRepl, &dev->flags);
else { else {
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
@ -4733,8 +4721,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; rdev = NULL;
if (rdev) { if (rdev) {
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), is_bad = rdev_has_badblock(rdev, sh->sector,
&first_bad, &bad_sectors); RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags) && (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) { || is_bad < 0)) {
@ -5463,8 +5451,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
struct bio *align_bio; struct bio *align_bio;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t sector, end_sector, first_bad; sector_t sector, end_sector;
int bad_sectors, dd_idx; int dd_idx;
bool did_inc; bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) { if (!in_chunk_boundary(mddev, raid_bio)) {
@ -5493,8 +5481,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
&bad_sectors)) {
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
return 0; return 0;
} }
@ -5530,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
} }
if (mddev->gendisk) mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio); submit_bio_noacct(align_bio);
return 1; return 1;
} }
@ -5701,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
} }
release_inactive_stripe_list(conf, cb->temp_inactive_list, release_inactive_stripe_list(conf, cb->temp_inactive_list,
NR_STRIPE_HASH_LOCKS); NR_STRIPE_HASH_LOCKS);
if (mddev->queue) if (!mddev_is_dm(mddev))
trace_block_unplug(mddev->queue, cnt, !from_schedule); trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
kfree(cb); kfree(cb);
} }
@ -5946,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector, if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) { conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
return STRIPE_SCHEDULE_AND_RETRY; ret = STRIPE_SCHEDULE_AND_RETRY;
goto out;
} }
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
@ -6025,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release: out_release:
raid5_release_stripe(sh); raid5_release_stripe(sh);
out:
if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
bi->bi_status = BLK_STS_RESOURCE;
ret = STRIPE_WAIT_RESHAPE;
pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
}
return ret; return ret;
} }
@ -6146,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) { while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector, res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi); bi);
if (res == STRIPE_FAIL) if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break; break;
if (res == STRIPE_RETRY) if (res == STRIPE_RETRY)
@ -6184,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE) if (rw == WRITE)
md_write_end(mddev); md_write_end(mddev);
if (res == STRIPE_WAIT_RESHAPE) {
md_free_cloned_bio(bi);
return false;
}
bio_endio(bi); bio_endio(bi);
return true; return true;
} }
@ -6773,7 +6770,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev); md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
/*
* Waiting on MD_SB_CHANGE_PENDING below may deadlock
* seeing md_check_recovery() is needed to clear
* the flag when using mdmon.
*/
continue;
} }
wait_event_lock_irq(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
conf->device_lock);
} }
pr_debug("%d stripes handled\n", handled); pr_debug("%d stripes handled\n", handled);
@ -6820,7 +6828,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768) if (size <= 16 || size > 32768)
return -EINVAL; return -EINVAL;
conf->min_nr_stripes = size; WRITE_ONCE(conf->min_nr_stripes, size);
mutex_lock(&conf->cache_size_mutex); mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes && while (size < conf->max_nr_stripes &&
drop_one_stripe(conf)) drop_one_stripe(conf))
@ -6832,7 +6840,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
mutex_lock(&conf->cache_size_mutex); mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes) while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL)) { if (!grow_one_stripe(conf, GFP_KERNEL)) {
conf->min_nr_stripes = conf->max_nr_stripes; WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
result = -ENOMEM; result = -ENOMEM;
break; break;
} }
@ -6967,10 +6975,8 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
pr_debug("md/raid: change stripe_size from %lu to %lu\n", pr_debug("md/raid: change stripe_size from %lu to %lu\n",
conf->stripe_size, new); conf->stripe_size, new);
if (mddev->sync_thread || if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) {
mddev->reshape_position != MaxSector ||
mddev->sysfs_active) {
err = -EBUSY; err = -EBUSY;
goto out_unlock; goto out_unlock;
} }
@ -7084,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf) if (!conf)
err = -ENODEV; err = -ENODEV;
else if (new != conf->skip_copy) { else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue; struct request_queue *q = mddev->gendisk->queue;
conf->skip_copy = new; conf->skip_copy = new;
if (new) if (new)
@ -7390,11 +7396,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc) struct shrink_control *sc)
{ {
struct r5conf *conf = shrink->private_data; struct r5conf *conf = shrink->private_data;
int max_stripes = READ_ONCE(conf->max_nr_stripes);
int min_stripes = READ_ONCE(conf->min_nr_stripes);
if (conf->max_nr_stripes < conf->min_nr_stripes) if (max_stripes < min_stripes)
/* unlikely, but not impossible */ /* unlikely, but not impossible */
return 0; return 0;
return conf->max_nr_stripes - conf->min_nr_stripes; return max_stripes - min_stripes;
} }
static struct r5conf *setup_conf(struct mddev *mddev) static struct r5conf *setup_conf(struct mddev *mddev)
@ -7684,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0; return 0;
} }
static void raid5_set_io_opt(struct r5conf *conf) static int raid5_set_limits(struct mddev *mddev)
{ {
blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * struct r5conf *conf = mddev->private;
(conf->raid_disks - conf->max_degraded)); struct queue_limits lim;
int data_disks, stripe;
struct md_rdev *rdev;
/*
* The read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
*/
data_disks = conf->previous_raid_disks - conf->max_degraded;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
blk_set_stacking_limits(&lim);
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.raid_partial_stripes_expensive = 1;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
mddev->gendisk->disk_name);
/*
* Zeroing is required for discard, otherwise data could be lost.
*
* Consider a scenario: discard a stripe (the stripe could be
* inconsistent if discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again depending on which
* disks are used to calculate parity); the disk is broken; The stripe
* data of this disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that only safe
* devices are in use by setting a module parameter. A better idea
* might be to turn DISCARD into WRITE_ZEROES requests, as that is
* required to be safe.
*/
if (!devices_handle_discard_safely ||
lim.max_discard_sectors < (stripe >> 9) ||
lim.discard_granularity < stripe)
lim.max_hw_discard_sectors = 0;
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
/* No restrictions on the number of segments in the request */
lim.max_segments = USHRT_MAX;
return queue_limits_set(mddev->gendisk->queue, &lim);
} }
static int raid5_run(struct mddev *mddev) static int raid5_run(struct mddev *mddev)
@ -7700,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
int i; int i;
long long min_offset_diff = 0; long long min_offset_diff = 0;
int first = 1; int first = 1;
int ret = -EIO;
if (mddev->recovery_cp != MaxSector) if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@ -7948,66 +8012,10 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev)); mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) { if (!mddev_is_dm(mddev)) {
int chunk_size; ret = raid5_set_limits(mddev);
/* read-ahead size must cover two whole stripes, which if (ret)
* is 2 * (datadisks) * chunksize where 'n' is the goto abort;
* number of raid devices
*/
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
/*
* zeroing is required, otherwise data
* could be lost. Consider a scenario: discard a stripe
* (the stripe could be inconsistent if
* discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again
* depending on which disks are used to calculate
* parity); the disk is broken; The stripe data of this
* disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that
* only safe devices are in use by setting a module parameter.
* A better idea might be to turn DISCARD into WRITE_ZEROES
* requests, as that is required to be safe.
*/
if (!devices_handle_discard_safely ||
mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
blk_queue_max_hw_sectors(mddev->queue,
RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
/* No restrictions on the number of segments in the request */
blk_queue_max_segments(mddev->queue, USHRT_MAX);
} }
if (log_init(conf, journal_dev, raid5_has_ppl(conf))) if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@ -8020,7 +8028,7 @@ abort:
free_conf(conf); free_conf(conf);
mddev->private = NULL; mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
return -EIO; return ret;
} }
static void raid5_free(struct mddev *mddev, void *priv) static void raid5_free(struct mddev *mddev, void *priv)
@ -8531,8 +8539,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
if (conf->mddev->queue) mddev_update_io_opt(conf->mddev,
raid5_set_io_opt(conf); conf->raid_disks - conf->max_degraded);
} }
} }
@ -8909,6 +8917,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log); return r5l_start(conf->log);
} }
/*
* This is only used for dm-raid456, caller already frozen sync_thread, hence
* if rehsape is still in progress, io that is waiting for reshape can never be
* done now, hence wake up and handle those IO.
*/
static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
wake_up(&conf->wait_for_overlap);
}
static struct md_personality raid6_personality = static struct md_personality raid6_personality =
{ {
.name = "raid6", .name = "raid6",
@ -8932,6 +8952,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
.takeover = raid6_takeover, .takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy, .change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
}; };
static struct md_personality raid5_personality = static struct md_personality raid5_personality =
{ {
@ -8956,6 +8977,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
.takeover = raid5_takeover, .takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy, .change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
}; };
static struct md_personality raid4_personality = static struct md_personality raid4_personality =
@ -8981,6 +9003,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
.takeover = raid4_takeover, .takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy, .change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
}; };
static int __init raid5_init(void) static int __init raid5_init(void)

View File

@ -2078,6 +2078,12 @@ static const struct blk_mq_ops msb_mq_ops = {
static int msb_init_disk(struct memstick_dev *card) static int msb_init_disk(struct memstick_dev *card)
{ {
struct msb_data *msb = memstick_get_drvdata(card); struct msb_data *msb = memstick_get_drvdata(card);
struct queue_limits lim = {
.logical_block_size = msb->page_size,
.max_hw_sectors = MS_BLOCK_MAX_PAGES,
.max_segments = MS_BLOCK_MAX_SEGS,
.max_segment_size = MS_BLOCK_MAX_PAGES * msb->page_size,
};
int rc; int rc;
unsigned long capacity; unsigned long capacity;
@ -2093,19 +2099,13 @@ static int msb_init_disk(struct memstick_dev *card)
if (rc) if (rc)
goto out_release_id; goto out_release_id;
msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) { if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk); rc = PTR_ERR(msb->disk);
goto out_free_tag_set; goto out_free_tag_set;
} }
msb->queue = msb->disk->queue; msb->queue = msb->disk->queue;
blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
MS_BLOCK_MAX_PAGES * msb->page_size);
blk_queue_logical_block_size(msb->queue, msb->page_size);
sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
msb->disk->fops = &msb_bdops; msb->disk->fops = &msb_bdops;
msb->disk->private_data = msb; msb->disk->private_data = msb;

View File

@ -1103,6 +1103,12 @@ static const struct blk_mq_ops mspro_mq_ops = {
static int mspro_block_init_disk(struct memstick_dev *card) static int mspro_block_init_disk(struct memstick_dev *card)
{ {
struct mspro_block_data *msb = memstick_get_drvdata(card); struct mspro_block_data *msb = memstick_get_drvdata(card);
struct queue_limits lim = {
.logical_block_size = msb->page_size,
.max_hw_sectors = MSPRO_BLOCK_MAX_PAGES,
.max_segments = MSPRO_BLOCK_MAX_SEGS,
.max_segment_size = MSPRO_BLOCK_MAX_PAGES * msb->page_size,
};
struct mspro_devinfo *dev_info = NULL; struct mspro_devinfo *dev_info = NULL;
struct mspro_sys_info *sys_info = NULL; struct mspro_sys_info *sys_info = NULL;
struct mspro_sys_attr *s_attr = NULL; struct mspro_sys_attr *s_attr = NULL;
@ -1138,18 +1144,13 @@ static int mspro_block_init_disk(struct memstick_dev *card)
if (rc) if (rc)
goto out_release_id; goto out_release_id;
msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) { if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk); rc = PTR_ERR(msb->disk);
goto out_free_tag_set; goto out_free_tag_set;
} }
msb->queue = msb->disk->queue; msb->queue = msb->disk->queue;
blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
MSPRO_BLOCK_MAX_PAGES * msb->page_size);
msb->disk->major = major; msb->disk->major = major;
msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT; msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT; msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
@ -1158,8 +1159,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
sprintf(msb->disk->disk_name, "mspblk%d", disk_id); sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
blk_queue_logical_block_size(msb->queue, msb->page_size);
capacity = be16_to_cpu(sys_info->user_block_count); capacity = be16_to_cpu(sys_info->user_block_count);
capacity *= be16_to_cpu(sys_info->block_size); capacity *= be16_to_cpu(sys_info->block_size);
capacity *= msb->page_size >> 9; capacity *= msb->page_size >> 9;

View File

@ -174,8 +174,8 @@ static struct scatterlist *mmc_alloc_sg(unsigned short sg_len, gfp_t gfp)
return sg; return sg;
} }
static void mmc_queue_setup_discard(struct request_queue *q, static void mmc_queue_setup_discard(struct mmc_card *card,
struct mmc_card *card) struct queue_limits *lim)
{ {
unsigned max_discard; unsigned max_discard;
@ -183,15 +183,17 @@ static void mmc_queue_setup_discard(struct request_queue *q,
if (!max_discard) if (!max_discard)
return; return;
blk_queue_max_discard_sectors(q, max_discard); lim->max_hw_discard_sectors = max_discard;
q->limits.discard_granularity = card->pref_erase << 9; if (mmc_can_secure_erase_trim(card))
lim->max_secure_erase_sectors = max_discard;
if (mmc_can_trim(card) && card->erased_byte == 0)
lim->max_write_zeroes_sectors = max_discard;
/* granularity must not be greater than max. discard */ /* granularity must not be greater than max. discard */
if (card->pref_erase > max_discard) if (card->pref_erase > max_discard)
q->limits.discard_granularity = SECTOR_SIZE; lim->discard_granularity = SECTOR_SIZE;
if (mmc_can_secure_erase_trim(card)) else
blk_queue_max_secure_erase_sectors(q, max_discard); lim->discard_granularity = card->pref_erase << 9;
if (mmc_can_trim(card) && card->erased_byte == 0)
blk_queue_max_write_zeroes_sectors(q, max_discard);
} }
static unsigned short mmc_get_max_segments(struct mmc_host *host) static unsigned short mmc_get_max_segments(struct mmc_host *host)
@ -341,40 +343,53 @@ static const struct blk_mq_ops mmc_mq_ops = {
.timeout = mmc_mq_timed_out, .timeout = mmc_mq_timed_out,
}; };
static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq,
struct mmc_card *card)
{ {
struct mmc_host *host = card->host; struct mmc_host *host = card->host;
unsigned block_size = 512; struct queue_limits lim = { };
struct gendisk *disk;
if (mmc_can_erase(card))
mmc_queue_setup_discard(card, &lim);
if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
lim.bounce = BLK_BOUNCE_HIGH;
lim.max_hw_sectors = min(host->max_blk_count, host->max_req_size / 512);
if (mmc_card_mmc(card) && card->ext_csd.data_sector_size)
lim.logical_block_size = card->ext_csd.data_sector_size;
else
lim.logical_block_size = 512;
WARN_ON_ONCE(lim.logical_block_size != 512 &&
lim.logical_block_size != 4096);
/*
* Setting a virt_boundary implicity sets a max_segment_size, so try
* to set the hardware one here.
*/
if (host->can_dma_map_merge) {
lim.virt_boundary_mask = dma_get_merge_boundary(mmc_dev(host));
lim.max_segments = MMC_DMA_MAP_MERGE_SEGMENTS;
} else {
lim.max_segment_size =
round_down(host->max_seg_size, lim.logical_block_size);
lim.max_segments = host->max_segs;
}
disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq);
if (IS_ERR(disk))
return disk;
mq->queue = disk->queue;
if (mmc_host_is_spi(host) && host->use_spi_crc)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
blk_queue_rq_timeout(mq->queue, 60 * HZ);
blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue);
if (mmc_can_erase(card))
mmc_queue_setup_discard(mq->queue, card);
if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH);
blk_queue_max_hw_sectors(mq->queue,
min(host->max_blk_count, host->max_req_size / 512));
if (host->can_dma_map_merge)
WARN(!blk_queue_can_use_dma_map_merging(mq->queue,
mmc_dev(host)),
"merging was advertised but not possible");
blk_queue_max_segments(mq->queue, mmc_get_max_segments(host));
if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) {
block_size = card->ext_csd.data_sector_size;
WARN_ON(block_size != 512 && block_size != 4096);
}
blk_queue_logical_block_size(mq->queue, block_size);
/*
* After blk_queue_can_use_dma_map_merging() was called with succeed,
* since it calls blk_queue_virt_boundary(), the mmc should not call
* both blk_queue_max_segment_size().
*/
if (!host->can_dma_map_merge)
blk_queue_max_segment_size(mq->queue,
round_down(host->max_seg_size, block_size));
dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue));
@ -386,6 +401,7 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
init_waitqueue_head(&mq->wait); init_waitqueue_head(&mq->wait);
mmc_crypto_setup_queue(mq->queue, host); mmc_crypto_setup_queue(mq->queue, host);
return disk;
} }
static inline bool mmc_merge_capable(struct mmc_host *host) static inline bool mmc_merge_capable(struct mmc_host *host)
@ -447,20 +463,11 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
return ERR_PTR(ret); return ERR_PTR(ret);
disk = blk_mq_alloc_disk(&mq->tag_set, mq); disk = mmc_alloc_disk(mq, card);
if (IS_ERR(disk)) { if (IS_ERR(disk))
blk_mq_free_tag_set(&mq->tag_set); blk_mq_free_tag_set(&mq->tag_set);
return disk; return disk;
} }
mq->queue = disk->queue;
if (mmc_host_is_spi(host) && host->use_spi_crc)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
blk_queue_rq_timeout(mq->queue, 60 * HZ);
mmc_setup_queue(mq, card);
return disk;
}
void mmc_queue_suspend(struct mmc_queue *mq) void mmc_queue_suspend(struct mmc_queue *mq)
{ {

View File

@ -277,6 +277,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
{ {
struct mtd_blktrans_ops *tr = new->tr; struct mtd_blktrans_ops *tr = new->tr;
struct mtd_blktrans_dev *d; struct mtd_blktrans_dev *d;
struct queue_limits lim = { };
int last_devnum = -1; int last_devnum = -1;
struct gendisk *gd; struct gendisk *gd;
int ret; int ret;
@ -332,8 +333,12 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
if (ret) if (ret)
goto out_kfree_tag_set; goto out_kfree_tag_set;
lim.logical_block_size = tr->blksize;
if (tr->discard)
lim.max_hw_discard_sectors = UINT_MAX;
/* Create gendisk */ /* Create gendisk */
gd = blk_mq_alloc_disk(new->tag_set, new); gd = blk_mq_alloc_disk(new->tag_set, &lim, new);
if (IS_ERR(gd)) { if (IS_ERR(gd)) {
ret = PTR_ERR(gd); ret = PTR_ERR(gd);
goto out_free_tag_set; goto out_free_tag_set;
@ -371,14 +376,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
if (tr->flush) if (tr->flush)
blk_queue_write_cache(new->rq, true, false); blk_queue_write_cache(new->rq, true, false);
blk_queue_logical_block_size(new->rq, tr->blksize);
blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
if (tr->discard)
blk_queue_max_discard_sectors(new->rq, UINT_MAX);
gd->queue = new->rq; gd->queue = new->rq;
if (new->readonly) if (new->readonly)

View File

@ -348,6 +348,9 @@ static int calc_disk_capacity(struct ubi_volume_info *vi, u64 *disk_capacity)
int ubiblock_create(struct ubi_volume_info *vi) int ubiblock_create(struct ubi_volume_info *vi)
{ {
struct queue_limits lim = {
.max_segments = UBI_MAX_SG_COUNT,
};
struct ubiblock *dev; struct ubiblock *dev;
struct gendisk *gd; struct gendisk *gd;
u64 disk_capacity; u64 disk_capacity;
@ -393,7 +396,7 @@ int ubiblock_create(struct ubi_volume_info *vi)
/* Initialize the gendisk of this ubiblock device */ /* Initialize the gendisk of this ubiblock device */
gd = blk_mq_alloc_disk(&dev->tag_set, dev); gd = blk_mq_alloc_disk(&dev->tag_set, &lim, dev);
if (IS_ERR(gd)) { if (IS_ERR(gd)) {
ret = PTR_ERR(gd); ret = PTR_ERR(gd);
goto out_free_tags; goto out_free_tags;
@ -416,7 +419,6 @@ int ubiblock_create(struct ubi_volume_info *vi)
dev->gd = gd; dev->gd = gd;
dev->rq = gd->queue; dev->rq = gd->queue;
blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT);
list_add_tail(&dev->list, &ubiblock_devices); list_add_tail(&dev->list, &ubiblock_devices);

View File

@ -1496,19 +1496,21 @@ static int btt_blk_init(struct btt *btt)
{ {
struct nd_btt *nd_btt = btt->nd_btt; struct nd_btt *nd_btt = btt->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns; struct nd_namespace_common *ndns = nd_btt->ndns;
int rc = -ENOMEM; struct queue_limits lim = {
.logical_block_size = btt->sector_size,
.max_hw_sectors = UINT_MAX,
};
int rc;
btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (!btt->btt_disk) if (IS_ERR(btt->btt_disk))
return -ENOMEM; return PTR_ERR(btt->btt_disk);
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->first_minor = 0; btt->btt_disk->first_minor = 0;
btt->btt_disk->fops = &btt_fops; btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt; btt->btt_disk->private_data = btt;
blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size);
blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);

View File

@ -451,6 +451,11 @@ static int pmem_attach_disk(struct device *dev,
{ {
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(dev->parent); struct nd_region *nd_region = to_nd_region(dev->parent);
struct queue_limits lim = {
.logical_block_size = pmem_sector_size(ndns),
.physical_block_size = PAGE_SIZE,
.max_hw_sectors = UINT_MAX,
};
int nid = dev_to_node(dev), fua; int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res; struct resource *res = &nsio->res;
struct range bb_range; struct range bb_range;
@ -497,9 +502,9 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY; return -EBUSY;
} }
disk = blk_alloc_disk(nid); disk = blk_alloc_disk(&lim, nid);
if (!disk) if (IS_ERR(disk))
return -ENOMEM; return PTR_ERR(disk);
q = disk->queue; q = disk->queue;
pmem->disk = disk; pmem->disk = disk;
@ -539,9 +544,6 @@ static int pmem_attach_disk(struct device *dev,
pmem->virt_addr = addr; pmem->virt_addr = addr;
blk_queue_write_cache(q, true, fua); blk_queue_write_cache(q, true, fua);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
if (pmem->pfn_flags & PFN_MAP) if (pmem->pfn_flags & PFN_MAP)

View File

@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev; goto put_dev;
} }
anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset); anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) { if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM; ret = -ENOMEM;
goto put_dev; goto put_dev;

View File

@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida); static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt; static dev_t nvme_ctrl_base_chr_devt;
static struct class *nvme_class; static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
static struct class *nvme_subsys_class; static const struct class nvme_class = {
.name = "nvme",
.dev_uevent = nvme_class_uevent,
};
static const struct class nvme_subsys_class = {
.name = "nvme-subsystem",
};
static DEFINE_IDA(nvme_ns_chr_minor_ida); static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt; static dev_t nvme_ns_chr_devt;
static struct class *nvme_ns_chr_class; static const struct class nvme_ns_chr_class = {
.name = "nvme-generic",
};
static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@ -1398,8 +1407,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl)); sizeof(struct nvme_id_ctrl));
if (error) if (error) {
kfree(*id); kfree(*id);
*id = NULL;
}
return error; return error;
} }
@ -1528,6 +1539,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) { if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id); kfree(*id);
*id = NULL;
} }
return error; return error;
} }
@ -1727,12 +1739,23 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0; return 0;
} }
#ifdef CONFIG_BLK_DEV_INTEGRITY static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
static void nvme_init_integrity(struct gendisk *disk,
struct nvme_ns_head *head, u32 max_integrity_segments)
{ {
struct blk_integrity integrity = { }; struct blk_integrity integrity = { };
blk_integrity_unregister(disk);
if (!head->ms)
return true;
/*
* PI can always be supported as we can ask the controller to simply
* insert/strip it, which is not possible for other kinds of metadata.
*/
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
!(head->features & NVME_NS_METADATA_SUPPORTED))
return nvme_ns_has_pi(head);
switch (head->pi_type) { switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3: case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) { switch (head->guard_type) {
@ -1775,53 +1798,32 @@ static void nvme_init_integrity(struct gendisk *disk,
} }
integrity.tuple_size = head->ms; integrity.tuple_size = head->ms;
integrity.pi_offset = head->pi_offset;
blk_integrity_register(disk, &integrity); blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); return true;
} }
#else
static void nvme_init_integrity(struct gendisk *disk,
struct nvme_ns_head *head, u32 max_integrity_segments)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
struct nvme_ns_head *head)
{ {
struct request_queue *queue = disk->queue; struct nvme_ctrl *ctrl = ns->ctrl;
u32 max_discard_sectors;
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
} else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
max_discard_sectors = UINT_MAX;
} else {
blk_queue_max_discard_sectors(queue, 0);
return;
}
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES); NVME_DSM_MAX_RANGES);
/* if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
* If discard is already enabled, don't reset queue limits. lim->max_hw_discard_sectors =
* nvme_lba_to_sect(ns->head, ctrl->dmrsl);
* This works around the fact that the block layer can't cope well with else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
* updating the hardware limits when overridden through sysfs. This is lim->max_hw_discard_sectors = UINT_MAX;
* harmless because discard limits in NVMe are purely advisory.
*/
if (queue->limits.max_discard_sectors)
return;
blk_queue_max_discard_sectors(queue, max_discard_sectors);
if (ctrl->dmrl)
blk_queue_max_discard_segments(queue, ctrl->dmrl);
else else
blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); lim->max_hw_discard_sectors = 0;
queue->limits.discard_granularity = queue_logical_block_size(queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) lim->discard_granularity = lim->logical_block_size;
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
if (ctrl->dmrl)
lim->max_discard_segments = ctrl->dmrl;
else
lim->max_discard_segments = NVME_DSM_MAX_RANGES;
} }
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@ -1832,42 +1834,38 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi; a->csi == b->csi;
} }
static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id) struct nvme_id_ns_nvm **nvmp)
{ {
bool first = id->dps & NVME_NS_DPS_PI_FIRST; struct nvme_command c = {
unsigned lbaf = nvme_lbaf_index(id->flbas); .identify.opcode = nvme_admin_identify,
struct nvme_command c = { }; .identify.nsid = cpu_to_le32(nsid),
.identify.cns = NVME_ID_CNS_CS_NS,
.identify.csi = NVME_CSI_NVM,
};
struct nvme_id_ns_nvm *nvm; struct nvme_id_ns_nvm *nvm;
int ret = 0; int ret;
u32 elbaf;
head->pi_size = 0;
head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
head->pi_size = sizeof(struct t10_pi_tuple);
head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL); nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm) if (!nvm)
return -ENOMEM; return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(head->ns_id);
c.identify.cns = NVME_ID_CNS_CS_NS;
c.identify.csi = NVME_CSI_NVM;
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret) if (ret)
goto free_data; kfree(nvm);
else
*nvmp = nvm;
return ret;
}
elbaf = le32_to_cpu(nvm->elbaf[lbaf]); static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
{
u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
/* no support for storage tag formats right now */ /* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf)) if (nvme_elbaf_sts(elbaf))
goto free_data; return;
head->guard_type = nvme_elbaf_guard_type(elbaf); head->guard_type = nvme_elbaf_guard_type(elbaf);
switch (head->guard_type) { switch (head->guard_type) {
@ -1880,30 +1878,31 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
default: default:
break; break;
} }
free_data:
kfree(nvm);
set_pi:
if (head->pi_size && (first || head->ms == head->pi_size))
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
head->pi_type = 0;
return ret;
} }
static int nvme_configure_metadata(struct nvme_ctrl *ctrl, static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
struct nvme_ns_head *head, struct nvme_id_ns *id) struct nvme_ns_head *head, struct nvme_id_ns *id,
struct nvme_id_ns_nvm *nvm)
{ {
int ret;
ret = nvme_init_ms(ctrl, head, id);
if (ret)
return ret;
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
head->pi_type = 0;
head->pi_size = 0;
head->pi_offset = 0;
head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0; return;
if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
nvme_configure_pi_elbas(head, id, nvm);
} else {
head->pi_size = sizeof(struct t10_pi_tuple);
head->guard_type = NVME_NVM_NS_16B_GUARD;
}
if (head->pi_size && head->ms >= head->pi_size)
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
if (!(id->dps & NVME_NS_DPS_PI_FIRST))
head->pi_offset = head->ms - head->pi_size;
if (ctrl->ops->flags & NVME_F_FABRICS) { if (ctrl->ops->flags & NVME_F_FABRICS) {
/* /*
@ -1912,7 +1911,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
* remap the separate metadata buffer from the block layer. * remap the separate metadata buffer from the block layer.
*/ */
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0; return;
head->features |= NVME_NS_EXT_LBAS; head->features |= NVME_NS_EXT_LBAS;
@ -1939,33 +1938,32 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
else else
head->features |= NVME_NS_METADATA_SUPPORTED; head->features |= NVME_NS_METADATA_SUPPORTED;
} }
return 0;
} }
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
struct request_queue *q)
{ {
bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
if (ctrl->max_hw_sectors) {
u32 max_segments =
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
max_segments = min_not_zero(max_segments, ctrl->max_segments);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
blk_queue_dma_alignment(q, 3);
blk_queue_write_cache(q, vwc, vwc);
} }
static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
struct nvme_ns_head *head, struct nvme_id_ns *id) struct queue_limits *lim)
{ {
sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze)); lim->max_hw_sectors = ctrl->max_hw_sectors;
lim->max_segments = min_t(u32, USHRT_MAX,
min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
lim->max_integrity_segments = ctrl->max_integrity_segments;
lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
lim->max_segment_size = UINT_MAX;
lim->dma_alignment = 3;
}
static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
struct queue_limits *lim)
{
struct nvme_ns_head *head = ns->head;
u32 bs = 1U << head->lba_shift; u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0; u32 atomic_bs, phys_bs, io_opt = 0;
bool valid = true;
/* /*
* The block layer can't support LBA sizes larger than the page size * The block layer can't support LBA sizes larger than the page size
@ -1973,12 +1971,10 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
* allow block I/O. * allow block I/O.
*/ */
if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9); bs = (1 << 9);
valid = false;
} }
blk_integrity_unregister(disk);
atomic_bs = phys_bs = bs; atomic_bs = phys_bs = bs;
if (id->nabo == 0) { if (id->nabo == 0) {
/* /*
@ -1989,7 +1985,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else else
atomic_bs = (1 + ctrl->subsys->awupf) * bs; atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
} }
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@ -1999,36 +1995,20 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
io_opt = bs * (1 + le16_to_cpu(id->nows)); io_opt = bs * (1 + le16_to_cpu(id->nows));
} }
blk_queue_logical_block_size(disk->queue, bs);
/* /*
* Linux filesystems assume writing a single physical block is * Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the * an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter. * value of the Atomic Write Unit Power Fail parameter.
*/ */
blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); lim->logical_block_size = bs;
blk_queue_io_min(disk->queue, phys_bs); lim->physical_block_size = min(phys_bs, atomic_bs);
blk_queue_io_opt(disk->queue, io_opt); lim->io_min = phys_bs;
lim->io_opt = io_opt;
/* if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
* Register a metadata profile for PI, or the plain non-integrity NVMe lim->max_write_zeroes_sectors = UINT_MAX;
* metadata masquerading as Type 0 if supported, otherwise reject block else
* I/O to namespaces with metadata except when the namespace supports lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
* PI, as it can strip/insert in that case. return valid;
*/
if (head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
(head->features & NVME_NS_METADATA_SUPPORTED))
nvme_init_integrity(disk, head,
ctrl->max_integrity_segments);
else if (!nvme_ns_has_pi(head))
capacity = 0;
}
set_capacity_and_notify(disk, capacity);
nvme_config_discard(ctrl, disk, head);
blk_queue_max_write_zeroes_sectors(disk->queue,
ctrl->max_zeroes_sectors);
} }
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@ -2042,7 +2022,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk); return !disk_live(disk);
} }
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
struct queue_limits *lim)
{ {
struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob; u32 iob;
@ -2070,38 +2051,36 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return; return;
} }
blk_queue_chunk_sectors(ns->queue, iob); lim->chunk_sectors = iob;
} }
static int nvme_update_ns_info_generic(struct nvme_ns *ns, static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info) struct nvme_ns_info *info)
{ {
struct queue_limits lim;
int ret;
blk_mq_freeze_queue(ns->disk->queue); blk_mq_freeze_queue(ns->disk->queue);
nvme_set_queue_limits(ns->ctrl, ns->queue); lim = queue_limits_start_update(ns->disk->queue);
nvme_set_ctrl_limits(ns->ctrl, &lim);
ret = queue_limits_commit_update(ns->disk->queue, &lim);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue); blk_mq_unfreeze_queue(ns->disk->queue);
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
ns->head->disk->flags |= GENHD_FL_HIDDEN;
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
/* Hide the block-interface for these devices */ /* Hide the block-interface for these devices */
ns->disk->flags |= GENHD_FL_HIDDEN; if (!ret)
set_bit(NVME_NS_READY, &ns->flags); ret = -ENODEV;
return ret;
return 0;
} }
static int nvme_update_ns_info_block(struct nvme_ns *ns, static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info) struct nvme_ns_info *info)
{ {
bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
struct queue_limits lim;
struct nvme_id_ns_nvm *nvm = NULL;
struct nvme_id_ns *id; struct nvme_id_ns *id;
sector_t capacity;
unsigned lbaf; unsigned lbaf;
int ret; int ret;
@ -2113,30 +2092,52 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
/* namespace not allocated or attached */ /* namespace not allocated or attached */
info->is_removed = true; info->is_removed = true;
ret = -ENODEV; ret = -ENODEV;
goto error; goto out;
}
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
if (ret < 0)
goto out;
} }
blk_mq_freeze_queue(ns->disk->queue); blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas); lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse); ns->head->nuse = le64_to_cpu(id->nuse);
nvme_set_queue_limits(ns->ctrl, ns->queue); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
ret = nvme_configure_metadata(ns->ctrl, ns->head, id); lim = queue_limits_start_update(ns->disk->queue);
if (ret < 0) { nvme_set_ctrl_limits(ns->ctrl, &lim);
blk_mq_unfreeze_queue(ns->disk->queue); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm);
goto out; nvme_set_chunk_sectors(ns, id, &lim);
} if (!nvme_update_disk_info(ns, id, &lim))
nvme_set_chunk_sectors(ns, id); capacity = 0;
nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id); nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
if (ns->head->ids.csi == NVME_CSI_ZNS) { ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf); ret = nvme_update_zone_info(ns, lbaf, &lim);
if (ret) { if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue); blk_mq_unfreeze_queue(ns->disk->queue);
goto out; goto out;
} }
} }
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
if (!nvme_init_integrity(ns->disk, ns->head))
capacity = 0;
set_capacity_and_notify(ns->disk, capacity);
/* /*
* Only set the DEAC bit if the device guarantees that reads from * Only set the DEAC bit if the device guarantees that reads from
@ -2147,28 +2148,50 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->head->features |= NVME_NS_DEAC; ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags); set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue); blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) { if (blk_queue_is_zoned(ns->queue)) {
ret = nvme_revalidate_zones(ns); ret = blk_revalidate_disk_zones(ns->disk, NULL);
if (ret && !nvme_first_scan(ns->disk)) if (ret && !nvme_first_scan(ns->disk))
goto out; goto out;
} }
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
disk_update_readahead(ns->head->disk);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
ret = 0; ret = 0;
out: out:
kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
bool unsupported = false;
int ret;
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
ret = nvme_update_ns_info_generic(ns, info);
break;
}
ret = nvme_update_ns_info_block(ns, info);
break;
case NVME_CSI_NVM:
ret = nvme_update_ns_info_block(ns, info);
break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
ret = nvme_update_ns_info_generic(ns, info);
break;
}
/* /*
* If probing fails due an unsupported feature, hide the block device, * If probing fails due an unsupported feature, hide the block device,
* but still allow other access. * but still allow other access.
@ -2176,33 +2199,30 @@ out:
if (ret == -ENODEV) { if (ret == -ENODEV) {
ns->disk->flags |= GENHD_FL_HIDDEN; ns->disk->flags |= GENHD_FL_HIDDEN;
set_bit(NVME_NS_READY, &ns->flags); set_bit(NVME_NS_READY, &ns->flags);
unsupported = true;
ret = 0; ret = 0;
} }
error: if (!ret && nvme_ns_head_multipath(ns->head)) {
kfree(id); struct queue_limits lim;
return ret;
blk_mq_freeze_queue(ns->head->disk->queue);
if (unsupported)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head->disk, ns->head);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
lim = queue_limits_start_update(ns->head->disk->queue);
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
blk_mq_unfreeze_queue(ns->head->disk->queue);
} }
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) return ret;
{
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
return nvme_update_ns_info_generic(ns, info);
}
return nvme_update_ns_info_block(ns, info);
case NVME_CSI_NVM:
return nvme_update_ns_info_block(ns, info);
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
return nvme_update_ns_info_generic(ns, info);
}
} }
#ifdef CONFIG_BLK_SED_OPAL #ifdef CONFIG_BLK_SED_OPAL
@ -2877,7 +2897,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf); subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys); nvme_mpath_default_iopolicy(subsys);
subsys->dev.class = nvme_subsys_class; subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem; subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups; subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@ -3117,11 +3137,17 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL; return -EINVAL;
} }
if (!ctrl->maxcmd) {
dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
return -EINVAL;
}
return 0; return 0;
} }
static int nvme_init_identify(struct nvme_ctrl *ctrl) static int nvme_init_identify(struct nvme_ctrl *ctrl)
{ {
struct queue_limits lim;
struct nvme_id_ctrl *id; struct nvme_id_ctrl *id;
u32 max_hw_sectors; u32 max_hw_sectors;
bool prev_apst_enabled; bool prev_apst_enabled;
@ -3188,7 +3214,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors = ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
nvme_set_queue_limits(ctrl, ctrl->admin_q); lim = queue_limits_start_update(ctrl->admin_q);
nvme_set_ctrl_limits(ctrl, &lim);
ret = queue_limits_commit_update(ctrl->admin_q, &lim);
if (ret)
goto out_free;
ctrl->sgls = le32_to_cpu(id->sgls); ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas); ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan); ctrl->max_namespaces = le32_to_cpu(id->mnan);
@ -3420,7 +3451,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0) if (minor < 0)
return minor; return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
cdev_device->class = nvme_ns_chr_class; cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel; cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device); device_initialize(cdev_device);
cdev_init(cdev, fops); cdev_init(cdev, fops);
@ -3692,7 +3723,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns) if (!ns)
return; return;
disk = blk_mq_alloc_disk(ctrl->tagset, ns); disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
if (IS_ERR(disk)) if (IS_ERR(disk))
goto out_free_ns; goto out_free_ns;
disk->fops = &nvme_bdev_ops; disk->fops = &nvme_bdev_ops;
@ -4353,6 +4384,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size) const struct blk_mq_ops *ops, unsigned int cmd_size)
{ {
struct queue_limits lim = {};
int ret; int ret;
memset(set, 0, sizeof(*set)); memset(set, 0, sizeof(*set));
@ -4372,14 +4404,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret) if (ret)
return ret; return ret;
ctrl->admin_q = blk_mq_init_queue(set); ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) { if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q); ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset; goto out_free_tagset;
} }
if (ctrl->ops->flags & NVME_F_FABRICS) { if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->fabrics_q = blk_mq_init_queue(set); ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) { if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q); ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q; goto out_cleanup_admin_q;
@ -4443,7 +4475,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret; return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) { if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->connect_q = blk_mq_init_queue(set); ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->connect_q)) { if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q); ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set; goto out_free_tag_set;
@ -4613,7 +4645,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device = &ctrl->ctrl_device; ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance); ctrl->instance);
ctrl->device->class = nvme_class; ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev; ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups) if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups; ctrl->device->groups = ops->dev_attr_groups;
@ -4846,42 +4878,36 @@ static int __init nvme_core_init(void)
if (result < 0) if (result < 0)
goto destroy_delete_wq; goto destroy_delete_wq;
nvme_class = class_create("nvme"); result = class_register(&nvme_class);
if (IS_ERR(nvme_class)) { if (result)
result = PTR_ERR(nvme_class);
goto unregister_chrdev; goto unregister_chrdev;
}
nvme_class->dev_uevent = nvme_class_uevent;
nvme_subsys_class = class_create("nvme-subsystem"); result = class_register(&nvme_subsys_class);
if (IS_ERR(nvme_subsys_class)) { if (result)
result = PTR_ERR(nvme_subsys_class);
goto destroy_class; goto destroy_class;
}
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic"); "nvme-generic");
if (result < 0) if (result < 0)
goto destroy_subsys_class; goto destroy_subsys_class;
nvme_ns_chr_class = class_create("nvme-generic"); result = class_register(&nvme_ns_chr_class);
if (IS_ERR(nvme_ns_chr_class)) { if (result)
result = PTR_ERR(nvme_ns_chr_class);
goto unregister_generic_ns; goto unregister_generic_ns;
}
result = nvme_init_auth(); result = nvme_init_auth();
if (result) if (result)
goto destroy_ns_chr; goto destroy_ns_chr;
return 0; return 0;
destroy_ns_chr: destroy_ns_chr:
class_destroy(nvme_ns_chr_class); class_unregister(&nvme_ns_chr_class);
unregister_generic_ns: unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class: destroy_subsys_class:
class_destroy(nvme_subsys_class); class_unregister(&nvme_subsys_class);
destroy_class: destroy_class:
class_destroy(nvme_class); class_unregister(&nvme_class);
unregister_chrdev: unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq: destroy_delete_wq:
@ -4897,9 +4923,9 @@ out:
static void __exit nvme_core_exit(void) static void __exit nvme_core_exit(void)
{ {
nvme_exit_auth(); nvme_exit_auth();
class_destroy(nvme_ns_chr_class); class_unregister(&nvme_ns_chr_class);
class_destroy(nvme_subsys_class); class_unregister(&nvme_subsys_class);
class_destroy(nvme_class); class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_delete_wq);

View File

@ -638,7 +638,7 @@ static struct key *nvmf_parse_key(int key_id)
} }
key = key_lookup(key_id); key = key_lookup(key_id);
if (!IS_ERR(key)) if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id); pr_err("key id %08x not found\n", key_id);
else else
pr_debug("Using key id %08x\n", key_id); pr_debug("Using key id %08x\n", key_id);
@ -1319,7 +1319,10 @@ out_free_opts:
return ERR_PTR(ret); return ERR_PTR(ret);
} }
static struct class *nvmf_class; static const struct class nvmf_class = {
.name = "nvme-fabrics",
};
static struct device *nvmf_device; static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex); static DEFINE_MUTEX(nvmf_dev_mutex);
@ -1439,15 +1442,14 @@ static int __init nvmf_init(void)
if (!nvmf_default_host) if (!nvmf_default_host)
return -ENOMEM; return -ENOMEM;
nvmf_class = class_create("nvme-fabrics"); ret = class_register(&nvmf_class);
if (IS_ERR(nvmf_class)) { if (ret) {
pr_err("couldn't register class nvme-fabrics\n"); pr_err("couldn't register class nvme-fabrics\n");
ret = PTR_ERR(nvmf_class);
goto out_free_host; goto out_free_host;
} }
nvmf_device = nvmf_device =
device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) { if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabrics device!\n"); pr_err("couldn't create nvme-fabrics device!\n");
ret = PTR_ERR(nvmf_device); ret = PTR_ERR(nvmf_device);
@ -1463,9 +1465,9 @@ static int __init nvmf_init(void)
return 0; return 0;
out_destroy_device: out_destroy_device:
device_destroy(nvmf_class, MKDEV(0, 0)); device_destroy(&nvmf_class, MKDEV(0, 0));
out_destroy_class: out_destroy_class:
class_destroy(nvmf_class); class_unregister(&nvmf_class);
out_free_host: out_free_host:
nvmf_host_put(nvmf_default_host); nvmf_host_put(nvmf_default_host);
return ret; return ret;
@ -1474,8 +1476,8 @@ out_free_host:
static void __exit nvmf_exit(void) static void __exit nvmf_exit(void)
{ {
misc_deregister(&nvmf_misc); misc_deregister(&nvmf_misc);
device_destroy(nvmf_class, MKDEV(0, 0)); device_destroy(&nvmf_class, MKDEV(0, 0));
class_destroy(nvmf_class); class_unregister(&nvmf_class);
nvmf_host_put(nvmf_default_host); nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64); BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);

View File

@ -516,6 +516,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{ {
struct queue_limits lim;
bool vwc = false; bool vwc = false;
mutex_init(&head->lock); mutex_init(&head->lock);
@ -532,9 +533,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath) !nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0; return 0;
head->disk = blk_alloc_disk(ctrl->numa_node); blk_set_stacking_limits(&lim);
if (!head->disk) lim.dma_alignment = 3;
return -ENOMEM; if (head->ids.csi != NVME_CSI_ZNS)
lim.max_zone_append_sectors = 0;
head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
if (IS_ERR(head->disk))
return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops; head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head; head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d", sprintf(head->disk->disk_name, "nvme%dn%d",
@ -553,11 +559,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
/* set to a default value of 512 until the disk is validated */
blk_queue_logical_block_size(head->disk->queue, 512);
blk_set_stacking_limits(&head->disk->queue->limits);
blk_queue_dma_alignment(head->disk->queue, 3);
/* we need to propagate up the VMC settings */ /* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true; vwc = true;

View File

@ -464,6 +464,7 @@ struct nvme_ns_head {
u16 ms; u16 ms;
u16 pi_size; u16 pi_size;
u8 pi_type; u8 pi_type;
u8 pi_offset;
u8 guard_type; u8 guard_type;
u16 sgs; u16 sgs;
u32 sws; u32 sws;
@ -1035,11 +1036,11 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
} }
#endif /* CONFIG_NVME_MULTIPATH */ #endif /* CONFIG_NVME_MULTIPATH */
int nvme_revalidate_zones(struct nvme_ns *ns);
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data); unsigned int nr_zones, report_zones_cb cb, void *data);
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct queue_limits *lim);
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd, struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action); enum nvme_zone_mgmt_action action);
@ -1050,13 +1051,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
{ {
return BLK_STS_NOTSUPP; return BLK_STS_NOTSUPP;
} }
static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
{
dev_warn(ns->ctrl->device,
"Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
return -EPROTONOSUPPORT;
}
#endif #endif
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)

View File

@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{ {
int ret; int ret;
bool changed; bool changed;
u16 max_queue_size;
ret = nvme_rdma_configure_admin_queue(ctrl, new); ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret) if (ret)
@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
} }
if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { if (ctrl->ctrl.max_integrity_segments)
max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
else
max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
if (ctrl->ctrl.sqsize + 1 > max_queue_size) {
dev_warn(ctrl->ctrl.device, dev_warn(ctrl->ctrl.device,
"ctrl sqsize %u > max queue size %u, clamping down\n", "ctrl sqsize %u > max queue size %u, clamping down\n",
ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); ctrl->ctrl.sqsize + 1, max_queue_size);
ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; ctrl->ctrl.sqsize = max_queue_size - 1;
} }
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {

View File

@ -221,14 +221,11 @@ static int ns_update_nuse(struct nvme_ns *ns)
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id); ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
if (ret) if (ret)
goto out_free_id; return ret;
ns->head->nuse = le64_to_cpu(id->nuse); ns->head->nuse = le64_to_cpu(id->nuse);
out_free_id:
kfree(id); kfree(id);
return 0;
return ret;
} }
static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,

View File

@ -7,16 +7,6 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include "nvme.h" #include "nvme.h"
int nvme_revalidate_zones(struct nvme_ns *ns)
{
struct request_queue *q = ns->queue;
blk_queue_chunk_sectors(q, ns->head->zsze);
blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
return blk_revalidate_disk_zones(ns->disk, NULL);
}
static int nvme_set_max_append(struct nvme_ctrl *ctrl) static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{ {
struct nvme_command c = { }; struct nvme_command c = { };
@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
return 0; return 0;
} }
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct queue_limits *lim)
{ {
struct nvme_effects_log *log = ns->head->effects; struct nvme_effects_log *log = ns->head->effects;
struct request_queue *q = ns->queue;
struct nvme_command c = { }; struct nvme_command c = { };
struct nvme_id_ns_zns *id; struct nvme_id_ns_zns *id;
int status; int status;
@ -109,10 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data; goto free_data;
} }
disk_set_zoned(ns->disk); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); lim->zoned = 1;
disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1); lim->max_open_zones = le32_to_cpu(id->mor) + 1;
disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1); lim->max_active_zones = le32_to_cpu(id->mar) + 1;
lim->chunk_sectors = ns->head->zsze;
lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
free_data: free_data:
kfree(id); kfree(id);
return status; return status;

View File

@ -428,7 +428,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->cqes = (0x4 << 4) | 0x4; id->cqes = (0x4 << 4) | 0x4;
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */ /* no enforcement soft-limit for maxcmd - pick arbitrary high value */
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl));
id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);

View File

@ -273,6 +273,32 @@ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_, param_inline_data_size); CONFIGFS_ATTR(nvmet_, param_inline_data_size);
static ssize_t nvmet_param_max_queue_size_show(struct config_item *item,
char *page)
{
struct nvmet_port *port = to_nvmet_port(item);
return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size);
}
static ssize_t nvmet_param_max_queue_size_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
int ret;
if (nvmet_is_port_enabled(port, __func__))
return -EACCES;
ret = kstrtoint(page, 0, &port->max_queue_size);
if (ret) {
pr_err("Invalid value '%s' for max_queue_size\n", page);
return -EINVAL;
}
return count;
}
CONFIGFS_ATTR(nvmet_, param_max_queue_size);
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
static ssize_t nvmet_param_pi_enable_show(struct config_item *item, static ssize_t nvmet_param_pi_enable_show(struct config_item *item,
char *page) char *page)
@ -1859,6 +1885,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_trtype, &nvmet_attr_addr_trtype,
&nvmet_attr_addr_tsas, &nvmet_attr_addr_tsas,
&nvmet_attr_param_inline_data_size, &nvmet_attr_param_inline_data_size,
&nvmet_attr_param_max_queue_size,
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_attr_param_pi_enable, &nvmet_attr_param_pi_enable,
#endif #endif
@ -1917,6 +1944,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals); INIT_LIST_HEAD(&port->referrals);
port->inline_data_size = -1; /* < 0 == let the transport choose */ port->inline_data_size = -1; /* < 0 == let the transport choose */
port->max_queue_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid); port->disc_addr.portid = cpu_to_le16(portid);
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX;

Some files were not shown because too many files have changed in this diff Show More