ublk: enable zoned storage support

Add zoned storage support to ublk: report_zones and operations:
 - REQ_OP_ZONE_OPEN
 - REQ_OP_ZONE_CLOSE
 - REQ_OP_ZONE_FINISH
 - REQ_OP_ZONE_RESET
 - REQ_OP_ZONE_APPEND

The zone append feature uses the `addr` field of `struct ublksrv_io_cmd` to
communicate ALBA back to the kernel. Therefore ublk must be used with the
user copy feature (UBLK_F_USER_COPY) for zoned storage support to be
available. Without this feature, ublk will not allow zoned storage support.

Signed-off-by: Andreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230804114610.179530-4-nmi@metaspace.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Andreas Hindborg 2023-08-04 13:46:10 +02:00 committed by Jens Axboe
parent 1a6e88b959
commit 29802d7ca3
2 changed files with 370 additions and 24 deletions

View File

@ -56,16 +56,21 @@
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY)
| UBLK_F_USER_COPY \
| UBLK_F_ZONED)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
#define UBLK_PARAM_TYPE_ALL \
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
struct ublk_rq_data {
struct llist_node node;
struct kref ref;
__u64 sector;
__u32 operation;
__u32 nr_zones;
};
struct ublk_uring_cmd_pdu {
@ -185,11 +190,263 @@ struct ublk_params_header {
__u32 types;
};
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
}
static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_ZONED;
}
#ifdef CONFIG_BLK_DEV_ZONED
static int ublk_get_nr_zones(const struct ublk_device *ub)
{
const struct ublk_param_basic *p = &ub->params.basic;
/* Zone size is a power of 2 */
return p->dev_sectors >> ilog2(p->chunk_sectors);
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
return blk_revalidate_disk_zones(ub->ub_disk, NULL);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
int nr_zones;
if (!ublk_dev_is_zoned(ub))
return -EINVAL;
if (!p->max_zone_append_sectors)
return -EINVAL;
nr_zones = ublk_get_nr_zones(ub);
if (p->max_active_zones > nr_zones)
return -EINVAL;
if (p->max_open_zones > nr_zones)
return -EINVAL;
return 0;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
return 0;
}
/* Based on virtblk_alloc_report_buffer */
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
unsigned int nr_zones, size_t *buflen)
{
struct request_queue *q = ublk->ub_disk->queue;
size_t bufsize;
void *buf;
nr_zones = min_t(unsigned int, nr_zones,
ublk->ub_disk->nr_zones);
bufsize = nr_zones * sizeof(struct blk_zone);
bufsize =
min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
while (bufsize >= sizeof(struct blk_zone)) {
buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
if (buf) {
*buflen = bufsize;
return buf;
}
bufsize >>= 1;
}
*buflen = 0;
return NULL;
}
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
struct ublk_device *ub = disk->private_data;
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
unsigned int first_zone = sector >> ilog2(zone_size_sectors);
unsigned int done_zones = 0;
unsigned int max_zones_per_request;
int ret;
struct blk_zone *buffer;
size_t buffer_length;
nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
nr_zones);
buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
if (!buffer)
return -ENOMEM;
max_zones_per_request = buffer_length / sizeof(struct blk_zone);
while (done_zones < nr_zones) {
unsigned int remaining_zones = nr_zones - done_zones;
unsigned int zones_in_request =
min_t(unsigned int, remaining_zones, max_zones_per_request);
struct request *req;
struct ublk_rq_data *pdu;
blk_status_t status;
memset(buffer, 0, buffer_length);
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
goto out;
}
pdu = blk_mq_rq_to_pdu(req);
pdu->operation = UBLK_IO_OP_REPORT_ZONES;
pdu->sector = sector;
pdu->nr_zones = zones_in_request;
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
GFP_KERNEL);
if (ret) {
blk_mq_free_request(req);
goto out;
}
status = blk_execute_rq(req, 0);
ret = blk_status_to_errno(status);
blk_mq_free_request(req);
if (ret)
goto out;
for (unsigned int i = 0; i < zones_in_request; i++) {
struct blk_zone *zone = buffer + i;
/* A zero length zone means no more zones in this response */
if (!zone->len)
break;
ret = cb(zone, i, data);
if (ret)
goto out;
done_zones++;
sector += zone_size_sectors;
}
}
ret = done_zones;
out:
kvfree(buffer);
return ret;
}
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
u32 ublk_op;
switch (req_op(req)) {
case REQ_OP_ZONE_OPEN:
ublk_op = UBLK_IO_OP_ZONE_OPEN;
break;
case REQ_OP_ZONE_CLOSE:
ublk_op = UBLK_IO_OP_ZONE_CLOSE;
break;
case REQ_OP_ZONE_FINISH:
ublk_op = UBLK_IO_OP_ZONE_FINISH;
break;
case REQ_OP_ZONE_RESET:
ublk_op = UBLK_IO_OP_ZONE_RESET;
break;
case REQ_OP_ZONE_APPEND:
ublk_op = UBLK_IO_OP_ZONE_APPEND;
break;
case REQ_OP_DRV_IN:
ublk_op = pdu->operation;
switch (ublk_op) {
case UBLK_IO_OP_REPORT_ZONES:
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_zones = pdu->nr_zones;
iod->start_sector = pdu->sector;
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
}
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_DRV_OUT:
/* We do not support reset_all and drv_out */
return BLK_STS_NOTSUPP;
default:
return BLK_STS_IOERR;
}
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
iod->addr = io->addr;
return BLK_STS_OK;
}
#else
#define ublk_report_zones (NULL)
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
return -EOPNOTSUPP;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
return -EOPNOTSUPP;
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
return 0;
}
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
struct request *req)
{
return -EOPNOTSUPP;
}
#endif
static inline void __ublk_complete_rq(struct request *req);
static void ublk_complete_rq(struct kref *ref);
@ -286,6 +543,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
return -EINVAL;
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
return -EINVAL;
} else
return -EINVAL;
@ -304,6 +564,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_validate(ub);
else if (ublk_dev_is_zoned(ub))
return -EINVAL;
return 0;
}
@ -317,6 +582,9 @@ static int ublk_apply_params(struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_apply(ub);
return 0;
}
@ -487,6 +755,7 @@ static const struct block_device_operations ub_fops = {
.owner = THIS_MODULE,
.open = ublk_open,
.free_disk = ublk_free_disk,
.report_zones = ublk_report_zones,
};
#define UBLK_MAX_PIN_PAGES 32
@ -601,7 +870,8 @@ static inline bool ublk_need_map_req(const struct request *req)
static inline bool ublk_need_unmap_req(const struct request *req)
{
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ;
return ublk_rq_has_data(req) &&
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@ -685,8 +955,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
enum req_op op = req_op(req);
u32 ublk_op;
if (!ublk_queue_is_zoned(ubq) &&
(op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
return -EIO;
switch (req_op(req)) {
case REQ_OP_READ:
ublk_op = UBLK_IO_OP_READ;
@ -704,6 +979,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
break;
default:
if (ublk_queue_is_zoned(ubq))
return ublk_setup_iod_zoned(ubq, req);
return BLK_STS_IOERR;
}
@ -756,7 +1033,8 @@ static inline void __ublk_complete_rq(struct request *req)
*
* Both the two needn't unmap.
*/
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE)
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
req_op(req) != REQ_OP_DRV_IN)
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
@ -1120,6 +1398,9 @@ static void ublk_commit_completion(struct ublk_device *ub,
/* find the io request and complete */
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba;
if (req && likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req);
}
@ -1468,8 +1749,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ))
goto out;
} else if (ub_cmd->addr) {
/* User copy requires addr to be unset */
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
/*
* User copy requires addr to be unset when command is
* not zone append
*/
ret = -EINVAL;
goto out;
}
@ -1546,11 +1830,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir)
{
/* copy ubuf to request pages */
if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
ubuf_dir == ITER_SOURCE)
return true;
/* copy request pages to ubuf */
if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
if ((req_op(req) == REQ_OP_WRITE ||
req_op(req) == REQ_OP_ZONE_APPEND) &&
ubuf_dir == ITER_DEST)
return true;
return false;
@ -1889,17 +2176,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
get_device(&ub->cdev_dev);
ub->dev_info.state = UBLK_S_DEV_LIVE;
if (ublk_dev_is_zoned(ub)) {
ret = ublk_revalidate_disk_zones(ub);
if (ret)
goto out_put_cdev;
}
ret = add_disk(disk);
if (ret)
goto out_put_cdev;
set_bit(UB_STATE_USED, &ub->state);
out_put_cdev:
if (ret) {
/*
* Has to drop the reference since ->free_disk won't be
* called in case of add_disk failure.
*/
ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub);
goto out_put_disk;
}
set_bit(UB_STATE_USED, &ub->state);
out_put_disk:
if (ret)
put_disk(disk);
@ -2049,6 +2343,13 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (ublk_dev_is_user_copy(ub))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/* Zoned storage support requires user copy feature */
if (ublk_dev_is_zoned(ub) &&
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
ret = -EINVAL;
goto out_free_dev_number;
}
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;

View File

@ -176,6 +176,12 @@
/* Copy between request and user buffer by pread()/pwrite() */
#define UBLK_F_USER_COPY (1UL << 7)
/*
* User space sets this flag when setting up the device to request zoned storage support. Kernel may
* deny the request by returning an error.
*/
#define UBLK_F_ZONED (1ULL << 8)
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@ -232,9 +238,26 @@ struct ublksrv_ctrl_dev_info {
#define UBLK_IO_OP_READ 0
#define UBLK_IO_OP_WRITE 1
#define UBLK_IO_OP_FLUSH 2
#define UBLK_IO_OP_DISCARD 3
#define UBLK_IO_OP_WRITE_SAME 4
#define UBLK_IO_OP_WRITE_ZEROES 5
#define UBLK_IO_OP_DISCARD 3
#define UBLK_IO_OP_WRITE_SAME 4
#define UBLK_IO_OP_WRITE_ZEROES 5
#define UBLK_IO_OP_ZONE_OPEN 10
#define UBLK_IO_OP_ZONE_CLOSE 11
#define UBLK_IO_OP_ZONE_FINISH 12
#define UBLK_IO_OP_ZONE_APPEND 13
#define UBLK_IO_OP_ZONE_RESET 15
/*
* Construct a zone report. The report request is carried in `struct
* ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
* and shall indicate the first zone of the report. The `nr_zones` shall
* indicate how many zones should be reported at most. The report shall be
* delivered as a `struct blk_zone` array. To report fewer zones than requested,
* zero the last entry of the returned array.
*
* Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
* include/uapi/linux/blkzoned.h are part of ublk UAPI.
*/
#define UBLK_IO_OP_REPORT_ZONES 18
#define UBLK_IO_F_FAILFAST_DEV (1U << 8)
#define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
@ -255,7 +278,10 @@ struct ublksrv_io_desc {
/* op: bit 0-7, flags: bit 8-31 */
__u32 op_flags;
__u32 nr_sectors;
union {
__u32 nr_sectors;
__u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */
};
/* start sector for this io */
__u64 start_sector;
@ -284,11 +310,21 @@ struct ublksrv_io_cmd {
/* io result, it is valid for COMMIT* command only */
__s32 result;
/*
* userspace buffer address in ublksrv daemon process, valid for
* FETCH* command only
*/
__u64 addr;
union {
/*
* userspace buffer address in ublksrv daemon process, valid for
* FETCH* command only
*
* `addr` should not be used when UBLK_F_USER_COPY is enabled,
* because userspace handles data copy by pread()/pwrite() over
* /dev/ublkcN. But in case of UBLK_F_ZONED, this union is
* re-used to pass back the allocated LBA for
* UBLK_IO_OP_ZONE_APPEND which actually depends on
* UBLK_F_USER_COPY
*/
__u64 addr;
__u64 zone_append_lba;
};
};
struct ublk_param_basic {
@ -331,6 +367,13 @@ struct ublk_param_devt {
__u32 disk_minor;
};
struct ublk_param_zoned {
__u32 max_open_zones;
__u32 max_active_zones;
__u32 max_zone_append_sectors;
__u8 reserved[20];
};
struct ublk_params {
/*
* Total length of parameters, userspace has to set 'len' for both
@ -342,11 +385,13 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_BASIC (1 << 0)
#define UBLK_PARAM_TYPE_DISCARD (1 << 1)
#define UBLK_PARAM_TYPE_DEVT (1 << 2)
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
__u32 types; /* types of parameter included */
struct ublk_param_basic basic;
struct ublk_param_discard discard;
struct ublk_param_devt devt;
struct ublk_param_zoned zoned;
};
#endif