for-5.15/drivers-2021-08-30

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmEs6LsQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpqnLD/9c8v7WTLjrDR6FLD8fHUmkwk9ss6OeyYJC
 Z62QOyk6BqNOu6FAwBax9wFaboXdUqOdpJU0PVQ7WJ5wBiCQ9DAZY6T+iwW0jE79
 +iOSqdXHVLAIyIM9GplzLH5AH3tx4445bX7fRWwWX1OgmSidkAhb25FusCvpcpHx
 1k+9dSLClLeHPR6jVT3k6tHv2RzPSw+/vYOggeWYA0YYPfoCx/Ft0uwO+PjKpvLQ
 Je5jASlLGYCXazswJBZgfjbroA97EuaLOmHHIHrwhkkFsbV6ewv6mlmanbMEs4fX
 Wh+axTt8so27g6gbw31EOcGsxTi0B37Jx9MOrSla6NdJoZkFE2sn6K+D5k4oeSrg
 QgYXL00U62eSgWmgSB0f0X081cQfI+FUMe5u5S368WdrgCPfaXl11zHw8nXw8gEW
 UvqR4zr3hQd4piXsIWl2bwZrmpPBCeB8iStLq3C92RLPFT6hJO3GM/ZmwTn+0HT0
 lMXzoEdkPywkKWi8aBbSgzXiGknNl8HAYnwMhcQjiHbYQOycGkI9pigJDNY9Ox1l
 fYHFSompmJ/XK8cIiU7QIglXEXJky5jQ89Ni0ryCstOaP20tPxWtkpOCgidXfNGz
 4lmQV8D5aBTUFs6ifPjXfiXUmDiU3SaxiFhAqaEkGII9BbkrNhlibB4LBAU+toi1
 Q0yGhGR/mg==
 =4uWF
 -----END PGP SIGNATURE-----

Merge tag 'for-5.15/drivers-2021-08-30' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:
 "Sitting on top of the core block changes, here are the driver changes
  for the 5.15 merge window:

   - NVMe updates via Christoph:
       - suspend improvements for devices with an HMB (Keith Busch)
       - handle double completions more gacefull (Sagi Grimberg)
       - cleanup the selects for the nvme core code a bit (Sagi Grimberg)
       - don't update queue count when failing to set io queues (Ruozhu Li)
       - various nvmet connect fixes (Amit Engel)
       - cleanup lightnvm leftovers (Keith Busch, me)
       - small cleanups (Colin Ian King, Hou Pu)
       - add tracing for the Set Features command (Hou Pu)
       - CMB sysfs cleanups (Keith Busch)
       - add a mutex_destroy call (Keith Busch)

   - remove lightnvm subsystem. It's served its purpose and ultimately
     led to zoned nvme support, we no longer need it (Christoph)

   - revert floppy O_NDELAY fix (Denis)

   - nbd fixes (Hou, Pavel, Baokun)

   - nbd locking fixes (Tetsuo)

   - nbd device removal fixes (Christoph)

   - raid10 rcu warning fix (Xiao)

   - raid1 write behind fix (Guoqing)

   - rnbd fixes (Gioh, Md Haris)

   - misc fixes (Colin)"

* tag 'for-5.15/drivers-2021-08-30' of git://git.kernel.dk/linux-block: (42 commits)
  Revert "floppy: reintroduce O_NDELAY fix"
  raid1: ensure write behind bio has less than BIO_MAX_VECS sectors
  md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard
  nbd: remove nbd->destroy_complete
  nbd: only return usable devices from nbd_find_unused
  nbd: set nbd->index before releasing nbd_index_mutex
  nbd: prevent IDR lookups from finding partially initialized devices
  nbd: reset NBD to NULL when restarting in nbd_genl_connect
  nbd: add missing locking to the nbd_dev_add error path
  nvme: remove the unused NVME_NS_* enum
  nvme: remove nvm_ndev from ns
  nvme: Have NVME_FABRICS select NVME_CORE instead of transport drivers
  block: nbd: add sanity check for first_minor
  nvmet: check that host sqsize does not exceed ctrl MQES
  nvmet: avoid duplicate qid in connect cmd
  nvmet: pass back cntlid on successful completion
  nvme-rdma: don't update queue count when failing to set io queues
  nvme-tcp: don't update queue count when failing to set io queues
  nvme-tcp: pair send_mutex init with destroy
  nvme: allow user toggling hmb usage
  ...
This commit is contained in:
Linus Torvalds 2021-08-30 19:01:46 -07:00
commit 9a1d6c9e3f
52 changed files with 463 additions and 13927 deletions

View File

@ -85,7 +85,6 @@ available subsections can be seen below.
io-mapping
io_ordering
generic-counter
lightnvm-pblk
memory-devices/index
men-chameleon-bus
ntb

View File

@ -1,21 +0,0 @@
pblk: Physical Block Device Target
==================================
pblk implements a fully associative, host-based FTL that exposes a traditional
block I/O interface. Its primary responsibilities are:
- Map logical addresses onto physical addresses (4KB granularity) in a
logical-to-physical (L2P) table.
- Maintain the integrity and consistency of the L2P table as well as its
recovery from normal tear down and power outage.
- Deal with controller- and media-specific constrains.
- Handle I/O errors.
- Implement garbage collection.
- Maintain consistency across the I/O stack during synchronization points.
For more information please refer to:
http://lightnvm.io
which maintains updated FAQs, manual pages, technical documentation, tools,
contacts, etc.

View File

@ -160,7 +160,6 @@ Code Seq# Include File Comments
'K' all linux/kd.h
'L' 00-1F linux/loop.h conflict!
'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict!
'L' 20-2F linux/lightnvm.h
'L' E0-FF linux/ppdd.h encrypted disk device driver
<http://linux01.gwdg.de/~alatham/ppdd.html>
'M' all linux/soundcard.h conflict!

View File

@ -10619,15 +10619,6 @@ F: LICENSES/
F: scripts/spdxcheck-test.sh
F: scripts/spdxcheck.py
LIGHTNVM PLATFORM SUPPORT
M: Matias Bjorling <mb@lightnvm.io>
L: linux-block@vger.kernel.org
S: Maintained
W: http://github/OpenChannelSSD
F: drivers/lightnvm/
F: include/linux/lightnvm.h
F: include/uapi/linux/lightnvm.h
LINEAR RANGES HELPERS
M: Mark Brown <broonie@kernel.org>
R: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>

View File

@ -51,8 +51,6 @@ source "drivers/net/Kconfig"
source "drivers/isdn/Kconfig"
source "drivers/lightnvm/Kconfig"
# input before char - char/joystick depends on it. As does USB.
source "drivers/input/Kconfig"

View File

@ -70,7 +70,6 @@ obj-$(CONFIG_FB_I810) += video/fbdev/i810/
obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
obj-$(CONFIG_PARPORT) += parport/
obj-$(CONFIG_NVM) += lightnvm/
obj-y += base/ block/ misc/ mfd/ nfc/
obj-$(CONFIG_LIBNVDIMM) += nvdimm/
obj-$(CONFIG_DAX) += dax/

View File

@ -4029,23 +4029,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
if (fdc_state[FDC(drive)].rawcmd == 1)
fdc_state[FDC(drive)].rawcmd = 2;
if (mode & (FMODE_READ|FMODE_WRITE)) {
drive_state[drive].last_checked = 0;
clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags);
if (bdev_check_media_change(bdev))
floppy_revalidate(bdev->bd_disk);
if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
goto out;
if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
if (!(mode & FMODE_NDELAY)) {
if (mode & (FMODE_READ|FMODE_WRITE)) {
drive_state[drive].last_checked = 0;
clear_bit(FD_OPEN_SHOULD_FAIL_BIT,
&drive_state[drive].flags);
if (bdev_check_media_change(bdev))
floppy_revalidate(bdev->bd_disk);
if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
goto out;
if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
goto out;
}
res = -EROFS;
if ((mode & FMODE_WRITE) &&
!test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags))
goto out;
}
res = -EROFS;
if ((mode & FMODE_WRITE) &&
!test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags))
goto out;
mutex_unlock(&open_lock);
mutex_unlock(&floppy_mutex);
return 0;

View File

@ -49,6 +49,7 @@
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
static struct workqueue_struct *nbd_del_wq;
static int nbd_total_devices = 0;
struct nbd_sock {
@ -113,12 +114,12 @@ struct nbd_device {
struct mutex config_lock;
struct gendisk *disk;
struct workqueue_struct *recv_workq;
struct work_struct remove_work;
struct list_head list;
struct task_struct *task_recv;
struct task_struct *task_setup;
struct completion *destroy_complete;
unsigned long flags;
char *backend;
@ -237,32 +238,36 @@ static void nbd_dev_remove(struct nbd_device *nbd)
{
struct gendisk *disk = nbd->disk;
if (disk) {
del_gendisk(disk);
blk_cleanup_disk(disk);
blk_mq_free_tag_set(&nbd->tag_set);
}
del_gendisk(disk);
blk_cleanup_disk(disk);
blk_mq_free_tag_set(&nbd->tag_set);
/*
* Place this in the last just before the nbd is freed to
* make sure that the disk and the related kobject are also
* totally removed to avoid duplicate creation of the same
* one.
* Remove from idr after del_gendisk() completes, so if the same ID is
* reused, the following add_disk() will succeed.
*/
if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
complete(nbd->destroy_complete);
mutex_lock(&nbd_index_mutex);
idr_remove(&nbd_index_idr, nbd->index);
mutex_unlock(&nbd_index_mutex);
kfree(nbd);
}
static void nbd_dev_remove_work(struct work_struct *work)
{
nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
}
static void nbd_put(struct nbd_device *nbd)
{
if (refcount_dec_and_mutex_lock(&nbd->refs,
&nbd_index_mutex)) {
idr_remove(&nbd_index_idr, nbd->index);
if (!refcount_dec_and_test(&nbd->refs))
return;
/* Call del_gendisk() asynchrounously to prevent deadlock */
if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
queue_work(nbd_del_wq, &nbd->remove_work);
else
nbd_dev_remove(nbd);
mutex_unlock(&nbd_index_mutex);
}
}
static int nbd_disconnected(struct nbd_config *config)
@ -1388,6 +1393,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
struct nbd_config *config = nbd->config;
loff_t bytesize;
switch (cmd) {
case NBD_DISCONNECT:
@ -1402,8 +1408,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_SET_SIZE:
return nbd_set_size(nbd, arg, config->blksize);
case NBD_SET_SIZE_BLOCKS:
return nbd_set_size(nbd, arg * config->blksize,
config->blksize);
if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize))
return -EINVAL;
return nbd_set_size(nbd, bytesize, config->blksize);
case NBD_SET_TIMEOUT:
nbd_set_cmd_timeout(nbd, arg);
return 0;
@ -1665,7 +1672,7 @@ static const struct blk_mq_ops nbd_mq_ops = {
.timeout = nbd_xmit_timeout,
};
static int nbd_dev_add(int index)
static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
struct nbd_device *nbd;
struct gendisk *disk;
@ -1683,13 +1690,14 @@ static int nbd_dev_add(int index)
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
nbd->destroy_complete = NULL;
INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
nbd->backend = NULL;
err = blk_mq_alloc_tag_set(&nbd->tag_set);
if (err)
goto out_free_nbd;
mutex_lock(&nbd_index_mutex);
if (index >= 0) {
err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
GFP_KERNEL);
@ -1700,9 +1708,10 @@ static int nbd_dev_add(int index)
if (err >= 0)
index = err;
}
nbd->index = index;
mutex_unlock(&nbd_index_mutex);
if (err < 0)
goto out_free_tags;
nbd->index = index;
disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
if (IS_ERR(disk)) {
@ -1726,38 +1735,65 @@ static int nbd_dev_add(int index)
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
refcount_set(&nbd->refs, 1);
/*
* Start out with a zero references to keep other threads from using
* this device until it is fully initialized.
*/
refcount_set(&nbd->refs, 0);
INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
/* Too big first_minor can cause duplicate creation of
* sysfs files/links, since first_minor will be truncated to
* byte in __device_add_disk().
*/
disk->first_minor = index << part_shift;
if (disk->first_minor > 0xff) {
err = -EINVAL;
goto out_free_idr;
}
disk->minors = 1 << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
add_disk(disk);
/*
* Now publish the device.
*/
refcount_set(&nbd->refs, refs);
nbd_total_devices++;
return index;
return nbd;
out_free_idr:
mutex_lock(&nbd_index_mutex);
idr_remove(&nbd_index_idr, index);
mutex_unlock(&nbd_index_mutex);
out_free_tags:
blk_mq_free_tag_set(&nbd->tag_set);
out_free_nbd:
kfree(nbd);
out:
return err;
return ERR_PTR(err);
}
static int find_free_cb(int id, void *ptr, void *data)
static struct nbd_device *nbd_find_get_unused(void)
{
struct nbd_device *nbd = ptr;
struct nbd_device **found = data;
struct nbd_device *nbd;
int id;
if (!refcount_read(&nbd->config_refs)) {
*found = nbd;
return 1;
lockdep_assert_held(&nbd_index_mutex);
idr_for_each_entry(&nbd_index_idr, nbd, id) {
if (refcount_read(&nbd->config_refs) ||
test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
continue;
if (refcount_inc_not_zero(&nbd->refs))
return nbd;
}
return 0;
return NULL;
}
/* Netlink interface. */
@ -1806,8 +1842,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
{
DECLARE_COMPLETION_ONSTACK(destroy_complete);
struct nbd_device *nbd = NULL;
struct nbd_device *nbd;
struct nbd_config *config;
int index = -1;
int ret;
@ -1829,56 +1864,30 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
again:
mutex_lock(&nbd_index_mutex);
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
if (ret == 0) {
int new_index;
new_index = nbd_dev_add(-1);
if (new_index < 0) {
mutex_unlock(&nbd_index_mutex);
printk(KERN_ERR "nbd: failed to add new device\n");
return new_index;
}
nbd = idr_find(&nbd_index_idr, new_index);
}
nbd = nbd_find_get_unused();
} else {
nbd = idr_find(&nbd_index_idr, index);
if (!nbd) {
ret = nbd_dev_add(index);
if (ret < 0) {
if (nbd) {
if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
printk(KERN_ERR "nbd: failed to add new device\n");
return ret;
pr_err("nbd: device at index %d is going down\n",
index);
return -EINVAL;
}
nbd = idr_find(&nbd_index_idr, index);
}
}
if (!nbd) {
printk(KERN_ERR "nbd: couldn't find device at index %d\n",
index);
mutex_unlock(&nbd_index_mutex);
return -EINVAL;
}
if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
nbd->destroy_complete = &destroy_complete;
mutex_unlock(&nbd_index_mutex);
/* Wait untill the the nbd stuff is totally destroyed */
wait_for_completion(&destroy_complete);
goto again;
}
if (!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
if (index == -1)
goto again;
printk(KERN_ERR "nbd: device at index %d is going down\n",
index);
return -EINVAL;
}
mutex_unlock(&nbd_index_mutex);
if (!nbd) {
nbd = nbd_dev_add(index, 2);
if (IS_ERR(nbd)) {
pr_err("nbd: failed to add new device\n");
return PTR_ERR(nbd);
}
}
mutex_lock(&nbd->config_lock);
if (refcount_read(&nbd->config_refs)) {
mutex_unlock(&nbd->config_lock);
@ -2424,16 +2433,21 @@ static int __init nbd_init(void)
if (register_blkdev(NBD_MAJOR, "nbd"))
return -EIO;
nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
if (!nbd_del_wq) {
unregister_blkdev(NBD_MAJOR, "nbd");
return -ENOMEM;
}
if (genl_register_family(&nbd_genl_family)) {
destroy_workqueue(nbd_del_wq);
unregister_blkdev(NBD_MAJOR, "nbd");
return -EINVAL;
}
nbd_dbg_init();
mutex_lock(&nbd_index_mutex);
for (i = 0; i < nbds_max; i++)
nbd_dev_add(i);
mutex_unlock(&nbd_index_mutex);
nbd_dev_add(i, 1);
return 0;
}
@ -2442,7 +2456,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data)
struct list_head *list = (struct list_head *)data;
struct nbd_device *nbd = ptr;
list_add_tail(&nbd->list, list);
/* Skip nbd that is being removed asynchronously */
if (refcount_read(&nbd->refs))
list_add_tail(&nbd->list, list);
return 0;
}
@ -2465,6 +2482,9 @@ static void __exit nbd_cleanup(void)
nbd_put(nbd);
}
/* Also wait for nbd_dev_remove_work() completes */
destroy_workqueue(nbd_del_wq);
idr_destroy(&nbd_index_idr);
genl_unregister_family(&nbd_genl_family);
unregister_blkdev(NBD_MAJOR, "nbd");

View File

@ -227,17 +227,17 @@ static ssize_t state_show(struct kobject *kobj,
switch (dev->dev_state) {
case DEV_STATE_INIT:
return snprintf(page, PAGE_SIZE, "init\n");
return sysfs_emit(page, "init\n");
case DEV_STATE_MAPPED:
/* TODO fix cli tool before changing to proper state */
return snprintf(page, PAGE_SIZE, "open\n");
return sysfs_emit(page, "open\n");
case DEV_STATE_MAPPED_DISCONNECTED:
/* TODO fix cli tool before changing to proper state */
return snprintf(page, PAGE_SIZE, "closed\n");
return sysfs_emit(page, "closed\n");
case DEV_STATE_UNMAPPED:
return snprintf(page, PAGE_SIZE, "unmapped\n");
return sysfs_emit(page, "unmapped\n");
default:
return snprintf(page, PAGE_SIZE, "unknown\n");
return sysfs_emit(page, "unknown\n");
}
}
@ -263,7 +263,7 @@ static ssize_t mapping_path_show(struct kobject *kobj,
dev = container_of(kobj, struct rnbd_clt_dev, kobj);
return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname);
return sysfs_emit(page, "%s\n", dev->pathname);
}
static struct kobj_attribute rnbd_clt_mapping_path_attr =
@ -276,8 +276,7 @@ static ssize_t access_mode_show(struct kobject *kobj,
dev = container_of(kobj, struct rnbd_clt_dev, kobj);
return snprintf(page, PAGE_SIZE, "%s\n",
rnbd_access_mode_str(dev->access_mode));
return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode));
}
static struct kobj_attribute rnbd_clt_access_mode =
@ -286,8 +285,8 @@ static struct kobj_attribute rnbd_clt_access_mode =
static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
return scnprintf(page, PAGE_SIZE, "Usage: echo <normal|force> > %s\n",
attr->attr.name);
return sysfs_emit(page, "Usage: echo <normal|force> > %s\n",
attr->attr.name);
}
static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj,
@ -357,9 +356,8 @@ static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *page)
{
return scnprintf(page, PAGE_SIZE,
"Usage: echo <new size in sectors> > %s\n",
attr->attr.name);
return sysfs_emit(page, "Usage: echo <new size in sectors> > %s\n",
attr->attr.name);
}
static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
@ -390,8 +388,7 @@ static struct kobj_attribute rnbd_clt_resize_dev_attr =
static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n",
attr->attr.name);
return sysfs_emit(page, "Usage: echo <1> > %s\n", attr->attr.name);
}
static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj,
@ -436,7 +433,7 @@ static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr,
dev = container_of(kobj, struct rnbd_clt_dev, kobj);
return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname);
return sysfs_emit(page, "%s\n", dev->sess->sessname);
}
static struct kobj_attribute rnbd_clt_session_attr =
@ -499,8 +496,8 @@ static ssize_t rnbd_clt_map_device_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *page)
{
return scnprintf(page, PAGE_SIZE,
"Usage: echo \"[dest_port=server port number] sessname=<name of the rtrs session> path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path=<full path on remote side> [access_mode=<ro|rw|migration>] [nr_poll_queues=<number of queues>]\" > %s\n\naddr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n",
return sysfs_emit(page,
"Usage: echo \"[dest_port=server port number] sessname=<name of the rtrs session> path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path=<full path on remote side> [access_mode=<ro|rw|migration>] [nr_poll_queues=<number of queues>]\" > %s\n\naddr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n",
attr->attr.name);
}

View File

@ -271,7 +271,7 @@ unlock:
*/
if (cpu_q)
*cpup = cpu_q->cpu;
put_cpu_var(sess->cpu_rr);
put_cpu_ptr(sess->cpu_rr);
if (q)
rnbd_clt_dev_requeue(q);

View File

@ -90,8 +90,8 @@ static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr,
sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
return scnprintf(page, PAGE_SIZE, "%d\n",
!(sess_dev->open_flags & FMODE_WRITE));
return sysfs_emit(page, "%d\n",
!(sess_dev->open_flags & FMODE_WRITE));
}
static struct kobj_attribute rnbd_srv_dev_session_ro_attr =
@ -105,8 +105,8 @@ static ssize_t access_mode_show(struct kobject *kobj,
sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
return scnprintf(page, PAGE_SIZE, "%s\n",
rnbd_access_mode_str(sess_dev->access_mode));
return sysfs_emit(page, "%s\n",
rnbd_access_mode_str(sess_dev->access_mode));
}
static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr =
@ -119,7 +119,7 @@ static ssize_t mapping_path_show(struct kobject *kobj,
sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname);
return sysfs_emit(page, "%s\n", sess_dev->pathname);
}
static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr =
@ -128,8 +128,8 @@ static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr =
static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n",
attr->attr.name);
return sysfs_emit(page, "Usage: echo 1 > %s\n",
attr->attr.name);
}
static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj,

View File

@ -1092,7 +1092,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
err = xlbd_reserve_minors(minor, nr_minors);
if (err)
return err;
err = -ENODEV;
memset(&info->tag_set, 0, sizeof(info->tag_set));
info->tag_set.ops = &blkfront_mq_ops;

View File

@ -1,44 +0,0 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# Open-Channel SSD NVM configuration
#
menuconfig NVM
bool "Open-Channel SSD target support (DEPRECATED)"
depends on BLOCK
help
Say Y here to get to enable Open-channel SSDs.
Open-Channel SSDs implement a set of extension to SSDs, that
exposes direct access to the underlying non-volatile memory.
If you say N, all options in this submenu will be skipped and disabled
only do this if you know what you are doing.
This code is deprecated and will be removed in Linux 5.15.
if NVM
config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target"
select CRC32
help
Allows an open-channel SSD to be exposed as a block device to the
host. The target assumes the device exposes raw flash and must be
explicitly managed by the host.
Please note the disk format is considered EXPERIMENTAL for now.
if NVM_PBLK
config NVM_PBLK_DEBUG
bool "PBlk Debug Support"
default n
help
Enables debug support for pblk. This includes extra checks, more
vocal error messages, and extra tracking fields in the pblk sysfs
entries.
endif # NVM_PBLK_DEBUG
endif # NVM

View File

@ -1,11 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for Open-Channel SSDs.
#
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \
pblk-gc.o pblk-recovery.o pblk-map.o \
pblk-rl.o pblk-sysfs.o

File diff suppressed because it is too large Load Diff

View File

@ -1,137 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-cache.c - pblk's write cache
*/
#include "pblk.h"
void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags)
{
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
unsigned long start_time;
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
start_time = bio_start_io_acct(bio);
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
switch (ret) {
case NVM_IO_REQUEUE:
io_schedule();
goto retry;
case NVM_IO_ERR:
pblk_pipeline_stop(pblk);
bio_io_error(bio);
goto out;
}
pblk_ppa_set_empty(&w_ctx.ppa);
w_ctx.flags = flags;
if (bio->bi_opf & REQ_PREFLUSH) {
w_ctx.flags |= PBLK_FLUSH_ENTRY;
pblk_write_kick(pblk);
}
if (unlikely(!bio_has_data(bio)))
goto out;
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
w_ctx.lba = lba + i;
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
atomic64_add(nr_entries, &pblk->user_wa);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(nr_entries, &pblk->inflight_writes);
atomic_long_add(nr_entries, &pblk->req_writes);
#endif
pblk_rl_inserted(&pblk->rl, nr_entries);
out:
bio_end_io_acct(bio, start_time);
pblk_write_should_kick(pblk);
if (ret == NVM_IO_DONE)
bio_endio(bio);
}
/*
* On GC the incoming lbas are not necessarily sequential. Also, some of the
* lbas might not be valid entries, which are marked as empty by the GC thread
*/
int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
{
struct pblk_w_ctx w_ctx;
unsigned int bpos, pos;
void *data = gc_rq->data;
int i, valid_entries;
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
*/
retry:
if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
io_schedule();
goto retry;
}
w_ctx.flags = PBLK_IOTYPE_GC;
pblk_ppa_set_empty(&w_ctx.ppa);
for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
if (gc_rq->lba_list[i] == ADDR_EMPTY)
continue;
w_ctx.lba = gc_rq->lba_list[i];
pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
gc_rq->paddr_list[i], pos);
data += PBLK_EXPOSED_PAGE_SIZE;
valid_entries++;
}
WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
"pblk: inconsistent GC write\n");
atomic64_add(valid_entries, &pblk->gc_wa);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(valid_entries, &pblk->inflight_writes);
atomic_long_add(valid_entries, &pblk->recov_gc_writes);
#endif
pblk_write_should_kick(pblk);
return NVM_IO_OK;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,726 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-gc.c - pblk's garbage collector
*/
#include "pblk.h"
#include "pblk-trace.h"
#include <linux/delay.h>
static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
{
vfree(gc_rq->data);
kfree(gc_rq);
}
static int pblk_gc_write(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
struct pblk_gc_rq *gc_rq, *tgc_rq;
LIST_HEAD(w_list);
spin_lock(&gc->w_lock);
if (list_empty(&gc->w_list)) {
spin_unlock(&gc->w_lock);
return 1;
}
list_cut_position(&w_list, &gc->w_list, gc->w_list.prev);
gc->w_entries = 0;
spin_unlock(&gc->w_lock);
list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
pblk_write_gc_to_cache(pblk, gc_rq);
list_del(&gc_rq->list);
kref_put(&gc_rq->line->ref, pblk_line_put);
pblk_gc_free_gc_rq(gc_rq);
}
return 0;
}
static void pblk_gc_writer_kick(struct pblk_gc *gc)
{
wake_up_process(gc->gc_writer_ts);
}
void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct list_head *move_list;
spin_lock(&l_mg->gc_lock);
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_GC);
line->state = PBLK_LINESTATE_CLOSED;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
/* We need to reset gc_group in order to ensure that
* pblk_line_gc_list will return proper move_list
* since right now current line is not on any of the
* gc lists.
*/
line->gc_group = PBLK_LINEGC_NONE;
move_list = pblk_line_gc_list(pblk, line);
spin_unlock(&line->lock);
list_add_tail(&line->list, move_list);
spin_unlock(&l_mg->gc_lock);
}
static void pblk_gc_line_ws(struct work_struct *work)
{
struct pblk_line_ws *gc_rq_ws = container_of(work,
struct pblk_line_ws, ws);
struct pblk *pblk = gc_rq_ws->pblk;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line = gc_rq_ws->line;
struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
int ret;
up(&gc->gc_sem);
/* Read from GC victim block */
ret = pblk_submit_read_gc(pblk, gc_rq);
if (ret) {
line->w_err_gc->has_gc_err = 1;
goto out;
}
if (!gc_rq->secs_to_gc)
goto out;
retry:
spin_lock(&gc->w_lock);
if (gc->w_entries >= PBLK_GC_RQ_QD) {
spin_unlock(&gc->w_lock);
pblk_gc_writer_kick(&pblk->gc);
usleep_range(128, 256);
goto retry;
}
gc->w_entries++;
list_add_tail(&gc_rq->list, &gc->w_list);
spin_unlock(&gc->w_lock);
pblk_gc_writer_kick(&pblk->gc);
kfree(gc_rq_ws);
return;
out:
pblk_gc_free_gc_rq(gc_rq);
kref_put(&line->ref, pblk_line_put);
kfree(gc_rq_ws);
}
static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
struct pblk_line *line)
{
struct line_emeta *emeta_buf;
struct pblk_line_meta *lm = &pblk->lm;
unsigned int lba_list_size = lm->emeta_len[2];
__le64 *lba_list;
int ret;
emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
if (!emeta_buf)
return NULL;
ret = pblk_line_emeta_read(pblk, line, emeta_buf);
if (ret) {
pblk_err(pblk, "line %d read emeta failed (%d)\n",
line->id, ret);
kvfree(emeta_buf);
return NULL;
}
/* If this read fails, it means that emeta is corrupted.
* For now, leave the line untouched.
* TODO: Implement a recovery routine that scans and moves
* all sectors on the line.
*/
ret = pblk_recov_check_emeta(pblk, emeta_buf);
if (ret) {
pblk_err(pblk, "inconsistent emeta (line %d)\n",
line->id);
kvfree(emeta_buf);
return NULL;
}
lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
if (lba_list)
memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
kvfree(emeta_buf);
return lba_list;
}
static void pblk_gc_line_prepare_ws(struct work_struct *work)
{
struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
ws);
struct pblk *pblk = line_ws->pblk;
struct pblk_line *line = line_ws->line;
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line_ws *gc_rq_ws;
struct pblk_gc_rq *gc_rq;
__le64 *lba_list;
unsigned long *invalid_bitmap;
int sec_left, nr_secs, bit;
invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
if (!invalid_bitmap)
goto fail_free_ws;
if (line->w_err_gc->has_write_err) {
lba_list = line->w_err_gc->lba_list;
line->w_err_gc->lba_list = NULL;
} else {
lba_list = get_lba_list_from_emeta(pblk, line);
if (!lba_list) {
pblk_err(pblk, "could not interpret emeta (line %d)\n",
line->id);
goto fail_free_invalid_bitmap;
}
}
spin_lock(&line->lock);
bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
sec_left = pblk_line_vsc(line);
spin_unlock(&line->lock);
if (sec_left < 0) {
pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
goto fail_free_lba_list;
}
bit = -1;
next_rq:
gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
if (!gc_rq)
goto fail_free_lba_list;
nr_secs = 0;
do {
bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
bit + 1);
if (bit > line->emeta_ssec)
break;
gc_rq->paddr_list[nr_secs] = bit;
gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
} while (nr_secs < pblk->max_write_pgs);
if (unlikely(!nr_secs)) {
kfree(gc_rq);
goto out;
}
gc_rq->nr_secs = nr_secs;
gc_rq->line = line;
gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
if (!gc_rq->data)
goto fail_free_gc_rq;
gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
if (!gc_rq_ws)
goto fail_free_gc_data;
gc_rq_ws->pblk = pblk;
gc_rq_ws->line = line;
gc_rq_ws->priv = gc_rq;
/* The write GC path can be much slower than the read GC one due to
* the budget imposed by the rate-limiter. Balance in case that we get
* back pressure from the write GC path.
*/
while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
io_schedule();
kref_get(&line->ref);
INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
sec_left -= nr_secs;
if (sec_left > 0)
goto next_rq;
out:
kvfree(lba_list);
kfree(line_ws);
kfree(invalid_bitmap);
kref_put(&line->ref, pblk_line_put);
atomic_dec(&gc->read_inflight_gc);
return;
fail_free_gc_data:
vfree(gc_rq->data);
fail_free_gc_rq:
kfree(gc_rq);
fail_free_lba_list:
kvfree(lba_list);
fail_free_invalid_bitmap:
kfree(invalid_bitmap);
fail_free_ws:
kfree(line_ws);
/* Line goes back to closed state, so we cannot release additional
* reference for line, since we do that only when we want to do
* gc to free line state transition.
*/
pblk_put_line_back(pblk, line);
atomic_dec(&gc->read_inflight_gc);
pblk_err(pblk, "failed to GC line %d\n", line->id);
}
static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_gc *gc = &pblk->gc;
struct pblk_line_ws *line_ws;
pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
if (!line_ws)
return -ENOMEM;
line_ws->pblk = pblk;
line_ws->line = line;
atomic_inc(&gc->pipeline_gc);
INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
queue_work(gc->gc_reader_wq, &line_ws->ws);
return 0;
}
static void pblk_gc_reader_kick(struct pblk_gc *gc)
{
wake_up_process(gc->gc_reader_ts);
}
static void pblk_gc_kick(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
pblk_gc_writer_kick(gc);
pblk_gc_reader_kick(gc);
/* If we're shutting down GC, let's not start it up again */
if (gc->gc_enabled) {
wake_up_process(gc->gc_ts);
mod_timer(&gc->gc_timer,
jiffies + msecs_to_jiffies(GC_TIME_MSECS));
}
}
static int pblk_gc_read(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line;
spin_lock(&gc->r_lock);
if (list_empty(&gc->r_list)) {
spin_unlock(&gc->r_lock);
return 1;
}
line = list_first_entry(&gc->r_list, struct pblk_line, list);
list_del(&line->list);
spin_unlock(&gc->r_lock);
pblk_gc_kick(pblk);
if (pblk_gc_line(pblk, line)) {
pblk_err(pblk, "failed to GC line %d\n", line->id);
/* rollback */
spin_lock(&gc->r_lock);
list_add_tail(&line->list, &gc->r_list);
spin_unlock(&gc->r_lock);
}
return 0;
}
static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
struct list_head *group_list)
{
struct pblk_line *line, *victim;
unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L;
victim = list_first_entry(group_list, struct pblk_line, list);
list_for_each_entry(line, group_list, list) {
if (!atomic_read(&line->sec_to_update))
line_vsc = le32_to_cpu(*line->vsc);
if (line_vsc < victim_vsc) {
victim = line;
victim_vsc = le32_to_cpu(*victim->vsc);
}
}
if (victim_vsc == ~0x0)
return NULL;
return victim;
}
static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
{
unsigned int nr_blocks_free, nr_blocks_need;
unsigned int werr_lines = atomic_read(&rl->werr_lines);
nr_blocks_need = pblk_rl_high_thrs(rl);
nr_blocks_free = pblk_rl_nr_free_blks(rl);
/* This is not critical, no need to take lock here */
return ((werr_lines > 0) ||
((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
}
void pblk_gc_free_full_lines(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line;
do {
spin_lock(&l_mg->gc_lock);
if (list_empty(&l_mg->gc_full_list)) {
spin_unlock(&l_mg->gc_lock);
return;
}
line = list_first_entry(&l_mg->gc_full_list,
struct pblk_line, list);
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
spin_unlock(&line->lock);
list_del(&line->list);
spin_unlock(&l_mg->gc_lock);
atomic_inc(&gc->pipeline_gc);
kref_put(&line->ref, pblk_line_put);
} while (1);
}
/*
* Lines with no valid sectors will be returned to the free list immediately. If
* GC is activated - either because the free block count is under the determined
* threshold, or because it is being forced from user space - only lines with a
* high count of invalid sectors will be recycled.
*/
static void pblk_gc_run(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_gc *gc = &pblk->gc;
struct pblk_line *line;
struct list_head *group_list;
bool run_gc;
int read_inflight_gc, gc_group = 0, prev_group = 0;
pblk_gc_free_full_lines(pblk);
run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
return;
next_gc_group:
group_list = l_mg->gc_lists[gc_group++];
do {
spin_lock(&l_mg->gc_lock);
line = pblk_gc_get_victim_line(pblk, group_list);
if (!line) {
spin_unlock(&l_mg->gc_lock);
break;
}
spin_lock(&line->lock);
WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
line->state = PBLK_LINESTATE_GC;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
spin_unlock(&line->lock);
list_del(&line->list);
spin_unlock(&l_mg->gc_lock);
spin_lock(&gc->r_lock);
list_add_tail(&line->list, &gc->r_list);
spin_unlock(&gc->r_lock);
read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
pblk_gc_reader_kick(gc);
prev_group = 1;
/* No need to queue up more GC lines than we can handle */
run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
break;
} while (1);
if (!prev_group && pblk->rl.rb_state > gc_group &&
gc_group < PBLK_GC_NR_LISTS)
goto next_gc_group;
}
static void pblk_gc_timer(struct timer_list *t)
{
struct pblk *pblk = from_timer(pblk, t, gc.gc_timer);
pblk_gc_kick(pblk);
}
static int pblk_gc_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
pblk_gc_run(pblk);
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}
static int pblk_gc_writer_ts(void *data)
{
struct pblk *pblk = data;
while (!kthread_should_stop()) {
if (!pblk_gc_write(pblk))
continue;
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}
static int pblk_gc_reader_ts(void *data)
{
struct pblk *pblk = data;
struct pblk_gc *gc = &pblk->gc;
while (!kthread_should_stop()) {
if (!pblk_gc_read(pblk))
continue;
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
#ifdef CONFIG_NVM_PBLK_DEBUG
pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
atomic_read(&gc->pipeline_gc));
#endif
do {
if (!atomic_read(&gc->pipeline_gc))
break;
schedule();
} while (1);
return 0;
}
static void pblk_gc_start(struct pblk *pblk)
{
pblk->gc.gc_active = 1;
pblk_debug(pblk, "gc start\n");
}
void pblk_gc_should_start(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
if (gc->gc_enabled && !gc->gc_active) {
pblk_gc_start(pblk);
pblk_gc_kick(pblk);
}
}
void pblk_gc_should_stop(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
if (gc->gc_active && !gc->gc_forced)
gc->gc_active = 0;
}
void pblk_gc_should_kick(struct pblk *pblk)
{
pblk_rl_update_rates(&pblk->rl);
}
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active)
{
struct pblk_gc *gc = &pblk->gc;
spin_lock(&gc->lock);
*gc_enabled = gc->gc_enabled;
*gc_active = gc->gc_active;
spin_unlock(&gc->lock);
}
int pblk_gc_sysfs_force(struct pblk *pblk, int force)
{
struct pblk_gc *gc = &pblk->gc;
if (force < 0 || force > 1)
return -EINVAL;
spin_lock(&gc->lock);
gc->gc_forced = force;
if (force)
gc->gc_enabled = 1;
else
gc->gc_enabled = 0;
spin_unlock(&gc->lock);
pblk_gc_should_start(pblk);
return 0;
}
int pblk_gc_init(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
int ret;
gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
if (IS_ERR(gc->gc_ts)) {
pblk_err(pblk, "could not allocate GC main kthread\n");
return PTR_ERR(gc->gc_ts);
}
gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
"pblk-gc-writer-ts");
if (IS_ERR(gc->gc_writer_ts)) {
pblk_err(pblk, "could not allocate GC writer kthread\n");
ret = PTR_ERR(gc->gc_writer_ts);
goto fail_free_main_kthread;
}
gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
"pblk-gc-reader-ts");
if (IS_ERR(gc->gc_reader_ts)) {
pblk_err(pblk, "could not allocate GC reader kthread\n");
ret = PTR_ERR(gc->gc_reader_ts);
goto fail_free_writer_kthread;
}
timer_setup(&gc->gc_timer, pblk_gc_timer, 0);
mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
gc->gc_active = 0;
gc->gc_forced = 0;
gc->gc_enabled = 1;
gc->w_entries = 0;
atomic_set(&gc->read_inflight_gc, 0);
atomic_set(&gc->pipeline_gc, 0);
/* Workqueue that reads valid sectors from a line and submit them to the
* GC writer to be recycled.
*/
gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
if (!gc->gc_line_reader_wq) {
pblk_err(pblk, "could not allocate GC line reader workqueue\n");
ret = -ENOMEM;
goto fail_free_reader_kthread;
}
/* Workqueue that prepare lines for GC */
gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!gc->gc_reader_wq) {
pblk_err(pblk, "could not allocate GC reader workqueue\n");
ret = -ENOMEM;
goto fail_free_reader_line_wq;
}
spin_lock_init(&gc->lock);
spin_lock_init(&gc->w_lock);
spin_lock_init(&gc->r_lock);
sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
INIT_LIST_HEAD(&gc->w_list);
INIT_LIST_HEAD(&gc->r_list);
return 0;
fail_free_reader_line_wq:
destroy_workqueue(gc->gc_line_reader_wq);
fail_free_reader_kthread:
kthread_stop(gc->gc_reader_ts);
fail_free_writer_kthread:
kthread_stop(gc->gc_writer_ts);
fail_free_main_kthread:
kthread_stop(gc->gc_ts);
return ret;
}
void pblk_gc_exit(struct pblk *pblk, bool graceful)
{
struct pblk_gc *gc = &pblk->gc;
gc->gc_enabled = 0;
del_timer_sync(&gc->gc_timer);
gc->gc_active = 0;
if (gc->gc_ts)
kthread_stop(gc->gc_ts);
if (gc->gc_reader_ts)
kthread_stop(gc->gc_reader_ts);
if (graceful) {
flush_workqueue(gc->gc_reader_wq);
flush_workqueue(gc->gc_line_reader_wq);
}
destroy_workqueue(gc->gc_reader_wq);
destroy_workqueue(gc->gc_line_reader_wq);
if (gc->gc_writer_ts)
kthread_stop(gc->gc_writer_ts);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,210 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-map.c - pblk's lba-ppa mapping strategy
*
*/
#include "pblk.h"
static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
struct ppa_addr *ppa_list,
unsigned long *lun_bitmap,
void *meta_list,
unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
struct pblk_emeta *emeta;
struct pblk_w_ctx *w_ctx;
__le64 *lba_list;
u64 paddr;
int nr_secs = pblk->min_write_pgs;
int i;
if (!line)
return -ENOSPC;
if (pblk_line_is_full(line)) {
struct pblk_line *prev_line = line;
/* If we cannot allocate a new line, make sure to store metadata
* on current line and then fail
*/
line = pblk_line_replace_data(pblk);
pblk_line_close_meta(pblk, prev_line);
if (!line) {
pblk_pipeline_stop(pblk);
return -ENOSPC;
}
}
emeta = line->emeta;
lba_list = emeta_to_lbas(pblk, emeta->buf);
paddr = pblk_alloc_page(pblk, line, nr_secs);
for (i = 0; i < nr_secs; i++, paddr++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
/* ppa to be sent to the device */
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
/* Write context for target bio completion on write buffer. Note
* that the write buffer is protected by the sync backpointer,
* and a single writer thread have access to each specific entry
* at a time. Thus, it is safe to modify the context for the
* entry we are setting up for submission without taking any
* lock or memory barrier.
*/
if (i < valid_secs) {
kref_get(&line->ref);
atomic_inc(&line->sec_to_update);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i];
meta->lba = cpu_to_le64(w_ctx->lba);
lba_list[paddr] = cpu_to_le64(w_ctx->lba);
if (lba_list[paddr] != addr_empty)
line->nr_valid_lbas++;
else
atomic64_inc(&pblk->pad_wa);
} else {
lba_list[paddr] = addr_empty;
meta->lba = addr_empty;
__pblk_map_invalidate(pblk, line, paddr);
}
}
pblk_down_rq(pblk, ppa_list[0], lun_bitmap);
return 0;
}
int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
unsigned long *lun_bitmap, unsigned int valid_secs,
unsigned int off)
{
void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
void *meta_buffer;
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i;
int ret;
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
meta_buffer = pblk_get_meta(pblk, meta_list, i);
ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
lun_bitmap, meta_buffer, map_secs);
if (ret)
return ret;
}
return 0;
}
/* only if erase_ppa is set, acquire erase semaphore */
int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int sentry, unsigned long *lun_bitmap,
unsigned int valid_secs, struct ppa_addr *erase_ppa)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
void *meta_buffer;
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
struct pblk_line *e_line, *d_line;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i, erase_lun;
int ret;
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
meta_buffer = pblk_get_meta(pblk, meta_list, i);
ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
lun_bitmap, meta_buffer, map_secs);
if (ret)
return ret;
erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
/* line can change after page map. We might also be writing the
* last line.
*/
e_line = pblk_line_get_erase(pblk);
if (!e_line)
return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
valid_secs, i + min);
spin_lock(&e_line->lock);
if (!test_bit(erase_lun, e_line->erase_bitmap)) {
set_bit(erase_lun, e_line->erase_bitmap);
atomic_dec(&e_line->left_eblks);
*erase_ppa = ppa_list[i];
erase_ppa->a.blk = e_line->id;
erase_ppa->a.reserved = 0;
spin_unlock(&e_line->lock);
/* Avoid evaluating e_line->left_eblks */
return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
valid_secs, i + min);
}
spin_unlock(&e_line->lock);
}
d_line = pblk_line_get_data(pblk);
/* line can change after page map. We might also be writing the
* last line.
*/
e_line = pblk_line_get_erase(pblk);
if (!e_line)
return -ENOSPC;
/* Erase blocks that are bad in this line but might not be in next */
if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
int bit = -1;
retry:
bit = find_next_bit(d_line->blk_bitmap,
lm->blk_per_line, bit + 1);
if (bit >= lm->blk_per_line)
return 0;
spin_lock(&e_line->lock);
if (test_bit(bit, e_line->erase_bitmap)) {
spin_unlock(&e_line->lock);
goto retry;
}
spin_unlock(&e_line->lock);
set_bit(bit, e_line->erase_bitmap);
atomic_dec(&e_line->left_eblks);
*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
erase_ppa->a.blk = e_line->id;
}
return 0;
}

View File

@ -1,858 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
*
* Based upon the circular ringbuffer.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-rb.c - pblk's write buffer
*/
#include <linux/circ_buf.h>
#include "pblk.h"
static DECLARE_RWSEM(pblk_rb_lock);
static void pblk_rb_data_free(struct pblk_rb *rb)
{
struct pblk_rb_pages *p, *t;
down_write(&pblk_rb_lock);
list_for_each_entry_safe(p, t, &rb->pages, list) {
free_pages((unsigned long)page_address(p->pages), p->order);
list_del(&p->list);
kfree(p);
}
up_write(&pblk_rb_lock);
}
void pblk_rb_free(struct pblk_rb *rb)
{
pblk_rb_data_free(rb);
vfree(rb->entries);
}
/*
* pblk_rb_calculate_size -- calculate the size of the write buffer
*/
static unsigned int pblk_rb_calculate_size(unsigned int nr_entries,
unsigned int threshold)
{
unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA));
unsigned int max_sz = max(thr_sz, nr_entries);
unsigned int max_io;
/* Alloc a write buffer that can (i) fit at least two split bios
* (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the
* threshold will be respected
*/
max_io = (1 << max((int)(get_count_order(max_sz)),
(int)(get_count_order(NVM_MAX_VLBA << 1))));
if ((threshold + NVM_MAX_VLBA) >= max_io)
max_io <<= 1;
return max_io;
}
/*
* Initialize ring buffer. The data and metadata buffers must be previously
* allocated and their size must be a power of two
* (Documentation/core-api/circular-buffers.rst)
*/
int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
unsigned int seg_size)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entries;
unsigned int init_entry = 0;
unsigned int max_order = MAX_ORDER - 1;
unsigned int power_size, power_seg_sz;
unsigned int alloc_order, order, iter;
unsigned int nr_entries;
nr_entries = pblk_rb_calculate_size(size, threshold);
entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry)));
if (!entries)
return -ENOMEM;
power_size = get_count_order(nr_entries);
power_seg_sz = get_count_order(seg_size);
down_write(&pblk_rb_lock);
rb->entries = entries;
rb->seg_size = (1 << power_seg_sz);
rb->nr_entries = (1 << power_size);
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
rb->back_thres = threshold;
rb->flush_point = EMPTY_ENTRY;
spin_lock_init(&rb->w_lock);
spin_lock_init(&rb->s_lock);
INIT_LIST_HEAD(&rb->pages);
alloc_order = power_size;
if (alloc_order >= max_order) {
order = max_order;
iter = (1 << (alloc_order - max_order));
} else {
order = alloc_order;
iter = 1;
}
do {
struct pblk_rb_entry *entry;
struct pblk_rb_pages *page_set;
void *kaddr;
unsigned long set_size;
int i;
page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
if (!page_set) {
up_write(&pblk_rb_lock);
vfree(entries);
return -ENOMEM;
}
page_set->order = order;
page_set->pages = alloc_pages(GFP_KERNEL, order);
if (!page_set->pages) {
kfree(page_set);
pblk_rb_data_free(rb);
up_write(&pblk_rb_lock);
vfree(entries);
return -ENOMEM;
}
kaddr = page_address(page_set->pages);
entry = &rb->entries[init_entry];
entry->data = kaddr;
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
set_size = (1 << order);
for (i = 1; i < set_size; i++) {
entry = &rb->entries[init_entry];
entry->cacheline = pblk_cacheline_to_addr(init_entry++);
entry->data = kaddr + (i * rb->seg_size);
entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
bio_list_init(&entry->w_ctx.bios);
}
list_add_tail(&page_set->list, &rb->pages);
iter--;
} while (iter > 0);
up_write(&pblk_rb_lock);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_set(&rb->inflight_flush_point, 0);
#endif
/*
* Initialize rate-limiter, which controls access to the write buffer
* by user and GC I/O
*/
pblk_rl_init(&pblk->rl, rb->nr_entries, threshold);
return 0;
}
static void clean_wctx(struct pblk_w_ctx *w_ctx)
{
int flags;
flags = READ_ONCE(w_ctx->flags);
WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
"pblk: overwriting unsubmitted data\n");
/* Release flags on context. Protect from writes and reads */
smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
pblk_ppa_set_empty(&w_ctx->ppa);
w_ctx->lba = ADDR_EMPTY;
}
#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
#define pblk_rb_ring_space(rb, head, tail, size) \
(CIRC_SPACE(head, tail, size))
/*
* Buffer space is calculated with respect to the back pointer signaling
* synchronized entries to the media.
*/
static unsigned int pblk_rb_space(struct pblk_rb *rb)
{
unsigned int mem = READ_ONCE(rb->mem);
unsigned int sync = READ_ONCE(rb->sync);
return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
}
unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p,
unsigned int nr_entries)
{
return (p + nr_entries) & (rb->nr_entries - 1);
}
/*
* Buffer count is calculated with respect to the submission entry signaling the
* entries that are available to send to the media
*/
unsigned int pblk_rb_read_count(struct pblk_rb *rb)
{
unsigned int mem = READ_ONCE(rb->mem);
unsigned int subm = READ_ONCE(rb->subm);
return pblk_rb_ring_count(mem, subm, rb->nr_entries);
}
unsigned int pblk_rb_sync_count(struct pblk_rb *rb)
{
unsigned int mem = READ_ONCE(rb->mem);
unsigned int sync = READ_ONCE(rb->sync);
return pblk_rb_ring_count(mem, sync, rb->nr_entries);
}
unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
{
unsigned int subm;
subm = READ_ONCE(rb->subm);
/* Commit read means updating submission pointer */
smp_store_release(&rb->subm, pblk_rb_ptr_wrap(rb, subm, nr_entries));
return subm;
}
static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_line *line;
struct pblk_rb_entry *entry;
struct pblk_w_ctx *w_ctx;
unsigned int user_io = 0, gc_io = 0;
unsigned int i;
int flags;
for (i = 0; i < to_update; i++) {
entry = &rb->entries[rb->l2p_update];
w_ctx = &entry->w_ctx;
flags = READ_ONCE(entry->w_ctx.flags);
if (flags & PBLK_IOTYPE_USER)
user_io++;
else if (flags & PBLK_IOTYPE_GC)
gc_io++;
else
WARN(1, "pblk: unknown IO type\n");
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
entry->cacheline);
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx);
rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1);
}
pblk_rl_out(&pblk->rl, user_io, gc_io);
return 0;
}
/*
* When we move the l2p_update pointer, we update the l2p table - lookups will
* point to the physical address instead of to the cacheline in the write buffer
* from this moment on.
*/
static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int mem, unsigned int sync)
{
unsigned int space, count;
int ret = 0;
lockdep_assert_held(&rb->w_lock);
/* Update l2p only as buffer entries are being overwritten */
space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
if (space > nr_entries)
goto out;
count = nr_entries - space;
/* l2p_update used exclusively under rb->w_lock */
ret = __pblk_rb_update_l2p(rb, count);
out:
return ret;
}
/*
* Update the l2p entry for all sectors stored on the write buffer. This means
* that all future lookups to the l2p table will point to a device address, not
* to the cacheline in the write buffer.
*/
void pblk_rb_sync_l2p(struct pblk_rb *rb)
{
unsigned int sync;
unsigned int to_update;
spin_lock(&rb->w_lock);
/* Protect from reads and writes */
sync = smp_load_acquire(&rb->sync);
to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
__pblk_rb_update_l2p(rb, to_update);
spin_unlock(&rb->w_lock);
}
/*
* Write @nr_entries to ring buffer from @data buffer if there is enough space.
* Typically, 4KB data chunks coming from a bio will be copied to the ring
* buffer, thus the write will fail if not all incoming data can be copied.
*
*/
static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx,
struct pblk_rb_entry *entry)
{
memcpy(entry->data, data, rb->seg_size);
entry->w_ctx.lba = w_ctx.lba;
entry->w_ctx.ppa = w_ctx.ppa;
}
void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx, unsigned int ring_pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
int flags;
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
__pblk_rb_write_entry(rb, data, w_ctx, entry);
pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
/* Release flags on write context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
}
void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
struct pblk_w_ctx w_ctx, struct pblk_line *line,
u64 paddr, unsigned int ring_pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
int flags;
entry = &rb->entries[ring_pos];
flags = READ_ONCE(entry->w_ctx.flags);
#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must guarantee that the entry is free */
BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
#endif
__pblk_rb_write_entry(rb, data, w_ctx, entry);
if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
entry->w_ctx.lba = ADDR_EMPTY;
flags = w_ctx.flags | PBLK_WRITTEN_DATA;
/* Release flags on write context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
}
static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
unsigned int pos)
{
struct pblk_rb_entry *entry;
unsigned int sync, flush_point;
pblk_rb_sync_init(rb, NULL);
sync = READ_ONCE(rb->sync);
if (pos == sync) {
pblk_rb_sync_end(rb, NULL);
return 0;
}
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_inc(&rb->inflight_flush_point);
#endif
flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
entry = &rb->entries[flush_point];
/* Protect flush points */
smp_store_release(&rb->flush_point, flush_point);
if (bio)
bio_list_add(&entry->w_ctx.bios, bio);
pblk_rb_sync_end(rb, NULL);
return bio ? 1 : 0;
}
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
unsigned int mem;
unsigned int sync;
unsigned int threshold;
sync = READ_ONCE(rb->sync);
mem = READ_ONCE(rb->mem);
threshold = nr_entries + rb->back_thres;
if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < threshold)
return 0;
if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
return 0;
*pos = mem;
return 1;
}
static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
if (!__pblk_rb_may_write(rb, nr_entries, pos))
return 0;
/* Protect from read count */
smp_store_release(&rb->mem, pblk_rb_ptr_wrap(rb, *pos, nr_entries));
return 1;
}
void pblk_rb_flush(struct pblk_rb *rb)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
unsigned int mem = READ_ONCE(rb->mem);
if (pblk_rb_flush_point_set(rb, NULL, mem))
return;
pblk_write_kick(pblk);
}
static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos, struct bio *bio,
int *io_ret)
{
unsigned int mem;
if (!__pblk_rb_may_write(rb, nr_entries, pos))
return 0;
mem = pblk_rb_ptr_wrap(rb, *pos, nr_entries);
*io_ret = NVM_IO_DONE;
if (bio->bi_opf & REQ_PREFLUSH) {
struct pblk *pblk = container_of(rb, struct pblk, rwb);
atomic64_inc(&pblk->nr_flush);
if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
*io_ret = NVM_IO_OK;
}
/* Protect from read count */
smp_store_release(&rb->mem, mem);
return 1;
}
/*
* Atomically check that (i) there is space on the write buffer for the
* incoming I/O, and (ii) the current I/O type has enough budget in the write
* buffer (rate-limiter).
*/
int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
unsigned int nr_entries, unsigned int *pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
int io_ret;
spin_lock(&rb->w_lock);
io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries);
if (io_ret) {
spin_unlock(&rb->w_lock);
return io_ret;
}
if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) {
spin_unlock(&rb->w_lock);
return NVM_IO_REQUEUE;
}
pblk_rl_user_in(&pblk->rl, nr_entries);
spin_unlock(&rb->w_lock);
return io_ret;
}
/*
* Look at pblk_rb_may_write_user comment
*/
int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
unsigned int *pos)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
spin_lock(&rb->w_lock);
if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
spin_unlock(&rb->w_lock);
return 0;
}
if (!pblk_rb_may_write(rb, nr_entries, pos)) {
spin_unlock(&rb->w_lock);
return 0;
}
pblk_rl_gc_in(&pblk->rl, nr_entries);
spin_unlock(&rb->w_lock);
return 1;
}
/*
* Read available entries on rb and add them to the given bio. To avoid a memory
* copy, a page reference to the write buffer is used to be added to the bio.
*
* This function is used by the write thread to form the write bio that will
* persist data on the write buffer to the media.
*/
unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
unsigned int pos, unsigned int nr_entries,
unsigned int count)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct request_queue *q = pblk->dev->q;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
struct pblk_rb_entry *entry;
struct page *page;
unsigned int pad = 0, to_read = nr_entries;
unsigned int i;
int flags;
if (count < nr_entries) {
pad = nr_entries - count;
to_read = count;
}
/* Add space for packed metadata if in use*/
pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
c_ctx->sentry = pos;
c_ctx->nr_valid = to_read;
c_ctx->nr_padded = pad;
for (i = 0; i < to_read; i++) {
entry = &rb->entries[pos];
/* A write has been allowed into the buffer, but data is still
* being copied to it. It is ok to busy wait.
*/
try:
flags = READ_ONCE(entry->w_ctx.flags);
if (!(flags & PBLK_WRITTEN_DATA)) {
io_schedule();
goto try;
}
page = virt_to_page(entry->data);
if (!page) {
pblk_err(pblk, "could not allocate write bio page\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
return NVM_IO_ERR;
}
if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
rb->seg_size) {
pblk_err(pblk, "could not add page to write bio\n");
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
return NVM_IO_ERR;
}
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&entry->w_ctx.flags, flags);
pos = pblk_rb_ptr_wrap(rb, pos, 1);
}
if (pad) {
if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
pblk_err(pblk, "could not pad page in write bio\n");
return NVM_IO_ERR;
}
if (pad < pblk->min_write_pgs)
atomic64_inc(&pblk->pad_dist[pad - 1]);
else
pblk_warn(pblk, "padding more than min. sectors\n");
atomic64_add(pad, &pblk->pad_wa);
}
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(pad, &pblk->padded_writes);
#endif
return NVM_IO_OK;
}
/*
* Copy to bio only if the lba matches the one on the given cache entry.
* Otherwise, it means that the entry has been overwritten, and the bio should
* be directed to disk.
*/
int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
struct ppa_addr ppa)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_rb_entry *entry;
struct pblk_w_ctx *w_ctx;
struct ppa_addr l2p_ppa;
u64 pos = pblk_addr_to_cacheline(ppa);
void *data;
int flags;
int ret = 1;
#ifdef CONFIG_NVM_PBLK_DEBUG
/* Caller must ensure that the access will not cause an overflow */
BUG_ON(pos >= rb->nr_entries);
#endif
entry = &rb->entries[pos];
w_ctx = &entry->w_ctx;
flags = READ_ONCE(w_ctx->flags);
spin_lock(&rb->w_lock);
spin_lock(&pblk->trans_lock);
l2p_ppa = pblk_trans_map_get(pblk, lba);
spin_unlock(&pblk->trans_lock);
/* Check if the entry has been overwritten or is scheduled to be */
if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba ||
flags & PBLK_WRITABLE_ENTRY) {
ret = 0;
goto out;
}
data = bio_data(bio);
memcpy(data, entry->data, rb->seg_size);
out:
spin_unlock(&rb->w_lock);
return ret;
}
struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
{
unsigned int entry = pblk_rb_ptr_wrap(rb, pos, 0);
return &rb->entries[entry].w_ctx;
}
unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
__acquires(&rb->s_lock)
{
if (flags)
spin_lock_irqsave(&rb->s_lock, *flags);
else
spin_lock_irq(&rb->s_lock);
return rb->sync;
}
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
__releases(&rb->s_lock)
{
lockdep_assert_held(&rb->s_lock);
if (flags)
spin_unlock_irqrestore(&rb->s_lock, *flags);
else
spin_unlock_irq(&rb->s_lock);
}
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
{
unsigned int sync, flush_point;
lockdep_assert_held(&rb->s_lock);
sync = READ_ONCE(rb->sync);
flush_point = READ_ONCE(rb->flush_point);
if (flush_point != EMPTY_ENTRY) {
unsigned int secs_to_flush;
secs_to_flush = pblk_rb_ring_count(flush_point, sync,
rb->nr_entries);
if (secs_to_flush < nr_entries) {
/* Protect flush points */
smp_store_release(&rb->flush_point, EMPTY_ENTRY);
}
}
sync = pblk_rb_ptr_wrap(rb, sync, nr_entries);
/* Protect from counts */
smp_store_release(&rb->sync, sync);
return sync;
}
/* Calculate how many sectors to submit up to the current flush point. */
unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
{
unsigned int subm, sync, flush_point;
unsigned int submitted, to_flush;
/* Protect flush points */
flush_point = smp_load_acquire(&rb->flush_point);
if (flush_point == EMPTY_ENTRY)
return 0;
/* Protect syncs */
sync = smp_load_acquire(&rb->sync);
subm = READ_ONCE(rb->subm);
submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
/* The sync point itself counts as a sector to sync */
to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
return (submitted < to_flush) ? (to_flush - submitted) : 0;
}
int pblk_rb_tear_down_check(struct pblk_rb *rb)
{
struct pblk_rb_entry *entry;
int i;
int ret = 0;
spin_lock(&rb->w_lock);
spin_lock_irq(&rb->s_lock);
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
(rb->sync == rb->l2p_update) &&
(rb->flush_point == EMPTY_ENTRY)) {
goto out;
}
if (!rb->entries) {
ret = 1;
goto out;
}
for (i = 0; i < rb->nr_entries; i++) {
entry = &rb->entries[i];
if (!entry->data) {
ret = 1;
goto out;
}
}
out:
spin_unlock_irq(&rb->s_lock);
spin_unlock(&rb->w_lock);
return ret;
}
unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
{
return (pos & (rb->nr_entries - 1));
}
int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
{
return (pos >= rb->nr_entries);
}
ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
{
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct pblk_c_ctx *c;
ssize_t offset;
int queued_entries = 0;
spin_lock_irq(&rb->s_lock);
list_for_each_entry(c, &pblk->compl_list, list)
queued_entries++;
spin_unlock_irq(&rb->s_lock);
if (rb->flush_point != EMPTY_ENTRY)
offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
rb->nr_entries,
rb->mem,
rb->subm,
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_read(&rb->inflight_flush_point),
#else
0,
#endif
rb->flush_point,
pblk_rb_read_count(rb),
pblk_rb_space(rb),
pblk_rb_flush_point_count(rb),
queued_entries);
else
offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
rb->nr_entries,
rb->mem,
rb->subm,
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_read(&rb->inflight_flush_point),
#else
0,
#endif
pblk_rb_read_count(rb),
pblk_rb_space(rb),
pblk_rb_flush_point_count(rb),
queued_entries);
return offset;
}

View File

@ -1,474 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-read.c - pblk's read path
*/
#include "pblk.h"
/*
* There is no guarantee that the value read from cache has not been updated and
* resides at another location in the cache. We guarantee though that if the
* value is read from the cache, it belongs to the mapped lba. In order to
* guarantee and order between writes and reads are ordered, a flush must be
* issued.
*/
static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
sector_t lba, struct ppa_addr ppa)
{
#ifdef CONFIG_NVM_PBLK_DEBUG
/* Callers must ensure that the ppa points to a cache address */
BUG_ON(pblk_ppa_empty(ppa));
BUG_ON(!pblk_addr_in_cache(ppa));
#endif
return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa);
}
static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct bio *bio, sector_t blba,
bool *from_cache)
{
void *meta_list = rqd->meta_list;
int nr_secs, i;
retry:
nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas,
from_cache);
if (!*from_cache)
goto end;
for (i = 0; i < nr_secs; i++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
sector_t lba = blba + i;
if (pblk_ppa_empty(rqd->ppa_list[i])) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
meta->lba = addr_empty;
} else if (pblk_addr_in_cache(rqd->ppa_list[i])) {
/*
* Try to read from write buffer. The address is later
* checked on the write buffer to prevent retrieving
* overwritten data.
*/
if (!pblk_read_from_cache(pblk, bio, lba,
rqd->ppa_list[i])) {
if (i == 0) {
/*
* We didn't call with bio_advance()
* yet, so we can just retry.
*/
goto retry;
} else {
/*
* We already call bio_advance()
* so we cannot retry and we need
* to quit that function in order
* to allow caller to handle the bio
* splitting in the current sector
* position.
*/
nr_secs = i;
goto end;
}
}
meta->lba = cpu_to_le64(lba);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
#endif
}
bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
}
end:
if (pblk_io_aligned(pblk, nr_secs))
rqd->is_seq = 1;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(nr_secs, &pblk->inflight_reads);
#endif
return nr_secs;
}
static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
sector_t blba)
{
void *meta_list = rqd->meta_list;
int nr_lbas = rqd->nr_ppas;
int i;
if (!pblk_is_oob_meta_supported(pblk))
return;
for (i = 0; i < nr_lbas; i++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
u64 lba = le64_to_cpu(meta->lba);
if (lba == ADDR_EMPTY)
continue;
if (lba != blba + i) {
#ifdef CONFIG_NVM_PBLK_DEBUG
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
print_ppa(pblk, &ppa_list[i], "seq", i);
#endif
pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
lba, (u64)blba + i);
WARN_ON(1);
}
}
}
/*
* There can be holes in the lba list.
*/
static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
u64 *lba_list, int nr_lbas)
{
void *meta_lba_list = rqd->meta_list;
int i, j;
if (!pblk_is_oob_meta_supported(pblk))
return;
for (i = 0, j = 0; i < nr_lbas; i++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk,
meta_lba_list, j);
u64 lba = lba_list[i];
u64 meta_lba;
if (lba == ADDR_EMPTY)
continue;
meta_lba = le64_to_cpu(meta->lba);
if (lba != meta_lba) {
#ifdef CONFIG_NVM_PBLK_DEBUG
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
print_ppa(pblk, &ppa_list[j], "rnd", j);
#endif
pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
meta_lba, lba);
WARN_ON(1);
}
j++;
}
WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
}
static void pblk_end_user_read(struct bio *bio, int error)
{
if (error && error != NVM_RSP_WARN_HIGHECC)
bio_io_error(bio);
else
bio_endio(bio);
}
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
bool put_line)
{
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *int_bio = rqd->bio;
unsigned long start_time = r_ctx->start_time;
bio_end_io_acct(int_bio, start_time);
if (rqd->error)
pblk_log_read_err(pblk, rqd);
pblk_read_check_seq(pblk, rqd, r_ctx->lba);
bio_put(int_bio);
if (put_line)
pblk_rq_to_line_put(pblk, rqd);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
#endif
pblk_free_rqd(pblk, rqd, PBLK_READ);
atomic_dec(&pblk->inflight_io);
}
static void pblk_end_io_read(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = (struct bio *)r_ctx->private;
pblk_end_user_read(bio, rqd->error);
__pblk_end_io_read(pblk, rqd, true);
}
static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
sector_t lba, bool *from_cache)
{
struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
struct ppa_addr ppa;
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
retry:
if (pblk_ppa_empty(ppa)) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
meta->lba = addr_empty;
return;
}
/* Try to read from write buffer. The address is later checked on the
* write buffer to prevent retrieving overwritten data.
*/
if (pblk_addr_in_cache(ppa)) {
if (!pblk_read_from_cache(pblk, bio, lba, ppa)) {
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache);
goto retry;
}
meta->lba = cpu_to_le64(lba);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
#endif
} else {
rqd->ppa_addr = ppa;
}
}
void pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
bool from_cache;
struct pblk_g_ctx *r_ctx;
struct nvm_rq *rqd;
struct bio *int_bio, *split_bio;
unsigned long start_time;
start_time = bio_start_io_acct(bio);
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
rqd->opcode = NVM_OP_PREAD;
rqd->nr_ppas = nr_secs;
rqd->private = pblk;
rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd);
r_ctx->start_time = start_time;
r_ctx->lba = blba;
if (pblk_alloc_rqd_meta(pblk, rqd)) {
bio_io_error(bio);
pblk_free_rqd(pblk, rqd, PBLK_READ);
return;
}
/* Clone read bio to deal internally with:
* -read errors when reading from drive
* -bio_advance() calls during cache reads
*/
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
if (nr_secs > 1)
nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba,
&from_cache);
else
pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache);
split_retry:
r_ctx->private = bio; /* original bio */
rqd->bio = int_bio; /* internal bio */
if (from_cache && nr_secs == rqd->nr_ppas) {
/* All data was read from cache, we can complete the IO. */
pblk_end_user_read(bio, 0);
atomic_inc(&pblk->inflight_io);
__pblk_end_io_read(pblk, rqd, false);
} else if (nr_secs != rqd->nr_ppas) {
/* The read bio request could be partially filled by the write
* buffer, but there are some holes that need to be read from
* the drive. In order to handle this, we will use block layer
* mechanism to split this request in to smaller ones and make
* a chain of it.
*/
split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL,
&pblk_bio_set);
bio_chain(split_bio, bio);
submit_bio_noacct(bio);
/* New bio contains first N sectors of the previous one, so
* we can continue to use existing rqd, but we need to shrink
* the number of PPAs in it. New bio is also guaranteed that
* it contains only either data from cache or from drive, newer
* mix of them.
*/
bio = split_bio;
rqd->nr_ppas = nr_secs;
if (rqd->nr_ppas == 1)
rqd->ppa_addr = rqd->ppa_list[0];
/* Recreate int_bio - existing might have some needed internal
* fields modified already.
*/
bio_put(int_bio);
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
goto split_retry;
} else if (pblk_submit_io(pblk, rqd, NULL)) {
/* Submitting IO to drive failed, let's report an error */
rqd->error = -ENODEV;
pblk_end_io_read(rqd);
}
}
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_line *line, u64 *lba_list,
u64 *paddr_list_gc, unsigned int nr_secs)
{
struct ppa_addr ppa_list_l2p[NVM_MAX_VLBA];
struct ppa_addr ppa_gc;
int valid_secs = 0;
int i;
pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
for (i = 0; i < nr_secs; i++) {
if (lba_list[i] == ADDR_EMPTY)
continue;
ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
continue;
}
rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
}
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(valid_secs, &pblk->inflight_reads);
#endif
return valid_secs;
}
static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_line *line, sector_t lba,
u64 paddr_gc)
{
struct ppa_addr ppa_l2p, ppa_gc;
int valid_secs = 0;
if (lba == ADDR_EMPTY)
goto out;
/* logic error: lba out-of-bounds */
if (lba >= pblk->capacity) {
WARN(1, "pblk: read lba out of bounds\n");
goto out;
}
spin_lock(&pblk->trans_lock);
ppa_l2p = pblk_trans_map_get(pblk, lba);
spin_unlock(&pblk->trans_lock);
ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
goto out;
rqd->ppa_addr = ppa_l2p;
valid_secs = 1;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->inflight_reads);
#endif
out:
return valid_secs;
}
int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
{
struct nvm_rq rqd;
int ret = NVM_IO_OK;
memset(&rqd, 0, sizeof(struct nvm_rq));
ret = pblk_alloc_rqd_meta(pblk, &rqd);
if (ret)
return ret;
if (gc_rq->nr_secs > 1) {
gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
gc_rq->lba_list,
gc_rq->paddr_list,
gc_rq->nr_secs);
if (gc_rq->secs_to_gc == 1)
rqd.ppa_addr = rqd.ppa_list[0];
} else {
gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
gc_rq->lba_list[0],
gc_rq->paddr_list[0]);
}
if (!(gc_rq->secs_to_gc))
goto out;
rqd.opcode = NVM_OP_PREAD;
rqd.nr_ppas = gc_rq->secs_to_gc;
if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) {
ret = -EIO;
goto err_free_dma;
}
pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
atomic_dec(&pblk->inflight_io);
if (rqd.error) {
atomic_long_inc(&pblk->read_failed_gc);
#ifdef CONFIG_NVM_PBLK_DEBUG
pblk_print_failed_rqd(pblk, &rqd, rqd.error);
#endif
}
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
#endif
out:
pblk_free_rqd_meta(pblk, &rqd);
return ret;
err_free_dma:
pblk_free_rqd_meta(pblk, &rqd);
return ret;
}

View File

@ -1,874 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial: Javier Gonzalez <javier@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-recovery.c - pblk's recovery path
*
* The L2P recovery path is single threaded as the L2P table is updated in order
* following the line sequence ID.
*/
#include "pblk.h"
#include "pblk-trace.h"
int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
{
u32 crc;
crc = pblk_calc_emeta_crc(pblk, emeta_buf);
if (le32_to_cpu(emeta_buf->crc) != crc)
return 1;
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
return 1;
return 0;
}
static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_emeta *emeta = line->emeta;
struct line_emeta *emeta_buf = emeta->buf;
__le64 *lba_list;
u64 data_start, data_end;
u64 nr_valid_lbas, nr_lbas = 0;
u64 i;
lba_list = emeta_to_lbas(pblk, emeta_buf);
if (!lba_list)
return 1;
data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
data_end = line->emeta_ssec;
nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
for (i = data_start; i < data_end; i++) {
struct ppa_addr ppa;
int pos;
ppa = addr_to_gen_ppa(pblk, i, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
/* Do not update bad blocks */
if (test_bit(pos, line->blk_bitmap))
continue;
if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
spin_lock(&line->lock);
if (test_and_set_bit(i, line->invalid_bitmap))
WARN_ONCE(1, "pblk: rec. double invalidate:\n");
else
le32_add_cpu(line->vsc, -1);
spin_unlock(&line->lock);
continue;
}
pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
nr_lbas++;
}
if (nr_valid_lbas != nr_lbas)
pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
line->id, nr_valid_lbas, nr_lbas);
line->left_msecs = 0;
return 0;
}
static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line,
u64 written_secs)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
int i;
for (i = 0; i < written_secs; i += pblk->min_write_pgs)
__pblk_alloc_page(pblk, line, pblk->min_write_pgs);
spin_lock(&l_mg->free_lock);
if (written_secs > line->left_msecs) {
/*
* We have all data sectors written
* and some emeta sectors written too.
*/
line->left_msecs = 0;
} else {
/* We have only some data sectors written. */
line->left_msecs -= written_secs;
}
spin_unlock(&l_mg->free_lock);
}
static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
u64 written_secs = 0;
int valid_chunks = 0;
int i;
for (i = 0; i < lm->blk_per_line; i++) {
struct nvm_chk_meta *chunk = &line->chks[i];
if (chunk->state & NVM_CHK_ST_OFFLINE)
continue;
written_secs += chunk->wp;
valid_chunks++;
}
if (lm->blk_per_line - nr_bb != valid_chunks)
pblk_err(pblk, "recovery line %d is bad\n", line->id);
pblk_update_line_wp(pblk, line, written_secs - lm->smeta_sec);
return written_secs;
}
struct pblk_recov_alloc {
struct ppa_addr *ppa_list;
void *meta_list;
struct nvm_rq *rqd;
void *data;
dma_addr_t dma_ppa_list;
dma_addr_t dma_meta_list;
};
static void pblk_recov_complete(struct kref *ref)
{
struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref);
complete(&pad_rq->wait);
}
static void pblk_end_io_recov(struct nvm_rq *rqd)
{
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
struct pblk_pad_rq *pad_rq = rqd->private;
struct pblk *pblk = pad_rq->pblk;
pblk_up_chunk(pblk, ppa_list[0]);
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
atomic_dec(&pblk->inflight_io);
kref_put(&pad_rq->ref, pblk_recov_complete);
}
/* pad line using line bitmap. */
static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
int left_ppas)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
void *meta_list;
struct pblk_pad_rq *pad_rq;
struct nvm_rq *rqd;
struct ppa_addr *ppa_list;
void *data;
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
u64 w_ptr = line->cur_sec;
int left_line_ppas, rq_ppas;
int i, j;
int ret = 0;
spin_lock(&line->lock);
left_line_ppas = line->left_msecs;
spin_unlock(&line->lock);
pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL);
if (!pad_rq)
return -ENOMEM;
data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs));
if (!data) {
ret = -ENOMEM;
goto free_rq;
}
pad_rq->pblk = pblk;
init_completion(&pad_rq->wait);
kref_init(&pad_rq->ref);
next_pad_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (rq_ppas < pblk->min_write_pgs) {
pblk_err(pblk, "corrupted pad line %d\n", line->id);
goto fail_complete;
}
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
ret = pblk_alloc_rqd_meta(pblk, rqd);
if (ret) {
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
goto fail_complete;
}
rqd->bio = NULL;
rqd->opcode = NVM_OP_PWRITE;
rqd->is_seq = 1;
rqd->nr_ppas = rq_ppas;
rqd->end_io = pblk_end_io_recov;
rqd->private = pad_rq;
ppa_list = nvm_rq_to_ppa_list(rqd);
meta_list = rqd->meta_list;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
struct ppa_addr dev_ppa;
struct pblk_sec_meta *meta;
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pblk_map_invalidate(pblk, dev_ppa);
lba_list[w_ptr] = addr_empty;
meta = pblk_get_meta(pblk, meta_list, i);
meta->lba = addr_empty;
ppa_list[i] = dev_ppa;
}
}
kref_get(&pad_rq->ref);
pblk_down_chunk(pblk, ppa_list[0]);
ret = pblk_submit_io(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
pblk_up_chunk(pblk, ppa_list[0]);
kref_put(&pad_rq->ref, pblk_recov_complete);
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
goto fail_complete;
}
left_line_ppas -= rq_ppas;
left_ppas -= rq_ppas;
if (left_ppas && left_line_ppas)
goto next_pad_rq;
fail_complete:
kref_put(&pad_rq->ref, pblk_recov_complete);
wait_for_completion(&pad_rq->wait);
if (!pblk_line_is_full(line))
pblk_err(pblk, "corrupted padded line: %d\n", line->id);
vfree(data);
free_rq:
kfree(pad_rq);
return ret;
}
static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int distance = geo->mw_cunits * geo->all_luns * geo->ws_opt;
return (distance > line->left_msecs) ? line->left_msecs : distance;
}
/* Return a chunk belonging to a line by stripe(write order) index */
static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk,
struct pblk_line *line,
int index)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
struct ppa_addr ppa;
int pos;
rlun = &pblk->luns[index];
ppa = rlun->bppa;
pos = pblk_ppa_to_pos(geo, ppa);
return &line->chks[pos];
}
static int pblk_line_wps_are_unbalanced(struct pblk *pblk,
struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
int blk_in_line = lm->blk_per_line;
struct nvm_chk_meta *chunk;
u64 max_wp, min_wp;
int i;
i = find_first_zero_bit(line->blk_bitmap, blk_in_line);
/* If there is one or zero good chunks in the line,
* the write pointers can't be unbalanced.
*/
if (i >= (blk_in_line - 1))
return 0;
chunk = pblk_get_stripe_chunk(pblk, line, i);
max_wp = chunk->wp;
if (max_wp > pblk->max_write_pgs)
min_wp = max_wp - pblk->max_write_pgs;
else
min_wp = 0;
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
while (i < blk_in_line) {
chunk = pblk_get_stripe_chunk(pblk, line, i);
if (chunk->wp > max_wp || chunk->wp < min_wp)
return 1;
i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1);
}
return 0;
}
static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
void *meta_list;
struct nvm_rq *rqd;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
__le64 *lba_list;
u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
bool padded = false;
int rq_ppas;
int i, j;
int ret;
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
if (pblk_line_wps_are_unbalanced(pblk, line))
pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
ppa_list = p.ppa_list;
meta_list = p.meta_list;
rqd = p.rqd;
data = p.data;
dma_ppa_list = p.dma_ppa_list;
dma_meta_list = p.dma_meta_list;
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
next_rq:
memset(rqd, 0, pblk_g_rq_size);
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
retry_rq:
rqd->bio = NULL;
rqd->opcode = NVM_OP_PREAD;
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
rqd->ppa_list = ppa_list;
rqd->dma_ppa_list = dma_ppa_list;
rqd->dma_meta_list = dma_meta_list;
ppa_list = nvm_rq_to_ppa_list(rqd);
if (pblk_io_aligned(pblk, rq_ppas))
rqd->is_seq = 1;
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
paddr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++)
ppa_list[i] =
addr_to_gen_ppa(pblk, paddr + j, line->id);
}
ret = pblk_submit_io_sync(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
return ret;
}
atomic_dec(&pblk->inflight_io);
/* If a read fails, do a best effort by padding the line and retrying */
if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
int pad_distance, ret;
if (padded) {
pblk_log_read_err(pblk, rqd);
return -EINTR;
}
pad_distance = pblk_pad_distance(pblk, line);
ret = pblk_recov_pad_line(pblk, line, pad_distance);
if (ret) {
return ret;
}
padded = true;
goto retry_rq;
}
pblk_get_packed_meta(pblk, rqd);
for (i = 0; i < rqd->nr_ppas; i++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
u64 lba = le64_to_cpu(meta->lba);
lba_list[paddr++] = cpu_to_le64(lba);
if (lba == ADDR_EMPTY || lba >= pblk->capacity)
continue;
line->nr_valid_lbas++;
pblk_update_map(pblk, lba, ppa_list[i]);
}
left_ppas -= rq_ppas;
if (left_ppas > 0)
goto next_rq;
#ifdef CONFIG_NVM_PBLK_DEBUG
WARN_ON(padded && !pblk_line_is_full(line));
#endif
return 0;
}
/* Scan line for lbas on out of bound area */
static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct nvm_rq *rqd;
struct ppa_addr *ppa_list;
void *meta_list;
struct pblk_recov_alloc p;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
int ret = 0;
meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
if (!meta_list)
return -ENOMEM;
ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
if (!data) {
ret = -ENOMEM;
goto free_meta_list;
}
rqd = mempool_alloc(&pblk->r_rq_pool, GFP_KERNEL);
memset(rqd, 0, pblk_g_rq_size);
p.ppa_list = ppa_list;
p.meta_list = meta_list;
p.rqd = rqd;
p.data = data;
p.dma_ppa_list = dma_ppa_list;
p.dma_meta_list = dma_meta_list;
ret = pblk_recov_scan_oob(pblk, line, p);
if (ret) {
pblk_err(pblk, "could not recover L2P form OOB\n");
goto out;
}
if (pblk_line_is_full(line))
pblk_line_recov_close(pblk, line);
out:
mempool_free(rqd, &pblk->r_rq_pool);
kfree(data);
free_meta_list:
nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
return ret;
}
/* Insert lines ordered by sequence number (seq_num) on list */
static void pblk_recov_line_add_ordered(struct list_head *head,
struct pblk_line *line)
{
struct pblk_line *t = NULL;
list_for_each_entry(t, head, list)
if (t->seq_nr > line->seq_nr)
break;
__list_add(&line->list, t->list.prev, &t->list);
}
static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
unsigned int emeta_secs;
u64 emeta_start;
struct ppa_addr ppa;
int pos;
emeta_secs = lm->emeta_sec[0];
emeta_start = lm->sec_per_line;
while (emeta_secs) {
emeta_start--;
ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
if (!test_bit(pos, line->blk_bitmap))
emeta_secs--;
}
return emeta_start;
}
static int pblk_recov_check_line_version(struct pblk *pblk,
struct line_emeta *emeta)
{
struct line_header *header = &emeta->header;
if (header->version_major != EMETA_VERSION_MAJOR) {
pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
header->version_major, EMETA_VERSION_MAJOR);
return 1;
}
#ifdef CONFIG_NVM_PBLK_DEBUG
if (header->version_minor > EMETA_VERSION_MINOR)
pblk_info(pblk, "newer line minor version found: %d\n",
header->version_minor);
#endif
return 0;
}
static void pblk_recov_wa_counters(struct pblk *pblk,
struct line_emeta *emeta)
{
struct pblk_line_meta *lm = &pblk->lm;
struct line_header *header = &emeta->header;
struct wa_counters *wa = emeta_to_wa(lm, emeta);
/* WA counters were introduced in emeta version 0.2 */
if (header->version_major > 0 || header->version_minor >= 2) {
u64 user = le64_to_cpu(wa->user);
u64 pad = le64_to_cpu(wa->pad);
u64 gc = le64_to_cpu(wa->gc);
atomic64_set(&pblk->user_wa, user);
atomic64_set(&pblk->pad_wa, pad);
atomic64_set(&pblk->gc_wa, gc);
pblk->user_rst_wa = user;
pblk->pad_rst_wa = pad;
pblk->gc_rst_wa = gc;
}
}
static int pblk_line_was_written(struct pblk_line *line,
struct pblk *pblk)
{
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct nvm_chk_meta *chunk;
struct ppa_addr bppa;
int smeta_blk;
if (line->state == PBLK_LINESTATE_BAD)
return 0;
smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
if (smeta_blk >= lm->blk_per_line)
return 0;
bppa = pblk->luns[smeta_blk].bppa;
chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
if (chunk->state & NVM_CHK_ST_CLOSED ||
(chunk->state & NVM_CHK_ST_OPEN
&& chunk->wp >= lm->smeta_sec))
return 1;
return 0;
}
static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
int i;
for (i = 0; i < lm->blk_per_line; i++)
if (line->chks[i].state & NVM_CHK_ST_OPEN)
return true;
return false;
}
struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line, *tline, *data_line = NULL;
struct pblk_smeta *smeta;
struct pblk_emeta *emeta;
struct line_smeta *smeta_buf;
int found_lines = 0, recovered_lines = 0, open_lines = 0;
int is_next = 0;
int meta_line;
int i, valid_uuid = 0;
LIST_HEAD(recov_list);
/* TODO: Implement FTL snapshot */
/* Scan recovery - takes place when FTL snapshot fails */
spin_lock(&l_mg->free_lock);
meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
set_bit(meta_line, &l_mg->meta_bitmap);
smeta = l_mg->sline_meta[meta_line];
emeta = l_mg->eline_meta[meta_line];
smeta_buf = (struct line_smeta *)smeta;
spin_unlock(&l_mg->free_lock);
/* Order data lines using their sequence number */
for (i = 0; i < l_mg->nr_lines; i++) {
u32 crc;
line = &pblk->lines[i];
memset(smeta, 0, lm->smeta_len);
line->smeta = smeta;
line->lun_bitmap = ((void *)(smeta_buf)) +
sizeof(struct line_smeta);
if (!pblk_line_was_written(line, pblk))
continue;
/* Lines that cannot be read are assumed as not written here */
if (pblk_line_smeta_read(pblk, line))
continue;
crc = pblk_calc_smeta_crc(pblk, smeta_buf);
if (le32_to_cpu(smeta_buf->crc) != crc)
continue;
if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
continue;
if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
pblk_err(pblk, "found incompatible line version %u\n",
smeta_buf->header.version_major);
return ERR_PTR(-EINVAL);
}
/* The first valid instance uuid is used for initialization */
if (!valid_uuid) {
import_guid(&pblk->instance_uuid, smeta_buf->header.uuid);
valid_uuid = 1;
}
if (!guid_equal(&pblk->instance_uuid,
(guid_t *)&smeta_buf->header.uuid)) {
pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
i);
continue;
}
/* Update line metadata */
spin_lock(&line->lock);
line->id = le32_to_cpu(smeta_buf->header.id);
line->type = le16_to_cpu(smeta_buf->header.type);
line->seq_nr = le64_to_cpu(smeta_buf->seq_nr);
spin_unlock(&line->lock);
/* Update general metadata */
spin_lock(&l_mg->free_lock);
if (line->seq_nr >= l_mg->d_seq_nr)
l_mg->d_seq_nr = line->seq_nr + 1;
l_mg->nr_free_lines--;
spin_unlock(&l_mg->free_lock);
if (pblk_line_recov_alloc(pblk, line))
goto out;
pblk_recov_line_add_ordered(&recov_list, line);
found_lines++;
pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
line->id, smeta_buf->seq_nr);
}
if (!found_lines) {
guid_gen(&pblk->instance_uuid);
spin_lock(&l_mg->free_lock);
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
&l_mg->meta_bitmap));
spin_unlock(&l_mg->free_lock);
goto out;
}
/* Verify closed blocks and recover this portion of L2P table*/
list_for_each_entry_safe(line, tline, &recov_list, list) {
recovered_lines++;
line->emeta_ssec = pblk_line_emeta_start(pblk, line);
line->emeta = emeta;
memset(line->emeta->buf, 0, lm->emeta_len[0]);
if (pblk_line_is_open(pblk, line)) {
pblk_recov_l2p_from_oob(pblk, line);
goto next;
}
if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) {
pblk_recov_l2p_from_oob(pblk, line);
goto next;
}
if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
pblk_recov_l2p_from_oob(pblk, line);
goto next;
}
if (pblk_recov_check_line_version(pblk, line->emeta->buf))
return ERR_PTR(-EINVAL);
pblk_recov_wa_counters(pblk, line->emeta->buf);
if (pblk_recov_l2p_from_emeta(pblk, line))
pblk_recov_l2p_from_oob(pblk, line);
next:
if (pblk_line_is_full(line)) {
struct list_head *move_list;
spin_lock(&line->lock);
line->state = PBLK_LINESTATE_CLOSED;
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
move_list = pblk_line_gc_list(pblk, line);
spin_unlock(&line->lock);
spin_lock(&l_mg->gc_lock);
list_move_tail(&line->list, move_list);
spin_unlock(&l_mg->gc_lock);
mempool_free(line->map_bitmap, l_mg->bitmap_pool);
line->map_bitmap = NULL;
line->smeta = NULL;
line->emeta = NULL;
} else {
spin_lock(&line->lock);
line->state = PBLK_LINESTATE_OPEN;
spin_unlock(&line->lock);
line->emeta->mem = 0;
atomic_set(&line->emeta->sync, 0);
trace_pblk_line_state(pblk_disk_name(pblk), line->id,
line->state);
data_line = line;
line->meta_line = meta_line;
open_lines++;
}
}
if (!open_lines) {
spin_lock(&l_mg->free_lock);
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
&l_mg->meta_bitmap));
spin_unlock(&l_mg->free_lock);
} else {
spin_lock(&l_mg->free_lock);
l_mg->data_line = data_line;
/* Allocate next line for preparation */
l_mg->data_next = pblk_line_get(pblk);
if (l_mg->data_next) {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
}
spin_unlock(&l_mg->free_lock);
}
if (is_next)
pblk_line_erase(pblk, l_mg->data_next);
out:
if (found_lines != recovered_lines)
pblk_err(pblk, "failed to recover all found lines %d/%d\n",
found_lines, recovered_lines);
return data_line;
}
/*
* Pad current line
*/
int pblk_recov_pad(struct pblk *pblk)
{
struct pblk_line *line;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
int left_msecs;
int ret = 0;
spin_lock(&l_mg->free_lock);
line = l_mg->data_line;
left_msecs = line->left_msecs;
spin_unlock(&l_mg->free_lock);
ret = pblk_recov_pad_line(pblk, line, left_msecs);
if (ret) {
pblk_err(pblk, "tear down padding failed (%d)\n", ret);
return ret;
}
pblk_line_close_meta(pblk, line);
return ret;
}

View File

@ -1,254 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-rl.c - pblk's rate limiter for user I/O
*
*/
#include "pblk.h"
static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
{
mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
}
int pblk_rl_is_limit(struct pblk_rl *rl)
{
int rb_space;
rb_space = atomic_read(&rl->rb_space);
return (rb_space == 0);
}
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
int rb_space = atomic_read(&rl->rb_space);
if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0))
return NVM_IO_ERR;
if (rb_user_cnt >= rl->rb_user_max)
return NVM_IO_REQUEUE;
return NVM_IO_OK;
}
void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries)
{
int rb_space = atomic_read(&rl->rb_space);
if (unlikely(rb_space >= 0))
atomic_sub(nr_entries, &rl->rb_space);
}
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
int rb_user_active;
/* If there is no user I/O let GC take over space on the write buffer */
rb_user_active = READ_ONCE(rl->rb_user_active);
return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active));
}
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_user_cnt);
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 1);
pblk_rl_kick_u_timer(rl);
}
void pblk_rl_werr_line_in(struct pblk_rl *rl)
{
atomic_inc(&rl->werr_lines);
}
void pblk_rl_werr_line_out(struct pblk_rl *rl)
{
atomic_dec(&rl->werr_lines);
}
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
{
atomic_add(nr_entries, &rl->rb_gc_cnt);
}
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
{
atomic_sub(nr_user, &rl->rb_user_cnt);
atomic_sub(nr_gc, &rl->rb_gc_cnt);
}
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
{
return atomic_read(&rl->free_blocks);
}
unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
{
return atomic_read(&rl->free_user_blocks);
}
static void __pblk_rl_update_rates(struct pblk_rl *rl,
unsigned long free_blocks)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int max = rl->rb_budget;
int werr_gc_needed = atomic_read(&rl->werr_lines);
if (free_blocks >= rl->high) {
if (werr_gc_needed) {
/* Allocate a small budget for recovering
* lines with write errors
*/
rl->rb_gc_max = 1 << rl->rb_windows_pw;
rl->rb_user_max = max - rl->rb_gc_max;
rl->rb_state = PBLK_RL_WERR;
} else {
rl->rb_user_max = max;
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_OFF;
}
} else if (free_blocks < rl->high) {
int shift = rl->high_pw - rl->rb_windows_pw;
int user_windows = free_blocks >> shift;
int user_max = user_windows << ilog2(NVM_MAX_VLBA);
rl->rb_user_max = user_max;
rl->rb_gc_max = max - user_max;
if (free_blocks <= rl->rsv_blocks) {
rl->rb_user_max = 0;
rl->rb_gc_max = max;
}
/* In the worst case, we will need to GC lines in the low list
* (high valid sector count). If there are lines to GC on high
* or mid lists, these will be prioritized
*/
rl->rb_state = PBLK_RL_LOW;
}
if (rl->rb_state != PBLK_RL_OFF)
pblk_gc_should_start(pblk);
else
pblk_gc_should_stop(pblk);
}
void pblk_rl_update_rates(struct pblk_rl *rl)
{
__pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
int blk_in_line = atomic_read(&line->blk_in_line);
int free_blocks;
atomic_add(blk_in_line, &rl->free_blocks);
free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
__pblk_rl_update_rates(rl, free_blocks);
}
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
bool used)
{
int blk_in_line = atomic_read(&line->blk_in_line);
int free_blocks;
atomic_sub(blk_in_line, &rl->free_blocks);
if (used)
free_blocks = atomic_sub_return(blk_in_line,
&rl->free_user_blocks);
else
free_blocks = atomic_read(&rl->free_user_blocks);
__pblk_rl_update_rates(rl, free_blocks);
}
int pblk_rl_high_thrs(struct pblk_rl *rl)
{
return rl->high;
}
int pblk_rl_max_io(struct pblk_rl *rl)
{
return rl->rb_max_io;
}
static void pblk_rl_u_timer(struct timer_list *t)
{
struct pblk_rl *rl = from_timer(rl, t, u_timer);
/* Release user I/O state. Protect from GC */
smp_store_release(&rl->rb_user_active, 0);
}
void pblk_rl_free(struct pblk_rl *rl)
{
del_timer(&rl->u_timer);
}
void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
int sec_meta, blk_meta;
unsigned int rb_windows;
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
rl->high_pw = get_count_order(rl->high);
rl->rsv_blocks = pblk_get_min_chks(pblk);
/* This will always be a power-of-2 */
rb_windows = budget / NVM_MAX_VLBA;
rl->rb_windows_pw = get_count_order(rb_windows);
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
rl->rb_user_max = budget;
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;
/* Maximize I/O size and ansure that back threshold is respected */
if (threshold)
rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold;
else
rl->rb_max_io = budget - pblk->min_write_pgs_data - 1;
atomic_set(&rl->rb_user_cnt, 0);
atomic_set(&rl->rb_gc_cnt, 0);
atomic_set(&rl->rb_space, -1);
atomic_set(&rl->werr_lines, 0);
timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
rl->rb_user_active = 0;
rl->rb_gc_active = 0;
}

View File

@ -1,728 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Implementation of a physical block-device target for Open-channel SSDs.
*
* pblk-sysfs.c - pblk's sysfs
*
*/
#include "pblk.h"
static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
ssize_t sz = 0;
int i;
for (i = 0; i < geo->all_luns; i++) {
int active = 1;
rlun = &pblk->luns[i];
if (!down_trylock(&rlun->wr_sem)) {
active = 0;
up(&rlun->wr_sem);
}
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"pblk: pos:%d, ch:%d, lun:%d - %d\n",
i,
rlun->bppa.a.ch,
rlun->bppa.a.lun,
active);
}
return sz;
}
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
{
int free_blocks, free_user_blocks, total_blocks;
int rb_user_max, rb_user_cnt;
int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
rb_user_max = pblk->rl.rb_user_max;
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
rb_gc_max = pblk->rl.rb_gc_max;
rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
rb_budget = pblk->rl.rb_budget;
rb_state = pblk->rl.rb_state;
total_blocks = pblk->rl.total_blocks;
return snprintf(page, PAGE_SIZE,
"u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
rb_user_cnt,
rb_user_max,
rb_gc_cnt,
rb_gc_max,
rb_state,
rb_budget,
pblk->rl.high,
free_blocks,
free_user_blocks,
total_blocks,
READ_ONCE(pblk->rl.rb_user_active));
}
static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
{
int gc_enabled, gc_active;
pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
gc_enabled, gc_active);
}
static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
{
ssize_t sz;
sz = snprintf(page, PAGE_SIZE,
"read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
atomic_long_read(&pblk->read_failed),
atomic_long_read(&pblk->read_high_ecc),
atomic_long_read(&pblk->read_empty),
atomic_long_read(&pblk->read_failed_gc),
atomic_long_read(&pblk->write_failed),
atomic_long_read(&pblk->erase_failed));
return sz;
}
static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
{
return pblk_rb_sysfs(&pblk->rwb, page);
}
static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
ssize_t sz = 0;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf;
sz = scnprintf(page, PAGE_SIZE,
"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
pblk->addrf_len,
ppaf->blk_offset, ppaf->blk_len,
ppaf->pg_offset, ppaf->pg_len,
ppaf->lun_offset, ppaf->lun_len,
ppaf->ch_offset, ppaf->ch_len,
ppaf->pln_offset, ppaf->pln_len,
ppaf->sec_offset, ppaf->sec_len);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
gppaf->blk_offset, gppaf->blk_len,
gppaf->pg_offset, gppaf->pg_len,
gppaf->lun_offset, gppaf->lun_len,
gppaf->ch_offset, gppaf->ch_len,
gppaf->pln_offset, gppaf->pln_len,
gppaf->sec_offset, gppaf->sec_len);
} else {
struct nvm_addrf *ppaf = &pblk->addrf;
struct nvm_addrf *gppaf = &geo->addrf;
sz = scnprintf(page, PAGE_SIZE,
"pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n",
pblk->addrf_len,
ppaf->ch_offset, ppaf->ch_len,
ppaf->lun_offset, ppaf->lun_len,
ppaf->chk_offset, ppaf->chk_len,
ppaf->sec_offset, ppaf->sec_len);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n",
gppaf->ch_offset, gppaf->ch_len,
gppaf->lun_offset, gppaf->lun_len,
gppaf->chk_offset, gppaf->chk_len,
gppaf->sec_offset, gppaf->sec_len);
}
return sz;
}
static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
ssize_t sz = 0;
int nr_free_lines;
int cur_data, cur_log;
int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
int d_line_cnt = 0, l_line_cnt = 0;
int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
int gc_werr = 0;
int bad = 0, cor = 0;
int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
int map_weight = 0, meta_weight = 0;
spin_lock(&l_mg->free_lock);
cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
nr_free_lines = l_mg->nr_free_lines;
list_for_each_entry(line, &l_mg->free_list, list)
free_line_cnt++;
spin_unlock(&l_mg->free_lock);
spin_lock(&l_mg->close_lock);
list_for_each_entry(line, &l_mg->emeta_list, list)
emeta_line_cnt++;
spin_unlock(&l_mg->close_lock);
spin_lock(&l_mg->gc_lock);
list_for_each_entry(line, &l_mg->gc_full_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_full++;
}
list_for_each_entry(line, &l_mg->gc_high_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_high++;
}
list_for_each_entry(line, &l_mg->gc_mid_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_mid++;
}
list_for_each_entry(line, &l_mg->gc_low_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_low++;
}
list_for_each_entry(line, &l_mg->gc_empty_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_empty++;
}
list_for_each_entry(line, &l_mg->gc_werr_list, list) {
if (line->type == PBLK_LINETYPE_DATA)
d_line_cnt++;
else if (line->type == PBLK_LINETYPE_LOG)
l_line_cnt++;
closed_line_cnt++;
gc_werr++;
}
list_for_each_entry(line, &l_mg->bad_list, list)
bad++;
list_for_each_entry(line, &l_mg->corrupt_list, list)
cor++;
spin_unlock(&l_mg->gc_lock);
spin_lock(&l_mg->free_lock);
if (l_mg->data_line) {
cur_sec = l_mg->data_line->cur_sec;
msecs = l_mg->data_line->left_msecs;
vsc = le32_to_cpu(*l_mg->data_line->vsc);
sec_in_line = l_mg->data_line->sec_in_line;
meta_weight = bitmap_weight(&l_mg->meta_bitmap,
PBLK_DATA_LINES);
spin_lock(&l_mg->data_line->lock);
if (l_mg->data_line->map_bitmap)
map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
lm->sec_per_line);
else
map_weight = 0;
spin_unlock(&l_mg->data_line->lock);
}
spin_unlock(&l_mg->free_lock);
if (nr_free_lines != free_line_cnt)
pblk_err(pblk, "corrupted free line list:%d/%d\n",
nr_free_lines, free_line_cnt);
sz = scnprintf(page, PAGE_SIZE - sz,
"line: nluns:%d, nblks:%d, nsecs:%d\n",
geo->all_luns, lm->blk_per_line, lm->sec_per_line);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
cur_data, cur_log,
nr_free_lines,
emeta_line_cnt, meta_weight,
closed_line_cnt,
bad, cor,
d_line_cnt, l_line_cnt,
l_mg->nr_lines);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
atomic_read(&pblk->gc.read_inflight_gc));
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
cur_data, cur_sec, msecs, vsc, sec_in_line,
map_weight, lm->sec_per_line,
atomic_read(&pblk->inflight_io));
return sz;
}
static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
ssize_t sz = 0;
sz = scnprintf(page, PAGE_SIZE - sz,
"smeta - len:%d, secs:%d\n",
lm->smeta_len, lm->smeta_sec);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"emeta - len:%d, sec:%d, bb_start:%d\n",
lm->emeta_len[0], lm->emeta_sec[0],
lm->emeta_bb);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"bitmap lengths: sec:%d, blk:%d, lun:%d\n",
lm->sec_bitmap_len,
lm->blk_bitmap_len,
lm->lun_bitmap_len);
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"blk_line:%d, sec_line:%d, sec_blk:%d\n",
lm->blk_per_line,
lm->sec_per_line,
geo->clba);
return sz;
}
static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page)
{
return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write);
}
static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad,
char *page)
{
int sz;
sz = scnprintf(page, PAGE_SIZE,
"user:%lld gc:%lld pad:%lld WA:",
user, gc, pad);
if (!user) {
sz += scnprintf(page + sz, PAGE_SIZE - sz, "NaN\n");
} else {
u64 wa_int;
u32 wa_frac;
wa_int = (user + gc + pad) * 100000;
wa_int = div64_u64(wa_int, user);
wa_int = div_u64_rem(wa_int, 100000, &wa_frac);
sz += scnprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n",
wa_int, wa_frac);
}
return sz;
}
static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page)
{
return pblk_get_write_amp(atomic64_read(&pblk->user_wa),
atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa),
page);
}
static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page)
{
return pblk_get_write_amp(
atomic64_read(&pblk->user_wa) - pblk->user_rst_wa,
atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa,
atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page);
}
static long long bucket_percentage(unsigned long long bucket,
unsigned long long total)
{
int p = bucket * 100;
p = div_u64(p, total);
return p;
}
static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
{
int sz = 0;
unsigned long long total;
unsigned long long total_buckets = 0;
int buckets = pblk->min_write_pgs - 1;
int i;
total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst;
if (!total) {
for (i = 0; i < (buckets + 1); i++)
sz += scnprintf(page + sz, PAGE_SIZE - sz,
"%d:0 ", i);
sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n");
return sz;
}
for (i = 0; i < buckets; i++)
total_buckets += atomic64_read(&pblk->pad_dist[i]);
sz += scnprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ",
bucket_percentage(total - total_buckets, total));
for (i = 0; i < buckets; i++) {
unsigned long long p;
p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]),
total);
sz += scnprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ",
i + 1, p);
}
sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n");
return sz;
}
#ifdef CONFIG_NVM_PBLK_DEBUG
static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
{
return snprintf(page, PAGE_SIZE,
"%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
atomic_long_read(&pblk->inflight_writes),
atomic_long_read(&pblk->inflight_reads),
atomic_long_read(&pblk->req_writes),
(u64)atomic64_read(&pblk->nr_flush),
atomic_long_read(&pblk->padded_writes),
atomic_long_read(&pblk->padded_wb),
atomic_long_read(&pblk->sub_writes),
atomic_long_read(&pblk->sync_writes),
atomic_long_read(&pblk->recov_writes),
atomic_long_read(&pblk->recov_gc_writes),
atomic_long_read(&pblk->recov_gc_reads),
atomic_long_read(&pblk->cache_reads),
atomic_long_read(&pblk->sync_reads));
}
#endif
static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
size_t len)
{
size_t c_len;
int force;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &force))
return -EINVAL;
pblk_gc_sysfs_force(pblk, force);
return len;
}
static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
const char *page, size_t len)
{
size_t c_len;
int sec_per_write;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &sec_per_write))
return -EINVAL;
if (!pblk_is_oob_meta_supported(pblk)) {
/* For packed metadata case it is
* not allowed to change sec_per_write.
*/
return -EINVAL;
}
if (sec_per_write < pblk->min_write_pgs
|| sec_per_write > pblk->max_write_pgs
|| sec_per_write % pblk->min_write_pgs != 0)
return -EINVAL;
pblk_set_sec_per_write(pblk, sec_per_write);
return len;
}
static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk,
const char *page, size_t len)
{
size_t c_len;
int reset_value;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &reset_value))
return -EINVAL;
if (reset_value != 0)
return -EINVAL;
pblk->user_rst_wa = atomic64_read(&pblk->user_wa);
pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa);
pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa);
return len;
}
static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk,
const char *page, size_t len)
{
size_t c_len;
int reset_value;
int buckets = pblk->min_write_pgs - 1;
int i;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &reset_value))
return -EINVAL;
if (reset_value != 0)
return -EINVAL;
for (i = 0; i < buckets; i++)
atomic64_set(&pblk->pad_dist[i], 0);
pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush);
return len;
}
static struct attribute sys_write_luns = {
.name = "write_luns",
.mode = 0444,
};
static struct attribute sys_rate_limiter_attr = {
.name = "rate_limiter",
.mode = 0444,
};
static struct attribute sys_gc_state = {
.name = "gc_state",
.mode = 0444,
};
static struct attribute sys_errors_attr = {
.name = "errors",
.mode = 0444,
};
static struct attribute sys_rb_attr = {
.name = "write_buffer",
.mode = 0444,
};
static struct attribute sys_stats_ppaf_attr = {
.name = "ppa_format",
.mode = 0444,
};
static struct attribute sys_lines_attr = {
.name = "lines",
.mode = 0444,
};
static struct attribute sys_lines_info_attr = {
.name = "lines_info",
.mode = 0444,
};
static struct attribute sys_gc_force = {
.name = "gc_force",
.mode = 0200,
};
static struct attribute sys_max_sec_per_write = {
.name = "max_sec_per_write",
.mode = 0644,
};
static struct attribute sys_write_amp_mileage = {
.name = "write_amp_mileage",
.mode = 0444,
};
static struct attribute sys_write_amp_trip = {
.name = "write_amp_trip",
.mode = 0644,
};
static struct attribute sys_padding_dist = {
.name = "padding_dist",
.mode = 0644,
};
#ifdef CONFIG_NVM_PBLK_DEBUG
static struct attribute sys_stats_debug_attr = {
.name = "stats",
.mode = 0444,
};
#endif
static struct attribute *pblk_attrs[] = {
&sys_write_luns,
&sys_rate_limiter_attr,
&sys_errors_attr,
&sys_gc_state,
&sys_gc_force,
&sys_max_sec_per_write,
&sys_rb_attr,
&sys_stats_ppaf_attr,
&sys_lines_attr,
&sys_lines_info_attr,
&sys_write_amp_mileage,
&sys_write_amp_trip,
&sys_padding_dist,
#ifdef CONFIG_NVM_PBLK_DEBUG
&sys_stats_debug_attr,
#endif
NULL,
};
static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct pblk *pblk = container_of(kobj, struct pblk, kobj);
if (strcmp(attr->name, "rate_limiter") == 0)
return pblk_sysfs_rate_limiter(pblk, buf);
else if (strcmp(attr->name, "write_luns") == 0)
return pblk_sysfs_luns_show(pblk, buf);
else if (strcmp(attr->name, "gc_state") == 0)
return pblk_sysfs_gc_state_show(pblk, buf);
else if (strcmp(attr->name, "errors") == 0)
return pblk_sysfs_stats(pblk, buf);
else if (strcmp(attr->name, "write_buffer") == 0)
return pblk_sysfs_write_buffer(pblk, buf);
else if (strcmp(attr->name, "ppa_format") == 0)
return pblk_sysfs_ppaf(pblk, buf);
else if (strcmp(attr->name, "lines") == 0)
return pblk_sysfs_lines(pblk, buf);
else if (strcmp(attr->name, "lines_info") == 0)
return pblk_sysfs_lines_info(pblk, buf);
else if (strcmp(attr->name, "max_sec_per_write") == 0)
return pblk_sysfs_get_sec_per_write(pblk, buf);
else if (strcmp(attr->name, "write_amp_mileage") == 0)
return pblk_sysfs_get_write_amp_mileage(pblk, buf);
else if (strcmp(attr->name, "write_amp_trip") == 0)
return pblk_sysfs_get_write_amp_trip(pblk, buf);
else if (strcmp(attr->name, "padding_dist") == 0)
return pblk_sysfs_get_padding_dist(pblk, buf);
#ifdef CONFIG_NVM_PBLK_DEBUG
else if (strcmp(attr->name, "stats") == 0)
return pblk_sysfs_stats_debug(pblk, buf);
#endif
return 0;
}
static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct pblk *pblk = container_of(kobj, struct pblk, kobj);
if (strcmp(attr->name, "gc_force") == 0)
return pblk_sysfs_gc_force(pblk, buf, len);
else if (strcmp(attr->name, "max_sec_per_write") == 0)
return pblk_sysfs_set_sec_per_write(pblk, buf, len);
else if (strcmp(attr->name, "write_amp_trip") == 0)
return pblk_sysfs_set_write_amp_trip(pblk, buf, len);
else if (strcmp(attr->name, "padding_dist") == 0)
return pblk_sysfs_set_padding_dist(pblk, buf, len);
return 0;
}
static const struct sysfs_ops pblk_sysfs_ops = {
.show = pblk_sysfs_show,
.store = pblk_sysfs_store,
};
static struct kobj_type pblk_ktype = {
.sysfs_ops = &pblk_sysfs_ops,
.default_attrs = pblk_attrs,
};
int pblk_sysfs_init(struct gendisk *tdisk)
{
struct pblk *pblk = tdisk->private_data;
struct device *parent_dev = disk_to_dev(pblk->disk);
int ret;
ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
kobject_get(&parent_dev->kobj),
"%s", "pblk");
if (ret) {
pblk_err(pblk, "could not register\n");
return ret;
}
kobject_uevent(&pblk->kobj, KOBJ_ADD);
return 0;
}
void pblk_sysfs_exit(struct gendisk *tdisk)
{
struct pblk *pblk = tdisk->private_data;
kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
kobject_del(&pblk->kobj);
kobject_put(&pblk->kobj);
}

View File

@ -1,145 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pblk
#if !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PBLK_H
#include <linux/tracepoint.h>
struct ppa_addr;
#define show_chunk_flags(state) __print_flags(state, "", \
{ NVM_CHK_ST_FREE, "FREE", }, \
{ NVM_CHK_ST_CLOSED, "CLOSED", }, \
{ NVM_CHK_ST_OPEN, "OPEN", }, \
{ NVM_CHK_ST_OFFLINE, "OFFLINE", })
#define show_line_state(state) __print_symbolic(state, \
{ PBLK_LINESTATE_NEW, "NEW", }, \
{ PBLK_LINESTATE_FREE, "FREE", }, \
{ PBLK_LINESTATE_OPEN, "OPEN", }, \
{ PBLK_LINESTATE_CLOSED, "CLOSED", }, \
{ PBLK_LINESTATE_GC, "GC", }, \
{ PBLK_LINESTATE_BAD, "BAD", }, \
{ PBLK_LINESTATE_CORRUPT, "CORRUPT" })
#define show_pblk_state(state) __print_symbolic(state, \
{ PBLK_STATE_RUNNING, "RUNNING", }, \
{ PBLK_STATE_STOPPING, "STOPPING", }, \
{ PBLK_STATE_RECOVERING, "RECOVERING", }, \
{ PBLK_STATE_STOPPED, "STOPPED" })
#define show_chunk_erase_state(state) __print_symbolic(state, \
{ PBLK_CHUNK_RESET_START, "START", }, \
{ PBLK_CHUNK_RESET_DONE, "OK", }, \
{ PBLK_CHUNK_RESET_FAILED, "FAILED" })
TRACE_EVENT(pblk_chunk_reset,
TP_PROTO(const char *name, struct ppa_addr *ppa, int state),
TP_ARGS(name, ppa, state),
TP_STRUCT__entry(
__string(name, name)
__field(u64, ppa)
__field(int, state)
),
TP_fast_assign(
__assign_str(name, name);
__entry->ppa = ppa->ppa;
__entry->state = state;
),
TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk),
show_chunk_erase_state((int)__entry->state))
);
TRACE_EVENT(pblk_chunk_state,
TP_PROTO(const char *name, struct ppa_addr *ppa, int state),
TP_ARGS(name, ppa, state),
TP_STRUCT__entry(
__string(name, name)
__field(u64, ppa)
__field(int, state)
),
TP_fast_assign(
__assign_str(name, name);
__entry->ppa = ppa->ppa;
__entry->state = state;
),
TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu),
(u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk),
show_chunk_flags((int)__entry->state))
);
TRACE_EVENT(pblk_line_state,
TP_PROTO(const char *name, int line, int state),
TP_ARGS(name, line, state),
TP_STRUCT__entry(
__string(name, name)
__field(int, line)
__field(int, state)
),
TP_fast_assign(
__assign_str(name, name);
__entry->line = line;
__entry->state = state;
),
TP_printk("dev=%s line=%d state=%s", __get_str(name),
(int)__entry->line,
show_line_state((int)__entry->state))
);
TRACE_EVENT(pblk_state,
TP_PROTO(const char *name, int state),
TP_ARGS(name, state),
TP_STRUCT__entry(
__string(name, name)
__field(int, state)
),
TP_fast_assign(
__assign_str(name, name);
__entry->state = state;
),
TP_printk("dev=%s state=%s", __get_str(name),
show_pblk_state((int)__entry->state))
);
#endif /* !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) */
/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../drivers/lightnvm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE pblk-trace
#include <trace/define_trace.h>

View File

@ -1,665 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 CNEX Labs
* Initial release: Javier Gonzalez <javier@cnexlabs.com>
* Matias Bjorling <matias@cnexlabs.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* pblk-write.c - pblk's write path from write buffer to media
*/
#include "pblk.h"
#include "pblk-trace.h"
static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct bio *original_bio;
struct pblk_rb *rwb = &pblk->rwb;
unsigned long ret;
int i;
for (i = 0; i < c_ctx->nr_valid; i++) {
struct pblk_w_ctx *w_ctx;
int pos = c_ctx->sentry + i;
int flags;
w_ctx = pblk_rb_w_ctx(rwb, pos);
flags = READ_ONCE(w_ctx->flags);
if (flags & PBLK_FLUSH_ENTRY) {
flags &= ~PBLK_FLUSH_ENTRY;
/* Release flags on context. Protect from writes */
smp_store_release(&w_ctx->flags, flags);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_dec(&rwb->inflight_flush_point);
#endif
}
while ((original_bio = bio_list_pop(&w_ctx->bios)))
bio_endio(original_bio);
}
if (c_ctx->nr_padded)
pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
c_ctx->nr_padded);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
#endif
ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
return ret;
}
static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
list_del(&c_ctx->list);
return pblk_end_w_bio(pblk, rqd, c_ctx);
}
static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct pblk_c_ctx *c, *r;
unsigned long flags;
unsigned long pos;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
#endif
pblk_up_rq(pblk, c_ctx->lun_bitmap);
pos = pblk_rb_sync_init(&pblk->rwb, &flags);
if (pos == c_ctx->sentry) {
pos = pblk_end_w_bio(pblk, rqd, c_ctx);
retry:
list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
rqd = nvm_rq_from_c_ctx(c);
if (c->sentry == pos) {
pos = pblk_end_queued_w_bio(pblk, rqd, c);
goto retry;
}
}
} else {
WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
list_add_tail(&c_ctx->list, &pblk->compl_list);
}
pblk_rb_sync_end(&pblk->rwb, &flags);
}
/* Map remaining sectors in chunk, starting from ppa */
static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
int rqd_ppas)
{
struct pblk_line *line;
struct ppa_addr map_ppa = *ppa;
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
__le64 *lba_list;
u64 paddr;
int done = 0;
int n = 0;
line = pblk_ppa_to_line(pblk, *ppa);
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
spin_lock(&line->lock);
while (!done) {
paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
if (!test_and_set_bit(paddr, line->map_bitmap))
line->left_msecs--;
if (n < rqd_ppas && lba_list[paddr] != addr_empty)
line->nr_valid_lbas--;
lba_list[paddr] = addr_empty;
if (!test_and_set_bit(paddr, line->invalid_bitmap))
le32_add_cpu(line->vsc, -1);
done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
n++;
}
line->w_err_gc->has_write_err = 1;
spin_unlock(&line->lock);
}
static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
unsigned int nr_entries)
{
struct pblk_rb *rb = &pblk->rwb;
struct pblk_rb_entry *entry;
struct pblk_line *line;
struct pblk_w_ctx *w_ctx;
struct ppa_addr ppa_l2p;
int flags;
unsigned int i;
spin_lock(&pblk->trans_lock);
for (i = 0; i < nr_entries; i++) {
entry = &rb->entries[pblk_rb_ptr_wrap(rb, sentry, i)];
w_ctx = &entry->w_ctx;
/* Check if the lba has been overwritten */
if (w_ctx->lba != ADDR_EMPTY) {
ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
w_ctx->lba = ADDR_EMPTY;
}
/* Mark up the entry as submittable again */
flags = READ_ONCE(w_ctx->flags);
flags |= PBLK_WRITTEN_DATA;
/* Release flags on write context. Protect from writes */
smp_store_release(&w_ctx->flags, flags);
/* Decrease the reference count to the line as we will
* re-map these entries
*/
line = pblk_ppa_to_line(pblk, w_ctx->ppa);
atomic_dec(&line->sec_to_update);
kref_put(&line->ref, pblk_line_put);
}
spin_unlock(&pblk->trans_lock);
}
static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
{
struct pblk_c_ctx *r_ctx;
r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
if (!r_ctx)
return;
r_ctx->lun_bitmap = NULL;
r_ctx->sentry = c_ctx->sentry;
r_ctx->nr_valid = c_ctx->nr_valid;
r_ctx->nr_padded = c_ctx->nr_padded;
spin_lock(&pblk->resubmit_lock);
list_add_tail(&r_ctx->list, &pblk->resubmit_list);
spin_unlock(&pblk->resubmit_lock);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
#endif
}
static void pblk_submit_rec(struct work_struct *work)
{
struct pblk_rec_ctx *recovery =
container_of(work, struct pblk_rec_ctx, ws_rec);
struct pblk *pblk = recovery->pblk;
struct nvm_rq *rqd = recovery->rqd;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
pblk_log_write_err(pblk, rqd);
pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
pblk_queue_resubmit(pblk, c_ctx);
pblk_up_rq(pblk, c_ctx->lun_bitmap);
if (c_ctx->nr_padded)
pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
c_ctx->nr_padded);
bio_put(rqd->bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
mempool_free(recovery, &pblk->rec_pool);
atomic_dec(&pblk->inflight_io);
pblk_write_kick(pblk);
}
static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
{
struct pblk_rec_ctx *recovery;
recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
if (!recovery) {
pblk_err(pblk, "could not allocate recovery work\n");
return;
}
recovery->pblk = pblk;
recovery->rqd = rqd;
INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
queue_work(pblk->close_wq, &recovery->ws_rec);
}
static void pblk_end_io_write(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
if (rqd->error) {
pblk_end_w_fail(pblk, rqd);
return;
} else {
if (trace_pblk_chunk_state_enabled())
pblk_check_chunk_state_update(pblk, rqd);
#ifdef CONFIG_NVM_PBLK_DEBUG
WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
#endif
}
pblk_complete_write(pblk, rqd, c_ctx);
atomic_dec(&pblk->inflight_io);
}
static void pblk_end_io_write_meta(struct nvm_rq *rqd)
{
struct pblk *pblk = rqd->private;
struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
struct pblk_line *line = m_ctx->private;
struct pblk_emeta *emeta = line->emeta;
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
int sync;
pblk_up_chunk(pblk, ppa_list[0]);
if (rqd->error) {
pblk_log_write_err(pblk, rqd);
pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
line->w_err_gc->has_write_err = 1;
} else {
if (trace_pblk_chunk_state_enabled())
pblk_check_chunk_state_update(pblk, rqd);
}
sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
if (sync == emeta->nr_entries)
pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
GFP_ATOMIC, pblk->close_wq);
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
atomic_dec(&pblk->inflight_io);
}
static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int nr_secs, nvm_end_io_fn(*end_io))
{
/* Setup write request */
rqd->opcode = NVM_OP_PWRITE;
rqd->nr_ppas = nr_secs;
rqd->is_seq = 1;
rqd->private = pblk;
rqd->end_io = end_io;
return pblk_alloc_rqd_meta(pblk, rqd);
}
static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct ppa_addr *erase_ppa)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line *e_line = pblk_line_get_erase(pblk);
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
unsigned int valid = c_ctx->nr_valid;
unsigned int padded = c_ctx->nr_padded;
unsigned int nr_secs = valid + padded;
unsigned long *lun_bitmap;
int ret;
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap)
return -ENOMEM;
c_ctx->lun_bitmap = lun_bitmap;
ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write);
if (ret) {
kfree(lun_bitmap);
return ret;
}
if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
valid, 0);
else
ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
valid, erase_ppa);
return ret;
}
static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
unsigned int secs_to_flush)
{
int secs_to_sync;
secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
#ifdef CONFIG_NVM_PBLK_DEBUG
if ((!secs_to_sync && secs_to_flush)
|| (secs_to_sync < 0)
|| (secs_to_sync > secs_avail && !secs_to_flush)) {
pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
secs_avail, secs_to_sync, secs_to_flush);
}
#endif
return secs_to_sync;
}
int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_emeta *emeta = meta_line->emeta;
struct ppa_addr *ppa_list;
struct pblk_g_ctx *m_ctx;
struct nvm_rq *rqd;
void *data;
u64 paddr;
int rq_ppas = pblk->min_write_pgs;
int id = meta_line->id;
int rq_len;
int i, j;
int ret;
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
m_ctx = nvm_rq_to_pdu(rqd);
m_ctx->private = meta_line;
rq_len = rq_ppas * geo->csecs;
data = ((void *)emeta->buf) + emeta->mem;
ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta);
if (ret)
goto fail_free_rqd;
ppa_list = nvm_rq_to_ppa_list(rqd);
for (i = 0; i < rqd->nr_ppas; ) {
spin_lock(&meta_line->lock);
paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas);
spin_unlock(&meta_line->lock);
for (j = 0; j < rq_ppas; j++, i++, paddr++)
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id);
}
spin_lock(&l_mg->close_lock);
emeta->mem += rq_len;
if (emeta->mem >= lm->emeta_len[0])
list_del(&meta_line->list);
spin_unlock(&l_mg->close_lock);
pblk_down_chunk(pblk, ppa_list[0]);
ret = pblk_submit_io(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
goto fail_rollback;
}
return NVM_IO_OK;
fail_rollback:
pblk_up_chunk(pblk, ppa_list[0]);
spin_lock(&l_mg->close_lock);
pblk_dealloc_page(pblk, meta_line, rq_ppas);
list_add(&meta_line->list, &meta_line->list);
spin_unlock(&l_mg->close_lock);
fail_free_rqd:
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
return ret;
}
static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
struct pblk_line *meta_line,
struct nvm_rq *data_rqd)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
struct pblk_line *data_line = pblk_line_get_data(pblk);
struct ppa_addr ppa, ppa_opt;
u64 paddr;
int pos_opt;
/* Schedule a metadata I/O that is half the distance from the data I/O
* with regards to the number of LUNs forming the pblk instance. This
* balances LUN conflicts across every I/O.
*
* When the LUN configuration changes (e.g., due to GC), this distance
* can align, which would result on metadata and data I/Os colliding. In
* this case, modify the distance to not be optimal, but move the
* optimal in the right direction.
*/
paddr = pblk_lookup_page(pblk, meta_line);
ppa = addr_to_gen_ppa(pblk, paddr, 0);
ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
test_bit(pos_opt, data_line->blk_bitmap))
return true;
if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
data_line->meta_distance--;
return false;
}
static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
struct nvm_rq *data_rqd)
{
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *meta_line;
spin_lock(&l_mg->close_lock);
if (list_empty(&l_mg->emeta_list)) {
spin_unlock(&l_mg->close_lock);
return NULL;
}
meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
if (meta_line->emeta->mem >= lm->emeta_len[0]) {
spin_unlock(&l_mg->close_lock);
return NULL;
}
spin_unlock(&l_mg->close_lock);
if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
return NULL;
return meta_line;
}
static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
{
struct ppa_addr erase_ppa;
struct pblk_line *meta_line;
int err;
pblk_ppa_set_empty(&erase_ppa);
/* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
if (err) {
pblk_err(pblk, "could not setup write request: %d\n", err);
return NVM_IO_ERR;
}
meta_line = pblk_should_submit_meta_io(pblk, rqd);
/* Submit data write for current data line */
err = pblk_submit_io(pblk, rqd, NULL);
if (err) {
pblk_err(pblk, "data I/O submission failed: %d\n", err);
return NVM_IO_ERR;
}
if (!pblk_ppa_empty(erase_ppa)) {
/* Submit erase for next data line */
if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct pblk_line *e_line = pblk_line_get_erase(pblk);
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int bit;
atomic_inc(&e_line->left_eblks);
bit = pblk_ppa_to_pos(geo, erase_ppa);
WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
}
}
if (meta_line) {
/* Submit metadata write for previous data line */
err = pblk_submit_meta_io(pblk, meta_line);
if (err) {
pblk_err(pblk, "metadata I/O submission failed: %d",
err);
return NVM_IO_ERR;
}
}
return NVM_IO_OK;
}
static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
{
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
if (c_ctx->nr_padded)
pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
c_ctx->nr_padded);
}
static int pblk_submit_write(struct pblk *pblk, int *secs_left)
{
struct bio *bio;
struct nvm_rq *rqd;
unsigned int secs_avail, secs_to_sync, secs_to_com;
unsigned int secs_to_flush, packed_meta_pgs;
unsigned long pos;
unsigned int resubmit;
*secs_left = 0;
spin_lock(&pblk->resubmit_lock);
resubmit = !list_empty(&pblk->resubmit_list);
spin_unlock(&pblk->resubmit_lock);
/* Resubmit failed writes first */
if (resubmit) {
struct pblk_c_ctx *r_ctx;
spin_lock(&pblk->resubmit_lock);
r_ctx = list_first_entry(&pblk->resubmit_list,
struct pblk_c_ctx, list);
list_del(&r_ctx->list);
spin_unlock(&pblk->resubmit_lock);
secs_avail = r_ctx->nr_valid;
pos = r_ctx->sentry;
pblk_prepare_resubmit(pblk, pos, secs_avail);
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
secs_avail);
kfree(r_ctx);
} else {
/* If there are no sectors in the cache,
* flushes (bios without data) will be cleared on
* the cache threads
*/
secs_avail = pblk_rb_read_count(&pblk->rwb);
if (!secs_avail)
return 0;
secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
return 0;
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pblk_err(pblk, "bad buffer sync calculation\n");
return 0;
}
secs_to_com = (secs_to_sync > secs_avail) ?
secs_avail : secs_to_sync;
pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
}
packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
rqd->bio = bio;
if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
secs_avail)) {
pblk_err(pblk, "corrupted write bio\n");
goto fail_put_bio;
}
if (pblk_submit_io_set(pblk, rqd))
goto fail_free_bio;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_add(secs_to_sync, &pblk->sub_writes);
#endif
*secs_left = 1;
return 0;
fail_free_bio:
pblk_free_write_rqd(pblk, rqd);
fail_put_bio:
bio_put(bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
return -EINTR;
}
int pblk_write_ts(void *data)
{
struct pblk *pblk = data;
int secs_left;
int write_failure = 0;
while (!kthread_should_stop()) {
if (!write_failure) {
write_failure = pblk_submit_write(pblk, &secs_left);
if (secs_left)
continue;
}
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -1329,6 +1329,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct raid1_plug_cb *plug = NULL;
int first_clone;
int max_sectors;
bool write_behind = false;
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@ -1381,6 +1382,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
/*
* The write-behind io is only attempted on drives marked as
* write-mostly, which means we could allocate write behind
* bio later.
*/
if (rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
@ -1454,6 +1464,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
goto retry_write;
}
/*
* When using a bitmap, we may call alloc_behind_master_bio below.
* alloc_behind_master_bio allocates a copy of the data payload a page
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
if (write_behind && bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);

View File

@ -1712,6 +1712,11 @@ retry_discard:
} else
r10_bio->master_bio = (struct bio *)first_r10bio;
/*
* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
@ -1743,9 +1748,6 @@ retry_discard:
for (disk = 0; disk < geo->raid_disks; disk++) {
sector_t dev_start, dev_end;
struct bio *mbio, *rbio = NULL;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[disk].replacement);
/*
* Now start to calculate the start and end address for each disk.
@ -1775,9 +1777,12 @@ retry_discard:
/*
* It only handles discard bio which size is >= stripe size, so
* dev_end > dev_start all the time
* dev_end > dev_start all the time.
* It doesn't need to use rcu lock to get rdev here. We already
* add rdev->nr_pending in the first loop.
*/
if (r10_bio->devs[disk].bio) {
struct md_rdev *rdev = conf->mirrors[disk].rdev;
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
mbio->bi_end_io = raid10_end_discard_request;
mbio->bi_private = r10_bio;
@ -1790,6 +1795,7 @@ retry_discard:
bio_endio(mbio);
}
if (r10_bio->devs[disk].repl_bio) {
struct md_rdev *rrdev = conf->mirrors[disk].replacement;
rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
rbio->bi_end_io = raid10_end_discard_request;
rbio->bi_private = r10_bio;

View File

@ -33,12 +33,12 @@ config NVME_HWMON
in the system.
config NVME_FABRICS
select NVME_CORE
tristate
config NVME_RDMA
tristate "NVM Express over Fabrics RDMA host driver"
depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK
select NVME_CORE
select NVME_FABRICS
select SG_POOL
help
@ -55,7 +55,6 @@ config NVME_FC
tristate "NVM Express over Fabrics FC host driver"
depends on BLOCK
depends on HAS_DMA
select NVME_CORE
select NVME_FABRICS
select SG_POOL
help
@ -72,7 +71,6 @@ config NVME_TCP
tristate "NVM Express over Fabrics TCP host driver"
depends on INET
depends on BLOCK
select NVME_CORE
select NVME_FABRICS
select CRYPTO
select CRYPTO_CRC32C

View File

@ -12,7 +12,6 @@ obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
nvme-core-y := core.o ioctl.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o

View File

@ -587,9 +587,6 @@ static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
if (ns->ndev)
nvme_nvm_unregister(ns);
put_disk(ns->disk);
nvme_put_ns_head(ns->head);
nvme_put_ctrl(ns->ctrl);
@ -1028,7 +1025,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
return BLK_STS_IOERR;
}
cmd->common.command_id = req->tag;
nvme_req(req)->genctr++;
cmd->common.command_id = nvme_cid(req);
trace_nvme_setup_cmd(req, cmd);
return ret;
}
@ -3217,9 +3215,6 @@ static const struct attribute_group nvme_ns_id_attr_group = {
const struct attribute_group *nvme_ns_id_attr_groups[] = {
&nvme_ns_id_attr_group,
#ifdef CONFIG_NVM
&nvme_nvm_attr_group,
#endif
NULL,
};
@ -3762,13 +3757,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (nvme_update_ns_info(ns, id))
goto out_unlink_ns;
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
if (nvme_nvm_register(ns, disk->disk_name, node)) {
dev_warn(ctrl->device, "LightNVM init failure\n");
goto out_unlink_ns;
}
}
down_write(&ctrl->namespaces_rwsem);
list_add_tail(&ns->list, &ctrl->namespaces);
up_write(&ctrl->namespaces_rwsem);

View File

@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
ret = -EINVAL;
goto out;
}
nvmf_host_put(opts->host);
opts->host = nvmf_host_add(p);
kfree(p);
if (!opts->host) {

View File

@ -342,9 +342,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
case NVME_IOCTL_IO64_CMD:
return nvme_user_cmd64(ns->ctrl, ns, argp);
default:
if (!ns->ndev)
return -ENOTTY;
return nvme_nvm_ioctl(ns, cmd, argp);
return -ENOTTY;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,6 @@
#include <linux/pci.h>
#include <linux/kref.h>
#include <linux/blk-mq.h>
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
@ -48,11 +47,6 @@ extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
enum {
NVME_NS_LBA = 0,
NVME_NS_LIGHTNVM = 1,
};
/*
* List of workarounds for devices that required behavior not specified in
* the standard.
@ -92,11 +86,6 @@ enum nvme_quirks {
*/
NVME_QUIRK_NO_DEEPEST_PS = (1 << 5),
/*
* Supports the LighNVM command set if indicated in vs[1].
*/
NVME_QUIRK_LIGHTNVM = (1 << 6),
/*
* Set MEDIUM priority on SQ creation
*/
@ -158,6 +147,7 @@ enum nvme_quirks {
struct nvme_request {
struct nvme_command *cmd;
union nvme_result result;
u8 genctr;
u8 retries;
u8 flags;
u16 status;
@ -449,7 +439,6 @@ struct nvme_ns {
u32 ana_grpid;
#endif
struct list_head siblings;
struct nvm_dev *ndev;
struct kref kref;
struct nvme_ns_head *head;
@ -497,6 +486,49 @@ struct nvme_ctrl_ops {
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
};
/*
* nvme command_id is constructed as such:
* | xxxx | xxxxxxxxxxxx |
* gen request tag
*/
#define nvme_genctr_mask(gen) (gen & 0xf)
#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12)
#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12)
#define nvme_tag_from_cid(cid) (cid & 0xfff)
static inline u16 nvme_cid(struct request *rq)
{
return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
}
static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
u16 command_id)
{
u8 genctr = nvme_genctr_from_cid(command_id);
u16 tag = nvme_tag_from_cid(command_id);
struct request *rq;
rq = blk_mq_tag_to_rq(tags, tag);
if (unlikely(!rq)) {
pr_err("could not locate request for tag %#x\n",
tag);
return NULL;
}
if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) {
dev_err(nvme_req(rq)->ctrl->device,
"request %#x genctr mismatch (got %#x expected %#x)\n",
tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
return NULL;
}
return rq;
}
static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags,
u16 command_id)
{
return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id));
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
const char *dev_name);
@ -594,7 +626,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
{
return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH;
return !qid &&
nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
}
void nvme_complete_rq(struct request *req);
@ -823,26 +856,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
}
#endif
#ifdef CONFIG_NVM
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
void nvme_nvm_unregister(struct nvme_ns *ns);
extern const struct attribute_group nvme_nvm_attr_group;
int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp);
#else
static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
int node)
{
return 0;
}
static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp)
{
return -ENOTTY;
}
#endif /* CONFIG_NVM */
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
{
return dev_to_disk(dev)->private_data;

View File

@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold,
"Use SGLs when average request segment size is larger or equal to "
"this size. Use 0 to disable SGLs.");
#define NVME_PCI_MIN_QUEUE_SIZE 2
#define NVME_PCI_MAX_QUEUE_SIZE 4095
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = {
static unsigned int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
@ -135,6 +137,7 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
u32 last_ps;
bool hmb;
mempool_t *iod_mempool;
@ -153,18 +156,14 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
bool attrs_added;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
int ret;
u32 n;
ret = kstrtou32(val, 10, &n);
if (ret != 0 || n < 2)
return -EINVAL;
return param_set_uint(val, kp);
return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
NVME_PCI_MAX_QUEUE_SIZE);
}
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@ -1014,7 +1013,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
return;
}
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
if (unlikely(!req)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
@ -1808,17 +1807,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
return ret >= 0 ? 0 : ret;
}
static ssize_t nvme_cmb_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n",
ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{
u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
@ -1887,20 +1875,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
pci_p2pmem_publish(pdev, true);
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
dev_warn(dev->ctrl.device,
"failed to add sysfs attribute for CMB\n");
}
static inline void nvme_release_cmb(struct nvme_dev *dev)
{
if (dev->cmb_size) {
sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL);
dev->cmb_size = 0;
}
}
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
@ -1923,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
dev_warn(dev->ctrl.device,
"failed to set host mem (err %d, flags %#x).\n",
ret, bits);
}
} else
dev->hmb = bits & NVME_HOST_MEM_ENABLE;
return ret;
}
@ -2080,6 +2056,102 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
return ret;
}
static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n",
ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);
static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
return sysfs_emit(buf, "%u\n", ndev->cmbloc);
}
static DEVICE_ATTR_RO(cmbloc);
static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
return sysfs_emit(buf, "%u\n", ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmbsz);
static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
return sysfs_emit(buf, "%d\n", ndev->hmb);
}
static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
bool new;
int ret;
if (strtobool(buf, &new) < 0)
return -EINVAL;
if (new == ndev->hmb)
return count;
if (new) {
ret = nvme_setup_host_mem(ndev);
} else {
ret = nvme_set_host_mem(ndev, 0);
if (!ret)
nvme_free_host_mem(ndev);
}
if (ret < 0)
return ret;
return count;
}
static DEVICE_ATTR_RW(hmb);
static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct nvme_ctrl *ctrl =
dev_get_drvdata(container_of(kobj, struct device, kobj));
struct nvme_dev *dev = to_nvme_dev(ctrl);
if (a == &dev_attr_cmb.attr ||
a == &dev_attr_cmbloc.attr ||
a == &dev_attr_cmbsz.attr) {
if (!dev->cmbsz)
return 0;
}
if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
return 0;
return a->mode;
}
static struct attribute *nvme_pci_attrs[] = {
&dev_attr_cmb.attr,
&dev_attr_cmbloc.attr,
&dev_attr_cmbsz.attr,
&dev_attr_hmb.attr,
NULL,
};
static const struct attribute_group nvme_pci_attr_group = {
.attrs = nvme_pci_attrs,
.is_visible = nvme_pci_attrs_are_visible,
};
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@ -2751,6 +2823,10 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
&nvme_pci_attr_group))
dev->attrs_added = true;
nvme_start_ctrl(&dev->ctrl);
return;
@ -2999,6 +3075,13 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
static void nvme_remove_attrs(struct nvme_dev *dev)
{
if (dev->attrs_added)
sysfs_remove_group(&dev->ctrl.device->kobj,
&nvme_pci_attr_group);
}
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@ -3020,7 +3103,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
nvme_release_cmb(dev);
nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
@ -3047,8 +3130,13 @@ static int nvme_resume(struct device *dev)
if (ndev->last_ps == U32_MAX ||
nvme_set_power_state(ctrl, ndev->last_ps) != 0)
return nvme_try_sched_reset(&ndev->ctrl);
goto reset;
if (ctrl->hmpre && nvme_setup_host_mem(ndev))
goto reset;
return 0;
reset:
return nvme_try_sched_reset(ctrl);
}
static int nvme_suspend(struct device *dev)
@ -3072,15 +3160,9 @@ static int nvme_suspend(struct device *dev)
* the PCI bus layer to put it into D3 in order to take the PCIe link
* down, so as to allow the platform to achieve its minimum low-power
* state (which may not be possible if the link is up).
*
* If a host memory buffer is enabled, shut down the device as the NVMe
* specification allows the device to access the host memory buffer in
* host DRAM from all power states, but hosts will fail access to DRAM
* during S3.
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
ndev->nr_host_mem_descs ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
return nvme_disable_prepare_reset(ndev, true);
@ -3091,6 +3173,17 @@ static int nvme_suspend(struct device *dev)
if (ctrl->state != NVME_CTRL_LIVE)
goto unfreeze;
/*
* Host memory access may not be successful in a system suspend state,
* but the specification allows the controller to access memory in a
* non-operational power state.
*/
if (ndev->hmb) {
ret = nvme_set_host_mem(ndev, 0);
if (ret < 0)
goto unfreeze;
}
ret = nvme_get_power_state(ctrl, &ndev->last_ps);
if (ret < 0)
goto unfreeze;
@ -3243,12 +3336,6 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */
.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */

View File

@ -735,13 +735,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
if (ret)
return ret;
ctrl->ctrl.queue_count = nr_io_queues + 1;
if (ctrl->ctrl.queue_count < 2) {
if (nr_io_queues == 0) {
dev_err(ctrl->ctrl.device,
"unable to set any I/O queues\n");
return -ENOMEM;
}
ctrl->ctrl.queue_count = nr_io_queues + 1;
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
@ -1730,10 +1730,10 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
struct request *rq;
struct nvme_rdma_request *req;
rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"tag 0x%x on QP %#x not found\n",
"got bad command_id %#x on QP %#x\n",
cqe->command_id, queue->qp->qp_num);
nvme_rdma_error_recovery(queue->ctrl);
return;

View File

@ -487,11 +487,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
{
struct request *rq;
rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag 0x%x not found\n",
nvme_tcp_queue_id(queue), cqe->command_id);
"got bad cqe.command_id %#x on queue %d\n",
cqe->command_id, nvme_tcp_queue_id(queue));
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EINVAL;
}
@ -508,11 +508,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
{
struct request *rq;
rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x not found\n",
nvme_tcp_queue_id(queue), pdu->command_id);
"got bad c2hdata.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
@ -606,7 +606,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
data->hdr.plen =
cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
data->ttag = pdu->ttag;
data->command_id = rq->tag;
data->command_id = nvme_cid(rq);
data->data_offset = cpu_to_le32(req->data_sent);
data->data_length = cpu_to_le32(req->pdu_len);
return 0;
@ -619,11 +619,11 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
struct request *rq;
int ret;
rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x not found\n",
nvme_tcp_queue_id(queue), pdu->command_id);
"got bad r2t.command_id %#x on queue %d\n",
pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
req = blk_mq_rq_to_pdu(rq);
@ -702,17 +702,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
struct nvme_tcp_request *req;
struct request *rq;
rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"queue %d tag %#x not found\n",
nvme_tcp_queue_id(queue), pdu->command_id);
return -ENOENT;
}
req = blk_mq_rq_to_pdu(rq);
struct request *rq =
nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
while (true) {
int recv_len, ret;
@ -804,8 +796,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
}
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
pdu->command_id);
struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
pdu->command_id);
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
queue->nr_cqe++;
@ -1228,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
sock_release(queue->sock);
kfree(queue->pdu);
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
}
@ -1533,6 +1526,7 @@ err_sock:
sock_release(queue->sock);
queue->sock = NULL;
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
return ret;
}
@ -1769,13 +1763,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
if (ret)
return ret;
ctrl->queue_count = nr_io_queues + 1;
if (ctrl->queue_count < 2) {
if (nr_io_queues == 0) {
dev_err(ctrl->device,
"unable to set any I/O queues\n");
return -ENOMEM;
}
ctrl->queue_count = nr_io_queues + 1;
dev_info(ctrl->device,
"creating %d I/O queues.\n", nr_io_queues);

View File

@ -72,6 +72,20 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
return ret;
}
static const char *nvme_trace_admin_set_features(struct trace_seq *p,
u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u8 fid = cdw10[0];
u8 sv = cdw10[3] & 0x8;
u32 cdw11 = get_unaligned_le32(cdw10 + 4);
trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_admin_get_features(struct trace_seq *p,
u8 *cdw10)
{
@ -80,7 +94,7 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p,
u8 sel = cdw10[1] & 0x7;
u32 cdw11 = get_unaligned_le32(cdw10 + 4);
trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11);
trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11);
trace_seq_putc(p, 0);
return ret;
@ -201,6 +215,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
return nvme_trace_create_cq(p, cdw10);
case nvme_admin_identify:
return nvme_trace_admin_identify(p, cdw10);
case nvme_admin_set_features:
return nvme_trace_admin_set_features(p, cdw10);
case nvme_admin_get_features:
return nvme_trace_admin_get_features(p, cdw10);
case nvme_admin_get_lba_status:

View File

@ -31,7 +31,6 @@ config NVME_TARGET_PASSTHRU
config NVME_TARGET_LOOP
tristate "NVMe loopback device support"
depends on NVME_TARGET
select NVME_CORE
select NVME_FABRICS
select SG_POOL
help
@ -65,7 +64,6 @@ config NVME_TARGET_FC
config NVME_TARGET_FCLOOP
tristate "NVMe over Fabrics FC Transport Loopback Test driver"
depends on NVME_TARGET
select NVME_CORE
select NVME_FABRICS
select SG_POOL
depends on NVME_FC

View File

@ -802,6 +802,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
* controller teardown as a result of a keep-alive expiration.
*/
ctrl->reset_tbkas = true;
sq->ctrl->sqs[sq->qid] = NULL;
nvmet_ctrl_put(ctrl);
sq->ctrl = NULL; /* allows reusing the queue later */
}

View File

@ -109,20 +109,37 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
u16 qid = le16_to_cpu(c->qid);
u16 sqsize = le16_to_cpu(c->sqsize);
struct nvmet_ctrl *old;
u16 mqes = NVME_CAP_MQES(ctrl->cap);
u16 ret;
if (!sqsize) {
pr_warn("queue size zero!\n");
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize);
ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
goto err;
}
if (ctrl->sqs[qid] != NULL) {
pr_warn("qid %u has already been created\n", qid);
req->error_loc = offsetof(struct nvmf_connect_command, qid);
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
if (sqsize > mqes) {
pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n",
sqsize, mqes, ctrl->cntlid);
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize);
return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
}
old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
if (old) {
pr_warn("queue already connected!\n");
req->error_loc = offsetof(struct nvmf_connect_command, opcode);
return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
}
if (!sqsize) {
pr_warn("queue size zero!\n");
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
goto err;
}
/* note: convert queue size from 0's-based value to 1's-based value */
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
@ -138,6 +155,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
if (ret) {
pr_err("failed to install queue %d cntlid %d ret %x\n",
qid, ctrl->cntlid, ret);
ctrl->sqs[qid] = NULL;
goto err;
}
}
@ -260,11 +278,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
}
status = nvmet_install_queue(ctrl, req);
if (status) {
/* pass back cntlid that had the issue of installing queue */
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
if (status)
goto out_ctrl_put;
}
/* pass back cntlid for successful completion */
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);

View File

@ -107,10 +107,10 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
} else {
struct request *rq;
rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
"tag 0x%x on queue %d not found\n",
"got bad command_id %#x on queue %d\n",
cqe->command_id, nvme_loop_queue_idx(queue));
return;
}

View File

@ -27,7 +27,7 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p,
u8 sel = cdw10[1] & 0x7;
u32 cdw11 = get_unaligned_le32(cdw10 + 4);
trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11);
trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11);
trace_seq_putc(p, 0);
return ret;
@ -49,6 +49,20 @@ static const char *nvmet_trace_get_lba_status(struct trace_seq *p,
return ret;
}
static const char *nvmet_trace_admin_set_features(struct trace_seq *p,
u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u8 fid = cdw10[0];
u8 sv = cdw10[3] & 0x8;
u32 cdw11 = get_unaligned_le32(cdw10 + 4);
trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@ -94,6 +108,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p,
switch (opcode) {
case nvme_admin_identify:
return nvmet_trace_admin_identify(p, cdw10);
case nvme_admin_set_features:
return nvmet_trace_admin_set_features(p, cdw10);
case nvme_admin_get_features:
return nvmet_trace_admin_get_features(p, cdw10);
case nvme_admin_get_lba_status:

View File

@ -115,14 +115,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
}
status = nvmet_req_find_ns(req);
if (status) {
status = NVME_SC_INTERNAL;
if (status)
goto done;
}
if (!bdev_is_zoned(req->ns->bdev)) {
req->error_loc = offsetof(struct nvme_identify, nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto done;
}

View File

@ -1,697 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef NVM_H
#define NVM_H
#include <linux/blkdev.h>
#include <linux/types.h>
#include <uapi/linux/lightnvm.h>
enum {
NVM_IO_OK = 0,
NVM_IO_REQUEUE = 1,
NVM_IO_DONE = 2,
NVM_IO_ERR = 3,
NVM_IOTYPE_NONE = 0,
NVM_IOTYPE_GC = 1,
};
/* common format */
#define NVM_GEN_CH_BITS (8)
#define NVM_GEN_LUN_BITS (8)
#define NVM_GEN_BLK_BITS (16)
#define NVM_GEN_RESERVED (32)
/* 1.2 format */
#define NVM_12_PG_BITS (16)
#define NVM_12_PL_BITS (4)
#define NVM_12_SEC_BITS (4)
#define NVM_12_RESERVED (8)
/* 2.0 format */
#define NVM_20_SEC_BITS (24)
#define NVM_20_RESERVED (8)
enum {
NVM_OCSSD_SPEC_12 = 12,
NVM_OCSSD_SPEC_20 = 20,
};
struct ppa_addr {
/* Generic structure for all addresses */
union {
/* generic device format */
struct {
u64 ch : NVM_GEN_CH_BITS;
u64 lun : NVM_GEN_LUN_BITS;
u64 blk : NVM_GEN_BLK_BITS;
u64 reserved : NVM_GEN_RESERVED;
} a;
/* 1.2 device format */
struct {
u64 ch : NVM_GEN_CH_BITS;
u64 lun : NVM_GEN_LUN_BITS;
u64 blk : NVM_GEN_BLK_BITS;
u64 pg : NVM_12_PG_BITS;
u64 pl : NVM_12_PL_BITS;
u64 sec : NVM_12_SEC_BITS;
u64 reserved : NVM_12_RESERVED;
} g;
/* 2.0 device format */
struct {
u64 grp : NVM_GEN_CH_BITS;
u64 pu : NVM_GEN_LUN_BITS;
u64 chk : NVM_GEN_BLK_BITS;
u64 sec : NVM_20_SEC_BITS;
u64 reserved : NVM_20_RESERVED;
} m;
struct {
u64 line : 63;
u64 is_cached : 1;
} c;
u64 ppa;
};
};
struct nvm_rq;
struct nvm_id;
struct nvm_dev;
struct nvm_tgt_dev;
struct nvm_chk_meta;
typedef int (nvm_id_fn)(struct nvm_dev *);
typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
struct nvm_chk_meta *);
typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *);
typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
typedef void (nvm_destroy_dma_pool_fn)(void *);
typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
dma_addr_t *);
typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
struct nvm_dev_ops {
nvm_id_fn *identity;
nvm_op_bb_tbl_fn *get_bb_tbl;
nvm_op_set_bb_fn *set_bb_tbl;
nvm_get_chk_meta_fn *get_chk_meta;
nvm_submit_io_fn *submit_io;
nvm_create_dma_pool_fn *create_dma_pool;
nvm_destroy_dma_pool_fn *destroy_dma_pool;
nvm_dev_dma_alloc_fn *dev_dma_alloc;
nvm_dev_dma_free_fn *dev_dma_free;
};
#ifdef CONFIG_NVM
#include <linux/file.h>
#include <linux/dmapool.h>
enum {
/* HW Responsibilities */
NVM_RSP_L2P = 1 << 0,
NVM_RSP_ECC = 1 << 1,
/* Physical Adressing Mode */
NVM_ADDRMODE_LINEAR = 0,
NVM_ADDRMODE_CHANNEL = 1,
/* Plane programming mode for LUN */
NVM_PLANE_SINGLE = 1,
NVM_PLANE_DOUBLE = 2,
NVM_PLANE_QUAD = 4,
/* Status codes */
NVM_RSP_SUCCESS = 0x0,
NVM_RSP_NOT_CHANGEABLE = 0x1,
NVM_RSP_ERR_FAILWRITE = 0x40ff,
NVM_RSP_ERR_EMPTYPAGE = 0x42ff,
NVM_RSP_ERR_FAILECC = 0x4281,
NVM_RSP_ERR_FAILCRC = 0x4004,
NVM_RSP_WARN_HIGHECC = 0x4700,
/* Device opcodes */
NVM_OP_PWRITE = 0x91,
NVM_OP_PREAD = 0x92,
NVM_OP_ERASE = 0x90,
/* PPA Command Flags */
NVM_IO_SNGL_ACCESS = 0x0,
NVM_IO_DUAL_ACCESS = 0x1,
NVM_IO_QUAD_ACCESS = 0x2,
/* NAND Access Modes */
NVM_IO_SUSPEND = 0x80,
NVM_IO_SLC_MODE = 0x100,
NVM_IO_SCRAMBLE_ENABLE = 0x200,
/* Block Types */
NVM_BLK_T_FREE = 0x0,
NVM_BLK_T_BAD = 0x1,
NVM_BLK_T_GRWN_BAD = 0x2,
NVM_BLK_T_DEV = 0x4,
NVM_BLK_T_HOST = 0x8,
/* Memory capabilities */
NVM_ID_CAP_SLC = 0x1,
NVM_ID_CAP_CMD_SUSPEND = 0x2,
NVM_ID_CAP_SCRAMBLE = 0x4,
NVM_ID_CAP_ENCRYPT = 0x8,
/* Memory types */
NVM_ID_FMTYPE_SLC = 0,
NVM_ID_FMTYPE_MLC = 1,
/* Device capabilities */
NVM_ID_DCAP_BBLKMGMT = 0x1,
NVM_UD_DCAP_ECC = 0x2,
};
struct nvm_id_lp_mlc {
u16 num_pairs;
u8 pairs[886];
};
struct nvm_id_lp_tbl {
__u8 id[8];
struct nvm_id_lp_mlc mlc;
};
struct nvm_addrf_12 {
u8 ch_len;
u8 lun_len;
u8 blk_len;
u8 pg_len;
u8 pln_len;
u8 sec_len;
u8 ch_offset;
u8 lun_offset;
u8 blk_offset;
u8 pg_offset;
u8 pln_offset;
u8 sec_offset;
u64 ch_mask;
u64 lun_mask;
u64 blk_mask;
u64 pg_mask;
u64 pln_mask;
u64 sec_mask;
};
struct nvm_addrf {
u8 ch_len;
u8 lun_len;
u8 chk_len;
u8 sec_len;
u8 rsv_len[2];
u8 ch_offset;
u8 lun_offset;
u8 chk_offset;
u8 sec_offset;
u8 rsv_off[2];
u64 ch_mask;
u64 lun_mask;
u64 chk_mask;
u64 sec_mask;
u64 rsv_mask[2];
};
enum {
/* Chunk states */
NVM_CHK_ST_FREE = 1 << 0,
NVM_CHK_ST_CLOSED = 1 << 1,
NVM_CHK_ST_OPEN = 1 << 2,
NVM_CHK_ST_OFFLINE = 1 << 3,
/* Chunk types */
NVM_CHK_TP_W_SEQ = 1 << 0,
NVM_CHK_TP_W_RAN = 1 << 1,
NVM_CHK_TP_SZ_SPEC = 1 << 4,
};
/*
* Note: The structure size is linked to nvme_nvm_chk_meta such that the same
* buffer can be used when converting from little endian to cpu addressing.
*/
struct nvm_chk_meta {
u8 state;
u8 type;
u8 wi;
u8 rsvd[5];
u64 slba;
u64 cnlb;
u64 wp;
};
struct nvm_target {
struct list_head list;
struct nvm_tgt_dev *dev;
struct nvm_tgt_type *type;
struct gendisk *disk;
};
#define ADDR_EMPTY (~0ULL)
#define NVM_TARGET_DEFAULT_OP (101)
#define NVM_TARGET_MIN_OP (3)
#define NVM_TARGET_MAX_OP (80)
#define NVM_VERSION_MAJOR 1
#define NVM_VERSION_MINOR 0
#define NVM_VERSION_PATCH 0
#define NVM_MAX_VLBA (64) /* max logical blocks in a vector command */
struct nvm_rq;
typedef void (nvm_end_io_fn)(struct nvm_rq *);
struct nvm_rq {
struct nvm_tgt_dev *dev;
struct bio *bio;
union {
struct ppa_addr ppa_addr;
dma_addr_t dma_ppa_list;
};
struct ppa_addr *ppa_list;
void *meta_list;
dma_addr_t dma_meta_list;
nvm_end_io_fn *end_io;
uint8_t opcode;
uint16_t nr_ppas;
uint16_t flags;
u64 ppa_status; /* ppa media status */
int error;
int is_seq; /* Sequential hint flag. 1.2 only */
void *private;
};
static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
{
return pdu - sizeof(struct nvm_rq);
}
static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
{
return rqdata + 1;
}
static inline struct ppa_addr *nvm_rq_to_ppa_list(struct nvm_rq *rqd)
{
return (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
}
enum {
NVM_BLK_ST_FREE = 0x1, /* Free block */
NVM_BLK_ST_TGT = 0x2, /* Block in use by target */
NVM_BLK_ST_BAD = 0x8, /* Bad block */
};
/* Instance geometry */
struct nvm_geo {
/* device reported version */
u8 major_ver_id;
u8 minor_ver_id;
/* kernel short version */
u8 version;
/* instance specific geometry */
int num_ch;
int num_lun; /* per channel */
/* calculated values */
int all_luns; /* across channels */
int all_chunks; /* across channels */
int op; /* over-provision in instance */
sector_t total_secs; /* across channels */
/* chunk geometry */
u32 num_chk; /* chunks per lun */
u32 clba; /* sectors per chunk */
u16 csecs; /* sector size */
u16 sos; /* out-of-band area size */
bool ext; /* metadata in extended data buffer */
u32 mdts; /* Max data transfer size*/
/* device write constrains */
u32 ws_min; /* minimum write size */
u32 ws_opt; /* optimal write size */
u32 mw_cunits; /* distance required for successful read */
u32 maxoc; /* maximum open chunks */
u32 maxocpu; /* maximum open chunks per parallel unit */
/* device capabilities */
u32 mccap;
/* device timings */
u32 trdt; /* Avg. Tread (ns) */
u32 trdm; /* Max Tread (ns) */
u32 tprt; /* Avg. Tprog (ns) */
u32 tprm; /* Max Tprog (ns) */
u32 tbet; /* Avg. Terase (ns) */
u32 tbem; /* Max Terase (ns) */
/* generic address format */
struct nvm_addrf addrf;
/* 1.2 compatibility */
u8 vmnt;
u32 cap;
u32 dom;
u8 mtype;
u8 fmtype;
u16 cpar;
u32 mpos;
u8 num_pln;
u8 pln_mode;
u16 num_pg;
u16 fpg_sz;
};
/* sub-device structure */
struct nvm_tgt_dev {
/* Device information */
struct nvm_geo geo;
/* Base ppas for target LUNs */
struct ppa_addr *luns;
struct request_queue *q;
struct nvm_dev *parent;
void *map;
};
struct nvm_dev {
struct nvm_dev_ops *ops;
struct list_head devices;
/* Device information */
struct nvm_geo geo;
unsigned long *lun_map;
void *dma_pool;
/* Backend device */
struct request_queue *q;
char name[DISK_NAME_LEN];
void *private_data;
struct kref ref;
void *rmap;
struct mutex mlock;
spinlock_t lock;
/* target management */
struct list_head area_list;
struct list_head targets;
};
static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
struct ppa_addr r)
{
struct nvm_geo *geo = &dev->geo;
struct ppa_addr l;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf;
l.ppa = ((u64)r.g.ch) << ppaf->ch_offset;
l.ppa |= ((u64)r.g.lun) << ppaf->lun_offset;
l.ppa |= ((u64)r.g.blk) << ppaf->blk_offset;
l.ppa |= ((u64)r.g.pg) << ppaf->pg_offset;
l.ppa |= ((u64)r.g.pl) << ppaf->pln_offset;
l.ppa |= ((u64)r.g.sec) << ppaf->sec_offset;
} else {
struct nvm_addrf *lbaf = &geo->addrf;
l.ppa = ((u64)r.m.grp) << lbaf->ch_offset;
l.ppa |= ((u64)r.m.pu) << lbaf->lun_offset;
l.ppa |= ((u64)r.m.chk) << lbaf->chk_offset;
l.ppa |= ((u64)r.m.sec) << lbaf->sec_offset;
}
return l;
}
static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
struct ppa_addr r)
{
struct nvm_geo *geo = &dev->geo;
struct ppa_addr l;
l.ppa = 0;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf;
l.g.ch = (r.ppa & ppaf->ch_mask) >> ppaf->ch_offset;
l.g.lun = (r.ppa & ppaf->lun_mask) >> ppaf->lun_offset;
l.g.blk = (r.ppa & ppaf->blk_mask) >> ppaf->blk_offset;
l.g.pg = (r.ppa & ppaf->pg_mask) >> ppaf->pg_offset;
l.g.pl = (r.ppa & ppaf->pln_mask) >> ppaf->pln_offset;
l.g.sec = (r.ppa & ppaf->sec_mask) >> ppaf->sec_offset;
} else {
struct nvm_addrf *lbaf = &geo->addrf;
l.m.grp = (r.ppa & lbaf->ch_mask) >> lbaf->ch_offset;
l.m.pu = (r.ppa & lbaf->lun_mask) >> lbaf->lun_offset;
l.m.chk = (r.ppa & lbaf->chk_mask) >> lbaf->chk_offset;
l.m.sec = (r.ppa & lbaf->sec_mask) >> lbaf->sec_offset;
}
return l;
}
static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf,
struct ppa_addr p)
{
struct nvm_geo *geo = &dev->geo;
u64 caddr;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)addrf;
caddr = (u64)p.g.pg << ppaf->pg_offset;
caddr |= (u64)p.g.pl << ppaf->pln_offset;
caddr |= (u64)p.g.sec << ppaf->sec_offset;
} else {
caddr = p.m.sec;
}
return caddr;
}
static inline struct ppa_addr nvm_ppa32_to_ppa64(struct nvm_dev *dev,
void *addrf, u32 ppa32)
{
struct ppa_addr ppa64;
ppa64.ppa = 0;
if (ppa32 == -1) {
ppa64.ppa = ADDR_EMPTY;
} else if (ppa32 & (1U << 31)) {
ppa64.c.line = ppa32 & ((~0U) >> 1);
ppa64.c.is_cached = 1;
} else {
struct nvm_geo *geo = &dev->geo;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = addrf;
ppa64.g.ch = (ppa32 & ppaf->ch_mask) >>
ppaf->ch_offset;
ppa64.g.lun = (ppa32 & ppaf->lun_mask) >>
ppaf->lun_offset;
ppa64.g.blk = (ppa32 & ppaf->blk_mask) >>
ppaf->blk_offset;
ppa64.g.pg = (ppa32 & ppaf->pg_mask) >>
ppaf->pg_offset;
ppa64.g.pl = (ppa32 & ppaf->pln_mask) >>
ppaf->pln_offset;
ppa64.g.sec = (ppa32 & ppaf->sec_mask) >>
ppaf->sec_offset;
} else {
struct nvm_addrf *lbaf = addrf;
ppa64.m.grp = (ppa32 & lbaf->ch_mask) >>
lbaf->ch_offset;
ppa64.m.pu = (ppa32 & lbaf->lun_mask) >>
lbaf->lun_offset;
ppa64.m.chk = (ppa32 & lbaf->chk_mask) >>
lbaf->chk_offset;
ppa64.m.sec = (ppa32 & lbaf->sec_mask) >>
lbaf->sec_offset;
}
}
return ppa64;
}
static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev,
void *addrf, struct ppa_addr ppa64)
{
u32 ppa32 = 0;
if (ppa64.ppa == ADDR_EMPTY) {
ppa32 = ~0U;
} else if (ppa64.c.is_cached) {
ppa32 |= ppa64.c.line;
ppa32 |= 1U << 31;
} else {
struct nvm_geo *geo = &dev->geo;
if (geo->version == NVM_OCSSD_SPEC_12) {
struct nvm_addrf_12 *ppaf = addrf;
ppa32 |= ppa64.g.ch << ppaf->ch_offset;
ppa32 |= ppa64.g.lun << ppaf->lun_offset;
ppa32 |= ppa64.g.blk << ppaf->blk_offset;
ppa32 |= ppa64.g.pg << ppaf->pg_offset;
ppa32 |= ppa64.g.pl << ppaf->pln_offset;
ppa32 |= ppa64.g.sec << ppaf->sec_offset;
} else {
struct nvm_addrf *lbaf = addrf;
ppa32 |= ppa64.m.grp << lbaf->ch_offset;
ppa32 |= ppa64.m.pu << lbaf->lun_offset;
ppa32 |= ppa64.m.chk << lbaf->chk_offset;
ppa32 |= ppa64.m.sec << lbaf->sec_offset;
}
}
return ppa32;
}
static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev,
struct ppa_addr *ppa)
{
struct nvm_geo *geo = &dev->geo;
int last = 0;
if (geo->version == NVM_OCSSD_SPEC_12) {
int sec = ppa->g.sec;
sec++;
if (sec == geo->ws_min) {
int pg = ppa->g.pg;
sec = 0;
pg++;
if (pg == geo->num_pg) {
int pl = ppa->g.pl;
pg = 0;
pl++;
if (pl == geo->num_pln)
last = 1;
ppa->g.pl = pl;
}
ppa->g.pg = pg;
}
ppa->g.sec = sec;
} else {
ppa->m.sec++;
if (ppa->m.sec == geo->clba)
last = 1;
}
return last;
}
typedef sector_t (nvm_tgt_capacity_fn)(void *);
typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
int flags);
typedef void (nvm_tgt_exit_fn)(void *, bool);
typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
enum {
NVM_TGT_F_DEV_L2P = 0,
NVM_TGT_F_HOST_L2P = 1 << 0,
};
struct nvm_tgt_type {
const char *name;
unsigned int version[3];
int flags;
/* target entry points */
const struct block_device_operations *bops;
nvm_tgt_capacity_fn *capacity;
/* module-specific init/teardown */
nvm_tgt_init_fn *init;
nvm_tgt_exit_fn *exit;
/* sysfs */
nvm_tgt_sysfs_init_fn *sysfs_init;
nvm_tgt_sysfs_exit_fn *sysfs_exit;
/* For internal use */
struct list_head list;
struct module *owner;
};
extern int nvm_register_tgt_type(struct nvm_tgt_type *);
extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *);
extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t);
extern struct nvm_dev *nvm_alloc_dev(int);
extern int nvm_register(struct nvm_dev *);
extern void nvm_unregister(struct nvm_dev *);
extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr,
int, struct nvm_chk_meta *);
extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *,
int, int);
extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *);
extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *);
extern void nvm_end_io(struct nvm_rq *);
#else /* CONFIG_NVM */
struct nvm_dev_ops;
static inline struct nvm_dev *nvm_alloc_dev(int node)
{
return ERR_PTR(-EINVAL);
}
static inline int nvm_register(struct nvm_dev *dev)
{
return -EINVAL;
}
static inline void nvm_unregister(struct nvm_dev *dev) {}
#endif /* CONFIG_NVM */
#endif /* LIGHTNVM.H */

View File

@ -431,6 +431,8 @@ extern int param_get_int(char *buffer, const struct kernel_param *kp);
extern const struct kernel_param_ops param_ops_uint;
extern int param_set_uint(const char *val, const struct kernel_param *kp);
extern int param_get_uint(char *buffer, const struct kernel_param *kp);
int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
unsigned int min, unsigned int max);
#define param_check_uint(name, p) __param_check(name, p, unsigned int)
extern const struct kernel_param_ops param_ops_long;

View File

@ -1,224 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Copyright (C) 2015 CNEX Labs. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*/
#ifndef _UAPI_LINUX_LIGHTNVM_H
#define _UAPI_LINUX_LIGHTNVM_H
#ifdef __KERNEL__
#include <linux/const.h>
#else /* __KERNEL__ */
#include <stdio.h>
#include <sys/ioctl.h>
#define DISK_NAME_LEN 32
#endif /* __KERNEL__ */
#include <linux/types.h>
#include <linux/ioctl.h>
#define NVM_TTYPE_NAME_MAX 48
#define NVM_TTYPE_MAX 63
#define NVM_MMTYPE_LEN 8
#define NVM_CTRL_FILE "/dev/lightnvm/control"
struct nvm_ioctl_info_tgt {
__u32 version[3];
__u32 reserved;
char tgtname[NVM_TTYPE_NAME_MAX];
};
struct nvm_ioctl_info {
__u32 version[3]; /* in/out - major, minor, patch */
__u16 tgtsize; /* number of targets */
__u16 reserved16; /* pad to 4K page */
__u32 reserved[12];
struct nvm_ioctl_info_tgt tgts[NVM_TTYPE_MAX];
};
enum {
NVM_DEVICE_ACTIVE = 1 << 0,
};
struct nvm_ioctl_device_info {
char devname[DISK_NAME_LEN];
char bmname[NVM_TTYPE_NAME_MAX];
__u32 bmversion[3];
__u32 flags;
__u32 reserved[8];
};
struct nvm_ioctl_get_devices {
__u32 nr_devices;
__u32 reserved[31];
struct nvm_ioctl_device_info info[31];
};
struct nvm_ioctl_create_simple {
__u32 lun_begin;
__u32 lun_end;
};
struct nvm_ioctl_create_extended {
__u16 lun_begin;
__u16 lun_end;
__u16 op;
__u16 rsv;
};
enum {
NVM_CONFIG_TYPE_SIMPLE = 0,
NVM_CONFIG_TYPE_EXTENDED = 1,
};
struct nvm_ioctl_create_conf {
__u32 type;
union {
struct nvm_ioctl_create_simple s;
struct nvm_ioctl_create_extended e;
};
};
enum {
NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */
};
struct nvm_ioctl_create {
char dev[DISK_NAME_LEN]; /* open-channel SSD device */
char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */
char tgtname[DISK_NAME_LEN]; /* dev to expose target as */
__u32 flags;
struct nvm_ioctl_create_conf conf;
};
struct nvm_ioctl_remove {
char tgtname[DISK_NAME_LEN];
__u32 flags;
};
struct nvm_ioctl_dev_init {
char dev[DISK_NAME_LEN]; /* open-channel SSD device */
char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */
__u32 flags;
};
enum {
NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as
* host blks or grown blks */
NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */
NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */
NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */
};
struct nvm_ioctl_dev_factory {
char dev[DISK_NAME_LEN];
__u32 flags;
};
struct nvm_user_vio {
__u8 opcode;
__u8 flags;
__u16 control;
__u16 nppas;
__u16 rsvd;
__u64 metadata;
__u64 addr;
__u64 ppa_list;
__u32 metadata_len;
__u32 data_len;
__u64 status;
__u32 result;
__u32 rsvd3[3];
};
struct nvm_passthru_vio {
__u8 opcode;
__u8 flags;
__u8 rsvd[2];
__u32 nsid;
__u32 cdw2;
__u32 cdw3;
__u64 metadata;
__u64 addr;
__u32 metadata_len;
__u32 data_len;
__u64 ppa_list;
__u16 nppas;
__u16 control;
__u32 cdw13;
__u32 cdw14;
__u32 cdw15;
__u64 status;
__u32 result;
__u32 timeout_ms;
};
/* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
enum {
/* top level cmds */
NVM_INFO_CMD = 0x20,
NVM_GET_DEVICES_CMD,
/* device level cmds */
NVM_DEV_CREATE_CMD,
NVM_DEV_REMOVE_CMD,
/* Init a device to support LightNVM media managers */
NVM_DEV_INIT_CMD,
/* Factory reset device */
NVM_DEV_FACTORY_CMD,
/* Vector user I/O */
NVM_DEV_VIO_ADMIN_CMD = 0x41,
NVM_DEV_VIO_CMD = 0x42,
NVM_DEV_VIO_USER_CMD = 0x43,
};
#define NVM_IOCTL 'L' /* 0x4c */
#define NVM_INFO _IOWR(NVM_IOCTL, NVM_INFO_CMD, \
struct nvm_ioctl_info)
#define NVM_GET_DEVICES _IOR(NVM_IOCTL, NVM_GET_DEVICES_CMD, \
struct nvm_ioctl_get_devices)
#define NVM_DEV_CREATE _IOW(NVM_IOCTL, NVM_DEV_CREATE_CMD, \
struct nvm_ioctl_create)
#define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \
struct nvm_ioctl_remove)
#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \
struct nvm_ioctl_dev_init)
#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
struct nvm_ioctl_dev_factory)
#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \
struct nvm_passthru_vio)
#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\
struct nvm_passthru_vio)
#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\
struct nvm_user_vio)
#define NVM_VERSION_MAJOR 1
#define NVM_VERSION_MINOR 0
#define NVM_VERSION_PATCHLEVEL 0
#endif

View File

@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
unsigned int min, unsigned int max)
{
unsigned int num;
int ret;
if (!val)
return -EINVAL;
ret = kstrtouint(val, 0, &num);
if (ret)
return ret;
if (num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;
return 0;
}
EXPORT_SYMBOL_GPL(param_set_uint_minmax);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
if (strlen(val) > 1024) {

View File

@ -3149,24 +3149,6 @@ void cleanup_socket_xprt(void)
xprt_unregister_transport(&xs_bc_tcp_transport);
}
static int param_set_uint_minmax(const char *val,
const struct kernel_param *kp,
unsigned int min, unsigned int max)
{
unsigned int num;
int ret;
if (!val)
return -EINVAL;
ret = kstrtouint(val, 0, &num);
if (ret)
return ret;
if (num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;
return 0;
}
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
return param_set_uint_minmax(val, kp,