linux-stable/kernel/dma/map_benchmark.c
Barry Song 9f5f8ec501 dma-mapping: benchmark: use u8 for reserved field in uAPI structure
The original code put five u32 before a u64 expansion[10] array. Five is
odd, this will cause trouble in the extension of the structure by adding
new features. This patch moves to use u8 for reserved field to avoid
future alignment risk.
Meanwhile, it also clears the memory of struct map_benchmark in tools,
otherwise, if users use old version to run on newer kernel, the random
expansion value will cause side effect on newer kernel.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
2021-02-05 12:48:46 +01:00

365 lines
8.9 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020 Hisilicon Limited.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
#define DMA_MAP_MAX_THREADS 1024
#define DMA_MAP_MAX_SECONDS 300
#define DMA_MAP_BIDIRECTIONAL 0
#define DMA_MAP_TO_DEVICE 1
#define DMA_MAP_FROM_DEVICE 2
struct map_benchmark {
__u64 avg_map_100ns; /* average map latency in 100ns */
__u64 map_stddev; /* standard deviation of map latency */
__u64 avg_unmap_100ns; /* as above */
__u64 unmap_stddev;
__u32 threads; /* how many threads will do map/unmap in parallel */
__u32 seconds; /* how long the test will last */
__s32 node; /* which numa node this benchmark will run on */
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
__u8 expansion[84]; /* For future use */
};
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
struct dentry *debugfs;
enum dma_data_direction dir;
atomic64_t sum_map_100ns;
atomic64_t sum_unmap_100ns;
atomic64_t sum_sq_map;
atomic64_t sum_sq_unmap;
atomic64_t loops;
};
static int map_benchmark_thread(void *data)
{
void *buf;
dma_addr_t dma_addr;
struct map_benchmark_data *map = data;
int ret = 0;
buf = (void *)__get_free_page(GFP_KERNEL);
if (!buf)
return -ENOMEM;
while (!kthread_should_stop()) {
u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
ktime_t map_delta, unmap_delta;
/*
* for a non-coherent device, if we don't stain them in the
* cache, this will give an underestimate of the real-world
* overhead of BIDIRECTIONAL or TO_DEVICE mappings;
* 66 means evertything goes well! 66 is lucky.
*/
if (map->dir != DMA_FROM_DEVICE)
memset(buf, 0x66, PAGE_SIZE);
map_stime = ktime_get();
dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
pr_err("dma_map_single failed on %s\n",
dev_name(map->dev));
ret = -ENOMEM;
goto out;
}
map_etime = ktime_get();
map_delta = ktime_sub(map_etime, map_stime);
unmap_stime = ktime_get();
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
unmap_etime = ktime_get();
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
/* calculate sum and sum of squares */
map_100ns = div64_ul(map_delta, 100);
unmap_100ns = div64_ul(unmap_delta, 100);
map_sq = map_100ns * map_100ns;
unmap_sq = unmap_100ns * unmap_100ns;
atomic64_add(map_100ns, &map->sum_map_100ns);
atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
atomic64_add(map_sq, &map->sum_sq_map);
atomic64_add(unmap_sq, &map->sum_sq_unmap);
atomic64_inc(&map->loops);
}
out:
free_page((unsigned long)buf);
return ret;
}
static int do_map_benchmark(struct map_benchmark_data *map)
{
struct task_struct **tsk;
int threads = map->bparam.threads;
int node = map->bparam.node;
const cpumask_t *cpu_mask = cpumask_of_node(node);
u64 loops;
int ret = 0;
int i;
tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
if (!tsk)
return -ENOMEM;
get_device(map->dev);
for (i = 0; i < threads; i++) {
tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
map->bparam.node, "dma-map-benchmark/%d", i);
if (IS_ERR(tsk[i])) {
pr_err("create dma_map thread failed\n");
ret = PTR_ERR(tsk[i]);
goto out;
}
if (node != NUMA_NO_NODE)
kthread_bind_mask(tsk[i], cpu_mask);
}
/* clear the old value in the previous benchmark */
atomic64_set(&map->sum_map_100ns, 0);
atomic64_set(&map->sum_unmap_100ns, 0);
atomic64_set(&map->sum_sq_map, 0);
atomic64_set(&map->sum_sq_unmap, 0);
atomic64_set(&map->loops, 0);
for (i = 0; i < threads; i++) {
get_task_struct(tsk[i]);
wake_up_process(tsk[i]);
}
msleep_interruptible(map->bparam.seconds * 1000);
/* wait for the completion of benchmark threads */
for (i = 0; i < threads; i++) {
ret = kthread_stop(tsk[i]);
if (ret)
goto out;
}
loops = atomic64_read(&map->loops);
if (likely(loops > 0)) {
u64 map_variance, unmap_variance;
u64 sum_map = atomic64_read(&map->sum_map_100ns);
u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
/* average latency */
map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
/* standard deviation of latency */
map_variance = div64_u64(sum_sq_map, loops) -
map->bparam.avg_map_100ns *
map->bparam.avg_map_100ns;
unmap_variance = div64_u64(sum_sq_unmap, loops) -
map->bparam.avg_unmap_100ns *
map->bparam.avg_unmap_100ns;
map->bparam.map_stddev = int_sqrt64(map_variance);
map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
}
out:
for (i = 0; i < threads; i++)
put_task_struct(tsk[i]);
put_device(map->dev);
kfree(tsk);
return ret;
}
static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct map_benchmark_data *map = file->private_data;
void __user *argp = (void __user *)arg;
u64 old_dma_mask;
int ret;
if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
return -EFAULT;
switch (cmd) {
case DMA_MAP_BENCHMARK:
if (map->bparam.threads == 0 ||
map->bparam.threads > DMA_MAP_MAX_THREADS) {
pr_err("invalid thread number\n");
return -EINVAL;
}
if (map->bparam.seconds == 0 ||
map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
pr_err("invalid duration seconds\n");
return -EINVAL;
}
if (map->bparam.node != NUMA_NO_NODE &&
!node_possible(map->bparam.node)) {
pr_err("invalid numa node\n");
return -EINVAL;
}
switch (map->bparam.dma_dir) {
case DMA_MAP_BIDIRECTIONAL:
map->dir = DMA_BIDIRECTIONAL;
break;
case DMA_MAP_FROM_DEVICE:
map->dir = DMA_FROM_DEVICE;
break;
case DMA_MAP_TO_DEVICE:
map->dir = DMA_TO_DEVICE;
break;
default:
pr_err("invalid DMA direction\n");
return -EINVAL;
}
old_dma_mask = dma_get_mask(map->dev);
ret = dma_set_mask(map->dev,
DMA_BIT_MASK(map->bparam.dma_bits));
if (ret) {
pr_err("failed to set dma_mask on device %s\n",
dev_name(map->dev));
return -EINVAL;
}
ret = do_map_benchmark(map);
/*
* restore the original dma_mask as many devices' dma_mask are
* set by architectures, acpi, busses. When we bind them back
* to their original drivers, those drivers shouldn't see
* dma_mask changed by benchmark
*/
dma_set_mask(map->dev, old_dma_mask);
break;
default:
return -EINVAL;
}
if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
return -EFAULT;
return ret;
}
static const struct file_operations map_benchmark_fops = {
.open = simple_open,
.unlocked_ioctl = map_benchmark_ioctl,
};
static void map_benchmark_remove_debugfs(void *data)
{
struct map_benchmark_data *map = (struct map_benchmark_data *)data;
debugfs_remove(map->debugfs);
}
static int __map_benchmark_probe(struct device *dev)
{
struct dentry *entry;
struct map_benchmark_data *map;
int ret;
map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
if (!map)
return -ENOMEM;
map->dev = dev;
ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
if (ret) {
pr_err("Can't add debugfs remove action\n");
return ret;
}
/*
* we only permit a device bound with this driver, 2nd probe
* will fail
*/
entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
&map_benchmark_fops);
if (IS_ERR(entry))
return PTR_ERR(entry);
map->debugfs = entry;
return 0;
}
static int map_benchmark_platform_probe(struct platform_device *pdev)
{
return __map_benchmark_probe(&pdev->dev);
}
static struct platform_driver map_benchmark_platform_driver = {
.driver = {
.name = "dma_map_benchmark",
},
.probe = map_benchmark_platform_probe,
};
static int
map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
return __map_benchmark_probe(&pdev->dev);
}
static struct pci_driver map_benchmark_pci_driver = {
.name = "dma_map_benchmark",
.probe = map_benchmark_pci_probe,
};
static int __init map_benchmark_init(void)
{
int ret;
ret = pci_register_driver(&map_benchmark_pci_driver);
if (ret)
return ret;
ret = platform_driver_register(&map_benchmark_platform_driver);
if (ret) {
pci_unregister_driver(&map_benchmark_pci_driver);
return ret;
}
return 0;
}
static void __exit map_benchmark_cleanup(void)
{
platform_driver_unregister(&map_benchmark_platform_driver);
pci_unregister_driver(&map_benchmark_pci_driver);
}
module_init(map_benchmark_init);
module_exit(map_benchmark_cleanup);
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
MODULE_DESCRIPTION("dma_map benchmark driver");
MODULE_LICENSE("GPL");