From 48338daaa00e6137a43fa5d0e54b763aa34f450b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 29 Apr 2020 12:30:03 -0400 Subject: [PATCH] dm writecache: improve performance on DDR persistent memory (Optane) When testing the dm-writecache target on a real DDR persistent memory (Intel Optane), it turned out that explicit cache flushing using the clflushopt instruction performs better than non-temporal stores for block sizes 1k, 2k and 4k. The dm-writecache target is singlethreaded (all the copying is done while holding the writecache lock), so it benefits from clwb, see: http://lore.kernel.org/r/alpine.LRH.2.02.2004160411460.7833@file01.intranet.prod.int.rdu2.redhat.com Add a new function memcpy_flushcache_optimized() that tests if clflushopt is present - and if it is, we use it instead of memcpy_flushcache. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d29d3e234e01..74f3c506f084 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1139,6 +1139,42 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, return r; } +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) +{ + /* + * clflushopt performs better with block size 1024, 2048, 4096 + * non-temporal stores perform better with block size 512 + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + * + * NOTE: this happens to be the case now (with dm-writecache's single + * threaded model) but re-evaluate this once memcpy_flushcache() is + * enabled to use movdir64b which might invalidate this performance + * advantage seen with cache-allocating-writes plus flushing. + */ +#ifdef CONFIG_X86 + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && + likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 768)) { + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64); + return; + } +#endif + memcpy_flushcache(dest, source, size); +} + static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; @@ -1164,7 +1200,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data } } else { flush_dcache_page(bio_page(bio)); - memcpy_flushcache(data, buf, size); + memcpy_flushcache_optimized(data, buf, size); } bvec_kunmap_irq(buf, &flags);