linux-stable/tools/testing/selftests/resctrl/fill_buf.c
Ilpo Järvinen 205de6ddd7 selftests/resctrl: Rewrite Cache Allocation Technology (CAT) test
CAT test spawns two processes into two different control groups with
exclusive schemata. Both the processes alloc a buffer from memory
matching their allocated LLC block size and flush the entire buffer out
of caches. Since the processes are reading through the buffer only once
during the measurement and initially all the buffer was flushed, the
test isn't testing CAT.

Rewrite the CAT test to allocate a buffer sized to half of LLC. Then
perform a sequence of tests with different LLC alloc sizes starting
from half of the CBM bits down to 1-bit CBM. Flush the buffer before
each test and read the buffer twice. Observe the LLC misses on the
second read through the buffer. As the allocated LLC block gets smaller
and smaller, the LLC misses will become larger and larger giving a
strong signal on CAT working properly.

The new CAT test is using only a single process because it relies on
measured effect against another run of itself rather than another
process adding noise. The rest of the system is set to use the CBM bits
not used by the CAT test to keep the test isolated.

Replace count_bits() with count_contiguous_bits() to get the first bit
position in order to be able to calculate masks based on it.

This change has been tested with a number of systems from different
generations.

Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
2024-02-13 13:56:45 -07:00

169 lines
3.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* fill_buf benchmark
*
* Copyright (C) 2018 Intel Corporation
*
* Authors:
* Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
* Fenghua Yu <fenghua.yu@intel.com>
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <inttypes.h>
#include <string.h>
#include "resctrl.h"
#define CL_SIZE (64)
#define PAGE_SIZE (4 * 1024)
#define MB (1024 * 1024)
static void sb(void)
{
#if defined(__i386) || defined(__x86_64)
asm volatile("sfence\n\t"
: : : "memory");
#endif
}
static void cl_flush(void *p)
{
#if defined(__i386) || defined(__x86_64)
asm volatile("clflush (%0)\n\t"
: : "r"(p) : "memory");
#endif
}
void mem_flush(unsigned char *buf, size_t buf_size)
{
unsigned char *cp = buf;
size_t i = 0;
buf_size = buf_size / CL_SIZE; /* mem size in cache lines */
for (i = 0; i < buf_size; i++)
cl_flush(&cp[i * CL_SIZE]);
sb();
}
/*
* Buffer index step advance to workaround HW prefetching interfering with
* the measurements.
*
* Must be a prime to step through all indexes of the buffer.
*
* Some primes work better than others on some architectures (from MBA/MBM
* result stability point of view).
*/
#define FILL_IDX_MULT 23
static int fill_one_span_read(unsigned char *buf, size_t buf_size)
{
unsigned int size = buf_size / (CL_SIZE / 2);
unsigned int i, idx = 0;
unsigned char sum = 0;
/*
* Read the buffer in an order that is unexpected by HW prefetching
* optimizations to prevent them interfering with the caching pattern.
*
* The read order is (in terms of halves of cachelines):
* i * FILL_IDX_MULT % size
* The formula is open-coded below to avoiding modulo inside the loop
* as it improves MBA/MBM result stability on some architectures.
*/
for (i = 0; i < size; i++) {
sum += buf[idx * (CL_SIZE / 2)];
idx += FILL_IDX_MULT;
while (idx >= size)
idx -= size;
}
return sum;
}
static void fill_one_span_write(unsigned char *buf, size_t buf_size)
{
unsigned char *end_ptr = buf + buf_size;
unsigned char *p;
p = buf;
while (p < end_ptr) {
*p = '1';
p += (CL_SIZE / 2);
}
}
void fill_cache_read(unsigned char *buf, size_t buf_size, bool once)
{
int ret = 0;
while (1) {
ret = fill_one_span_read(buf, buf_size);
if (once)
break;
}
/* Consume read result so that reading memory is not optimized out. */
*value_sink = ret;
}
static void fill_cache_write(unsigned char *buf, size_t buf_size, bool once)
{
while (1) {
fill_one_span_write(buf, buf_size);
if (once)
break;
}
}
unsigned char *alloc_buffer(size_t buf_size, int memflush)
{
void *buf = NULL;
uint64_t *p64;
size_t s64;
int ret;
ret = posix_memalign(&buf, PAGE_SIZE, buf_size);
if (ret < 0)
return NULL;
/* Initialize the buffer */
p64 = buf;
s64 = buf_size / sizeof(uint64_t);
while (s64 > 0) {
*p64 = (uint64_t)rand();
p64 += (CL_SIZE / sizeof(uint64_t));
s64 -= (CL_SIZE / sizeof(uint64_t));
}
/* Flush the memory before using to avoid "cache hot pages" effect */
if (memflush)
mem_flush(buf, buf_size);
return buf;
}
int run_fill_buf(size_t buf_size, int memflush, int op, bool once)
{
unsigned char *buf;
buf = alloc_buffer(buf_size, memflush);
if (!buf)
return -1;
if (op == 0)
fill_cache_read(buf, buf_size, once);
else
fill_cache_write(buf, buf_size, once);
free(buf);
return 0;
}