cosmopolitan/third_party/dlmalloc/mspaces.inc

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

586 lines
16 KiB
PHP
Raw Normal View History

#include "third_party/dlmalloc/dlmalloc.h"
static mstate init_user_mstate(char* tbase, size_t tsize) {
size_t msize = pad_request(sizeof(struct malloc_state));
mchunkptr mn;
mchunkptr msp = align_as_chunk(tbase);
mstate m = (mstate)(chunk2mem(msp));
Make malloc() go 200x faster If pthread_create() is linked into the binary, then the cosmo runtime will create an independent dlmalloc arena for each core. Whenever the malloc() function is used it will index `g_heaps[sched_getcpu() / 2]` to find the arena with the greatest hyperthread / numa locality. This may be configured via an environment variable. For example if you say `export COSMOPOLITAN_HEAP_COUNT=1` then you can restore the old ways. Your process may be configured to have anywhere between 1 - 128 heaps We need this revision because it makes multithreaded C++ applications faster. For example, an HTTP server I'm working on that makes extreme use of the STL went from 16k to 2000k requests per second, after this change was made. To understand why, try out the malloc_test benchmark which calls malloc() + realloc() in a loop across many threads, which sees a a 250x improvement in process clock time and 200x on wall time The tradeoff is this adds ~25ns of latency to individual malloc calls compared to MODE=tiny, once the cosmo runtime has transitioned into a fully multi-threaded state. If you don't need malloc() to be scalable then cosmo provides many options for you. For starters the heap count variable above can be set to put the process back in single heap mode plus you can go even faster still, if you include tinymalloc.inc like many of the programs in tool/build/.. are already doing since that'll shave tens of kb off your binary footprint too. Theres also MODE=tiny which is configured to use just 1 plain old dlmalloc arena by default Another tradeoff is we need more memory now (except in MODE=tiny), to track the provenance of memory allocation. This is so allocations can be freely shared across threads, and because OSes can reschedule code to different CPUs at any time.
2024-06-05 08:31:21 +00:00
// bzero(m, msize); // [jart] it is not needed
(void)INITIAL_LOCK(&m->mutex);
msp->head = (msize|INUSE_BITS);
m->seg.base = m->least_addr = tbase;
m->seg.size = m->footprint = m->max_footprint = tsize;
m->magic = mparams.magic;
m->release_checks = MAX_RELEASE_CHECK_RATE;
m->mflags = mparams.default_mflags;
m->extp = 0;
m->exts = 0;
disable_contiguous(m);
init_bins(m);
mn = next_chunk(mem2chunk(m));
init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
check_top_chunk(m, m->top);
return m;
}
mspace create_mspace(size_t capacity, int locked) {
mstate m = 0;
size_t msize;
ensure_initialization();
msize = pad_request(sizeof(struct malloc_state));
if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
size_t rs = ((capacity == 0)? mparams.granularity :
(capacity + TOP_FOOT_SIZE + msize));
size_t tsize = granularity_align(rs);
Make malloc() go 200x faster If pthread_create() is linked into the binary, then the cosmo runtime will create an independent dlmalloc arena for each core. Whenever the malloc() function is used it will index `g_heaps[sched_getcpu() / 2]` to find the arena with the greatest hyperthread / numa locality. This may be configured via an environment variable. For example if you say `export COSMOPOLITAN_HEAP_COUNT=1` then you can restore the old ways. Your process may be configured to have anywhere between 1 - 128 heaps We need this revision because it makes multithreaded C++ applications faster. For example, an HTTP server I'm working on that makes extreme use of the STL went from 16k to 2000k requests per second, after this change was made. To understand why, try out the malloc_test benchmark which calls malloc() + realloc() in a loop across many threads, which sees a a 250x improvement in process clock time and 200x on wall time The tradeoff is this adds ~25ns of latency to individual malloc calls compared to MODE=tiny, once the cosmo runtime has transitioned into a fully multi-threaded state. If you don't need malloc() to be scalable then cosmo provides many options for you. For starters the heap count variable above can be set to put the process back in single heap mode plus you can go even faster still, if you include tinymalloc.inc like many of the programs in tool/build/.. are already doing since that'll shave tens of kb off your binary footprint too. Theres also MODE=tiny which is configured to use just 1 plain old dlmalloc arena by default Another tradeoff is we need more memory now (except in MODE=tiny), to track the provenance of memory allocation. This is so allocations can be freely shared across threads, and because OSes can reschedule code to different CPUs at any time.
2024-06-05 08:31:21 +00:00
char* tbase = (char*)dlmalloc_requires_more_vespene_gas(tsize);
if (tbase != CMFAIL) {
m = init_user_mstate(tbase, tsize);
m->seg.sflags = USE_MMAP_BIT;
set_lock(m, locked);
}
}
return (mspace)m;
}
mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
mstate m = 0;
size_t msize;
ensure_initialization();
msize = pad_request(sizeof(struct malloc_state));
if (capacity > msize + TOP_FOOT_SIZE &&
capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
m = init_user_mstate((char*)base, capacity);
m->seg.sflags = EXTERN_BIT;
set_lock(m, locked);
}
return (mspace)m;
}
int mspace_track_large_chunks(mspace msp, int enable) {
int ret = 0;
mstate ms = (mstate)msp;
if (!PREACTION(ms)) {
if (!use_mmap(ms)) {
ret = 1;
}
if (!enable) {
enable_mmap(ms);
} else {
disable_mmap(ms);
}
POSTACTION(ms);
}
return ret;
}
size_t destroy_mspace(mspace msp) {
size_t freed = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
msegmentptr sp = &ms->seg;
(void)DESTROY_LOCK(&ms->mutex); /* destroy before unmapped */
while (sp != 0) {
char* base = sp->base;
size_t size = sp->size;
flag_t flag = sp->sflags;
(void)base; /* placate people compiling -Wunused-variable */
sp = sp->next;
if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
CALL_MUNMAP(base, size) == 0)
freed += size;
}
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return freed;
}
/*
mspace versions of routines are near-clones of the global
versions. This is not so nice but better than the alternatives.
*/
void* mspace_malloc(mspace msp, size_t bytes) {
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
return 0;
}
if (!PREACTION(ms)) {
void* mem;
size_t nb;
if (bytes <= MAX_SMALL_REQUEST) {
bindex_t idx;
binmap_t smallbits;
nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
idx = small_index(nb);
smallbits = ms->smallmap >> idx;
if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
mchunkptr b, p;
idx += ~smallbits & 1; /* Uses next bin if idx empty */
b = smallbin_at(ms, idx);
p = b->fd;
assert(chunksize(p) == small_index2size(idx));
unlink_first_small_chunk(ms, b, p, idx);
set_inuse_and_pinuse(ms, p, small_index2size(idx));
mem = chunk2mem(p);
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
else if (nb > ms->dvsize) {
if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
mchunkptr b, p, r;
size_t rsize;
bindex_t i;
binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
binmap_t leastbit = least_bit(leftbits);
compute_bit2idx(leastbit, i);
b = smallbin_at(ms, i);
p = b->fd;
assert(chunksize(p) == small_index2size(i));
unlink_first_small_chunk(ms, b, p, i);
rsize = small_index2size(i) - nb;
/* Fit here cannot be remainderless if 4byte sizes */
if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
set_inuse_and_pinuse(ms, p, small_index2size(i));
else {
set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
r = chunk_plus_offset(p, nb);
set_size_and_pinuse_of_free_chunk(r, rsize);
replace_dv(ms, r, rsize);
}
mem = chunk2mem(p);
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
}
}
else if (bytes >= MAX_REQUEST)
nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
else {
nb = pad_request(bytes);
if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
}
if (nb <= ms->dvsize) {
size_t rsize = ms->dvsize - nb;
mchunkptr p = ms->dv;
if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
ms->dvsize = rsize;
set_size_and_pinuse_of_free_chunk(r, rsize);
set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
}
else { /* exhaust dv */
size_t dvs = ms->dvsize;
ms->dvsize = 0;
ms->dv = 0;
set_inuse_and_pinuse(ms, p, dvs);
}
mem = chunk2mem(p);
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
else if (nb < ms->topsize) { /* Split top */
size_t rsize = ms->topsize -= nb;
mchunkptr p = ms->top;
mchunkptr r = ms->top = chunk_plus_offset(p, nb);
r->head = rsize | PINUSE_BIT;
set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
mem = chunk2mem(p);
check_top_chunk(ms, ms->top);
check_malloced_chunk(ms, mem, nb);
goto postaction;
}
mem = sys_alloc(ms, nb);
POSTACTION(ms);
if (mem == MAP_FAILED && _weaken(__oom_hook)) {
_weaken(__oom_hook)(bytes);
}
return mem;
postaction:
POSTACTION(ms);
return mem;
}
return 0;
}
void mspace_free(mspace msp, void* mem) {
if (mem != 0) {
mchunkptr p = mem2chunk(mem);
#if FOOTERS
mstate fm = get_mstate_for(p);
(void)msp; /* placate people compiling -Wunused */
#else /* FOOTERS */
mstate fm = (mstate)msp;
#endif /* FOOTERS */
if (!ok_magic(fm)) {
USAGE_ERROR_ACTION(fm, p);
return;
}
if (!PREACTION(fm)) {
check_inuse_chunk(fm, p);
if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
size_t psize = chunksize(p);
mchunkptr next = chunk_plus_offset(p, psize);
if (!pinuse(p)) {
size_t prevsize = p->prev_foot;
if (is_mmapped(p)) {
psize += prevsize + MMAP_FOOT_PAD;
if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
fm->footprint -= psize;
goto postaction;
}
else {
mchunkptr prev = chunk_minus_offset(p, prevsize);
psize += prevsize;
p = prev;
if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
if (p != fm->dv) {
unlink_chunk(fm, p, prevsize);
}
else if ((next->head & INUSE_BITS) == INUSE_BITS) {
fm->dvsize = psize;
set_free_with_pinuse(p, psize, next);
goto postaction;
}
}
else
goto erroraction;
}
}
if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
if (!cinuse(next)) { /* consolidate forward */
if (next == fm->top) {
size_t tsize = fm->topsize += psize;
fm->top = p;
p->head = tsize | PINUSE_BIT;
if (p == fm->dv) {
fm->dv = 0;
fm->dvsize = 0;
}
if (should_trim(fm, tsize))
sys_trim(fm, 0);
goto postaction;
}
else if (next == fm->dv) {
size_t dsize = fm->dvsize += psize;
fm->dv = p;
set_size_and_pinuse_of_free_chunk(p, dsize);
goto postaction;
}
else {
size_t nsize = chunksize(next);
psize += nsize;
unlink_chunk(fm, next, nsize);
set_size_and_pinuse_of_free_chunk(p, psize);
if (p == fm->dv) {
fm->dvsize = psize;
goto postaction;
}
}
}
else
set_free_with_pinuse(p, psize, next);
if (is_small(psize)) {
insert_small_chunk(fm, p, psize);
check_free_chunk(fm, p);
}
else {
tchunkptr tp = (tchunkptr)p;
insert_large_chunk(fm, tp, psize);
check_free_chunk(fm, p);
if (--fm->release_checks == 0)
release_unused_segments(fm);
}
goto postaction;
}
}
erroraction:
USAGE_ERROR_ACTION(fm, p);
postaction:
POSTACTION(fm);
}
}
}
void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
void* mem;
size_t req = 0;
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
return 0;
}
if (n_elements != 0) {
req = n_elements * elem_size;
if (((n_elements | elem_size) & ~(size_t)0xffff) &&
(req / n_elements != elem_size))
req = MAX_SIZE_T; /* force downstream failure on overflow */
}
mem = internal_malloc(ms, req);
if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
bzero(mem, req);
return mem;
}
void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
void* mem = 0;
if (oldmem == 0) {
mem = mspace_malloc(msp, bytes);
}
else if (bytes >= MAX_REQUEST) {
MALLOC_FAILURE_ACTION;
}
#ifdef REALLOC_ZERO_BYTES_FREES
else if (bytes == 0) {
mspace_free(msp, oldmem);
}
#endif /* REALLOC_ZERO_BYTES_FREES */
else {
size_t nb = request2size(bytes);
mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
mstate m = (mstate)msp;
#else /* FOOTERS */
mstate m = get_mstate_for(oldp);
if (!ok_magic(m)) {
USAGE_ERROR_ACTION(m, oldmem);
return 0;
}
#endif /* FOOTERS */
if (!PREACTION(m)) {
mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
if (newp != 0) {
/* [jart] fix realloc MT bug in DEBUG mode
https://github.com/intel/linux-sgx/issues/534 */
check_inuse_chunk(m, newp);
POSTACTION(m);
mem = chunk2mem(newp);
}
else {
POSTACTION(m);
mem = mspace_malloc(m, bytes);
if (mem != 0) {
size_t oc = chunksize(oldp) - overhead_for(oldp);
memcpy(mem, oldmem, (oc < bytes)? oc : bytes);
mspace_free(m, oldmem);
}
}
}
}
return mem;
}
void* mspace_realloc_in_place(mspace msp, void* oldmem, size_t bytes) {
void* mem = 0;
if (oldmem != 0) {
if (bytes >= MAX_REQUEST) {
MALLOC_FAILURE_ACTION;
}
else {
size_t nb = request2size(bytes);
mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
mstate m = (mstate)msp;
#else /* FOOTERS */
mstate m = get_mstate_for(oldp);
(void)msp; /* placate people compiling -Wunused */
if (!ok_magic(m)) {
USAGE_ERROR_ACTION(m, oldmem);
return 0;
}
#endif /* FOOTERS */
if (!PREACTION(m)) {
mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
if (newp == oldp) {
/* [jart] fix realloc_in_place MT bug in DEBUG mode
https://github.com/intel/linux-sgx/issues/534 */
check_inuse_chunk(m, newp);
mem = oldmem;
}
POSTACTION(m);
}
}
}
return mem;
}
void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
return 0;
}
if (alignment <= MALLOC_ALIGNMENT)
return mspace_malloc(msp, bytes);
return internal_memalign(ms, alignment, bytes);
}
void** mspace_independent_calloc(mspace msp, size_t n_elements,
size_t elem_size, void* chunks[]) {
size_t sz = elem_size; /* serves as 1-element array */
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
return 0;
}
return ialloc(ms, n_elements, &sz, 3, chunks);
}
void** mspace_independent_comalloc(mspace msp, size_t n_elements,
size_t sizes[], void* chunks[]) {
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
return 0;
}
return ialloc(ms, n_elements, sizes, 0, chunks);
}
size_t mspace_bulk_free(mspace msp, void* array[], size_t nelem) {
return internal_bulk_free((mstate)msp, array, nelem);
}
#if MALLOC_INSPECT_ALL
void mspace_inspect_all(mspace msp,
void(*handler)(void *start,
void *end,
size_t used_bytes,
void* callback_arg),
void* arg) {
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
if (!PREACTION(ms)) {
internal_inspect_all(ms, handler, arg);
POSTACTION(ms);
}
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
}
#endif /* MALLOC_INSPECT_ALL */
int mspace_trim(mspace msp, size_t pad) {
int result = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
if (!PREACTION(ms)) {
result = sys_trim(ms, pad);
POSTACTION(ms);
}
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return result;
}
#if !NO_MALLOC_STATS
void mspace_malloc_stats(mspace msp) {
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
internal_malloc_stats(ms);
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
}
#endif /* NO_MALLOC_STATS */
size_t mspace_footprint(mspace msp) {
size_t result = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
result = ms->footprint;
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return result;
}
size_t mspace_max_footprint(mspace msp) {
size_t result = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
result = ms->max_footprint;
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return result;
}
size_t mspace_footprint_limit(mspace msp) {
size_t result = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
size_t maf = ms->footprint_limit;
result = (maf == 0) ? MAX_SIZE_T : maf;
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return result;
}
size_t mspace_set_footprint_limit(mspace msp, size_t bytes) {
size_t result = 0;
mstate ms = (mstate)msp;
if (ok_magic(ms)) {
if (bytes == 0)
result = granularity_align(1); /* Use minimal size */
if (bytes == MAX_SIZE_T)
result = 0; /* disable */
else
result = granularity_align(bytes);
ms->footprint_limit = result;
}
else {
USAGE_ERROR_ACTION(ms,ms);
}
return result;
}
#if !NO_MALLINFO
struct mallinfo mspace_mallinfo(mspace msp) {
mstate ms = (mstate)msp;
if (!ok_magic(ms)) {
USAGE_ERROR_ACTION(ms,ms);
}
return internal_mallinfo(ms);
}
#endif /* NO_MALLINFO */
size_t mspace_usable_size(const void* mem) {
if (mem != 0) {
mchunkptr p = mem2chunk(mem);
if (is_inuse(p))
return chunksize(p) - overhead_for(p);
}
return 0;
}
int mspace_mallopt(int param_number, int value) {
return change_mparam(param_number, value);
}