Description
tldr: I have a stress test where mimalloc consumes ~30x as much working set memory as a toy allocator I wrote.
I'm using mimalloc 2.1.7-1
(?) as packaged by msys2.
Background: I'm designing my own allocator and was stress-testing how it handles extremely GC-like workloads and ended up with a stress test with the following design:
- 32 work threads, allocating various sizes of allocations, writing to them, checking the value they just wrote, then punting them to a free-loop to be freed
- The single free-loop thread frees the allocations
Running this test, my toy allocator finishes in ~30 seconds and takes up ~70MB of RAM. Mimalloc finishes in ~5 seconds and takes up ~6GB of RAM (peak value). If I enable address sanitizer, the numbers change to 30s/200MB and 18s/2GB. I threw this same test at my system's malloc implementation (I'm in windows 10) and it takes so long to run that I'm treating it as "doesn't terminate" (it took over 50 minutes and still hadn't finished). The system allocator's working set usage amount was roughly the same as my toy allocator.
NOTE: I didn't bother deleting commented-out test/debug code before uploading. I'm still busy debugging issues with my allocator and GC and can't dedicate too much time to making this report clean.
If this is a false positive somehow (e.g. something is wrong with my environment), the stress test might still be useful for https://github.com/daanx/mimalloc-bench since it breaks my system's allocator too.
Attached zip file contains a .cpp file containing my stress test.
cpp file contents follow:
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
using namespace std;
//// custom malloc
////#define WALLOC_SYS_MALLOC
////#define WALLOC_GLOBAL_FREELIST
//#define WALLOC_PULL_OVERRIDE (256*128)
//#define WALLOC_FLUSH_OVERRIDE (4096*64)
//#define WALLOC_FLUSH_KEEP_OVERRIDE 0
//#define WALLOC_MAXIMUM_FAST
////#define WALLOC_CACHEHINT 64
//#include "wmalloc.hpp"
//#define malloc _walloc_raw_malloc
//#define calloc _walloc_raw_calloc
//#define realloc _walloc_raw_realloc
//#define free _walloc_raw_free
// mimalloc
#include <mimalloc.h>
#define malloc mi_malloc
#define free mi_free
// glibc
//__attribute__((optnone)) void * _malloc(size_t n) { return malloc(n); }
//#define malloc _malloc
std::mutex global_tofree_list_mtx;
std::vector<void *> global_tofree_list;
std::atomic_int mustexit;
void freeloop()
{
//int i = 0;
while (!mustexit)
{
//printf("%d\n", i++);
global_tofree_list_mtx.lock();
/*
if (global_tofree_list.size())
printf("%zd\n", global_tofree_list.size());
*/
for (auto & p : global_tofree_list)
free(p);
global_tofree_list.clear();
/*
while (global_tofree_list.size())
{
free(global_tofree_list.back());
global_tofree_list.pop_back();
}
*/
global_tofree_list_mtx.unlock();
//_walloc_flush_freelists();
}
}
std::atomic_int tc = 0;
int * ptrs[512][8];
void looper()
{
std::vector<void *> tofree_list;
auto do_free = [&](void * p)
{
tofree_list.push_back(p);
if (tofree_list.size() > 100)
{
global_tofree_list_mtx.lock();
for (auto & p : tofree_list)
global_tofree_list.push_back(p);
global_tofree_list_mtx.unlock();
tofree_list.clear();
}
};
int unique = tc.fetch_add(1);
for (int i = 0; i < 1000000; ++i)
{
size_t s = 1ULL << (i%20);
for (int j = 0; j < 8; j++)
{
ptrs[unique][j] = (int *)(malloc(s*sizeof(int)));
*ptrs[unique][j] = j+unique*1523;
}
for (int j = 8; j > 0; j--)
{
if (*ptrs[unique][j-1] != j-1+unique*1523)
assert(((void)"memory corruption! (FILO)", 0));
do_free(ptrs[unique][j-1]);
}
for (int j = 0; j < 8; j++)
{
ptrs[unique][j] = (int *)(malloc(s*sizeof(int)));
*ptrs[unique][j] = j+unique*1523;
}
for (int j = 0; j < 8; j++)
{
if (*ptrs[unique][j] != j+unique*1523)
assert(((void)"memory corruption! (FIFO)", 0));
do_free(ptrs[unique][j]);
}
}
//puts("!!!!!!!!!!!!!!! thread finished !!!!!!!!!!!!");
//printf("!!!! thread %zd finished !!!!\n", _thread_info->alt_id);
fflush(stdout);
}
int main()
{
int threadcount = 32;
vector<thread> threads;
for (int i = 0; i < threadcount; ++i)
threads.emplace_back(looper);
std::thread freeloop_thread(freeloop);
for (auto & thread : threads)
{
thread.join();
}
mustexit.store(1);
freeloop_thread.join();
puts("Done!");
return 0;
}