Revision 5cc79f6a5f29cbc944ac6dc8ab54c67ad0a269fb authored by stamatak on 28 February 2014, 12:33:05 UTC, committed by stamatak on 28 February 2014, 12:33:05 UTC
1 parent 145f77e
Raw File
ll_alloc.c
/*
 *   Copyright (C) 2009, 2010, 2011 Lockless Inc., Steven Von Fuerst.
 *
 * This library is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * Implement a lockfree allocator based upon lockless queues
 * that communicate between processors, and btrees to hold the
 * unallocated memory.
 */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include "compiler.h"
#include "ll_asm.h"
#include "ll_list.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdarg.h>

#ifndef WINDOWS
#include <sys/mman.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/futex.h>
#endif /* !WINDOWS */

#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <assert.h>
// #include <malloc.h>
#define USE_PREFIX 1

#ifdef USE_DLL
#define PREFIX(X)   llalloc##X
#else
#ifdef USE_PREFIX
#define PREFIX(X)   llalloc##X
#else
#define PREFIX(X)   X
#endif
#endif


void *PREFIX(memalign)(size_t align, size_t size);
void *PREFIX(malloc)(size_t size);
void *PREFIX(realloc)(void *p, size_t size);
int PREFIX(posix_memalign)(void **p, size_t align, size_t size);
void *PREFIX(calloc)(size_t n, size_t size);
void PREFIX(free)(void *p);

static size_t malloc_usable_size(void *p);

/* Debugging */
//#define DEBUG_ALLOC

/* Extra checking */
//#define DEBUG_ALLOC_SLOW

/* Memory leak debugging */
//#define DEBUG_LEAK
//#define DEBUG_LEAK_DISP		0

/* For windows and valgrind */
//#define EMU_SBRK
//#define EMU_SBRK_VG

/* Turn off slab usage - useful for debugging btree and small alloc code */
//#define DEBUG_NO_SLAB

/* Turn on home-made profiler */
//#define DEBUG_PROFILE

#ifdef DEBUG_PROFILE
#include "prof.h"
#else
#define DECL_PROF_FUNC int __attribute((unused)) profile_dummy
#endif


#ifndef WINDOWS
#define PAGESIZE		4096UL
#else
#define PAGESIZE		((size_t) 65536)
#endif

/* Seperator between allocations */
#define SEPSIZE			16
#define PTRSIZE			8

#define HEADERSIZE		32

#define ADDR_SIZE		27

#define SLABSIZE		((uintptr_t) (1 << 17))
//#define SLABBMAX		((SLABSIZE / 8) - 2)
#define SLABBMAX		64	/* About 4M per thread */

/* Slab sizes 0 to 512 bytes in steps of 16 */
#define NUM_SB			33
#define SB_MAX			((NUM_SB - 1) * 16)

/* Maximum size of medium allocations */
#define BTMALLOC		((1L << ADDR_SIZE) - HEADERSIZE)

#define TOP_SIZE		(-(PAGESIZE * 2))

/* Minimum size to allocate at a time */
#define MINALLOC		(1L << 21)

/* 64 queues */
#define NUM_QS			64
#define QS_MAX			(NUM_QS * 16 - SEPSIZE)

/* Only check four fast bins */
#define FAST_MASK		0x0fULL

/* Clear the fast lists at least this often on free() */
#define FREE_FAST		((1 << 16) - 1)

/* The biggest size that can reasonably be stored in the fast lists */
#ifdef __x86_64__
#define FAST_64_BIN		67108863
#else
#define FAST_64_BIN		3669975
#endif


#ifdef __x86_64__
#define MYSIZE_TO_PTR(T, N) ((dlist *) (((char *) T) + offsetof(atls, qs) + N - SEPSIZE))
#else
#define MYSIZE_TO_PTR(T, N) ((dlist *) (((char *) T) + offsetof(atls, qs) + N/2 - PTRSIZE))
#endif

/* Fast-lists */
#define NUM_FL			64

/* btree size */
#define BT_MAX			16

/* 64bit mask type */
typedef unsigned long long u64b;

/* Pre declare */
typedef struct btree btree;

typedef struct sep sep;
struct sep
{
	btree *left;

#ifndef __x86_64__
	int pad;
#endif

	__extension__ union
	{
		__extension__ struct
		{
			unsigned bs_offset;
			unsigned size;
		};
		uintptr_t prev;
	};
};

struct btree
{
	/* Seperator */
	sep s;

	__extension__ union
	{
		slist list;
		dlist list2;
		void *data;

		__extension__ struct
		{
			btree *parent;
			unsigned bsize[BT_MAX + 1];
			char prev[BT_MAX + 1];
			btree *ptr[BT_MAX];
		};
	};
#ifndef __x86_64__
	unsigned pad;
#endif
};

#ifdef WINDOWS
/* For documentation purposes only */
struct mallinfo
{
	/* Total space allocated with sbrk in all threads */
	int arena;

	/* Number of ordinary (non-slab and non-mmap) allocations */
	int ordblks;

	/* Number of blocks in this threads slab */
	int smblks;

	/* Number of mmaped chunks in our thread */
	int hblks;

	/* Number of btree nodes for our thread */
	int hblkhd;

	/* Total (possibly partially) used slab blocks */
	int usmblks;

	/* Total number of free slab blocks */
	int fsmblks;

	/* Total allocated space for this thread including overhead */
	int uordblks;

	/* Total free space for this thread in ordinary mmap region */
	int fordblks;

	/* zero */
	int keepcost;
};
#endif


/* Seperator bitflags */
#define FLG_UNUSED	0x01
#define FLG_LUNUSED	0x02
#define FLG_LSIZE8	0x04
#define FLG_SIZE8	0x08

static int b_leaf(btree *b);

#define SEP_INDEX(b, loc, v) (((unsigned char *) &((b)->bsize[loc]))[v])

/* Index into the zeroth int in bsize[] */
#define b_start(b) SEP_INDEX(b, 0, 0)
#define b_pindex(b) SEP_INDEX(b, 0, 1)
#define b_mask(b) (*(unsigned short*) &SEP_INDEX(b, 0, 2))

#define b_next(b, loc) SEP_INDEX(b, loc, 0)
#define b_prev(b, loc) (b->prev[loc])
#define b_last(b) b_prev(b, 0)
#define b_ptr(b, loc) (b->ptr[(loc) - 1])

typedef union mealloc mealloc;
union mealloc
{
	__extension__ struct
	{
		slist **tail;
		dlist m_list;
	};

	__extension__ struct
	{
		char pad[16];
		btree b;
	};

	/* Prevent compiler warning "no named members" */
	void *dummy;
};

typedef struct sbheader sbheader;
struct sbheader
{
	__extension__ union
	{
		__extension__ struct
		{
			slist **tail;

			dlist list;

			uintptr_t max;

			unsigned size;

		};

		/* First cache line is mostly read-only */
		char pad[64];
	};

	/* Second cache line is read-write */
	uintptr_t first;

	unsigned used;

#ifndef __x86_64__
	u64b dummy;	/* padding to get right alignment */
#endif

	/* This needs to be 16 byte aligned */
	void *data;
};

/* 64k block of pointers to free blocks */
typedef struct freesb freesb;
struct freesb
{
	freesb *next;
	unsigned count;

	sbheader *blocks[SLABBMAX];
};

typedef struct atls atls;
struct atls
{
	slist fl[NUM_FL];
	u64b f_mask;

#ifndef __x86_64__
	unsigned dummy;	/* padding to get right q8 alignment */
#endif

	/* Note that qs[0] is a miss-aligned btree pointer! */
	dlist qs[NUM_QS];

	__extension__ union
	{
		__extension__ struct
		{
			/* Overlap with the seperator in bheap */
			slist btree_freenode;
			unsigned b_hgt;
			unsigned b_cnt;
		};

		btree bheap;
	};

	u64b q_mask;

	/* Partially full slabs */
	dlist slab[NUM_SB];

	dlist slab_full;

	freesb *slab_chunk;

	size_t percpu_hash;

	size_t a_alloced;
	size_t s_wanted;

	slist *head;

	dlist bl;

#ifdef DEBUG_LEAK
	int leak_fd;
#endif

	/* Hazard list */
	dlist h_list;
	void *hazard;

	/* Deleted list */
	atls *d_list;

	int fcount;

	char callocable;

	char dummy3[59];

	/* Off by itself to reduce false sharing */
	slist *tail;
};

#ifdef USE_DLL
#define PREFIX(X)	llalloc##X
#else
#ifdef USE_PREFIX
#define PREFIX(X)	llalloc##X
#else
#define PREFIX(X)	X
#endif
#endif

#ifndef DEBUG_ALLOC
#define always_inline inline __attribute__((always_inline))
#else
#define always_inline
#endif

#ifdef WINDOWS

void llmutex_lock(void *l);
void llmutex_unlock(void *l);
int llmutex_trylock(void *l);

typedef void * mutex_t;
#define mutex_lock llmutex_lock
#define mutex_unlock llmutex_unlock
#define mutex_trylock llmutex_trylock
#define MUTEX_INITIALIZER {0}

#ifndef EMU_SBRK
#define EMU_SBRK
#endif

#define set_enomem() _set_errno(ENOMEM)

/* Other functions that need prototypes */
//#if defined( USE_DLL ) || defined( USE_PREFIX )
// #ifdef USE_PREFIX
// void PREFIX(free)(void *p);
// void *PREFIX(malloc)(size_t size);
// void *PREFIX(calloc)(size_t size, size_t n);
// void *PREFIX(realloc)(void *p, size_t size);
// size_t PREFIX(_msize)(void *p);
// void *PREFIX(_expand)(void *p, size_t size);
// 
// /* Hack - indirect calls to crt functions */
// int (* __callnewh)(size_t);
// int (* __newmode)(void);
// #endif /* USE_DLL */


void PREFIX(free)(void *p);
void *PREFIX(malloc)(size_t size);
void *PREFIX(calloc)(size_t size, size_t n);
void *PREFIX(realloc)(void *p, size_t size);
size_t PREFIX(_msize)(void *p);
void *PREFIX(_expand)(void *p, size_t size);



void cfree(void *p);
void *memalign(size_t align, size_t size);
int posix_memalign(void **p, size_t align, size_t size);
void *valloc(size_t size);
void *pvalloc(size_t size);
struct mallinfo mallinfo(void);
int malloc_trim(size_t pad);
int mallopt(int param, int val);
void *PREFIX(_calloc_impl)(size_t n, size_t size, int *errno_tmp);
void PREFIX(_free_nolock)(void *p);
void *PREFIX(_realloc_nolock)(void *p, size_t size);
void *PREFIX(_calloc_nolock)(size_t n, size_t size);
size_t PREFIX(_msize_nolock)(void *p);
static size_t malloc_usable_size(void *p);

void __tlregdtor(void (*)(void *));

#else /* WINDOWS */

static int sys_futex(void *addr1, int op, int val1, struct timespec *timeout, void *addr2, int val3)
{
	return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
}

#define cmpxchg(P, O, N) __sync_val_compare_and_swap((P), (O), (N))

typedef union mutex_t mutex_t;

union mutex_t
{
	unsigned u;
	struct
	{
		unsigned char locked;
		unsigned char contended;
	} b;
};

static void mutex_init(mutex_t *m)
{
	m->u = 0;
}

static void mutex_lock(mutex_t *m)
{
	int i;

	/* Try to grab lock */
	for (i = 0; i < 100; i++)
	{
		if (!xchg_8(&m->b.locked, 1)) return;

		cpu_relax();
	}

	/* Have to sleep */
	while (xchg_32(&m->u, 257) & 1)
	{
		sys_futex(m, FUTEX_WAIT_PRIVATE, 257, NULL, NULL, 0);
	}
}

static void mutex_unlock(mutex_t *m)
{
	DECL_PROF_FUNC;

	int i;

	/* Locked and not contended */
	if ((m->u == 1) && (cmpxchg(&m->u, 1, 0) == 1)) return;

	/* Unlock */
	m->b.locked = 0;

	barrier();

	/* Spin and hope someone takes the lock */
	for (i = 0; i < 200; i++)
	{
		if (m->b.locked) return;

		cpu_relax();
	}

	/* We need to wake someone up */
	m->b.contended = 0;

	sys_futex(m, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0);
}

static int mutex_trylock(mutex_t *m)
{
	unsigned c;

	if (m->b.locked) return EBUSY;
	c = xchg_8(&m->b.locked, 1);
	if (!c) return 0;
	return EBUSY;
}

#define MUTEX_INITIALIZER {0}


// static void malloc_stats_aux(int show_nodes);

static gcc_used char dummy1[64];
static pthread_once_t init_once = PTHREAD_ONCE_INIT;
static pthread_key_t death_key;

/*
 * If pthread isn't linked in,
 * have weak replacements for the single-threaded case
 */
#pragma weak pthread_atfork
#pragma weak pthread_key_create
#pragma weak pthread_setspecific
#pragma weak pthread_once

#define set_enomem() (errno = ENOMEM)

#endif /* WINDOWS */

typedef union percpu_list percpu_list;
union percpu_list
{
	__extension__ struct
	{
		mutex_t m;
		freesb *list;
	};
	char pad[64];
};

/* Global thread hazard list */
static cache_align mutex_t h_lock = MUTEX_INITIALIZER;
static dlist h_list = DLIST_INIT(h_list);

static cache_align mutex_t d_lock = MUTEX_INITIALIZER;
static atls *d_list = NULL;

/* List of freed slab blocks */
static cache_align percpu_list *pc_slab;
static size_t cpu_total;

/* sbrk information */
static cache_align mutex_t sb_lock = MUTEX_INITIALIZER;
static cache_align uintptr_t sbrk_start = 0;
static uintptr_t sbrk_size = 0;
static int sbrk_oom = 0;
static unsigned sltotal[NUM_SB];

#ifdef DEBUG_LEAK
#define LEAK_MAX		1024
typedef struct bigallocs bigallocs;
struct bigallocs
{
	void *p;
	size_t size;
};

static bigallocs big_leak[LEAK_MAX] = {{NULL, 0}};
static mutex_t l_lock = MUTEX_INITIALIZER;

#endif

/* Hack */
#define BUILD_ASSERT(C) do {switch (0){case 0:; case (C):;}} while (0)

/* Pre-declares */
static always_inline void *split_node(atls *tl, btree *b, size_t t_size, size_t size);
static void merge_node(atls *tl, void *p);
static int init_sldata(void);
static void slab_free(atls *tl, void *p);
static void local_free(atls *tl, void *p);
static void *local_alloc(atls *tl, size_t size);
static void *slab_alloc_safe(atls *tl, size_t size);
static always_inline void *fast_alloc(atls *tl, size_t size);
static void *slow_alloc(atls *tl, size_t size);
static void atls_merge(atls *tl1, atls *tl2);
static void test_all(atls *tl);
static void *zalloc(atls *tl, size_t size);
void **independent_calloc(size_t n, size_t size, void **chunks);
void **independent_comalloc(size_t n, size_t *sizes, void **chunks);

static inline btree *small_next(btree *b)
{
	return b->data;
}

static inline void set_small_next(btree *b, btree *next)
{
	b->data = next;
}

#ifdef __x86_64__
static inline btree *small_prev(btree *b)
{
	return (btree *) (b->s.prev & ~15);
}

static inline void set_small_prev(btree *b, btree *prev)
{
	uintptr_t p = b->s.prev & 15;
	b->s.prev = p + (uintptr_t) prev;
}
#else
static inline btree *small_prev(btree *b)
{
	return (btree *) b->s.size;
}

static inline void set_small_prev(btree *b, btree *prev)
{
	b->s.size = (unsigned) prev;
}

#endif

static inline void *shift(void *p, size_t s)
{
	return &(((char *)p)[s]);
}

/* Unfortunately, TLS support with mingw is totally broken... so we need to emulate it */
#ifdef WINDOWS
static DWORD tls_index = TLS_OUT_OF_INDEXES;
static atls *get_tls(void)
{
	if (tls_index == TLS_OUT_OF_INDEXES) return NULL;

	return TlsGetValue(tls_index);
}

static void set_tls(atls *tls)
{
	TlsSetValue(tls_index, tls);
}
#else
static __thread__ atls *tls = NULL;
#define get_tls() tls
#define set_tls(T) (tls = (T))
#endif

static size_t cpu_num(void)
{
#ifdef WINDOWS                  
	SYSTEM_INFO info;

	GetSystemInfo(&info);
	return info.dwNumberOfProcessors;
#else
#ifdef SYS_MACOSX
	int num;
	size_t len = sizeof(num);
	if (sysctlbyname("hw.ncpu", &num, &len, NULL, 0)) num = 1;
	return num;
#else
	return sysconf(_SC_NPROCESSORS_ONLN);
#endif	/* SYS_MACOSX */
#endif  /* WINDOWS */
}

/*
 * Emulate sbrk()
 * Assumes we are called under a lock */
#ifdef EMU_SBRK

#ifdef EMU_SBRK_VG
#define SBRK_SIZE (1ULL << 30)
#else /* EMU_SBRK_VG */

#ifdef __x86_64__

/* Default to 32GiB of sbrk space */
#define SBRK_SIZE (1ULL << 37)
#else /* __x86_64__ */

/* Default to 1GiB of sbrk space */
#define SBRK_SIZE (1ULL << 30)
#endif /* __x86_64__ */
#endif /* EMU_SBRK_VG */


static void *sbrk_mmap_base = NULL;
static void *sbrk_mmap_end = 0;
static void init_sbrk(void)
{
	DECL_PROF_FUNC;

	size_t size = SBRK_SIZE;

	while (1)
	{
#ifndef WINDOWS
		sbrk_mmap_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
#else
		/* Allocate address space - but no memory */
		sbrk_mmap_base = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_READWRITE);
#endif
		if (sbrk_mmap_base == MAP_FAILED)
		{
			sbrk_mmap_base = NULL;
			size = size / 2;
			if (size < 65536) return;
		}
		else
		{
			sbrk_mmap_end = shift(sbrk_mmap_base, size);

			return;
		}
	}
}

static void *emu_sbrk(size_t size)
{
	DECL_PROF_FUNC;

	void *out;

	/* Hack - initialize if required */
	if (!size) init_sbrk();
	if (!sbrk_mmap_base) return MAP_FAILED;

	out = sbrk_mmap_base;
	sbrk_mmap_base = shift(sbrk_mmap_base, size);
	if (sbrk_mmap_base >= sbrk_mmap_end)
	{
		sbrk_mmap_base = out;

		return MAP_FAILED;
	}

#ifdef WINDOWS
	/* Enable memory */
	VirtualAlloc(out, size, MEM_COMMIT, PAGE_READWRITE);
#endif

	return out;
}

#define sbrk(S) emu_sbrk(S)
#endif

static inline int is_slab(void *p)
{
	return ((uintptr_t) p - sbrk_start < sbrk_size);
}

static inline void *page_start(void *p)
{
	return (void *) (-PAGESIZE & (uintptr_t) p);
}

static inline size_t page_align(size_t s)
{
	return -PAGESIZE & (s + PAGESIZE - 1);
}

static inline size_t sep_align(size_t s)
{
	/*
	 * We want to align on 16byte boundaries
	 *
	 * 16 -> 16
	 * 24 -> 16
	 * 25 -> 32
	 * 32 -> 32
	 */
	/*
	 * Then we want to include the extra 8 bytes of last ptr that are free.
	 * Finally, we want to include the following sep data.
	 */
	/*
	 *	0 -> 16
	 *  8 -> 16
	 *  9 -> 32
	 * 16 -> 32
	 * 24 -> 32
	 * 25 -> 48
	 * 32 -> 48
	 * 40 -> 48
	 */

	return (s + 7 + 16) & ~15;
}

static inline int un_used(btree *b)
{
	return (b->s.bs_offset & FLG_UNUSED);
}

static inline int left_unused(btree *b)
{
	return b->s.bs_offset & FLG_LUNUSED;
}

static inline void set_unused(btree *b, btree *br)
{
	br->s.bs_offset |= FLG_LUNUSED;
	br->s.left = b;

	b->s.bs_offset |= FLG_UNUSED;
}

static inline void set_used(btree *b, size_t size)
{
	btree *br = shift(b, size);
	br->s.bs_offset &= ~FLG_LUNUSED;

#ifdef DEBUG_ALLOC_SLOW
	if (size != b->s.size) errx(1, "size missmatch\n");
#endif

	b->s.bs_offset &= ~FLG_UNUSED;
}

static inline void set_size8(btree *b)
{
	btree *br = shift(b, 16);
	br->s.bs_offset |= FLG_LSIZE8;
	b->s.bs_offset |= FLG_SIZE8;
}

static inline void unset_size8(btree *b)
{
	btree *br = shift(b, 16);
	br->s.bs_offset &= ~FLG_LSIZE8;
	b->s.bs_offset &= ~FLG_SIZE8;
	b->s.size = 16;
	b->s.bs_offset &= 15;
	b->s.bs_offset += (br->s.bs_offset & ~15) - 16;
}

#ifdef __x86_64__
static inline btree *get_q8(atls *tl)
{
	/* Mega hack - align so that we return a btree object pointer to the correct memory locations */
	return shift(&tl->qs[0], -(uintptr_t)8);
}
#else
static inline btree *get_q8(atls *tl)
{
	/* Mega hack - align so that we return a btree object pointer to the correct memory locations */
	return shift(&tl->qs[0], -(uintptr_t)12);
}
#endif

static inline btree *read_left(btree *b)
{
	if (!left_unused(b)) return NULL;
	if (b->s.bs_offset & FLG_LSIZE8) return shift(b, -(uintptr_t)16);
	return b->s.left;
}

static inline mealloc *read_bs(btree *b)
{
	uintptr_t s = b->s.bs_offset & ~15;

#ifdef DEBUG_ALLOC_SLOW
	void *v = shift(b, -s);
	if ((PAGESIZE - 1) & (uintptr_t) v) errx(1, "mealloc misaligned\n");
#endif

	return (mealloc *) shift(b, -s);
}

static void btree_init(btree *b)
{
	/* Init header */
	//b_start(b) = 0;
	b_mask(b) = -1;
	//b_pindex(b) = 0;
	//b_last(b) = 0;
}

static inline void set_sep(btree *b, int size, btree *bo)
{
	unsigned offset = bo->s.bs_offset & ~15;

	/* Store split block offset + size + used indicators */
	b->s.bs_offset = offset + ((uintptr_t) b - (uintptr_t) bo);
	b->s.size = size;
}

#ifdef DEBUG_ALLOC
static void check_sep(btree *b)
{
	btree *br = shift(b, b->s.size);

	if (((uintptr_t) b) & 15) errx(1, "btree misaligned\n");

	if (is_slab(&b->data)) errx(1, "btree slab overlap\n");

	/* Test unused bit */
	if (un_used(b))
	{
		if (b->s.bs_offset & FLG_SIZE8)
		{
			br = shift(b, 16);

			if (!(br->s.bs_offset & FLG_LSIZE8)) errx(1, "size8 bit missmatch\n");
		}
		else
		{
			if (b->s.size & 15) errx(1, "mysize misaligned\n");

			if ((b->s.size == 16) || (br->s.bs_offset & FLG_LSIZE8)) errx(1, "size8 bit missmatch\n");
			if (read_left(br) != b) errx(1, "left pointer wrong\n");
		}

		if (!left_unused(br)) errx(1, "Unused flag conflict\n");
	}
	else
	{
		if (b->s.size & 15) errx(1, "mysize misaligned\n");
		if (left_unused(br)) errx(1, "Unused flag conflict\n");
	}
}
#else
#define check_sep(B) ((void) sizeof(B))
#endif

#ifndef WINDOWS
static __attribute__((format (gnu_printf, 2, 3))) void leak_print(atls *tl, const char *format, ...)
{
	char buf[1024];

	va_list ap;

	va_start(ap, format);
	vsnprintf(buf, 1024, format, ap);
	va_end(ap);

#ifdef DEBUG_LEAK
	/* Need tls and leak_fd initialized */
	if (tl)
	{
		if (tl->leak_fd == -1)
		{
			char buf[1024];
			int pid = getpid();
			int tid = syscall(SYS_gettid);

			snprintf(buf, 1024, "/tmp/leak-%d:%d.txt", pid, tid);

			tl->leak_fd = open(buf, O_WRONLY | O_CREAT | O_APPEND,  S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
		}

		if (tl->leak_fd != -1)
		{
			int len = strlen(buf);
			char *c = buf;

			while (len)
			{
				int out = write(tl->leak_fd, c, len);

				/* Interrupted - try again */
				if (out == -1) continue;

				/* Device is full - stop writing */
				if (!out) return;

				len -= out;
				c += out;
			}
		}
	}
#else
	/* Shut up compiler warning */
	(void) tl;

	/* Otherwise output to stderr */
	fprintf(stderr, "%s", buf);

#endif
}
#else
#define leak_print(...)
#define malloc_stats_aux(...)
#endif

#ifdef DEBUG_LEAK
static void big_alloced(void *p, size_t size)
{
	int i;

	mutex_lock(&l_lock);
	for (i = 0; i < LEAK_MAX; i++)
	{
		if (big_leak[i].p) continue;

		big_leak[i].p = p;
		big_leak[i].size = size;
		mutex_unlock(&l_lock);
		leak_print(get_tls(), "Big alloc %p %llu\n", p, (unsigned long long) size);
		return;
	}
	mutex_unlock(&l_lock);

	errx(1, "debug leak oom, increase LEAK_MAX\n");
}

static void big_freed(void *p, size_t size)
{
	int i;

	mutex_lock(&l_lock);
	for (i = 0; i < LEAK_MAX; i++)
	{
		if (big_leak[i].p != p) continue;

		if (big_leak[i].size != size) errx(1, "big alloc size missmatch\n");
		big_leak[i].p = NULL;
		mutex_unlock(&l_lock);
		leak_print(get_tls(), "Big free %p %llu\n", p, (unsigned long long) size);
		return;
	}
	mutex_unlock(&l_lock);

	errx(1, "freeing unknown large block %p\n", p);
}

static size_t big_block_size(void *p)
{
	int i;

	mutex_lock(&l_lock);
	for (i = 0; i < LEAK_MAX; i++)
	{
		if (big_leak[i].p != p) continue;
		mutex_unlock(&l_lock);
		return big_leak[i].size;
	}

	mutex_unlock(&l_lock);

	errx(1, "freeing unknown large block %p\n", p);
}

static void test_leak_aux(void)
{
	atls *tl = get_tls();
	if (!tl) return;
	malloc_stats_aux(3);
	leak_print(tl, "Done\n");
	close(tl->leak_fd);
}

static void test_leak(void)
{
	static int count = 0;

	/* Display turned off? */
	if (!DEBUG_LEAK_DISP) return;

	/* Don't bother to be thread safe - it doesn't matter much */
	if (count++ == DEBUG_LEAK_DISP)
	{
		count = 0;
		malloc_stats_aux(3);
	}
}


#else
#define big_alloced(p, size) ((void) (sizeof(p) + sizeof(size)))
#define big_freed(p, size) ((void)(sizeof(p) + sizeof(size)))
#define test_leak_aux()
#define test_leak()
#define big_block_size(p) (sizeof(p) * 0)
#endif


/* Big allocations */
static void *big_alloc_aux(size_t size)
{
	DECL_PROF_FUNC;

	/* Get memory */
#ifndef WINDOWS
	void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
#else
	void *p = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
#endif

	/* Out of memory */
	if (p == MAP_FAILED) return NULL;

	big_alloced(p, size);

	/* Done */
	return p;
}

#ifdef WINDOWS
int handle_oom(int aize);
#else
#define handle_oom(S)	((errno = ENOMEM), 0)
#endif


static noinline void *big_alloc(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	size_t psize;

	size_t *p;

	/* This arguement prevents register problems in the fast path */
	(void) tl;

	if (size > TOP_SIZE) goto nomem;

	/* Get real size to allocate */
	psize = page_align(size + SEPSIZE);

	p = big_alloc_aux(psize);

	if (p)
	{
		*p = psize;
		return shift(p, SEPSIZE);
	}

nomem:

	if (handle_oom(size)) return big_alloc(tl, size);

	return NULL;
}

#ifdef WINDOWS
static noinline void big_free_aux(size_t *p)
{
	DECL_PROF_FUNC;

	big_freed(p, *p);

	VirtualFree(p, 0, MEM_RELEASE);
}
#else
static inline void big_free_aux(size_t *p)
{
	big_freed(p, *p);

	munmap(p, *p);
}
#endif


#ifdef DEBUG_ALLOC_SLOW
static void test_queue(atls *tl)
{
	slist *q;

	btree *b;

	/* Scan incoming queue, looking for corruption */
	for (q = tl->head; q; q = q->next)
	{
		/* Ignore slab nodes */
		if (is_slab(q)) continue;

		if (((uintptr_t) q) & 15) errx(1, "incoming queue corrupted\n");

		b = CONTAINER(btree, data, q);

		if (un_used(b)) errx(1, "queue element marked as unused\n");
	}
}
#else
#define test_queue(T) ((void) sizeof(T))
#endif

#ifdef __x86_64__
/* Magic code that converts size to entry in fast-list array */
static always_inline size_t size2fl(ssize_t size)
{
	size_t n = (size / 32);

	/* Make sure we don't overflow */
	if (size == 16) return 0;
	if (size > FAST_64_BIN) return NUM_FL - 1;

	n = n * n * n;

	return flsq(n);
}
#else
/* Magic code that converts size to entry in fast-list array */
static inline size_t size2fl(ssize_t size)
{
	size_t n;

	/* 32 bit version uses old floating point instructions */
	union di
	{
		double d;
		unsigned u2[2];
	} di;

	di.d = size + 40;

	n = (di.u2[1] >> 18) - 4115;

	/* Make sure we don't overflow */
	if (n >= NUM_FL) n = NUM_FL - 1;

	return n;
}
#endif


/* Add to previous list - but don't set flag */
static always_inline void fast_add(atls *tl, btree *b, size_t n)
{
	slist_add(&tl->fl[n], &b->list);
	tl->f_mask |= 1ULL << n;
}

/* Add to free lists */
static always_inline void fast_free(atls *tl, btree *b, size_t ms)
{
	size_t n = size2fl(ms);

#ifdef DEBUG_ALLOC_SLOW
	if (un_used(b)) errx(1, "fast_free() needs used node\n");
	if (b->s.size != ms) errx(1, "fast_free size wrong\n");
#endif

	fast_add(tl, b, n);
}

static int scan_queue(atls *tl, slist **qh, size_t wanted)
{
	DECL_PROF_FUNC;

	slist *q, *qn, *qp = NULL;

	btree *b;

	size_t msize;
	int flag = 0;

	/* Size wanted */
	tl->s_wanted = wanted;

	/* Scan incoming queue, freeing as we go */
	for (q = *qh; q; q = qn)
	{
#ifdef DEBUG_ALLOC_SLOW
		if (!is_slab(q))
		{
			if (((uintptr_t) q) & 15) errx(1, "incoming queue corrupted\n");
		}
#endif

		qn = q->next;
		qp = q;
		if (qn)
		{
			if (is_slab(q))
			{
				slab_free(tl, q);
			}
			else
			{
				merge_node(tl, q);
			}

			flag = 1;
		}
	}

	*qh = qp;

	/* Reset size wanted */
	tl->s_wanted = 0;

	/*
	 * Make sure the last node isn't taking up too much room.
	 * Not that a slab node could only take up a max of SB_MAX bytes.
	 * (They aren't splittable anyway)
	 */
	if (is_slab(qp)) return flag;

	b = CONTAINER(btree, data, qp);

	msize = b->s.size;

	/* Don't split if too small */
	if (msize <= (1 << 16)) return flag;

	/* Make the head node take up less room.  Also, size 32 is faster than 16. */
	split_node(tl, b, msize, 32);

	return 1;
}


#ifdef DEBUG_ALLOC_SLOW

static void test_node(atls *tl, btree *b)
{
	mealloc *m = read_bs(b);

	if (tl != get_tls()) errx(1, "tls incorrect\n");
	if (m->tail != &tl->tail) errx(1, "node owner wrong\n");
}

/* Test fast list constraints */
static void test_fast_lists(atls *tl)
{
	int i, j;

	//if (tl->fl[63].next) errx(1, "fast list overflow\n");

	for (i = 0; i < NUM_FL; i++)
	{
		slist *p = &tl->fl[i];
		slist *f;

		scan_slist(p, f)
		{
			btree *b = CONTAINER(btree, list, f);

			/* Are we a slab node? */
			if (is_slab(&b->data))
			{
				errx(1, "Slab node on fast list\n");
			}

			test_node(tl, b);

			if (un_used(b)) errx(1, "Unused element in fast list\n");
			check_sep(b);

			j = size2fl(b->s.size);
			if ((i != j) && (i != j - 1)) errx(1, "Fast element in wrong bin\n");


			if (!(((uintptr_t) b ^ (uintptr_t) tl) & ~(PAGESIZE - 1))) errx(1, "tls on fast list!\n");
			//if (f == tl->head) errx(1, "queue head in fast list\n");

			if (f->next == f) errx(1, "fast list loop\n");
		}
	}
}
#else
#define test_fast_lists(T) ((void) sizeof(T))
#endif

/* Clear fast-lists */
static void clear_fast(atls *tl)
{
	DECL_PROF_FUNC;

	u64b mask = tl->f_mask;

	/* Anything to do? */
	while (mask)
	{
		size_t n = ffsq(mask);

		slist *p = &tl->fl[n];

		/* Get mask bit */
		mask &= -mask;

		/* Convert to a mask */
		mask = ~mask;

		/* Properly free everything in the list */
		while (p->next)
		{
			merge_node(tl, slist_rem(p));
		}

		/* Clear bottom bit */
		tl->f_mask &= mask;
		mask = tl->f_mask;
	}
}

/* Hack - same as clear_fast() but free nodes from tl2 into tl1 */
static void fast_merge(atls *tl1, atls *tl2)
{
	size_t n;
	slist *p = tl2->fl;

	/* Anything to do? */
	while (tl2->f_mask)
	{
		n = ffsq(tl2->f_mask);
		p = &tl2->fl[n];

		/* Turn off bit in f_mask, as nothing will be left there */
		tl2->f_mask &= tl2->f_mask - 1;

		/* Properly free everything in the list */
		while (p->next)
		{
			void *l = slist_rem(p);

			merge_node(tl1, l);
		}
	}
}

static noinline int reap_dead(atls *tl)
{
	DECL_PROF_FUNC;

	dlist *d;

	atls *tl2, *tl3;

	/* Check without taking mutex */
	if (!d_list) return 0;

	/* Try to get dead thread */
	if (mutex_trylock(&d_lock)) return 0;

	if (!d_list)
	{
		/* Nothing there */
		mutex_unlock(&d_lock);
		return 0;
	}

	/* Grab dead thread */
	tl2 = d_list;
	d_list = tl2->d_list;
	mutex_unlock(&d_lock);

	mutex_lock(&h_lock);

	/* Remove from hazard list */
	dlist_del(&tl2->h_list);

	/* Set flag so that memless free works */
	tl2->h_list.next = NULL;

	/* Merge data + update tail pointers */
	atls_merge(tl, tl2);

	/* Wait for all threads to not point to dead thread */
	scan_list(&h_list, d)
	{
		tl3 = list_entry(atls, h_list, d);

		while (tl3->hazard == &tl2->tail) cpu_relax();
	}

	mutex_unlock(&h_lock);

	/* Scan all final pending */
	scan_queue(tl, &tl2->head, 0);

	/* Free head */
	local_free(tl, tl2->head);

	/* Finally free tls data for dead thread */
#ifdef WINDOWS		
	VirtualFree(page_start(tl2), 0, MEM_RELEASE);
#else
	munmap(page_start(tl2), PAGESIZE);
#endif

	test_all(tl);

	/* Try to free up memory */
	return 1;
}

static void prepend_queue(slist *p, atls *tl, slist ***bs)
{
	DECL_PROF_FUNC;

	slist *tail;

	slist **btail = *bs;
	slist **btold;

	do
	{
		btold = btail;

		/* Make sure we write to the hazard pointer */
		xchg_ptr(&tl->hazard, btail);

		/* Has it changed while we were writing to our hazard pointer? */
		btail = *bs;
	}
	while (btold != btail);

	p->next = NULL;
	tail = xchg_ptr(btail, p);
	tail->next = p;

	barrier();

	tl->hazard = NULL;
}

static void destroy_tls(void *dummy)
{
	DECL_PROF_FUNC;

	atls *tl = get_tls();

	(void) dummy;

	test_all(tl);

	test_leak_aux();

	/*
	 * Make sure that any recursion via signals or other
	 * pthread_key destructors will reset this handler.
	 */
#ifdef WINDOWS
	set_tls((atls *) 1);
#else
	set_tls(NULL);
#endif

	/* The above line isn't allowed to be moved inside the lock due to possible signals */
	barrier();

	/* Add to dead list */
	mutex_lock(&d_lock);
	tl->d_list = d_list;
	d_list = tl;
	mutex_unlock(&d_lock);
}


/* Convert a pointer into a 32bit random number */
static unsigned rnd_ptr(void *p)
{
	u64b rnd_seed = (uintptr_t) p;
	rnd_seed *= 7319936632422683443ULL;
	rnd_seed ^= rnd_seed >> 32;
	rnd_seed *= 7319936632422683443ULL;
	rnd_seed ^= rnd_seed >> 32;

	/* Truncate to 32 bits */
	return rnd_seed;
}

/*
 * Pick a random offset from p into a region of size total
 * to fit an object of size size.
 *
 * Return a pointer to the object
 */
static void *rnd_offset(void *p, size_t total, size_t size)
{
	u64b slack_space = total - size;

	unsigned rng = rnd_ptr(p);

	unsigned offset = (slack_space * rng) >> 32;

	/* Keep 16-byte alignment */
	offset &= ~15;

	return shift(p, offset);
}

static atls *init_atls(atls *tl)
{
	int i;

	mealloc *m;
	btree *b, *br;

	btree *q8;

	/* Init lists */
	dlist_init(&tl->bl);

	/* queue 0 is taken by size 8 small-list */
	for (i = 1; i < NUM_QS; i++)
	{
		dlist_init(&tl->qs[i]);
	}

	/* Init small list */
	q8 = get_q8(tl);

#ifdef DEBUG_ALLOC_SLOW
	/* Btree needs to be correctly aligned */
	if (((uintptr_t) q8) & 15) errx(1, "q8 misaligned\n");
#endif

	set_small_next(q8, q8);
	set_small_prev(q8, q8);

	/* Init slabs */
	for (i = 0; i < NUM_SB; i++)
	{
		dlist_init(&tl->slab[i]);
	}
	dlist_init(&tl->slab_full);

	tl->percpu_hash = rnd_ptr(tl);

	/* Init btree */
	btree_init(&tl->bheap);

	/* Need a maximum of 2 nodes at this point */
	tl->b_hgt = 2;

#ifdef DEBUG_LEAK
	tl->leak_fd = -1;
#endif

	/* Grab initial allocation */
	m = big_alloc_aux(PAGESIZE);
	if (!m)
	{
		set_tls(NULL);
#ifdef WINDOWS
		VirtualFree(page_start(tl), 0, MEM_RELEASE);
#else
		munmap(page_start(tl), PAGESIZE);
#endif
		return NULL;
	}

	/* Keep track of total allocations */
	tl->a_alloced = PAGESIZE;

	/* Fill in header */
	dlist_add(&tl->bl, &m->m_list);

	m->tail = &tl->tail;

	b = &m->b;

	/* Create left seperator */
	b->s.size = PAGESIZE - HEADERSIZE;
	b->s.bs_offset = 16;

	/* Position of right seperator */
	br = shift(b, b->s.size);

	/* Create right seperator */
	br->s.bs_offset = PAGESIZE - SEPSIZE;
	split_node(tl, b, b->s.size, SEPSIZE);

	/* Make queue */
	tl->head = (void *) &b->data;
	tl->tail = tl->head;
	tl->head->next = NULL;

	/* Add to hazard list */
	mutex_lock(&h_lock);
	dlist_add(&h_list, &tl->h_list);
	mutex_unlock(&h_lock);

	return tl;
}

#ifndef WINDOWS

static void prepare_fork(void)
{
	size_t i;

	/* Stablize slab */
	for (i = 0; i < cpu_total; i++)
	{
		mutex_lock(&pc_slab[i].m);
	}

	/* Stablize hazard list */
	mutex_lock(&h_lock);

	/* Stabilize dead list */
	mutex_lock(&d_lock);

	/* Stablize sbrk */
	mutex_lock(&sb_lock);
}

static void parent_fork(void)
{
	size_t i;

	/* Done with sbrk */
	mutex_unlock(&sb_lock);

	/* Done with dead list */
	mutex_unlock(&d_lock);

	/* Done with hazard list */
	mutex_unlock(&h_lock);

	/* Done with slab */
	for (i = 0; i < cpu_total; i++)
	{
		mutex_unlock(&pc_slab[i].m);
	}
}

static void child_fork(void)
{
	size_t i;

	/* Clean up sb_lock in child */
	mutex_init(&sb_lock);

	/* Clean up d_lock in child */
	mutex_init(&d_lock);

	/* Clean up h_lock in child */
	mutex_init(&h_lock);

	/* Clean up slab locks in child */
	for (i = 0; i < cpu_total; i++)
	{
		mutex_unlock(&pc_slab[i].m);
	}

	/*
	 * Wipe hazard list as the other threads no longer exist
	 * This leaks memory, but we can't help it,
	 * as the other threads may be concurrently modifying internal
	 * data structures now.
	 */
	dlist_init(&h_list);

	/* We are the only member */
	dlist_add(&h_list, &tls->h_list);
}

/*
 * Initialize things.
 * Unfortunately, we don't have a failure return value so we must die instead.
 */
static void init_handler(void)
{
	int res;

	void *v;

	/* Init sbrk information */
	v = sbrk(0);
	sbrk_start = (uintptr_t) v;

	/* Add a fork handler */
	if (pthread_atfork)
	{
		res = pthread_atfork(prepare_fork, parent_fork, child_fork);
		if (res) errx(1, "pthread_atfork failed\n");
	}

	/* Create thread death key */
	if (pthread_key_create)
	{
		res = pthread_key_create(&death_key, destroy_tls);
		if (res) errx(1, "pthread_key_create() failed\n");
	}

	if (init_sldata())
	{
		errx(1, "Failed to allocate enough memory to initialize slab\n");
	}

#ifdef DEBUG_LEAK
	atexit(test_leak_aux);
#endif
}

static atls *init_tls(void)
{
	DECL_PROF_FUNC;

	atls *tl;

	/* Can we use a dead thread's tls data? */
	if (d_list)
	{
		mutex_lock(&d_lock);

		if (d_list)
		{
			/* Grab from death list */
			tl = d_list;
			d_list = tl->d_list;
			mutex_unlock(&d_lock);

			set_tls(tl);

			/* Init my thread destructor */
			if (pthread_setspecific) pthread_setspecific(death_key, tl);

			test_all(tl);

			/* Done */
			return tl;
		}
		mutex_unlock(&d_lock);
	}

	/* Hack - use a full page for it */
	tl = mmap(NULL, PAGESIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

	/* Out of memory */
	if (tl == MAP_FAILED) goto nomem;

	/* Randomly cache colour the tls data */
	tl = rnd_offset(tl, PAGESIZE, sizeof(atls));

	/* Save pointer for later memory calls */
	set_tls(tl);

	/* Make sure that we can always allocate two btree nodes from within itself */
	BUILD_ASSERT(NUM_QS * 8 + 16 >= sizeof(btree) * 2);

	/* Make sure atls isn't too big */
	BUILD_ASSERT(sizeof(atls) <= PAGESIZE);

	/* Make sure btree nodes fit in the slab */
	BUILD_ASSERT(sizeof(btree) <= SB_MAX);

	/* Hack - we should use the rest of the space to init the heap... */
	tl = init_atls(tl);
	if (!tl) goto nomem;

	/*
	 * Init handler.
	 * Note that this can allocate memory, so needs to be done last
	 */
	if (pthread_once)
	{
		pthread_once(&init_once, init_handler);
	}
	else
	{
		/* Since there are no threads... */
		if (!sbrk_start) init_handler();
	}

	/* Init my thread destructor */
	if (pthread_setspecific) pthread_setspecific(death_key, tl);

	test_all(tl);

	return tl;

nomem:
	set_enomem();
	return NULL;
}
#else /* WINDOWS */


#ifdef USE_DLL

typedef struct patch patch;
struct patch
{
	const char *name;
	void *func;
};


#define PATCH_FUNC(X)\
	{#X, (void *) ((uintptr_t) llalloc##X)}

static patch patch_list[] =
{
	PATCH_FUNC(free),
	PATCH_FUNC(malloc),
	PATCH_FUNC(calloc),
	PATCH_FUNC(realloc),
	PATCH_FUNC(_msize),
	PATCH_FUNC(_expand),
	PATCH_FUNC(_free_nolock),
	PATCH_FUNC(_realloc_nolock),
	PATCH_FUNC(_calloc_nolock),
	PATCH_FUNC(_msize_nolock),
	{NULL, NULL}
};

#define JMP_OP	0xE9
#define CALL_OP	0xE8
#define JMP_OFFSET(P1, P2)	(((uintptr_t)(P1)) - ((uintptr_t)(P2)) - 5)

/* Follow a call to its target */
static void *follow_call(void *p)
{
	int target;

	/* Are we pointing to a jump? */
	if ((*(unsigned char *)p) != CALL_OP) return NULL;

	target = *(int *) shift(p, 1);

	return (void *) shift(p, (uintptr_t) target + 5);
}

/* Find a jump in a dumb manner */
static void *find_call(void *p)
{
	while ((*(unsigned char *) p) != CALL_OP)
	{
		p = shift(p, 1);
	}

	return p;
}

static void patch_function(void *func, void *my_func)
{
	MEMORY_BASIC_INFORMATION mbi;

	/* Make code read/write */
	VirtualQuery(func, &mbi, sizeof(mbi));
	VirtualProtect(mbi.BaseAddress, mbi.RegionSize,
						PAGE_EXECUTE_READWRITE, &mbi.Protect);

	/* Patch in a jmp to our routine */
	*(unsigned char *) func = JMP_OP;
	*(unsigned *) shift(func, 1) = JMP_OFFSET(my_func, func);

	/* Reset code permissions */
	VirtualProtect(mbi.BaseAddress, mbi.RegionSize, mbi.Protect, &mbi.Protect);
}

static void *init_crt_funcs(void)
{
	FARPROC func_f;
	patch *p;
	void *f;

	HMODULE library = GetModuleHandle("MSVCR90.DLL");
	if (!library) return NULL;

	func_f = GetProcAddress(library, "_callnewh");
	if (!func_f) return NULL;
	__callnewh = (typeof(__callnewh)) func_f;

	func_f = GetProcAddress(library, "?_query_new_mode@@YAHXZ");
	if (!func_f) return NULL;
	__newmode = (typeof(__newmode)) func_f;

	for (p = patch_list; p->name; p++)
	{
		func_f = GetProcAddress(library, p->name);
		if (!func_f) continue;

		patch_function((void *) (uintptr_t) func_f, p->func);
	}

	func_f = GetProcAddress(library, "calloc");
	f = (void *) (uintptr_t) func_f;

	/* Not here... don't crash */
	if (!f) goto out;

	/* Get pointer to _calloc_impl() */
	f = find_call(f);
	f = follow_call(f);

	/* Finally patch _calloc_impl */
	patch_function(f, (void *) (uintptr_t) llalloc_calloc_impl);

out:

	/* Success */
	return (void*) 1;
}


#endif

void lldebug_hook(void);
static atls *init_tls(void)
{
	DECL_PROF_FUNC;

	atls *tl;

	static void *init = (void *) 1;
	void *first = xchg_ptr(&init, NULL);

	if (!first)
	{
		/* We've already died - use free_nomem() */
		if (get_tls() == (atls *) 1) return NULL;

		/* Can we use a dead thread's tls data? */
		if (d_list)
		{
			mutex_lock(&d_lock);

			if (d_list)
			{
				/* Grab from death list */
				tl = d_list;
				d_list = tl->d_list;
				mutex_unlock(&d_lock);

				set_tls(tl);

				test_all(tl);

				/* Undocumented crt function */
				__tlregdtor(destroy_tls);

				/* Done */
				return tl;
			}
			mutex_unlock(&d_lock);
		}
	}
	else
	{
		void *v = sbrk(0);

		/* Init slab */
		if (init_sldata())
		{
			init = (void *) 1;
			return NULL;
		}

		tls_index = TlsAlloc();
		if (tls_index == TLS_OUT_OF_INDEXES)
		{
			/* We can't handle this */
			init = (void *) 1;
			return NULL;
		}

#ifdef USE_DLL
		/* Initialize function pointers */
		if (!init_crt_funcs())
		{
			/* Doesn't work... fail and bail out of dll init */
			init = (void *) 1;
			return NULL;
		}
#endif
		/* Init sbrk information */
		sbrk_start = (uintptr_t) v;
	}

	/* Hack - use a full page for it */
	tl = VirtualAlloc(NULL, PAGESIZE, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);

	/* Out of memory */
	if (!tl) goto nomem;

	/* Randomly cache colour the tls data */
	tl = rnd_offset(tl, PAGESIZE, sizeof(atls));

	/* Save pointer for later memory calls */
	set_tls(tl);

	/* Make sure that we can always allocate two btree nodes from within itself */
	//BUILD_BUG(NUM_QS * 8 + 16 >= sizeof(btree) * 2);

	/* Hack - we should use the rest of the space to init the heap... */
	tl = init_atls(tl);
	if (!tl) goto nomem;

	/* Undocumented crt function */
	__tlregdtor(destroy_tls);

	test_all(tl);

	return tl;

nomem:
	/* Try again if possible */
	if (handle_oom(PAGESIZE * 2)) return init_tls();

	return NULL;
}

#ifdef USE_DLL
BOOL DllMain(HINSTANCE h, DWORD reason, LPVOID reserved);
BOOL DllMain(HINSTANCE h, DWORD reason, LPVOID reserved)
{
	/* Silence compiler warnings */
	(void) h;
	(void) reserved;

	/* Init the memory allocator */
	if ((reason == DLL_PROCESS_ATTACH) || (reason == DLL_THREAD_ATTACH))
	{
		if (!init_tls()) return 0;
	}
#ifdef DEBUG_PROFILE
	else if(reason == DLL_PROCESS_DETACH)
	{
		ll_print_prof();
	}
#endif

	return 1;
}
#endif /* USE_DLL */

#endif /* WINDOWS */

#ifdef DEBUG_ALLOC_SLOW

/* Get node previous to loc in b */
static void test_btree_linked(btree *b, int loc)
{
	int i, j = 0;

	if (b_leaf(b)) errx(1, "No previous!\n");

	for (i = b_start(b); i != loc; i = b_next(b, i))
	{
		j++;

		if (j > BT_MAX) errx(1, "Btree node loop!\n");
	}
}

static void test_in_btree(atls *tl, btree *b)
{
	btree *bp;

	if (!b_leaf(b)) errx(1, "Unused btree object that is not a leaf\n");

	while (b->parent)
	{
		bp = b->parent;

		if (b_ptr(bp, b_pindex(b)) != b) errx(1, "Parent doesn't own %p\n", (void *) b);
		if (!bp->bsize[b_pindex(b)]) errx(1, "Parent link broken\n");

		test_btree_linked(bp, b_pindex(b));

		b = bp;
	}

	if (&tl->bheap != b) errx(1, "Heap doesn't own %p\n", (void *) b);
}

#ifdef UNUSED_FUNC
static int is_fast_node(atls *tl, btree *b)
{
	size_t bin = size2fl(b->s.size);
	slist *f;

	scan_slist(&tl->fl[bin], f)
	{
		/* Found it? */
		if (f == &b->list) return 1;
	}

	/* Didn't find it */
	return 0;
}
#endif /* UNUSED_FUNC */

static void test_blocks(atls *tl)
{
	mealloc *m;
	dlist *d;

	btree *b;

	size_t size;

	if (tl->bl.next->prev != &tl->bl) errx(1, "Block list corrupt\n");

	/* Scan blocks */
	scan_list(&tl->bl, d)
	{
		m = list_entry(mealloc, m_list, d);

		if (d->next->prev != d) errx(1, "Block list corrupt\n");

		/* Scan seps for this block */
		for (b = &m->b;; b = shift(b, size))
		{
			if (b < &m->b) errx(1, "Node before block start!\n");

			if (b->s.bs_offset & FLG_SIZE8)
			{
				size = 16;
			}
			else
			{
				size = b->s.size;

				if (!size) break;

				if (shift(b, -(uintptr_t)(b->s.bs_offset & ~15)) != m) errx(1, "Block back link broken\n");

				if (read_bs(b) != m) errx(1, "Block start corrupted\n");
			}

			check_sep(b);

			if ((size > QS_MAX) && un_used(b)) test_in_btree(tl, b);
		}
	}
}
#else
#define test_blocks(T) ((void) sizeof(T))
#endif

/* Medium allocations */

#ifdef DEBUG_ALLOC_SLOW

static unsigned test_btree_aux(atls *tl, btree *b, unsigned lsize)
{
	btree *bn;
	int n = b_start(b);
	int i = 0;
	unsigned ssize = lsize;

	unsigned short msk = -1;

	/* Size of node can be incorrect if splitting not possible */
	if (n && b->parent && !is_slab(b) && (b->s.size < sizeof(btree)))
	{
		errx(1, "Btree nodesize wrong\n");
	}

	while (n)
	{
		bn = b_ptr(b, n);

		i++;

		if (bn->parent != b) errx(1, "Btree parent incorrect\n");
		if (b_pindex(bn) != n) errx(1, "Btree p_index incorrect\n");

		if (b_mask(b) & (1 << (n - 1))) errx(1, "Used btree node marked free\n");
		msk -= (1 << (n - 1));

		/* Scan lower */
		ssize = test_btree_aux(tl, bn, ssize);

		if (b->bsize[n] < ssize) errx(1, "Btree size misordered\n");
		ssize = b->bsize[n] & ~0xff;

		if (b_leaf(bn))
		{
			if (bn->s.size*16 != ssize) errx(1, "Btree leaf size wrong\n");

			if (!un_used(bn)) errx(1, "Btree leaf marked used!\n");
		}
		else if (!is_slab(bn) && un_used(bn)) errx(1, "Btree node marked unused!\n");

		if (b_prev(b, b_next(b, n)) != n) errx(1, "prev link broken\n");

		n = b_next(b, n);

		if (i > BT_MAX) errx(1, "Btree node loop!\n");
	}

	/* Leaf node? */
	if (!i) return ssize;

	if (msk != b_mask(b)) errx(1, "Btree free mask missmatch\n");

	if (b->parent && (i <= 3)) errx(1, "Btree has too few children\n");

	return ssize & ~0xff;
}

static void test_btree(atls *tl)
{
	if ((tl->b_hgt > 100) || (tl->b_cnt > 100)) errx(1, "btree height corrupt\n");

	test_btree_aux(tl, &tl->bheap, 0);
}
#else
#define test_btree(T) ((void) sizeof(T))
#endif

static char btree_count(btree *b)
{
	int x = b_mask(b);

	/* See Wikipedia for this algorithm for popcount */
	int m1 = 0x5555;
	int m2 = 0x3333;
	int m4 = 0x0f0f;

	/* Put counts into pairs of bits */
	x -= (x >> 1) & m1;

	/* 4 bit counts */
	x = (x & m2) + ((x >> 2) & m2);

	/* Make 8bit counts */
	x = (x + (x >> 4)) & m4;

	return 16 - (x + (x >> 8));
}

static inline int btree_alloc(btree *b)
{
	int loc = ffsu(b_mask(b));

	b_mask(b) &= ~(1 << loc);

	return loc + 1;
}

static inline void btree_free(btree *b, int loc)
{
	b_mask(b) |= 1 << (loc - 1);
}

static int b_leaf(btree *b)
{
	/* Check to see if there are no children */
	return !b_start(b);
}

static inline unsigned btree_ssize(btree *b, int loc)
{
	return b->bsize[loc] & ~0xff;
}

static inline void btree_update_daughter(btree *bp, btree *b, int loc)
{
	b_ptr(bp, loc) = b;
	b->parent = bp;
	b_pindex(b) = loc;
}


/* Update my parent with my new size */
static void btree_update_psize(btree *b, unsigned ssize)
{
	int bpi = b_pindex(b);
	btree *bp;

	/* Update parent size value */
	for (bp = b->parent; bp; bp = bp->parent)
	{
		bp->bsize[bpi] &= 0xff;
		bp->bsize[bpi] += ssize;

		/* Are we done with the chain of updates? */
		if (b_next(bp, bpi)) break;

		bpi = b_pindex(bp);
	}
}

/* Forward declare */
static void btree_node_del(atls *tl, btree *b, int loc);

static void btree_merge_aux(atls *tl, btree *bl, btree *br)
{
	int i, j, k;

	int ip, pi;

	int next;

	unsigned ssize;

	btree *bp;

	int bcl, bcr;

#ifdef DEBUG_ALLOC_SLOW	
	if (!bl->parent || !br->parent) errx(1, "Trying to merge heap top\n");
	if (bl->parent != br->parent) errx(1, "Trying to merge two nodes with different parents\n");
#endif

	bcl = btree_count(bl);
	bcr = btree_count(br);

	/* Move some from neighbour to me */
	if (bcr + bcl > BT_MAX)
	{
		if (bcr > bcl)
		{
			/* Silence compiler warning */
			next = 0;

			/* Move some nodes from br to bl */
			ip = b_last(bl);

			for (j = bcr / 3, k = b_start(br); j; k = next, j--)
			{
				next = b_next(br, k);
				i = btree_alloc(bl);

				/* Add in new node */
				bl->bsize[i] = br->bsize[k];
				b_next(bl, ip) = i;
				b_prev(bl, i) = ip;
				btree_update_daughter(bl, b_ptr(br, k), i);
				ip = i;

				/* Remove old node */
				btree_free(br, k);
			}
			b_next(bl, ip) = 0;
			ssize = bl->bsize[ip];
			b_last(bl) = ip;

			b_start(br) = next;
			b_prev(br, next) = 0;

			/* Notify parent of my new size */
			btree_update_psize(bl, ssize);

			return;
		}

		/* Scan 2/3rds of the way through bl */
		for (j = bcl / 3, ip = b_last(bl); j; ip = b_prev(bl, ip), j--);

		k = b_start(br);
		ssize = btree_ssize(bl, ip);
		j = b_next(bl, ip);
		b_next(bl, ip) = 0;
		b_last(bl) = ip;

		/* Copy remainder to br, deleting as we go */
		for (ip = 0; j; j = next)
		{
			next = b_next(bl, j);
			i = btree_alloc(br);

			/* Add in new node */
			br->bsize[i] = bl->bsize[j];
			b_next(br, ip) = i;
			b_prev(br, i) = ip;
			ip = i;
			btree_update_daughter(br, b_ptr(bl, j), i);

			/* Remove old node */
			btree_free(bl, j);
		}

		/* link to remainder of nodes in br */
		b_next(br, ip) = k;
		b_prev(br, k) = ip;

		/* Notify parent of my new size */
		btree_update_psize(bl, ssize);

		return;
	}

	/* merge bl into br and delete bl */
	ip = 0;
	k = b_start(br);
	for (j = b_start(bl); j; j = b_next(bl, j))
	{
		i = btree_alloc(br);

		/* Add in new node */
		br->bsize[i] = bl->bsize[j];
		b_next(br, ip) = i;
		b_prev(br, i) = ip;
		ip = i;

		btree_update_daughter(br, b_ptr(bl, j), i);
	}

#ifdef DEBUG_ALLOC_SLOW
	if (!ip) errx(1, "Empty left node?\n");
#endif

	b_next(br, ip) = k;
	b_prev(br, k) = ip;

	/* Save these so we can delete */
	bp = bl->parent;
	pi = b_pindex(bl);

	/* Delete this node when done */
	local_free(tl, &bl->data);

	/* Delete bl */
	btree_node_del(tl, bp, pi);

	/* Tail recursion */
}

static noinline void btree_node_del_aux(atls *tl, btree *b, btree *bp)
{
	size_t prev, next;

	int pi = b_pindex(b);

	int i;

#ifdef DEBUG_ALLOC_SLOW
	if (!pi) errx(1, "Corrupted leaf\n");
#endif

	/* Rebalance if possible */
	next = b_next(bp, pi);
	if (next)
	{
		/* Merge with next node */
		btree_merge_aux(tl, b, b_ptr(bp, next));

		return;
	}

	prev = b_prev(bp, pi);
	if (prev)
	{
		/* Merge with previous node */
		btree_merge_aux(tl, b_ptr(bp, prev), b);

		return;
	}

	/* Just me here? */
#ifdef DEBUG_ALLOC_SLOW
	if (bp != &tl->bheap) errx(1, "Invalid node count\n");
#endif

	/* Move my data to the top of the btree */
	b_start(bp) = b_start(b);
	b_last(bp) = b_last(b);
	b_mask(bp) = b_mask(b);

	/* Init alloced list */
	for (i = b_start(b); i; i = b_next(b, i))
	{
		bp->bsize[i] = b->bsize[i];
		bp->prev[i] = b->prev[i];
		btree_update_daughter(bp, b_ptr(b, i), i);
	}

	/* Btree is shorter */
	tl->b_hgt--;

	/* Delete this node when done */
	local_free(tl, &b->data);

	/* Prevent having too many spare nodes which can cause fragmentation */
	if (tl->b_hgt < tl->b_cnt)
	{
		/* Pop off the extra node */
		void *st = slist_rem(&tl->btree_freenode);
		b = list_entry(btree, data, st);

		/* Delete it */
		local_free(tl, &b->data);
		tl->b_cnt--;
	}
}

/* Delete node at location loc */
static void btree_node_del(atls *tl, btree *b, int loc)
{
	size_t prev = b_prev(b, loc);
	size_t next = b_next(b, loc);

	btree *bp = b->parent;

	b_next(b, prev) = next;
	b_prev(b, next) = prev;

	/* Add to free list */
	btree_free(b, loc);

	/* If top - am done */
	if (!bp) return;

	/* Was last? */
	if (!next)
	{
		/* Update parent size (we know there must be at least one other node) */
		btree_update_psize(b, btree_ssize(b, prev));
	}

	/* Still not empty enough btree_count(b) > 3) (faster than popcount) */
	if (b_next(b, b_next(b, b_next(b, b_start(b))))) return;

	btree_node_del_aux(tl, b, bp);
}

static inline btree *btree_search(atls *tl, unsigned ssize)
{
	btree *b = &tl->bheap;
	size_t i = b_start(b);

	while (i)
	{
		/* Scan level below? */
		if (b->bsize[i] < ssize)
		{
			i = b_next(b, i);
		}
		else
		{
			b = b_ptr(b, i);
			i = b_start(b);
		}
	}

	return b;
}

/* Return node of size ssize if possible */
static btree *btree_remove(atls *tl, unsigned ssize)
{
	btree *b = btree_search(tl, ssize);

	/* Nothing? */
	if (b == &tl->bheap) return NULL;

	/* Disconnect it */
	btree_node_del(tl, b->parent, b_pindex(b));

	return b;
}

/* Find space for node of size ssize */
static btree *btree_find(atls *tl, unsigned ssize, int *ipv)
{
	btree *b = btree_search(tl, ssize);
	btree *bp = &tl->bheap;

	if (b != bp)
	{
		bp = b->parent;
		*ipv = b_prev(bp, b_pindex(b));
		return bp;
	}

	/* Nothing in btree? */
	if (b_leaf(b)) return bp;

	/* We are larger than anything */
	do
	{
		/* Scan level below */
		b = b_ptr(b, (int) b_last(b));
	}
	while (!b_leaf(b));

	*ipv = b_pindex(b);

	return b->parent;
}

/* Cleanup - make sure we have enough temp nodes */
static noinline void btree_cleanup(atls *tl)
{
	/* First try to use slab allocations to prevent fragmentation */
	while (tl->b_hgt > tl->b_cnt)
	{
		slist *s = slab_alloc_safe(tl, sizeof(btree) - SEPSIZE);

		/* Fall back to in-btree allocations */
		if (!s) goto use_btree;

		slist_add(&tl->btree_freenode, s);
		tl->b_cnt++;
	}

	return;

use_btree:

	/* In-btree allocation by manual memory manipulation */
	while (tl->b_hgt > tl->b_cnt)
	{
		size_t num, msize;

		unsigned i;

		btree *br;

		unsigned offset;

		/* Get smallest allocation in btree */
		btree *b = btree_remove(tl, 0);

		msize = b->s.size;

		/* How many nodes can fit? */
		num = msize / sizeof(btree);

		/* As many as required */
		if (num > tl->b_hgt - tl->b_cnt) num = tl->b_hgt - tl->b_cnt;

		/* Prevent recursion by always adding at least one node */
		if (num < 1) num = 1;

		/* We are using this */
		set_used(b, msize);
		offset = b->s.bs_offset & ~15;

		for (i = 0; i < num; i++)
		{
			br = shift(b, sizeof(btree));
			b->s.size = sizeof(btree);
			offset += sizeof(btree);
			if (i != num - 1) br->s.bs_offset = offset;
			msize -= sizeof(btree);

			slist_add(&tl->btree_freenode, &b->list);
			b = br;
		}

		tl->b_cnt += num;

		/* Any room left? */
		if (!msize) continue;

		/* Free remaining fragment */
		b->s.size = msize;
		b->s.bs_offset = offset;
		fast_free(tl, b, msize);
	}
}

static void btree_node_insert(atls *tl, btree *b, int loc, unsigned ssize, btree *node);

/* Split myself */
static noinline void btree_node_insert_aux(atls *tl, btree *b, int loc, unsigned ssize, btree *node)
{
	btree *tmp, *tmp2, *bp;

	unsigned bsize = 0, tsize;

	int i, j, bn;

	void *st;

	size_t new;
	int inserted = 0;
	size_t next;

	/* New node */
	st = slist_rem(&tl->btree_freenode);
	tmp = list_entry(btree, data, st);
	tl->b_cnt--;

	/* Clear it */
	memset(&tmp->data, 0, offsetof(btree, prev) - SEPSIZE);

#if 0
	/* Hack - get daughter testing to work */
	tmp->parent = (btree *) 1;
#endif

	/* Special case - insert at start? */
	if (!loc)
	{
		/* Insert at the beginning */
		tmp->bsize[1] = ssize + 2;
		tmp->prev[1] = 0;
		btree_update_daughter(tmp, node, 1);
		inserted = 1;

		/* Copy things below median here */
		for (i = 2, j = b_start(b); i <= BT_MAX/2; i++, j = bn)
		{
			bn = b_next(b, j);

			tmp->bsize[i] = btree_ssize(b, j) + i + 1;
			tmp->prev[i] = i - 1;
			btree_update_daughter(tmp, b_ptr(b, j), i);

			btree_free(b, j);
		}
	}
	else
	{
		/* Copy things below median here */
		for (i = 1, j = b_start(b); i <= BT_MAX/2; i++, j = bn)
		{
			bn = b_next(b, j);

			tmp->bsize[i] = btree_ssize(b, j) + i + 1;
			tmp->prev[i] = i - 1;
			btree_update_daughter(tmp, b_ptr(b, j), i);

			btree_free(b, j);

			/* Need to insert new node? */
			if (j == loc)
			{
				i++;
				tmp->bsize[i] = ssize + i + 1;
				tmp->prev[i] = i - 1;
				btree_update_daughter(tmp, node, i);
				inserted = 1;
			}
		}
	}
	b_start(b) = j;
	b_prev(b, j) = 0;

	/* Finish initialization of new node */
	b_start(tmp) = 1;
	b_last(tmp) = i - 1;
	b_next(tmp, i - 1) = 0;
	tsize = tmp->bsize[i - 1];
	b_mask(tmp) = -(1 << (i - 1));

	/* Need to insert in remainder? */
	if (!inserted)
	{
		next = b_next(b, loc);

		/* We have space - add it */
		new = btree_alloc(b);

		b->bsize[new] = ssize + next;
		b_prev(b, next) = new;
		b_next(b, loc) = new;
		b_prev(b, new) = loc;

		/* Am I last?  Need to update parents */
		if (!next) btree_update_psize(b, ssize);

		btree_update_daughter(b, node, new);
	}

	bp = b->parent;
	if (bp)
	{
		/* Get node previous to myself above */
		size_t ip = b_prev(bp, b_pindex(b));

		/* Easy - just insert into the parent, tail recurse */
		btree_node_insert(tl, bp, ip, tsize, tmp);

		return;
	}

	/* I'm the top node */

	/* New node */
	st = slist_rem(&tl->btree_freenode);
	tmp2 = list_entry(btree, data, st);
	tl->b_cnt--;

	/* btree is taller */
	tl->b_hgt++;

	/* Copy b into this -shouldn't need this, use allocated root instead */
	memcpy(&tmp2->data, &b->data, sizeof(btree) - SEPSIZE);

	for (i = b_start(b); i; i = b_next(b, i))
	{
		b_ptr(b, i)->parent = tmp2;
		bsize = b->bsize[i];
	}

	/* Init b */
	b->bsize[1] = tsize + 2;
	b->bsize[2] = bsize & ~0xff;
	b_ptr(b, 1) = tmp;
	b_ptr(b, 2) = tmp2;
	b_prev(b, 0) = 2;
	b_prev(b, 1) = 0;
	b_prev(b, 2) = 1;
	b_start(b) = 1;
	b_mask(b) = -4;
	b->parent = NULL;

	/* Make links */
	tmp->parent = b;
	tmp2->parent = b;
	b_pindex(tmp) = 1;
	b_pindex(tmp2) = 2;
}

static void btree_node_insert(atls *tl, btree *b, int loc, unsigned ssize, btree *node)
{
	size_t new;
	size_t next;

#ifdef DEBUG_ALLOC_SLOW
	if (ssize & 0xff) errx(1, "ssize not clean\n");
#endif

	if (!b_mask(b))
	{
		btree_node_insert_aux(tl, b, loc, ssize, node);

		return;
	}

	/* We have space - add it */
	new = btree_alloc(b);

	next = b_next(b, loc);
	b->bsize[new] = ssize + next;
	b_prev(b, next) = new;
	b_next(b, loc) = new;
	b_prev(b, new) = loc;

	/* Am I last?  Need to update parents */
	if (!next) btree_update_psize(b, ssize);

	btree_update_daughter(b, node, new);
}

static void btree_insert(atls *tl, btree *n, size_t size)
{
	int ip = 0;

	/* Convert to internal size (upper 24bits of 32bit bsize) */
	unsigned ssize = size * 16;

	/* First find where to put it, splitting to make room */
	btree *b = btree_find(tl, ssize, &ip);

#ifdef DEBUG_ALLOC_SLOW
	if (!un_used(n)) errx(1, "inserting a used node\n");
	if (size != n->s.size) errx(1, "size missmatch\n");
#endif

	/* Make a leaf node */
	//b_start(n) = 0;

	/* Hack - do the above more efficiently */
	n->bsize[0] = 0;

	/* Insert it */
	btree_node_insert(tl, b, ip, ssize, n);

	btree_cleanup(tl);
}

static noinline btree *btree_get(atls *tl, unsigned size)
{
	DECL_PROF_FUNC;

	unsigned ssize = size * 16;
	btree *b;

	b = btree_remove(tl, ssize);

	if (b)
	{
		/* Do not try to merge with me - I'm already taken! */
		set_used(b, b->s.size);
	}

	return b;
}

/* Dumb nlogn merge.  Avoids recursion though */
static void btree_merge(atls *tl1, atls *tl2)
{
	btree *b;

	slist *s, *sn;

	while ((b = btree_remove(tl2, 0)))
	{
		btree_insert(tl1, b, b->s.size);
	}

	/* Update allocated size */
	tl1->a_alloced += tl2->a_alloced;

	/* Free the old extra btree nodes */
	scan_slist_safe(&tl2->btree_freenode, s, sn)
	{
		b = list_entry(btree, data, s);

		/* Delete it */
		local_free(tl1, &b->data);
	}

	tl2->b_cnt = 0;
}

/* Count number of nodes + leaves in the btree recursively */
static int count_btree(btree *b)
{
	int i, count = 1;

	for (i = b_start(b); i; i = b_next(b, i))
	{
		count += count_btree(b_ptr(b, i));
	}

	return count;
}

static int count_btree_space(btree *b)
{
	int i, count = 0;

	if (b_leaf(b)) return b->s.size - PTRSIZE;

	for (i = b_start(b); i; i = b_next(b, i))
	{
		count += count_btree_space(b_ptr(b, i));
	}

	return count;
}


#ifdef DEBUG_ALLOC_SLOW

/* Check double list constraints */
static void test_double_lists(atls *tl)
{
	btree *b;
	dlist *d, *dn;

	unsigned i;

	for (i = 1; i < NUM_QS; i++)
	{
		if (tl->qs[i].next->prev != &tl->qs[i]) errx(1, "First double link broken\n");

		scan_list_safe(&tl->qs[i], d, dn)
		{
			b = list_entry(btree, list, d);
			check_sep(b);

			if (!un_used(b)) errx(1, "False used\n");
			if (b->s.size != (i+1)*16) errx(1, "Wrong sized double link\n");
			if (b->s.bs_offset & FLG_SIZE8) errx(1, "flag size wrong\n");

			if (dn->prev != d) errx(1, "Back-link broken\n");
		}
	}

	if (tl->q_mask & (1ULL << 63)) errx(1, "double list last bit set\n");
}

#else
#define test_double_lists(T) ((void) sizeof(T))
#endif


#ifdef DEBUG_ALLOC_SLOW
/* Test small list constraints */
static void test_small_list(atls *tl)
{
	btree *b, *bn;

	btree *q8 = get_q8(tl);

	if (small_prev(small_next(q8)) != q8) errx(1, "First link broken\n");

	for (b = small_next(q8); b != q8; b = bn)
	{
		check_sep(b);
		bn = small_next(b);

		if (!(b->s.bs_offset & FLG_SIZE8)) errx(1, "Wrong sized small link\n");
		if (!un_used(b)) errx(1, "False used\n");
		if (small_prev(bn) != b) errx(1, "Back-link broken\n");
	}
}
#else
#define test_small_list(T) ((void) sizeof(T))
#endif

/* Add to end of small list */
static void small_insert(atls *tl, btree *b)
{
	btree *q8 = get_q8(tl);
	btree *qp;

	/* Set flag */
	set_size8(b);

	qp = small_prev(q8);

	set_small_prev(b, qp);
	set_small_next(qp, b);

	set_small_next(b, q8);
	set_small_prev(q8, b);
}

static void small_remove(btree *b)
{
	btree *qn = small_next(b);
	btree *qp = small_prev(b);

	set_small_next(qp, qn);
	set_small_prev(qn, qp);

	/* Clear flag */
	unset_size8(b);
}

static btree *small_del_first(atls *tl)
{
	btree *q8 = get_q8(tl);
	btree *b = small_next(q8);
	btree *qn = small_next(b);

	/* List is empty */
	if (b == q8) return NULL;

	/* Dequeue b */
	set_small_next(q8, qn);
	set_small_prev(qn, q8);

	/* Clear flag */
	unset_size8(b);

	return b;
}

static void small_merge(atls *tl1, atls *tl2)
{
	btree *q81 = get_q8(tl1);
	btree *q82 = get_q8(tl2);

	btree *q1p = small_prev(q81);
	btree *q2n = small_next(q82);
	btree *q2p = small_prev(q82);

	/* Don't need to do anything if adding an empty list */
	if (q2n == q82) return;

	set_small_next(q1p, q2n);
	set_small_prev(q2n, q1p);

	set_small_prev(q81, q2p);
	set_small_next(q2p, q81);
}

/* Slab implementation */

static sbheader *slab_start(void *p)
{
	return (sbheader *) (-SLABSIZE & (uintptr_t) p);
}

#ifdef DEBUG_ALLOC_SLOW

static void test_slab(atls *tl)
{
	int i;

	dlist *d, *dn;

	for (i = 0; i < NUM_SB; i++)
	{
		scan_list_safe(&tl->slab[i], d, dn)
		{
			if (dn->prev != d) errx(1, "Back-link broken\n");
		}
	}

	scan_list_safe(&tl->slab_full, d, dn)
	{
		if (dn->prev != d) errx(1, "Back-link broken\n");
	}
}
#else
#define test_slab(T) ((void) sizeof(T))
#endif


static freesb *slab_alloc_chunk(atls *tl)
{
	DECL_PROF_FUNC;

	freesb *fsb;

	size_t alloc_amount = SLABSIZE * (SLABBMAX + 1);
	size_t sbrk_end;

	unsigned i;
	unsigned alloced = SLABBMAX;

	/* Handle oom more efficiently */
	if (sbrk_oom) return NULL;

	/* Make sure percpu value isn't too big */
	if (tl->percpu_hash > cpu_total)
	{
		tl->percpu_hash %= cpu_total;
	}

	/* Find an unlocked list with something in it */
	for (i = tl->percpu_hash; i < cpu_total; i++)
	{
		if (pc_slab[i].list && !mutex_trylock(&pc_slab[i].m))
		{
			if (pc_slab[i].list)
			{
				fsb = pc_slab[i].list;
				pc_slab[i].list = fsb->next;

				mutex_unlock(&pc_slab[i].m);
#ifdef WINDOWS
				/* Reallow use of pages */
				for (i = 0; i < fsb->count; i++)
				{
					VirtualAlloc(fsb->blocks[i], SLABSIZE, MEM_COMMIT, PAGE_READWRITE);
				}
#endif

				return fsb;
			}

			mutex_unlock(&pc_slab[i].m);
		}
	}

	for (i = 0; i < tl->percpu_hash; i++)
	{
		if (pc_slab[i].list && !mutex_trylock(&pc_slab[i].m))
		{
			if (pc_slab[i].list)
			{
				fsb = pc_slab[i].list;
				pc_slab[i].list = fsb->next;

				mutex_unlock(&pc_slab[i].m);
#ifdef WINDOWS
				/* Reallow use of pages */
				for (i = 0; i < fsb->count; i++)
				{
					VirtualAlloc(fsb->blocks[i], SLABSIZE, MEM_COMMIT, PAGE_READWRITE);
				}
#endif

				return fsb;
			}

			mutex_unlock(&pc_slab[i].m);
		}
	}

	mutex_lock(&sb_lock);

	sbrk_end = sbrk_start + sbrk_size;

	/* Try to realign with SLABSIZE boundaries */
	if (sbrk_end & (SLABSIZE - 1))
	{
		alloc_amount += SLABSIZE - (sbrk_end & (SLABSIZE - 1));
	}

	fsb = sbrk(alloc_amount);

	if (fsb == MAP_FAILED)
	{
		/* Too much to allocate - fall back on mmap */
		sbrk_oom = 1;
		mutex_unlock(&sb_lock);

		return NULL;
	}

	/* Update sbrk information */
	sbrk_size = alloc_amount + (uintptr_t) fsb - sbrk_start;

	mutex_unlock(&sb_lock);

	/* Are we improperly aligned? */
	if ((SLABSIZE - 1) & (uintptr_t) fsb)
	{
		/* Realign myself (wastes memory) */
		freesb *fsb_new = (freesb *) slab_start(shift(fsb, SLABSIZE - 1));

		/* Did we shift too much? */
		if ((uintptr_t) fsb_new - (uintptr_t) fsb > alloc_amount - SLABSIZE * (SLABBMAX + 1))
		{
			alloced--;
		}

		fsb = fsb_new;
	}

	/* Fill in details */
	for (i = 0; i < alloced; i++)
	{
		fsb->blocks[i] = shift(fsb, SLABSIZE * (i + 1));
	}
	fsb->count = alloced;

	return fsb;
}

static noinline void slab_free_chunk(atls *tl, freesb *fsb)
{
	DECL_PROF_FUNC;

	unsigned i;

	/* Mark memory as unused */
	for (i = 0; i < fsb->count; i++)
	{
#ifdef WINDOWS
		VirtualFree(fsb->blocks[i], SLABSIZE, MEM_DECOMMIT);
#else
		madvise(fsb->blocks[i], SLABSIZE, MADV_DONTNEED);
#endif
	}

	/* Make sure percpu value isn't too big */
	if (tl->percpu_hash > cpu_total)
	{
		tl->percpu_hash %= cpu_total;
	}

	/* First trylock everything, to find something that works */
	for (i = tl->percpu_hash; i < cpu_total; i++)
	{
		if (!mutex_trylock(&pc_slab[i].m))
		{
			tl->percpu_hash = i;
			goto success;
		}
	}

	for (i = 0; i < tl->percpu_hash; i++)
	{
		if (!mutex_trylock(&pc_slab[i].m))
		{
			tl->percpu_hash = i;
			goto success;
		}
	}

	/* Failure - too much contention, just use our current value */
	i = tl->percpu_hash;
	mutex_lock(&pc_slab[i].m);

success:
	tl->percpu_hash = i;
	fsb->next = pc_slab[i].list;
	pc_slab[i].list = fsb;
	mutex_unlock(&pc_slab[i].m);
}


static sbheader *slab_alloc_block(atls *tl)
{
	freesb *fsb;
	sbheader *sb;

	/* Make sure we have empty blocks */
	if (!tl->slab_chunk)
	{
		tl->slab_chunk = slab_alloc_chunk(tl);
		if (!tl->slab_chunk) return NULL;
	}

	fsb = tl->slab_chunk;

	if (!fsb->count)
	{
		sb = (sbheader *) fsb;

		tl->slab_chunk = NULL;

		/* Prevent reuse of this overwritten block */
		sb->size = 0;

		return sb;
	}

	fsb->count--;
	sb = fsb->blocks[fsb->count];

	return sb;
}

static void slab_free_block(atls *tl, sbheader *sb)
{
	freesb *ofsb = tl->slab_chunk;
	freesb *fsb = (freesb *) sb;

	if (ofsb)
	{
		if (ofsb->count < SLABBMAX - 1)
		{
			ofsb->blocks[ofsb->count] = sb;
			ofsb->count++;
			return;
		}

		/* Simplest case - no chunk yet */
		fsb->count = 0;
		tl->slab_chunk = fsb;

		slab_free_chunk(tl, ofsb);

		return;
	}

	/* Simplest case - no chunk yet */
	fsb->count = 0;
	tl->slab_chunk = fsb;
}

static int init_sldata(void)
{
	unsigned i;

	/* Init total number of cpus */
	cpu_total = cpu_num();

	/* Init per-cpu free slab lists */
	pc_slab = big_alloc_aux(page_align(cpu_total * 64));
	if (!pc_slab) return 1;

	/*
	 * Initialized mutexes have state zero, and initialized lists are NULL
	 * so we don't have to do anything to pc_slab to finish initializing it.
	 */

	/* Calculate slab total sizes so we avoid a division later */
	for (i = 1; i < NUM_SB; i++)
	{
		unsigned size = i * 16;

		/* total size taken by all blocks */
		sltotal[i] = ((SLABSIZE - offsetof(sbheader, data))/size) * size;
	}

	return 0;
}

static sbheader *slab_create(atls *tl, dlist *slab, unsigned size)
{
	DECL_PROF_FUNC;

	unsigned index = size/16;
	unsigned total = sltotal[index];

	uintptr_t offset;

	sbheader *sb = slab_alloc_block(tl);
	if (!sb) return NULL;

	/* Normalize size */
	size = index * 16;

	/* Fill in details */
	sb->tail = &tl->tail;

	dlist_add(slab, &sb->list);

	/* Already initialized? */
	if (sb->size == size) return sb;

	sb->used = 0;
	sb->size = size;

	/* Calculate offset */
	if ((size == 64) || (size == 256))
	{
		/* Make cacheline-aligned */
		offset = (uintptr_t) sb + 128 + 1;
	}
	else
	{
		void *tmp;

		/* Start of region */
		offset = (-16 & (uintptr_t) &sb->data);

		/* Randomize offset */
		tmp = rnd_offset((void *) offset, (uintptr_t) sb + SLABSIZE - (uintptr_t) offset, total);

		offset = (uintptr_t) tmp + 1;
	}

#ifdef DEBUG_ALLOC
	if (offset - 1 + total - (uintptr_t) sb > SLABSIZE) errx(1, "slab overflow\n");
#endif

	sb->first = offset;
	sb->max = (uintptr_t) sb + SLABSIZE - sb->size;

	return sb;
}

static void slab_free(atls *tl, void *p)
{
	DECL_PROF_FUNC;

	sbheader *sb = slab_start(p);

	/* Do I own this? */
	if (unlikely(sb->tail != &tl->tail))
	{
		/* Hack wrt mealloc */
		prepend_queue(p, tl, &sb->tail);

		return;
	}

	/* Add to this slab's free list */
	*(uintptr_t *)p = sb->first;
	sb->first = (uintptr_t) p;

	sb->used--;
	if (!sb->used)
	{
		/* If I am the only one in the partial list, don't bother to delete */
		if (sb->list.next == sb->list.prev) return;

		dlist_del(&sb->list);

		/* Free it */
		slab_free_block(tl, sb);
	}
}

static void *slab_alloc(atls *tl, size_t size);
static noinline void *slab_alloc_nolist(size_t size, dlist *slab, atls *tl)
{
	DECL_PROF_FUNC;

	void *res;

	/* Out of line zero-check */
	if (!size)
	{
		size++;
	}
	else
	{
		/* Scan queue if we have nothing left */
		if (!tl->slab_chunk)
		{
			scan_queue(tl, &tl->head, 0);
		}

		/* Still nothing? */
		if (dlist_empty(slab))
		{
			if (!slab_create(tl, slab, size + 15))
			{
				goto again;
			}
		}
	}

	/* We have something to use */
	res = slab_alloc(tl, size);
	if (res) return res;

again:

	size = sep_align(size);
	return local_alloc(tl, size);
}

static void *slab_alloc_aux(atls *tl, dlist *slab)
{
	DECL_PROF_FUNC;

	/* Get sbheader */
	sbheader *sb = list_entry(sbheader, list, slab->next);

	uintptr_t p = sb->first;

	sb->used++;

	if (!(p & 1))
	{
		sb->first = *(uintptr_t *) (void *) p;

		if (!sb->first) goto done;
	}
	else
	{
		p--;
		sb->first += sb->size;
		if (sb->first > sb->max)
		{
			sb->first = 0;

			goto done;
		}
	}

	return (void *) p;

done:
	/* Move to full list */
	dlist_del(&sb->list);
	dlist_add(&tl->slab_full, &sb->list);

	return (void *) p;
}

static void *slab_alloc(atls *tl, size_t size)
{
	dlist *slab;

	size_t nsize = size + 15;

#ifdef DEBUG_NO_SLAB
	size = sep_align(size);
	return local_alloc(tl, size);
#endif

	/* Get slab */
#ifdef __x86_64__
	slab = shift(&tl->slab[0], nsize & ~15);
#else
	slab = shift(&tl->slab[0], (nsize & ~15) / 2);
#endif

	if (dlist_empty(slab)) return slab_alloc_nolist(size, slab, tl);

	return slab_alloc_aux(tl, slab);
}

/* Same as above, but fail if we can't quickly allocate */
static void *slab_alloc_safe(atls *tl, size_t size)
{
	dlist *slab;

#ifdef DEBUG_NO_SLAB
	return NULL;
#endif

	size += 15;

	/* Get slab */
#ifdef __x86_64__
	slab = shift(&tl->slab[0], size & ~15);
#else
	slab = shift(&tl->slab[0], (size & ~15) / 2);
#endif

	/* Fail if we can't quickly allocate (don't call scan_queue) */
	if (dlist_empty(slab) && !slab_create(tl, slab, size)) return NULL;

	return slab_alloc_aux(tl, slab);
}

static noinline void *slab_zalloc(atls *tl, size_t size)
{
	void *p = slab_alloc(tl, size);
	if (p) return memset(p, 0, size);

	size = sep_align(size);

	p = fast_alloc(tl, size);
	if (!p)
	{
		tl->callocable = 0;
		p = slow_alloc(tl, size);

		/* No need to memset? */
		if (!p || tl->callocable)
		{
			return p;
		}
	}

	/* Success */
	return memset(p, 0, size - 8);
}

static void slab_merge(atls *tl1, atls *tl2)
{
	int i;
	dlist *d, *dn;

	/* Merge partial slabs */
	for (i = 0; i < NUM_SB; i++)
	{
		/* Update all the tail pointers */
		scan_list_safe(&tl2->slab[i], d, dn)
		{
			sbheader *sb = list_entry(sbheader, list, d);
			sb->tail = &tl1->tail;

			/* There may be one empty slab in this slot */
			if (!sb->used)
			{
				/* Move to full list */
				dlist_del(&sb->list);
				dlist_add(&tl1->slab_full, &sb->list);
			}
		}

		dlist_merge(&tl1->slab[i], &tl2->slab[i]);
	}

	/* Merge full and empty slabs */

	/* Update all the tail pointers */
	scan_list(&tl2->slab_full, d)
	{
		sbheader *sb = list_entry(sbheader, list, d);
		sb->tail = &tl1->tail;
	}

	dlist_merge(&tl1->slab_full, &tl2->slab_full);

	/* Get rid of empty pages */
	if (tl2->slab_chunk) slab_free_chunk(tl1, tl2->slab_chunk);
}

static void local_free(atls *tl, void *p)
{
	if (is_slab(p))
	{
		slab_free(tl, p);
	}
	else
	{
		btree *b = CONTAINER(btree, data, p);
		fast_free(tl, b, b->s.size);
	}
}

static void *local_alloc(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	void *p = fast_alloc(tl, size);
	if (p) return p;

	return slow_alloc(tl, size);
}

static void test_all(atls *tl)
{
	test_fast_lists(tl);
	test_double_lists(tl);
	test_small_list(tl);
	test_btree(tl);
	test_queue(tl);
	test_slab(tl);
	test_blocks(tl);
}

static void block_list_merge(atls *tl1, atls *tl2)
{
	mealloc *m;
	dlist *d;

	/* Scan block list, and update all tail pointers */
	scan_list(&tl2->bl, d)
	{
		m = list_entry(mealloc, m_list, d);
		m->tail = &tl1->tail;
	}

	dlist_merge(&tl1->bl, &tl2->bl);
}

static void atls_merge(atls *tl1, atls *tl2)
{
	int i;

	/* Merge block lists so others know about us */
	block_list_merge(tl1, tl2);

	/* Then merge the btrees */
	btree_merge(tl1, tl2);

	/* Merge the fast lists */
	fast_merge(tl1, tl2);

	/* Merge the slabs */
	slab_merge(tl1, tl2);

	/* Then the double links */
	for (i = 1; i < NUM_QS; i++)
	{
		dlist_merge(&tl1->qs[i], &tl2->qs[i]);
	}

	/* Finally the small list */
	small_merge(tl1, tl2);

	test_all(tl1);
}

static btree *split_node_rhs(atls *tl, btree *b, size_t t_size, size_t msize)
{
	size_t r_size = t_size - msize;

	btree *bm = shift(b, msize);

#ifdef DEBUG_ALLOC_SLOW	
	if (t_size != b->s.size) errx(1, "size missmatch\n");
	if (msize > t_size) errx(1, "too big to fit in split\n");
	check_sep(b);
#endif

	/* Too large to split profitably? */
	if (!r_size) return b;

	/* Update local size */
	b->s.size = msize;

	/* Create middle seperator */
	set_sep(bm, r_size, b);

	check_sep(bm);

	/* Make sure to try to use remainder */
	fast_free(tl, b, msize);

	/* Paranoia */
	check_sep(b);
	check_sep(bm);

	/* Used for when right node is used */
	return bm;
}

static always_inline void *split_node(atls *tl, btree *b, size_t t_size, size_t msize)
{
	size_t r_size = t_size - msize;

	btree *bm = shift(b, msize);

#ifdef DEBUG_ALLOC_SLOW	
	if (t_size != b->s.size) errx(1, "size missmatch\n");
	if (msize > t_size) errx(1, "too big to fit in split\n");
	check_sep(b);
#endif

	/* Too large to split profitably? */
	if (r_size * 4 < msize)
	{
		/* Used this whole */
		return &b->data;
	}

	/* Update local size */
	b->s.size = msize;

	/* Create middle seperator */
	set_sep(bm, r_size, b);

	check_sep(bm);

	/* Make sure to try to use remainder */
	fast_free(tl, bm, r_size);

	/* Paranoia */
	check_sep(b);
	check_sep(bm);

	return &b->data;
}

static void node_insert(atls *tl, btree *b)
{
	size_t size = b->s.size;

	if (size > QS_MAX)
	{
		/* Insert new segment into btree */
		btree_insert(tl, b, size);

		return;
	}

	if (size == 16)
	{
		small_insert(tl, b);

		return;
	}

	dlist_add(MYSIZE_TO_PTR(tl, size), &b->list2);
	tl->q_mask |= 1ULL << ((size - 8) / 16);
}

/* Complete merge */
static void merge_node_aux(atls *tl, btree *bl)
{
	DECL_PROF_FUNC;

	size_t msize = bl->s.size;

	/* Are we the only element in the allocation? */
	btree *br = shift(bl, msize);

	mealloc *m;

	/* Is block empty? */
	if ((bl->s.bs_offset >= HEADERSIZE) || br->s.size || (msize * 4 > tl->a_alloced)) goto save;

	/* Save a block, if it is big enough to use for a pending allocation */
	if (tl->s_wanted && (tl->s_wanted <= bl->s.size))
	{
		/* No longer wanted */
		tl->s_wanted = 0;

		goto save;
	}

	/* Get header */
	m = page_start(bl);

	/* Remove from the list */
	dlist_del(&m->m_list);

	/* Size of block */
	msize = m->b.s.size + HEADERSIZE;

#ifdef DEBUG_ALLOC_SLOW
	if (msize & (PAGESIZE - 1)) errx(1, "big block size incorrect\n");
#endif

	tl->a_alloced -= msize;

	big_freed(m, msize);

#ifndef WINDOWS
	munmap(m, msize);
#else
	VirtualFree(m, 0, MEM_RELEASE);
#endif
	return;

save:
	/* element is unallocated */
	set_unused(bl, br);

	/* Insert into correct data structure */
	node_insert(tl, bl);
}

/* Merge a node with unallocated neighbours + insert into free list */
static void merge_node(atls *tl, void *p)
{
	DECL_PROF_FUNC;

	btree *b = CONTAINER(btree, data, p);
	btree *bl = b, *br = shift(b, b->s.size);

	size_t tsize;

#ifdef DEBUG_ALLOC_SLOW
	if (un_used(b)) errx(1, "merging unused node");
#endif

	/* Test right */
	if (un_used(br))
	{
		if (br->s.bs_offset & FLG_SIZE8)
		{
			small_remove(br);
			tsize = 16;
		}
		else
		{
			tsize = br->s.size;

			if (tsize > QS_MAX)
			{
				btree_node_del(tl, br->parent, b_pindex(br));
			}
			else
			{
				dlist_del(&br->list2);
			}
		}

		/* Fixup sizes */
		b->s.size += tsize;
	}

	/* Test left */
	if (left_unused(b))
	{
		if (b->s.bs_offset & FLG_LSIZE8)
		{
			bl = shift(b, -(uintptr_t)16);

			small_remove(bl);
		}
		else
		{
			bl = b->s.left;

			if (bl->s.size > QS_MAX)
			{
				btree_node_del(tl, bl->parent, b_pindex(bl));
			}
			else
			{
				dlist_del(&bl->list2);
			}
		}

		/* Fixup sizes */
		bl->s.size += b->s.size;
	}

	merge_node_aux(tl, bl);
}

#ifdef __x86_64__

#define SAVE_REG(N, V)\
	asm volatile ("movq %0, %%xmm" #N "\n\t" :: "mr" (V))

#define RESTORE_REG(N, V)\
	asm volatile ("movq %%xmm" #N ", %0\n\t" : "=r" (V))

static always_inline void *fast_alloc(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	size_t n;
	u64b mask, tmask;
	slist *p;

	btree *b;
	size_t rsize;

	if (unlikely(size > FAST_64_BIN)) return NULL;

	n = size2fl(size);
	mask = FAST_MASK << n;
	tmask = tl->f_mask & mask;

	/* Anything to do? */
	while (tmask)
	{
		n = ffsq(tmask);
		p = &tl->fl[n];

		SAVE_REG(0, tmask);

		while (p->next)
		{
			slist *s = slist_rem(p);
			b = CONTAINER(btree, list, s);

			rsize = b->s.size;

			check_sep(b);

			/* Found a match? */
			if (likely(rsize >= size)) return &b->data;

			RESTORE_REG(0, tmask);

			/* Move to lower bin */
			//fast_add(tl, b, n - 1);

			/* Inlined version of the above so %rbx isn't used */
			slist_add(&p[-1], &b->list);
			tmask = tmask / 2;
			tl->f_mask |= tmask & (-tmask);
		}

		RESTORE_REG(0, tmask);

		/*
		 * Turn off least significant bit in tmask, as nothing is left there
		 */
		mask = (tmask - 1) | ~tmask;
		tmask &= mask;
		tl->f_mask &= mask;
	}

	return NULL;
}

static void *slow_alloc_aux(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	size_t n;
	u64b mask, tmask;

	btree *b;
	dlist *d;
	size_t rsize;

	/* Special case empty allocations */
	if (size == 16)
	{
		b = small_del_first(tl);
		if (b)
		{
			set_used(b, 16);

			return &b->data;
		}

		n = 1;
	}
	else
	{
		n = (size / 16) - 1;
	}

	mask = (~0ULL) << n;
	tmask = tl->q_mask & mask;

	/* Are there nodes big enough in the queues? */
	while (tmask)
	{
		/* Ignore if bit unset */
		n = ffsq(tmask);
		d = &tl->qs[n];

		/* Found something? */
		if (d->next != d)
		{
			b = list_entry(btree, list2, d->next);

			/* Paranoia */
			check_sep(b);

			dlist_del(&b->list2);

			rsize = (n + 1) * 16;
			set_used(b, rsize);
			return split_node(tl, b, rsize, size);
		}

		/*
		 * Turn off least significant bit in tmask, as nothing is left there
		 */
		mask = (tmask - 1) | ~tmask;
		tmask &= mask;
		tl->q_mask &= mask;
	}

	return NULL;
}

#else

/* Versions optimized for 32bit */
static always_inline void *fast_alloc(atls *tl, size_t size)
{
	size_t n = size2fl(size);

	unsigned tmask;

	slist *p;

	btree *b;
	size_t rsize;

	if (n < 32)
	{
		tmask = tl->f_mask & (FAST_MASK << n);

		/* Anything to do? */
		while (tmask)
		{
			n = ffsu(tmask);
			p = &tl->fl[n];

			while (p->next)
			{
				slist *s = slist_rem(p);
				b = CONTAINER(btree, list, s);

				rsize = b->s.size;

				check_sep(b);

				/* Found a match? */
				if (likely(rsize >= size)) return &b->data;

				/* Move to lower bin */
				fast_add(tl, b, n - 1);
			}

			/*
			 * Turn off least significant bit in tmask, as nothing is left there
			 */
			tmask &= tmask - 1;
			tl->f_mask &= ~(1ULL << n);
		}

		tmask = (tl->f_mask >> 32) & (FAST_MASK >> (32 - n));
	}
	else
	{
		tmask = (tl->f_mask >> 32) & (FAST_MASK << (n - 32));
	}

	if (unlikely(size >= FAST_64_BIN)) return NULL;

	/* Anything to do? */
	while (tmask)
	{
		n = ffsu(tmask) + 32;
		p = &tl->fl[n];

		while (p->next)
		{
			slist *s = slist_rem(p);
			b = CONTAINER(btree, list, s);

			rsize = b->s.size;

			check_sep(b);

			/* Found a match */
			if (likely(rsize >= size)) return &b->data;

			/* Move to lower bin */
			fast_add(tl, b, n - 1);
		}

		/*
		 * Turn off least significant bit in tmask, as nothing is left there
		 */
		tmask &= tmask - 1;
		tl->f_mask &= ~(1ULL << n);
	}

	return NULL;
}

static void *slow_alloc_aux(atls *tl, size_t size)
{
	size_t n;
	unsigned tmask;

	btree *b;
	dlist *d;
	size_t rsize;

	/* Special case empty allocations */
	if (size == 16)
	{
		b = small_del_first(tl);
		if (b)
		{
			set_used(b, 16);
			return &b->data;
		}

		n = 1;
	}
	else
	{
		n = (size / 16) - 1;
	}

	if (n < 32)
	{
		tmask = tl->q_mask & (~0 << n);

		/* Are there nodes big enough in the queues? */
		while (tmask)
		{
			/* Ignore if bit unset */
			n = ffsu(tmask);
			d = &tl->qs[n];

			/* Found something? */
			if (d->next != d)
			{
				b = list_entry(btree, list, d->next);

				/* Paranoia */
				check_sep(b);

				dlist_del(&b->list2);

				rsize = (n + 1) * 16;
				set_used(b, rsize);

				return split_node(tl, b, rsize, size);
			}

			/*
			 * Turn off least significant bit in tmask, as nothing is left there
			 */
			tmask &= tmask - 1;
			tl->q_mask &= ~(1ULL << n);
		}

		tmask = tl->q_mask >> 32;
	}
	else
	{
		tmask = (tl->q_mask >> 32) & (~0 << (n - 32));
	}

	/* Are there nodes big enough in the queues? */
	while (tmask)
	{
		/* Ignore if bit unset */
		n = ffsu(tmask) + 32;
		d = &tl->qs[n];

		/* Found something? */
		if (d->next != d)
		{
			b = list_entry(btree, list, d->next);

			/* Paranoia */
			check_sep(b);

			dlist_del(&b->list2);

			rsize = (n + 1) * 16;
			set_used(b, rsize);

			return split_node(tl, b, rsize, size);
		}

		/*
		 * Turn off least significant bit in tmask, as nothing is left there
		 */
		tmask &= tmask - 1;
		tl->q_mask &= ~(1ULL << n);
	}

	/* Failed */
	return NULL;
}

#endif

static void *block_alloc_aux(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	btree *b, *br;
	mealloc *ma;
	size_t rsize, tasize;

	/* Make overhead 1/4th of total allocated after this allocation */
	tasize = size + (size + tl->a_alloced) / 3;
	tasize = page_align(tasize);

	/* Clip to BTMALLOC */
	if (tasize > BTMALLOC) tasize = BTMALLOC;

	/* Must be more than MINALLOC */
	if (tasize < MINALLOC) tasize = MINALLOC;

	ma = big_alloc_aux(tasize);

	if (!ma)
	{
		/* Try with smaller alloc */
		tasize = page_align(size + HEADERSIZE);

		ma = big_alloc_aux(tasize);

		if (!ma)
		{
			/* Try again if new handler works */
			if (handle_oom(size)) return slow_alloc(tl, size);

			return NULL;
		}
	}

	rsize = tasize - HEADERSIZE;

	/* Keep track of total allocations */
	tl->a_alloced += tasize;

	/* Fill in header */
	dlist_add(&tl->bl, &ma->m_list);

	ma->tail = &tl->tail;

	b = &ma->b;

	/* Create left seperator */
	b->s.size = rsize;
	b->s.bs_offset = 16;

	/* Position of right seperator */
	br = shift(b, rsize);

	/* Create right seperator */
	br->s.bs_offset = tasize - SEPSIZE;

	tl->callocable = 1;

	return split_node(tl, b, rsize, size);
}

static void *block_alloc(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	btree *b;
	void *p;

	if (size >= BTMALLOC)
	{
		tl->callocable = 1;
		return big_alloc(tl, size);
	}

	if (size <= QS_MAX)
	{
		p = slow_alloc_aux(tl, size);
		if (p) return p;

		/* Clear fast lists */
		clear_fast(tl);

		p = slow_alloc_aux(tl, size);
		if (p) return p;
	}
	else
	{
		/* Clear fast lists */
		clear_fast(tl);
	}

	/* Try to alloc on the btree */
	b = btree_get(tl, size);
	if (b) return split_node(tl, b, b->s.size, size);

	/* Try to grab space from a dead thread */
	if (reap_dead(tl)) return local_alloc(tl, size);

	/* We need more space, so try to free memory. */
	if (scan_queue(tl, &tl->head, size)) return local_alloc(tl, size);

#ifdef DEBUG_LEAK
	leak_print(tl, "Trying to allocate %llu (%p) but cannot\n", (unsigned long long) size, (void *) size);
	malloc_stats_aux(3);
#endif

	/* Failure - make a big alloc, and add to the btree */
	return block_alloc_aux(tl, size);
}


static void *slow_alloc(atls *tl, size_t size)
{
	DECL_PROF_FUNC;

	/* Fast allocation failed - try normal data structures */
	if (size <= QS_MAX)
	{
		void *res = slow_alloc_aux(tl, size);
		if (res) return res;
	}

	return block_alloc(tl, size);
}

/* Free with no memory usage */
static void free_nomem(void *p)
{
	DECL_PROF_FUNC;

	btree *b = CONTAINER(btree, data, p);

	mealloc *m;

	slist *v, *tail;

#ifdef DEBUG_ALLOC	
	/* Check for double-free errors */
	if (un_used(b)) errx(1, "Double free with %p\n", p);
#endif

	/* Get block start */
	m = read_bs(b);

	/* Treat node as a list now */
	v = &b->list;

	v->next = NULL;

	/*
	 * Prevent other threads from dying because we have no hazard pointer
	 * This protects the dereference of m->tail
	 */
	mutex_lock(&h_lock);

	/* Prepend the data */
	tail = xchg_ptr(m->tail, v);

	/* Done */
	mutex_unlock(&h_lock);

	tail->next = v;
}

static noinline void free_aux(void *p)
{
	DECL_PROF_FUNC;

	atls *tl = init_tls();
	if (!tl)
	{
		free_nomem(p);
		return;
	}

	PREFIX(free)(p);
}

static noinline void free_clear(atls *tl)
{
	clear_fast(tl);
}

void PREFIX(free)(void *p)
{
	DECL_PROF_FUNC;

	btree *b;
	size_t size;

	atls *tl;

	if (!p) return;

	tl = get_tls();

	if (!tl)
	{
		free_aux(p);
		return;
	}

	test_all(tl);

	if (likely(is_slab(p)))
	{
		slab_free(tl, p);

		test_all(tl);

		return;
	}

	b = CONTAINER(btree, data, p);
	size = b->s.size;

#ifdef DEBUG_ALLOC
	/* Check for double-free errors */
	if (un_used(b)) errx(1, "Double free with %p\n", p);
#endif

	if (size)
	{
		/* Get block start */
		mealloc *m = read_bs(b);

		/* My tail = a local node */
		if (unlikely(m->tail != &tl->tail))
		{

			/* Add to their queue, and let them deal with it */
			prepend_queue(p, tl, &m->tail);

			return;
		}

		/* Inlined version of fast_free() */
		size = size2fl(size);
		tl->f_mask |= 1ULL << size;
		slist_add(&tl->fl[size], p);

		tl->fcount++;
		if (!(tl->fcount & FREE_FAST)) free_clear(tl);

		test_all(tl);
	}
	else
	{
		big_free_aux(page_start(b));
	}
}
#if 0
void cfree(void *p)
{
	PREFIX(free)(p);
}
#endif
static noinline void *malloc_aux(size_t size)
{
	DECL_PROF_FUNC;

	atls *tl = init_tls();
	if (!tl) return NULL;
	return PREFIX(malloc)(size);
}

void *PREFIX(malloc)(size_t size)
{
	DECL_PROF_FUNC;

	void *res;
	atls *tl;

	test_leak();

	/* Init local data if required */
	tl = get_tls();

	if (!tl) return malloc_aux(size);

	test_all(tl);

	if (likely(size <= SB_MAX)) return slab_alloc(tl, size);

	/* Prevent overflow bug in sep_align() below */
	if (unlikely(size > BTMALLOC)) return big_alloc(tl, size);

	size = sep_align(size);
	res = fast_alloc(tl, size);
	if (res) return res;

	return slow_alloc(tl, size);
}

#ifdef DEBUG_ALLOC_SLOW
static void test_wiped(void *p, size_t len)
{
	char *endy = &(((char *)p)[len - 8]);
	char *y;

	if (!len) return;

	for (y = p; y < endy; y++)
	{
		if (*y) errx(1, "found non-zero\n");
	}
}
#else
#define test_wiped(P, L) ((void) (sizeof(P) + sizeof(L)))
#endif

static noinline void *zalloc_aux(size_t size)
{
	atls *tl = init_tls();
	if (!tl) return NULL;
	return zalloc(tl, size);
}

static void *zalloc(atls *tl, size_t size)
{
	void *p;

	test_leak();
	test_all(tl);

	if (likely(size <= SB_MAX)) return slab_zalloc(tl, size);

	/* Prevent overflow bug in sep_align() below */
	if (unlikely(size > BTMALLOC)) return big_alloc(tl, size);

	size = sep_align(size);

	p = fast_alloc(tl, size);

	if (!p)
	{
		tl->callocable = 0;
		p = slow_alloc(tl, size);

		/* No need to memset? */
		if (!p || tl->callocable)
		{
			test_wiped(p, size);

			return p;
		}
	}

	test_all(tl);

	return memset(p, 0, size - 8);
}

static size_t safemul(size_t n, size_t size)
{
#ifdef __x86_64__
	/* 64 bit */
	__uint128_t dn = n;
	__uint128_t dsize = size;
	__uint128_t drsize = dn*dsize;
	size_t rsize = drsize;
	if (drsize >> 64)
	{
		/* Overflow */
		return TOP_SIZE + 1;
	}

#else

	/* 32 bit */
	u64b dn = n;
	u64b dsize = size;
	u64b drsize = dn*dsize;
	size_t rsize = drsize;

	if (drsize >> 32)
	{
		/* Overflow */
		return TOP_SIZE + 1;
	}
#endif

	return rsize;
}

void *PREFIX(calloc)(size_t n, size_t size)
{
	DECL_PROF_FUNC;

	/* Init local data if required */
	atls *tl = get_tls();

	size = safemul(n, size);

	test_leak();

	if (!tl) return zalloc_aux(size);

	return zalloc(tl, size);
}

#ifdef WINDOWS
void *PREFIX(_calloc_impl)(size_t n, size_t size, int *errno_tmp)
{
	DECL_PROF_FUNC;

	void *ret;
	atls *tl;

	int errno_orig;
	if (!errno_tmp) return PREFIX(calloc)(n, size);

	/* Init local data if required */
	tl = get_tls();

	size = safemul(n, size);

	test_leak();

	if (!tl) return zalloc_aux(size);

	_get_errno(&errno_orig);

	ret = zalloc(tl, safemul(n, size));
	_get_errno(errno_tmp);
	_set_errno(errno_orig);

	return ret;
}
#endif

static noinline void *realloc_aux(void *p, size_t size)
{
	atls *tl = init_tls();

	/* Cannot allocate anything */
	if (!tl) return NULL;

	return PREFIX(realloc)(p, size);
}

static noinline void *realloc_aux2(void *p, size_t size, atls *tl)
{
	btree *b = CONTAINER(btree, data, p);
	size_t msize = b->s.size;

	size_t old_size;

#ifdef DEBUG_ALLOC
	if (un_used(b)) errx(1, "Realloc of unmalloced pointer %p\n", p);
#endif

	/* Was a big block? */
	if (!msize)
	{
#ifndef WINDOWS
		size_t *np;
#endif
		size_t *ps = page_start(b);

		size_t offset = (char *) b - (char *) ps;

		/* Get old size */
		old_size = *ps;

		/* Don't bother resizing shrinks that are more than half the allocated size */
		if ((old_size - offset <= size * 2) && (old_size - offset >= size)) return p;

		/* Resize to new big block if possible */
		if (size >= BTMALLOC)
		{
			/* Align */
			size = page_align(size + offset + offsetof(btree, data));

#ifndef WINDOWS
			/* Use (nonportable) mremap */
			np = mremap(ps, old_size, size, MREMAP_MAYMOVE);

			/* Success? */
			if (np != MAP_FAILED)
			{
				/* Save new size */
				*np = size;

				/* Return new pointer */
				return shift(np, offset + offsetof(btree, data));
			}
#endif

			if (size < old_size)
			{
#ifndef WINDOWS
				if (!munmap(shift(ps, size), old_size - size))
				{
					/* Update size */
					*ps = size;
				}
#else
				/*
				 * Say we no longer want the memory....
				 * But it is still mapped into our address space taking up room!
				 */
				if (VirtualAlloc(shift(ps, size), old_size - size, MEM_RESET,
				PAGE_NOACCESS))
				{
					/* Update size */
					*ps = size;
				}
#endif

				return p;
			}
		}
	}
	else
	{
		mealloc *m;

		/* Get old size */
		old_size = msize;

		size = sep_align(size);

		/* Don't bother resizing shrinks that are more than half the allocated size */
		if ((old_size <= size * 2) && (old_size >= size)) return p;

		m = read_bs(b);

		/* Local node? */
		if (m->tail == &tl->tail)
		{
			btree *br;

			/* Easy case */
			if (size <= msize) return split_node(tl, b, msize, size);

			/* Make sure adjacent nodes are in the btree */
			clear_fast(tl);

			/* Medium or small size - try to merge */
			br = shift(b, msize);
			if (un_used(br))
			{
				if (br->s.bs_offset & FLG_SIZE8)
				{
					small_remove(br);

					/* Fixup sizes */
					b->s.size += 16;
					msize += 16;

					br = shift(br, 16);

					/* Set it as used */
					br->s.bs_offset &= ~FLG_LUNUSED;
				}
				else
				{
					size_t rsize = br->s.size;
					if (rsize)
					{
						if (rsize > QS_MAX)
						{
							btree_node_del(tl, br->parent, b_pindex(br));
						}
						else
						{
							dlist_del(&br->list2);
						}

						/* Fixup sizes */
						b->s.size += rsize;
						msize += rsize;

						br = shift(br, rsize);

						/* Set it as used */
						br->s.bs_offset &= ~FLG_LUNUSED;
					}
				}
			}

			/* Region fits? */
			if (size <= msize) return split_node(tl, b, msize, size);
		}
		else
		{
			/* We can only shrink a foreign node */
			if (size <= msize)
			{
				/* Open coded split node */
				btree *bm = shift(b, size);

				/* Update my size */
				b->s.size = size;

				/* Create middle seperator */
				set_sep(bm, old_size - size, b);

				/* Free the foreign excess */
				prepend_queue((void *) &bm->data, tl, &m->tail);

				return p;
			}
		}
	}

	/* Failure */
	return NULL;
}

void *PREFIX(realloc)(void *p, size_t size)
{
	DECL_PROF_FUNC;

	void *np;

	size_t old_size;

	/* Init local data if required */
	atls *tl = get_tls();

	int old_errno;

	test_leak();

	if (!tl) return realloc_aux(p, size);

	test_all(tl);

	/* realloc(p, 0) is the same as free(p) */
	if (!size)
	{
		PREFIX(free)(p);

		return NULL;
	}

	/* Relloc NULL is the same as malloc */
	if (!p) return PREFIX(malloc)(size);

	/* Too large to allocate */
	if (size > TOP_SIZE) goto nomem;

	if (!is_slab(p))
	{
		/* See if merging will work */
		np = realloc_aux2(p, size, tl);

		if (np) return np;
	}

	old_size = malloc_usable_size(p);

#ifdef WINDOWS
	_get_errno(&old_errno);
#else
	/* Failure - have to do it manually */
	old_errno = errno;
#endif

	np = PREFIX(malloc)(size);
	if (!np)
	{
		/* Is original allocation still okay? */
		if (size <= old_size)
		{
			/* Don't set errno to be ENOMEM */
#ifdef WINDOWS
			_set_errno(old_errno);
#else
			errno = old_errno;
#endif

			/* Return old allocation */
			return p;
		}
		goto nomem;
	}

	/* Copy data */
	if (size > old_size) size = old_size;
	memcpy(np, p, size);

	PREFIX(free)(p);

	/* Done */
	return np;

nomem:
	set_enomem();
	return NULL;
}

#ifndef WINDOWS
static void unmap_range(void *x1, void *x2)
{
	if (x1 != x2)
	{
		munmap(x1, (char *) x2 - (char *) x1);
	}
}
#endif

#ifdef DEBUG_ALLOC_SLOW
static void test_align(size_t align, void *p)
{
	uintptr_t x = (uintptr_t) p;
	if (align && (x & (align - 1))) errx(1, "Incorrect alignment of pointer\n");
}
#else
#define test_align(a, p) ((void) (sizeof(a) + sizeof(p)))
#endif

static noinline void *memalign_aux(size_t align, size_t size)
{
	atls *tl = init_tls();

	/* Cannot allocate anything! */
	if (!tl) return NULL;

	return PREFIX(memalign)(align, size);
}

#ifdef WINDOWS
static void *memalign_aux2(size_t align, size_t size, size_t rsize)
{
	size_t psize = page_align(rsize + PAGESIZE);
	size_t *ps;

	(void) size;

	if (align > PAGESIZE) goto nomem;
	if (rsize > TOP_SIZE) goto nomem;

	ps = VirtualAlloc(NULL, rsize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
	if (ps == MAP_FAILED) goto nomem;

	/* Small alignment */
	*ps = psize;
	ps = shift(ps, PAGESIZE);

	test_align(align, ps);

	return ps;

nomem:
	set_enomem();
	return NULL;
}
#else
static void *memalign_aux2(size_t align, size_t size, size_t rsize)
{
	size_t pssize = page_align(size + PAGESIZE);
	size_t psize = page_align(rsize + PAGESIZE);
	size_t lsize;
	int flags = MAP_PRIVATE | MAP_ANONYMOUS;

	size_t *lstart, *lend;

	size_t *ps;
	void *p;

	if (rsize > TOP_SIZE) goto nomem;

	/*
	 * Hack - large alignments require no reservation,
	 * otherwise we run out of memory
	 */
	if (align > size) flags |= MAP_NORESERVE;

	ps = mmap(NULL, psize, PROT_READ | PROT_WRITE, flags, -1, 0);

	/* Out of memory */
	if (ps == MAP_FAILED) goto nomem;

	/* Small alignment */
	if (align <= PAGESIZE)
	{
		*ps = psize;
		p = shift(ps, PAGESIZE);

		test_align(align, p);

		return p;
	}

	/* Large alignment */
	lstart = ps;
	lsize = (-(uintptr_t) ps) & (align - 1);

	/* Already aligned - need to shift to get sep+size at beginning */
	if (!lsize)
	{
		ps = shift(ps, align - PAGESIZE);

		/* Fragment at beginning to unmap */
		unmap_range(lstart, ps);

		*ps = pssize;
		p = shift(ps, PAGESIZE);

		test_align(align, p);
		return p;
	}

	lend = shift(ps, rsize);
	ps = shift(ps, lsize - PAGESIZE);

	/* Fragment at beginning to unmap */
	unmap_range(lstart, ps);
	*ps = pssize;
	p = shift(ps, PAGESIZE);

	lstart = shift(p, pssize);

	/* Fragment at end to unmap */
	unmap_range(lstart, lend);

	test_align(align, p);
	return p;

nomem:
	set_enomem();
	return NULL;
}
#endif

void *PREFIX(memalign)(size_t align, size_t size)
{
	DECL_PROF_FUNC;

	size_t rsize, lsize;

	void *p;

	btree *b;

	atls *tl = get_tls();

	test_leak();

	/* Too large to allocate */
	if (size > TOP_SIZE) goto nomem;

	if (align <= SEPSIZE) return PREFIX(malloc)(size);

	if (!tl) return memalign_aux(align, size);

	/* Try to cache-line align via slab special case */
	if ((size <= 64) && (align <= 64))
	{
		p = slab_alloc(tl, 64);
		if (p)
		{
			/* Double-check alignment as slab_alloc may fall-back in low mem */
			if (!(63 & (uintptr_t) p)) return p;
			local_free(tl, p);
		}
	}

	size = sep_align(size);
	rsize = sep_align(size + align);

	/* Check for overflow */
	if ((rsize <= size) || (rsize <= align)) goto nomem;

#ifdef WINDOWS
	/* Large allocations are special */
	if (rsize >= BTMALLOC)
	{
		return memalign_aux2(align, size, rsize);
	}
#else
	/* Large allocations are special */
	if (rsize >= BTMALLOC)
	{
		return memalign_aux2(align, size, rsize);
	}
#endif

	while (1)
	{
		p = fast_alloc(tl, rsize);
		if (p) break;

		if (rsize < QS_MAX)
		{
			p = slow_alloc_aux(tl, rsize);
			if (p) break;

			/* Clear fast lists */
			clear_fast(tl);

			p = slow_alloc_aux(tl, rsize);
			if (p) break;
		}
		else
		{
			/* Clear fast lists */
			clear_fast(tl);
		}

		/* Try to alloc on the btree */
		b = btree_get(tl, rsize);
		if (b)
		{
			p = split_node(tl, b, b->s.size, rsize);
			break;
		}

		/* Try to grab space from a dead thread */
		if (reap_dead(tl)) continue;

		/* We need more space, so try to free memory. */
		if (scan_queue(tl, &tl->head, rsize)) continue;

		/* Everything failed - fall back to large allocation */
		return memalign_aux2(align, size, rsize);
	}

	lsize = (-(uintptr_t) p) & (align - 1);

	b = CONTAINER(btree, data, p);

#ifdef DEBUG_ALLOC_SLOW
	if (rsize > b->s.size) errx(1, "node received is too small\n");
#endif

	/* get real size allocated */
	rsize = b->s.size;

	/* Already aligned? */
	if (!lsize)
	{
		test_align(align, &b->data);

		/* Split off part we need */
		return split_node(tl, b, rsize, size);
	}

	b = split_node_rhs(tl, b, rsize, lsize);
	test_align(align, &b->data);

	return split_node(tl, b, rsize - lsize, size);

nomem:
	set_enomem();
	return NULL;
}

int PREFIX(posix_memalign)(void **p, size_t align, size_t size)
{
	/* Make sure power of two and greater than sizeof(void *) */
#ifdef __x86_64__	
	if (align & ((align - 1) | 7))
	{
		*p = NULL;
		return EINVAL;
	}
#else
	if (align & ((align - 1) | 3))
	{
		*p = NULL;
		return EINVAL;
	}
#endif

	*p = PREFIX(memalign)(align, size);

	if (!*p) return ENOMEM;

	return 0;
}
#if 0
void *valloc(size_t size)
{
	return PREFIX(memalign)(PAGESIZE, size);
}

void *pvalloc(size_t size)
{
	return PREFIX(memalign)(PAGESIZE, page_align(size));
}
#endif
//#ifdef WINDOWS
#if 1
static
#endif
size_t malloc_usable_size(void *p)
{
	size_t offset;
	size_t *ps;
	size_t size;

	DECL_PROF_FUNC;

	btree *b = CONTAINER(btree, data, p);

	/* Don't crash on a NULL pointer */
	if (!p) return 0;

	/* Handle slab allocations */
	if (is_slab(p))
	{
		sbheader *sb = slab_start(p);
		return sb->size;
	}

	size = b->s.size;

	/* Small allocation */
	if (size) return size - PTRSIZE;

	/* Large allocation */
	ps = page_start(b);
	offset = (uintptr_t) &b->data - (uintptr_t) ps;

	return *ps - offset;
}

#ifdef WINDOWS
#ifdef PREFIX
size_t PREFIX(_msize)(void *p)
#else /* !PREFIX */
__attribute__((dllimport)) size_t _msize(void *p)
#endif /* PREFIX */
{
	return malloc_usable_size(p);
}
#endif


#if 0
struct mallinfo mallinfo(void)
{
	atls *tl = get_tls();
	struct mallinfo mi = {0,0,0,0,0,0,0,0,0,0};

	dlist *d;
	slist *s;

	int i;

	btree *b;

	size_t size;

	mi.arena = sbrk_size;

	if (!tl)
	{
		tl = init_tls();

		/* Cannot allocate anything, just return arena count */
		if (!tl) return mi;
	}

	/* Scan slab */
	for (i = 0; i < NUM_SB; i++)
	{
		scan_list(&tl->slab[i], d)
		{
			mi.smblks++;
			mi.usmblks++;
		}
	}

	scan_list(&tl->slab_full, d)
	{
		mi.smblks++;
		mi.usmblks++;
	}

	if (tl->slab_chunk)
	{
		mi.fsmblks = 1 + tl->slab_chunk->count;
		mi.smblks += mi.fsmblks;
	}

	/* Scan dlists */
	for (i = 1; i < NUM_QS; i++)
	{
		scan_list(&tl->qs[i], d)
		{
			mi.ordblks++;

			b = CONTAINER(btree, list, d);
			size = b->s.size - PTRSIZE;
			mi.fordblks += size;
		}
	}

	/* Add in results from small list */
	for (b = small_next((btree *) &tl->qs[0]); b != (btree *) &tl->qs[0]; b = small_next(b))
	{
		mi.ordblks++;
		mi.fordblks += 8;
	}

	/* Scan fastlists */
	for (i = 0; i < NUM_FL; i++)
	{
		scan_slist(&tl->fl[i], s)
		{
			mi.ordblks++;

			b = CONTAINER(btree, list, s);
			size = b->s.size - PTRSIZE;
			mi.fordblks += size;
		}
	}

	/* Count memory blocks */
	scan_list(&tl->bl, d)
	{
		mi.hblks++;
	}

	/* Count btree nodes */
	mi.hblkhd = count_btree(&tl->bheap);

	/* Count btree space */
	mi.fordblks += count_btree_space(&tl->bheap);

	/* Total allocated space (including overhead of seperators and atls) */
	mi.uordblks = tl->a_alloced - mi.fordblks + PAGESIZE;

	/* Total easily callocable region */
	mi.keepcost = 0;

	/* Done */
	return mi;
}

int malloc_trim(size_t pad)
{
	atls *tl = get_tls();

	/* Nothing allocated - do nothing */
	if (!tl) return 1;

	/* Clear incoming frees */
	scan_queue(tl, &tl->head, 0);

	/* Hack - ignore pad - and just free as much as possible */
	clear_fast(tl);

	(void) pad;

	/* Always return success */
	return 1;
}



int mallopt(int param, int val)
{
	/* Ignore parameters */
	(void) param;
	(void) val;

	/* Just return success - we don't have any parameters to modify */
	return 1;
}

#endif

#ifdef DEBUG_LEAK
static int btree_print(atls *tl, btree *b)
{
	int i;
	btree *bc;

	int count = 1;

	if (b_leaf(b))
	{
		leak_print(tl, "%u\n", b->s.size);
		return 0;
	}

	leak_print(tl, "Btree: %p\n", (void *) b);

	for (i = b_start(b); i; i = b_next(b, i))
	{
		bc = b_ptr(b, i);
		leak_print(tl, "link %p\n", (void *) bc);
		count += btree_print(tl, bc);
	}

	return count;
}
#endif

#if 0
// #ifndef WINDOWS

static void mem_slab(void)
{
	int i;
	int count;
	dlist *d;

	atls *tl = get_tls();
	if (!tl) return;

	leak_print(tl, "Total Slab Virtual: %llu\n", (unsigned long long) sbrk_size);

	for (i = 0; i < NUM_SB; i++)
	{
		if (dlist_empty(&tl->slab[i])) continue;

		count = 0;
		scan_list(&tl->slab[i], d)
		{
			count++;
		}
		leak_print(tl, "Partial slab %d used: %lld\n", i * 16, count * 65536ULL);
	}

	if (!dlist_empty(&tl->slab_full))
	{
		count = 0;
		scan_list(&tl->slab_full, d)
		{
			count++;
		}
		leak_print(tl, "Full slab used: %lld\n", count * 65536ULL);
	}

	if (tl->slab_chunk)
	{
		leak_print(tl, "Local free slabs: %lld\n", (tl->slab_chunk->count + 1) * 65536LL);
	}
	else
	{
		leak_print(tl, "Local free slabs: 0\n");
	}
}

#ifdef DEBUG_LEAK
static void mem_big(void)
{
	int i;

	/* If vsnprintf allocates, we may have a problem... */
	mutex_lock(&l_lock);

	for (i = 0; i < LEAK_MAX; i++)
	{
		if (big_leak[i].p)
		{
			leak_print(get_tls(), "big block %p: %llu\n", big_leak[i].p, (unsigned long long) big_leak[i].size);
		}
	}

	mutex_unlock(&l_lock);
}
#endif

static void malloc_stats_aux(int show_nodes)
{
	atls *tl = get_tls();

	dlist *d;
	btree *b;

	size_t size;
	size_t tsize = 0;
	size_t asize = 0;

	/* Nothing allocated - print nothing */
	if (!tl) return;

	clear_fast(tl);

	scan_list(&tl->bl, d)
	{
		mealloc *m = list_entry(mealloc, m_list, d);

		size = big_block_size(m);

		if (size)
		{
			leak_print(tl, "Block: %p %llu\n", (void *) m, (unsigned long long) size);
		}

		/* Scan seps for this block */
		for (b = &m->b;; b = shift(b, size))
		{
			if (b->s.bs_offset & FLG_SIZE8)
			{
				size = 16;
			}
			else
			{
				size = b->s.size;
			}

			if (!size) break;

			tsize += size;

			if (un_used(b))
			{
				if (show_nodes) leak_print(tl, "  %p\n", (void *) size);
			}
			else
			{
				if (show_nodes) leak_print(tl, "* %p\n", (void *) size);
				asize += size;
			}
		}
	}

	leak_print(tl, "Total in btree %llu, total alloced %llu\n", (unsigned long long) tsize, (unsigned long long) asize);

#ifdef DEBUG_LEAK
	if (show_nodes & 2)
	{
		int count = btree_print(tl, &tl->bheap);

		leak_print(tl, "b_cnt = %d, b_hgt = %d, total = %d\n", tl->b_cnt, tl->b_hgt, count);
	}

	mutex_lock(&h_lock);
	size = 0;
	scan_list(&h_list, d)
	{
		size++;
	}
	mutex_unlock(&h_lock);
	leak_print(tl, "%d threads\n", (int) size);

	mem_big();
#endif
	mem_slab();
}

void malloc_stats(void)
{
	malloc_stats_aux(0);
}
#endif

static void **ialloc_fallback(atls *tl, size_t n, size_t *sizes, void **chunks, int clear)
{
	size_t i;
	void **out;

	/* Get storage for pointers */
	if (!chunks)
	{
		out = local_alloc(tl, sep_align(sizeof(void *) * n));
		if (!out) return NULL;
	}
	else
	{
		out = chunks;
	}

	/* Do it manually */
	if (clear)
	{
		for (i = 0; i < n; i++)
		{
			out[i] = zalloc(tl, sizes[0]);
			if (!out[i]) goto fail;
		}
	}
	else
	{
		for (i = 0; i < n; i++)
		{
			out[i] = local_alloc(tl, sizes[i]);
			if (!out[i]) goto fail;
		}
	}

	return out;

fail:
	for (n = 0; n < i; n++)
	{
		PREFIX(free)(out[i]);
	}

	if (!chunks) PREFIX(free)(out);

	return NULL;
}

static void **ialloc(atls *tl, size_t n, size_t *sizes, void **chunks, int clear)
{
	size_t i;

	size_t nsize;
	size_t total_size = 0;

	void *p;
	btree *b, *br;
	unsigned offset;

	void **out;

	test_all(tl);

	test_leak();

	/* Zero sized array? */
	if (!n)
	{
		if (chunks) return chunks;

		return PREFIX(malloc)(0);
	}

	/* Get total size to allocate */
	if (clear)
	{
		total_size = safemul(sep_align(sizes[0]), n);

		/* Overflow */
		if (total_size >= TOP_SIZE)
		{
			set_enomem();
			return NULL;
		}
	}
	else
	{
		for (i = 0; i < n; i++)
		{
			nsize = sep_align(sizes[i]);
			total_size += nsize;

			/* Overflow */
			if (total_size < nsize)
			{
				set_enomem();
				return NULL;
			}
		}
	}

	if (clear) tl->callocable = 0;

	while (1)
	{
		p = fast_alloc(tl, total_size);
		if (p) break;

		if (total_size < QS_MAX) p = slow_alloc(tl, total_size);
		if (p) break;

		/* Too large to allocate normally */
		if (total_size >= BTMALLOC) return ialloc_fallback(tl, n, sizes, chunks, clear);

		/* Clear fast lists */
		clear_fast(tl);

		/* Try to alloc on the btree */
		b = btree_get(tl, total_size);
		if (b)
		{
			p = split_node(tl, b, b->s.size, total_size);
			break;
		}

		/* Try to grab space from a dead thread */
		if (reap_dead(tl)) continue;

		/* We need more space, so try to free memory. */
		if (scan_queue(tl, &tl->head, total_size)) continue;

		/* Try to allocate a new block */
		p = block_alloc_aux(tl, total_size);
		if (p) break;

		/* Everything failed - fall back to individual allocations */
		return ialloc_fallback(tl, n, sizes, chunks, clear);
	}

	b = CONTAINER(btree, data, p);

	/* Get real total size */
	total_size = b->s.size;
	offset = b->s.bs_offset & ~15;

	/* Do we need to clear it? */
	if (clear && !tl->callocable) memset(p, 0, total_size - 8);

	/* Get storage for pointers */
	if (!chunks)
	{
		out = local_alloc(tl, sep_align(sizeof(void *) * n));

		if (!out)
		{
			PREFIX(free)(p);
			return NULL;
		}
	}
	else
	{
		out = chunks;
	}

	for (i = 0; i < n; i++)
	{
		out[i] = p;

		if (clear)
		{
			nsize = sep_align(sizes[0]);
		}
		else
		{
			nsize = sep_align(sizes[i]);
		}
		total_size -= nsize;

		/* Update local size */
		b->s.size = nsize;

		p = shift(p, nsize);
		br = CONTAINER(btree, data, p);

		/* Create offset part of right seperator */
		offset += nsize;
		if (i != n - 1) br->s.bs_offset = offset;

		b = br;
	}

	/* Nothing left - then we are done */
	if (!total_size)
	{
		test_all(tl);
		return out;
	}

	/* Resize last element to have the slack */
	p = out[n - 1];
	b = CONTAINER(btree, data, p);

	b->s.size += total_size;
	check_sep(b);

	/* How big is last allocation? */
	if (clear)
	{
		nsize = sep_align(sizes[0]);
	}
	else
	{
		nsize = sep_align(sizes[n - 1]);
	}

	/* Split off excess if too much */
	split_node(tl, b, b->s.size, nsize);

	test_all(tl);
	return out;
}

#if 0
static noinline void **ialloc_aux(size_t n, size_t *sizes, void **chunks, int clear)
{
	atls *tl = init_tls();
	if (!tl) return NULL;

	return ialloc(tl, n, sizes, chunks, clear);
}

void **independent_calloc(size_t n, size_t size, void **chunks)
{
	atls *tl = get_tls();

	if (!tl) return ialloc_aux(n, &size, chunks, 1);

	return ialloc(tl, n, &size, chunks, 1);
}

void **independent_comalloc(size_t n, size_t *sizes, void **chunks)
{
	atls *tl = get_tls();

	if (!tl) return ialloc_aux(n, sizes, chunks, 0);

	return ialloc(tl, n, sizes, chunks, 0);
}
#endif
//#ifndef WINDOWS
#if 0
void *malloc_get_state(void)
{
	abort();

	return NULL;
}

int malloc_set_state(void *p)
{
	(void) p;
	abort();

	return 0;
}
#endif


#ifdef WINDOWS
#ifdef PREFIX
void *PREFIX(_expand)(void *p, size_t size)
#else /* PREFIX */
__attribute__((dllimport)) void *_expand(void *p, size_t size)
#endif /* PREFIX */
{
	DECL_PROF_FUNC;

	atls *tl = get_tls();

	/* paranoia */
	if (!p) return NULL;

	/* Handle expansion into already allocated memory */
	if (malloc_usable_size(p) <= size) return p;

	/* Don't handle slab allocations */
	if (is_slab(p)) return NULL;

	/* Cannot expand a block created by someone else */
	if (!tl) goto nomem;

	p = realloc_aux2(p, size, tl);

	/* Did it work? */
	if (malloc_usable_size(p) >= size) return p;

nomem:
	set_enomem();
	return NULL;
}

/* Nolock functions call their normal functions */
void PREFIX(_free_nolock)(void *p)
{
	PREFIX(free)(p);
}

void *PREFIX(_realloc_nolock)(void *p, size_t size)
{
	return PREFIX(realloc)(p, size);
}

void *PREFIX(_calloc_nolock)(size_t n, size_t size)
{
	return PREFIX(calloc)(n, size);
}

size_t PREFIX(_msize_nolock)(void *p)
{
	return malloc_usable_size(p);
}

#endif
back to top