libmalloc源码分析之nanozone_s的处理

摘要

根据申请内存的大小不同,libmalloc会从不同的堆上空间将内存分配给申请者,而最小的一个一档,是从nano_zone_s中,通过nano_*函数获取堆上已分配的内存。

nano_zone_s

nano_zone_s的数据结构如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
typedef struct nanozone_s {				// vm_allocate()'d, so page-aligned to begin with.
malloc_zone_t basic_zone; // first page will be given read-only protection
uint8_t pad[PAGE_MAX_SIZE - sizeof(malloc_zone_t)];

// remainder of structure is R/W (contains no function pointers)
// page-aligned
struct nano_meta_s meta_data[NANO_MAG_SIZE][NANO_SLOT_SIZE]; // max: NANO_MAG_SIZE cores x NANO_SLOT_SIZE slots for nano blocks {16 .. 256}
_malloc_lock_s band_resupply_lock[NANO_MAG_SIZE];
uintptr_t band_max_mapped_baseaddr[NANO_MAG_SIZE];
size_t core_mapped_size[NANO_MAG_SIZE];

unsigned debug_flags;
unsigned our_signature;
unsigned phys_ncpus;
unsigned logical_ncpus;
unsigned hyper_shift;

/* security cookie */
uintptr_t cookie;

/*
* The nano zone constructed by create_nano_zone() would like to hand off tiny, small, and large
* allocations to the default scalable zone. Record the latter as the "helper" zone here.
*/

malloc_zone_t *helper_zone;
} nanozone_t;

libmalloc源码分析之初始化中做过简单的介绍,这个数据结构中,主要需要理解的是几个宏和一个数据结构。

一些需要理解的宏定义

NANO_MAX_SIZE

1
2
#define NANO_MAX_SIZE			256 
/* Buckets sized {16, 32, 48, 64, 80, 96, 112, ...} */

定义了nano_zone中可以申请的内存块的最大值为256

SLOT_IN_BAND_SIZE

1
2
#define NANO_OFFSET_BITS		17	
#define SLOT_IN_BAND_SIZE (1 << NANO_OFFSET_BITS)

SLOT_IN_BAND_SIZE的值为2^17==128*1024,所以SLOT_IN_BAND_SIZE的值等于128KB。用来表示,在每一个nano内存分配的BAND之中,一个每一个SLOT的大小是128KB

BAND_SIZE

1
2
3
4
#define NANO_SLOT_BITS			4
#define NANO_OFFSET_BITS 17
#define BAND_SIZE (1 << (NANO_SLOT_BITS + NANO_OFFSET_BITS))
/* == Number of bytes covered by a page table entry */

BAND_SIZE的值为2^21==2*1024*1024,所以BAND_SIZE的值等于2MB。用来表示,在每一个nano_zone_t的分配中,每一个BAND的大小是2MB。

NANO_MAG_SIZE

1
2
#define NANO_MAG_BITS            5
#define NANO_MAG_SIZE (1 << NANO_MAG_BITS)

NANO_MAG_SIZE的值为2^5==32,用来最多支持多少个核。

NANO_SLOT_SIZE

1
2
#define NANO_SLOT_BITS			4
#define NANO_SLOT_SIZE (1 << NANO_SLOT_BITS)

表示一共有2^4,也就是16种SLOT的大小。分别是16,32,48,64,…,256。

nano_meta_s

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
typedef struct nano_meta_s {
OSQueueHead slot_LIFO CACHE_ALIGN;
//用于重复利用释放内存的队列
unsigned int slot_madvised_log_page_count;
volatile uintptr_t slot_current_base_addr;
//slot的基址
volatile uintptr_t slot_limit_addr;
//slot可以分配的地址最大值
volatile size_t slot_objects_mapped;
//slot中已经映射了多少个objects
volatile size_t slot_objects_skipped;
//slot开始的时候会跳过多少个object开始使用
bitarray_t slot_madvised_pages;
volatile uintptr_t slot_bump_addr CACHE_ALIGN;
//slot当前开始分配的地址
// position on cache line distinct from that of slot_LIFO
volatile boolean_t slot_exhausted;
// slot是否已经用尽
unsigned int slot_bytes;
// 该slot中存储的bytes是多少(16、32、48...)
unsigned int slot_objects;
// 该slot中存储了多少个object(slot总内存/slot_bytes)
} *nano_meta_admin_t;

这个数据结构就是nano_zone_s中的meta_data的数据结构,具体用来描述每一个slot的状态和属性。

nano_malloc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
void *
malloc(size_t size) {

void *retval;
retval = malloc_zone_malloc(inline_malloc_default_zone(), size);
if (retval == NULL) {
errno = ENOMEM;
}
return retval;
}

void *
malloc_zone_malloc(malloc_zone_t *zone, size_t size) {

void *ptr;
/*...*/
ptr = zone->malloc(zone, size);
/*...*/
return ptr;
}

static void *
nano_malloc(nanozone_t *nanozone, size_t size)
{

if (size <= NANO_MAX_SIZE) {
void *p = _nano_malloc_check_clear(nanozone, size, 0);
if (p) {
return p;
} else {
/* FALLTHROUGH to helper zone */
}
}

malloc_zone_t *zone = (malloc_zone_t *)(nanozone->helper_zone);
return zone->malloc(zone, size);
}

因为默认的zone是使用nano_zone_t,在malloc_zone_malloc中的zone->malloc()函数被调用之后,在

nano_malloc会依据申请的size进行判断,当只有当size小于NANO_MAX_SIZE的时候才会调用nano_malloc的内部实现_nano_malloc_check_clear,否则就通过nanozone->helper_zonemalloc函数,进一步判断使用tinysmall、还是large

第一次调用_nano_malloc_check_clear

所以需要重点分析的函数是_nano_malloc_check_clear。而第一次调用malloc的话,主要流程在segregated_next_block中实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static void 
_nano_malloc_check_clear(nanozone_t *nanozone, size_t size, boolean_t cleared_requested)
{
void *ptr;
unsigned int slot_key;
unsigned int slot_bytes = segregated_size_to_fit(nanozone, size, &slot_key); // Note slot_key is set here
//SLOT_BYTES是固定的16、32、64、128、、、、
//slot_key 1~16
unsigned int mag_index = NANO_MAG_INDEX(nanozone);
//获取cpu对应的index

//选择cpu以及对应的内存大小块
nano_meta_admin_t pMeta = &(nanozone->meta_data[mag_index][slot_key]);

//检测是否存在已经释放过,可以直接拿来用的内存
ptr = OSAtomicDequeue( &(pMeta->slot_LIFO), offsetof(struct chained_block_s,next));
if (ptr) {
/*...*/
//如果队列中存在函数,则直接使用队列中的数据
//第一次调用malloc时,不会执行这一块代码,所以后面再另做分析
} else {
//没有释放过的内存,所以调用函数 获取内存
ptr = segregated_next_block(nanozone, pMeta, slot_bytes, mag_index);
}

if (cleared_requested && ptr)
memset(ptr, 0, slot_bytes); // TODO: Needs a memory barrier after memset to ensure zeroes land first?

return ptr;
}

segregated_size_to_fit

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/*
input : size = 0
size = NANO_REGIME_QUANTA_SIZE==>(1<<4)==>2^4==>16
k = (16 + 16 - 1) >> 4 ==> 31>>4 ==> 1
slot_bytes = 1 << 4 ==> 16
*pkey = 0

input : size = 16
size = 16
k = (16+16-1) >> 4 ==>1
slot_bytes = 16
*pkey = 0

input : size = 32
size : 32
k = (32+16-1)>>4 ==>2
slot_bytes = 2<<4 = 32
*pkey = 2-1 ==>1

0~16 :
pkey ==> 0
slot_bytes ==> 16B

17~32
pkey ==> 1
slot_bytes ==> 32B

256
pkey ==> 16
slot_bytes ==> 256B
*/

static INLINE unsigned int
segregated_size_to_fit(nanozone_t *nanozone, size_t size, unsigned int *pKey)
{

unsigned int k, slot_bytes;

if (0 == size)
size = NANO_REGIME_QUANTA_SIZE; // Historical behavior

k = (size + NANO_REGIME_QUANTA_SIZE - 1) >> SHIFT_NANO_QUANTUM; // round up and shift for number of quanta
slot_bytes = k << SHIFT_NANO_QUANTUM; // multiply by power of two quanta size
*pKey = k - 1; // Zero-based!

return slot_bytes;
}

该函数实现的功能是根据申请的size大小,计算到相应的需要申请的相应的slot_bytes。例如申请的大小是20的话,slot_bytes就应当是32。

初见segregated_next_block

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

static INLINE void *
segregated_next_block(nanozone_t *nanozone, nano_meta_admin_t pMeta, unsigned int slot_bytes, unsigned int mag_index)
{

while (1) {
uintptr_t theLimit = pMeta->slot_limit_addr; // Capture the slot limit that bounds slot_bump_addr right now
//pMeta->slot_limit_addr
//当前这块pMeta可用内存的结束地址。
uintptr_t b = OSAtomicAdd64Barrier(slot_bytes, (volatile int64_t *)&(pMeta->slot_bump_addr));
//原子的为pMeta->slot_bump_addr添加slot_bytes的长度,偏移到下一个地址
b -= slot_bytes; // Atomic op returned addr of *next* free block. Subtract to get addr for *this* allocation.
//减去添加的偏移量,获取当前可以获取的地址
if (b < theLimit) { // Did we stay within the bound of the present slot allocation?
//如果地址还在范围之内,则返回地址
//todo:b不仅小于thelimit且远远小于thelimit,就可以获取一个就指向其他内存块的地址了。
return (void *)b; // Yep, so the slot_bump_addr this thread incremented is good to go
} else {
if (pMeta->slot_exhausted) { // exhausted all the bands availble for this slot?
return 0; // We're toast
} else {
// One thread will grow the heap, others will see its been grown and retry allocation
_malloc_lock_lock(&nanozone->band_resupply_lock[mag_index]);
// re-check state now that we've taken the lock
if (pMeta->slot_exhausted) {
_malloc_lock_unlock(&nanozone->band_resupply_lock[mag_index]);
return 0; // Toast
} else if (b < pMeta->slot_limit_addr) {
_malloc_lock_unlock(&nanozone->band_resupply_lock[mag_index]);
continue; // ... the slot was successfully grown by first-taker (not us). Now try again.
} else if (segregated_band_grow(nanozone, pMeta, slot_bytes, mag_index)) {
_malloc_lock_unlock(&nanozone->band_resupply_lock[mag_index]);
continue; // ... the slot has been successfully grown by us. Now try again.
} else {
pMeta->slot_exhausted = TRUE;
_malloc_lock_unlock(&nanozone->band_resupply_lock[mag_index]);
return 0;
}
}
}
}
}

因为是第一次调用segregated_next_block函数,theLimit的值为0,b的值也为0。

又因为pMeta->slot_exhausted的值也是0,且pMeta->slot_limit_addr的值也为0,所以会调用

segregated_band_grow(nanozone, pMeta, slot_bytes, mag_index)

该函数的实现非常的有技巧性,所以不只有初始化nano_zoneband的功能,在后面的分析还会遇到,遇到之后再做详细分析。

获得第一个BAND

segregated_band_grow函数实现了2个功能。

  • BAND用尽后的增长
  • 以及一个特殊情况,创建第一个BAND
什么是BAND

每一个BAND的大小为2MB,是nano_zone_t向堆申请内存的单位,每一个独立的cpu每次申请一个BAND,当一个BAND用完之后,会再申请一个BAND。每一个cpu的BAND是分开使用的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

static boolean_t
segregated_band_grow(nanozone_t *nanozone, nano_meta_admin_t pMeta, unsigned int slot_bytes, unsigned int mag_index)
{

nano_blk_addr_t u; // the compiler holds this in a register
uintptr_t p, s;
size_t watermark, hiwater;

if (0 == pMeta->slot_current_base_addr) { // First encounter?

u.fields.nano_signature = NANOZONE_SIGNATURE;
u.fields.nano_mag_index = mag_index;
u.fields.nano_band = 0;
u.fields.nano_slot = (slot_bytes >> SHIFT_NANO_QUANTUM) - 1;
u.fields.nano_offset = 0;

p = u.addr;
//使用union计算地址是多少
pMeta->slot_bytes = slot_bytes;
pMeta->slot_objects = SLOT_IN_BAND_SIZE / slot_bytes; //128KB/slot_bytes==>有多少个分片。
} else {
p = pMeta->slot_current_base_addr + BAND_SIZE; // Growing, so stride ahead by BAND_SIZE 1<<21 ==> 2^21
//再slot_current_base_addr再增加2
u.addr = (uint64_t)p;
if (0 == u.fields.nano_band) // Did the band index wrap?
return FALSE;

assert(slot_bytes == pMeta->slot_bytes);
}
pMeta->slot_current_base_addr = p;

//band_size大小是2MB
//根据当前slot_current_base_addr,计算得到band的开始地址,用于内存申请。
mach_vm_address_t vm_addr = p & ~((uintptr_t)(BAND_SIZE - 1)); // Address of the (2MB) band covering this (128KB) slot

if (nanozone->band_max_mapped_baseaddr[mag_index] < vm_addr) {
// Obtain the next band to cover this slot
// 申请2MB的空间用于这个band
kern_return_t kr = mach_vm_map(mach_task_self(), &vm_addr, BAND_SIZE,
0, VM_MAKE_TAG(VM_MEMORY_MALLOC_NANO), MEMORY_OBJECT_NULL, 0, FALSE,
VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
/* ...*/
nanozone->band_max_mapped_baseaddr[mag_index] = vm_addr;
}

// Randomize the starting allocation from this slot (introduces 11 to 14 bits of entropy)
if (0 == pMeta->slot_objects_mapped) { // First encounter?
//SLOT_IN_BAND_SIZE / slot_bytes 这个slot会被分成多少份。
pMeta->slot_objects_skipped = (malloc_entropy[1] % (SLOT_IN_BAND_SIZE / slot_bytes));
//通过取余在内存开始处空出几个slot_bytes不用。
pMeta->slot_bump_addr = p + (pMeta->slot_objects_skipped * slot_bytes);
} else {
pMeta->slot_bump_addr = p;
}

pMeta->slot_limit_addr = p + (SLOT_IN_BAND_SIZE / slot_bytes) * slot_bytes; //p+128KB
pMeta->slot_objects_mapped += (SLOT_IN_BAND_SIZE / slot_bytes); //128KB/slot_bytes

u.fields.nano_signature = NANOZONE_SIGNATURE;
u.fields.nano_mag_index = mag_index;
u.fields.nano_band = 0;
u.fields.nano_slot = 0;
u.fields.nano_offset = 0;
s = u.addr; // Base for this core.

// Set the high water mark for this CPU's entire magazine, if this resupply raised it.
watermark = nanozone->core_mapped_size[mag_index];
hiwater = MAX( watermark, p - s + SLOT_IN_BAND_SIZE );
nanozone->core_mapped_size[mag_index] = hiwater;

return TRUE;
}
获取pMeta->slot_current_base_addr

第一次调用segregated_band_grow,因为0 == pMeta->slot_current_base_addr,所以进行初始化,利用

nano_blk_addr_t这个结构,计算当前的slot_current_base_addr的地址是多少。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
struct nano_blk_addr_s {
uint64_t
nano_offset:NANO_OFFSET_BITS, // locates the block
nano_slot:NANO_SLOT_BITS, // bucket of homogenous quanta-multiple blocks
nano_band:NANO_BAND_BITS,
nano_mag_index:NANO_MAG_BITS, // the core that allocated this block
nano_signature:NANO_SIGNATURE_BITS; // 0x00006nnnnnnnnnnn the address range devoted to us.
};
#endif

typedef union {
uint64_t addr;
struct nano_blk_addr_s fields;
} nano_blk_addr_t;

/*...*/
u.fields.nano_signature = NANOZONE_SIGNATURE;
u.fields.nano_mag_index = mag_index;
u.fields.nano_band = 0;
u.fields.nano_slot = (slot_bytes >> SHIFT_NANO_QUANTUM) - 1;
u.fields.nano_offset = 0;

p = u.addr;
/*...*/

先根据不同的mag_indexslot_bytes填写union中的fileds。因为union使用不同的数据类型范围同一块内存。所以用u.addr来获取前面填入的数据,就得到了当前的slot_current_base_addr

通过一段简单的代码来验证一下。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
nano_blk_addr_t u;
uintptr_t p;
printf("mag,band,slot,addr");
int imag=0;
int iband=0 ;
int islot_bytes=0;
for (imag = 0;imag!=32;imag++) {
for (iband = 0 ; iband!=2 ; iband++) {
for (islot_bytes=16;islot_bytes!=256+16;islot_bytes = islot_bytes + 16) {
u.fields.nano_signature = NANOZONE_SIGNATURE;
u.fields.nano_mag_index = imag;
u.fields.nano_band = iband;
u.fields.nano_slot = (islot_bytes >> SHIFT_NANO_QUANTUM) - 1;
u.fields.nano_offset = 0;

p = u.addr;
//mach_vm_address_t vm_addr = p & ~((uintptr_t)(BAND_SIZE - 1));
printf("\n%d,%d,%d,%llx",imag,iband,islot_bytes,u.addr);

}

}
}

打印的结果太多了,这里就整理出比较有特征的几个结果,用作分析。

mag band slot addr
0 0 16 0x600000000000
0 0 32 0x600000020000
0 0 256 0x6000001e0000
0 1 16 0x600000200000
0 1 256 0x6000003e0000
1 0 16 0x608000000000
1 1 16 0x608000200000

表-1

可以明显看出有一下几个特性

  • 所有的band基址是从0x600000000000开始的。
  • 每一个cpu单独使用一个bandmag不同的时候,band号为0,slot_bytes为16时,两个addr不同。
  • 同一个cpu的一个band用完之后,线性的申请下一个band
  • 相同的band,即band号相同时,0x200000(2MB)的空间,被划分为0x10(16)个,大小为0x2000(128KB)的slot。且为线性分布。

而不是第一次调用的情况则十分简单,直接在slot_current_base_addr的基础上添加BAND_SIZE,也就是2MB(0x200000)获取新的slot_current_base_addr

例如:

band为0,slot_bytes为256时,新的slot_current_base_addr

0x6000001e0000+0x200000=0x6000003e0000

与表-1中结果一致。

余下的代码比较好理解,就是根据计算出来的地址,想内核申请内存,且根据相关数据填写pMeta的的数据,在下一次调用malloc时,非常方便的从已经处理好的pMeta中获取内存空间。再根据pMeta中相关的数据,计算得到需要返回给调用者的指针。

nano_free

free的操作,相对简单,就是将不用的内存,添加到每一个pMeta的队列之中,而每一个队列,在malloc时,如果发现需要使用的队列中存在元素,则优先使用队列中已经释放的内存。

逻辑相对简单,就不对源码做出分析了,有兴趣的读者可以自行阅读源码。相信在了解了上面流程之后,不是一件难的事情。

小结

nano的内存分配逻辑相对简单,和之前分析过的linux堆内存的分配策略是不同的,在所申请的内存范围很小的时候,可以加快申请的速度,并且提高内存的复用。

文章目录
  1. 1. 摘要
  2. 2. nano_zone_s
    1. 2.1. 一些需要理解的宏定义
      1. 2.1.1. NANO_MAX_SIZE
      2. 2.1.2. SLOT_IN_BAND_SIZE
      3. 2.1.3. BAND_SIZE
      4. 2.1.4. NANO_MAG_SIZE
      5. 2.1.5. NANO_SLOT_SIZE
    2. 2.2. nano_meta_s
  3. 3. nano_malloc
    1. 3.0.1. 第一次调用_nano_malloc_check_clear
      1. 3.0.1.1. segregated_size_to_fit
      2. 3.0.1.2. 初见segregated_next_block
      3. 3.0.1.3. 获得第一个BAND
        1. 3.0.1.3.1. 什么是BAND
        2. 3.0.1.3.2. 获取pMeta->slot_current_base_addr
  • 4. nano_free
  • 5. 小结
  • ,