2016-03-30

OSX内核加载mach-o流程分析

0x00 摘要

研究OS X安全方面的知识需要对mach-o加载的流程需要有一个比较完整的理解，断断续续一个月的时间里面，通过对源码的阅读对mach-o的加载有一个比较基本的认识，在遇到各个具体的问题是才能更好的理解和操作。

其他相关文章可以看这里，基本涵盖了从内核态到应用层的相关源码的简单分析。还有不足之处在遇到相关的问题时也会加到这一系列文章中。

1.mach-o加载流程学习-dyld对主image的处理流程

2.mach-o加载流程学习-dyld对依赖库的加载流程

3.mach-o加载流程学习-内核对mach-o文件的加载流程(本文)

通过一张图片，可以比较清楚的理解整个流程。

整体流程

0x01 源码分析

1.1 __mac_execve

int
__mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
{
	char *bufp = NULL; 
	struct image_params *imgp;
	struct vnode_attr *vap;
	struct vnode_attr *origvap;
	int error;
	int is_64 = IS_64BIT_PROCESS(p);
	struct vfs_context context;
	struct uthread	*uthread;
	
  	//初始化context
	context.vc_thread = current_thread();
	context.vc_ucred = kauth_cred_proc_ref(p);	/* XXX must NOT be kauth_cred_get() */

	/* Allocate a big chunk for locals instead of using stack since these  
	 * structures a pretty big.
	 */
  	//申请一块连续的大内存，用来存放imgp，vap，origvap的数据结构
	MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO);
	imgp = (struct image_params *) bufp;
	if (bufp == NULL) {
		error = ENOMEM
		goto exit_with_error;
	}
  	//通过数据结构size的偏移，指向对应的内存空间
  	//imgp,vap,origvap实际是连续的一块内存
	vap = (struct vnode_attr *) (bufp + sizeof(*imgp));
	origvap = (struct vnode_attr *) (bufp + sizeof(*imgp) + sizeof(*vap));
	
	/* Initialize the common data in the image_params structure */
	//初始化数据
  	imgp->ip_user_fname = uap->fname;
	imgp->ip_user_argv = uap->argp;
	imgp->ip_user_envv = uap->envp;
	imgp->ip_vattr = vap;
	imgp->ip_origvattr = origvap;
	imgp->ip_vfs_context = &context;
	imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
	imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
	imgp->ip_mac_return = 0;

  	//设置线程信息
	uthread = get_bsdthread_info(current_thread());
	if (uthread->uu_flag & UT_VFORK) {
		imgp->ip_flags |= IMGPF_VFORK_EXEC;
	}

  	//MAC模块相应的处理，与进程的权限相关
  	//MAC:https://www.freebsd.org/doc/handbook/mac.html
#if CONFIG_MACF
	if (uap->mac_p != USER_ADDR_NULL) {
		error = mac_execve_enter(uap->mac_p, imgp);
		if (error) {
			kauth_cred_unref(&context.vc_ucred);
			goto exit_with_error;
		}
	}
#endif
	
  	//执行image
	error = exec_activate_image(imgp);

  	//释放资源与出错处理
	kauth_cred_unref(&context.vc_ucred);
	
	/* Image not claimed by any activator? */
	if (error == -1)
		error = ENOEXEC;
	/*...*/	
	return(error);
}

主要就是进行了一些数据结构的初始化已经权限的判断，资源的获取与释放，主要逻辑在exec_activate_image中。

1.2 exec_activate_image

/*
 * exec_activate_image
 *
 * Description:	Iterate through the available image activators, and activate
 *		the image associated with the imgp structure.  We start with
 *		the
 *
 * Parameters:	struct image_params *	Image parameter block
 *
 * Returns:	0			Success
 *		EBADEXEC		The executable is corrupt/unknown
 *	execargs_alloc:EINVAL		Invalid argument
 *	execargs_alloc:EACCES		Permission denied
 *	execargs_alloc:EINTR		Interrupted function
 *	execargs_alloc:ENOMEM		Not enough space
 *	exec_save_path:EFAULT		Bad address
 *	exec_save_path:ENAMETOOLONG	Filename too long
 *	exec_check_permissions:EACCES	Permission denied
 *	exec_check_permissions:ENOEXEC	Executable file format error
 *	exec_check_permissions:ETXTBSY	Text file busy [misuse of error code]
 *	exec_check_permissions:???
 *	namei:???
 *	vn_rdwr:???			[anything vn_rdwr can return]
 *	<ex_imgact>:???			[anything an imgact can return]
 */
static int
exec_activate_image(struct image_params *imgp)
{
	struct nameidata *ndp = NULL;
	const char *excpath;
	int error;
	int resid;
	int once = 1;	/* save SGUID-ness for interpreted files */
	int i;
	int itercount = 0;
	proc_t p = vfs_context_proc(imgp->ip_vfs_context);

	error = execargs_alloc(imgp);
	if (error)
		goto bad_notrans;
	
	error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath);
	if (error) {
		goto bad_notrans;
	}

	/* Use excpath, which contains the copyin-ed exec path */
	DTRACE_PROC1(exec, uintptr_t, excpath);

	MALLOC(ndp, struct nameidata *, sizeof(*ndp), M_TEMP, M_WAITOK | M_ZERO);
	if (ndp == NULL) {
		error = ENOMEM;
		goto bad_notrans;
	}

	NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1,
		   UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);

again:
	error = namei(ndp); //todo:	详细流程先不看，研究下来感觉是路径的搜索
	if (error)
		goto bad_notrans;
	imgp->ip_ndp = ndp;	/* successful namei(); call nameidone() later */
	imgp->ip_vp = ndp->ni_vp;	/* if set, need to vnode_put() at some point */

	/*
	 * Before we start the transition from binary A to binary B, make
	 * sure another thread hasn't started exiting the process.  We grab
	 * the proc lock to check p_lflag initially, and the transition
	 * mechanism ensures that the value doesn't change after we release
	 * the lock.
	 */
	proc_lock(p);
	if (p->p_lflag & P_LEXIT) {
		proc_unlock(p);
		goto bad_notrans;
	}
	error = proc_transstart(p, 1, 0);
	proc_unlock(p);
	if (error)
		goto bad_notrans;

	error = exec_check_permissions(imgp);
	if (error)
		goto bad;

	/* Copy; avoid invocation of an interpreter overwriting the original */
	if (once) {
		once = 0;
		*imgp->ip_origvattr = *imgp->ip_vattr;
	}
	//读取数据到内存中
	error = vn_rdwr(UIO_READ, imgp->ip_vp, imgp->ip_vdata, PAGE_SIZE, 0,
			UIO_SYSSPACE, IO_NODELOCKED,
			vfs_context_ucred(imgp->ip_vfs_context),
			&resid, vfs_context_proc(imgp->ip_vfs_context));
	if (error)
		goto bad;

	if (resid) {
		memset(imgp->ip_vdata + (PAGE_SIZE - resid), 0x0, resid);
	}

  	//到这里之前的代码主要做了两件事情
  	//1.根据路径查找文件
  	//2.将文件拷贝到内存中
encapsulated_binary:
	/* Limit the number of iterations we will attempt on each binary */
	if (++itercount > EAI_ITERLIMIT) {
		error = EBADEXEC;
		goto bad;
	}
	error = -1;
	for(i = 0; error == -1 && execsw[i].ex_imgact != NULL; i++) {
		//这里对macho文件进行了解析
		error = (*execsw[i].ex_imgact)(imgp);	//todo:调用了一个指针函数，exec_mach_imgact
      	//总共有三种函数
      	/*
        struct execsw {
		int (*ex_imgact)(struct image_params *);
		const char *ex_name;
		} execsw[] = {
		{ exec_mach_imgact,		"Mach-o Binary" },
		{ exec_fat_imgact,		"Fat Binary" },
		{ exec_shell_imgact,		"Interpreter Script" },
		{ NULL, NULL}
};
*/
      	//分别是osx支持的三种不同的可执行文件

		switch (error) {
            /*出错处理*/
        }
	}

	/*
	 * Call out to allow 3rd party notification of exec. 
	 * Ignore result of kauth_authorize_fileop call.
	 */
	if (error == 0 && kauth_authorize_fileop_has_listeners()) {
		kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
					KAUTH_FILEOP_EXEC,
					(uintptr_t)ndp->ni_vp, 0);
	}

bad:
	proc_transend(p, 0);

bad_notrans:
	if (imgp->ip_strings)
		execargs_free(imgp);
	if (imgp->ip_ndp)
		nameidone(imgp->ip_ndp);
	if (ndp)
		FREE(ndp, M_TEMP);

	return (error);
}

这个函数主要做的事情就是寻找并拷贝可执行文件到内存中，并且根据可执行文件的类型调用不同的解析函数。osx总共支持三种可执行文件。他们各自有对应的处理函数。

mach-o：exec_mach_imgact
Fat Binary：exec_fat_imgact
Interpreter Script：exec_shell_imgact

1.3 exec_mach_imgact

/*
 * exec_mach_imgact
 *
 * Image activator for mach-o 1.0 binaries.
 *
 * Parameters;	struct image_params *	image parameter block
 *
 * Returns:	-1			not a fat binary (keep looking)
 *		-2			Success: encapsulated binary: reread
 *		>0			Failure: error number
 *		EBADARCH		Mach-o binary, but with an unrecognized
 *					architecture
 *		ENOMEM			No memory for child process after -
 *					can only happen after vfork()
 *
 * Important:	This image activator is NOT byte order neutral.
 *
 * Note:	A return value other than -1 indicates subsequent image
 *		activators should not be given the opportunity to attempt
 *		to activate the image.
 *
 * TODO:	More gracefully handle failures after vfork
 */
static int
exec_mach_imgact(struct image_params *imgp)
{
	struct mach_header *mach_header = (struct mach_header *)imgp->ip_vdata;
	proc_t			p = vfs_context_proc(imgp->ip_vfs_context);
	int			error = 0;
	task_t			task;
	task_t			new_task = NULL; /* protected by vfexec */
	thread_t		thread;
	struct uthread		*uthread;
	vm_map_t old_map = VM_MAP_NULL;
	vm_map_t map;
	load_return_t		lret;
	load_result_t		load_result;
	struct _posix_spawnattr *psa = NULL;
	int			spawn = (imgp->ip_flags & IMGPF_SPAWN);
	int			vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
	int			p_name_len;

	/*
	 * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference
	 * is a reserved field on the end, so for the most part, we can
	 * treat them as if they were identical. Reverse-endian Mach-O
	 * binaries are recognized but not compatible.
 	 */
    // 检测header里面的magic，是否符合macho文件的特征
  	// NXSwapInt:PowerPC等平台中的二进制文件
  	//MH_CIGAM    = 0xCEFAEDFE
    //MH_CIGAM_64 = 0xCFFAEDFE
	if ((mach_header->magic == MH_CIGAM) ||
	    (mach_header->magic == MH_CIGAM_64)) {
		error = EBADARCH;
		goto bad;
	}

  	// 检测header里面的magic，是否符合macho文件的特征
    // #define	MH_MAGIC	0xfeedface
  	// #define MH_MAGIC_64 0xfeedfacf 
    // 通用的macho二进制文件，一般遇到都是这种
	if ((mach_header->magic != MH_MAGIC) &&
	    (mach_header->magic != MH_MAGIC_64)) {
		error = -1;
		goto bad;
	}
	
  	// 检测macho的文件类型，文件类型必须是可执行文件
   	// 还有一些其他的常见类型
    // #define	MH_OBJECT	0x1		编译过程产生的obj文件
    // #define	MH_CORE		0x4		崩溃时的dump文件
	if (mach_header->filetype != MH_EXECUTE) {
		error = -1;
		goto bad;
	}

    // 获取macho的执行环境，cpu的平台与版本
	if (imgp->ip_origcputype != 0) {
		/* Fat header previously had an idea about this thin file */
		if (imgp->ip_origcputype != mach_header->cputype ||
			imgp->ip_origcpusubtype != mach_header->cpusubtype) {
			error = EBADARCH;
			goto bad;
		}
	} else {
		imgp->ip_origcputype = mach_header->cputype;
		imgp->ip_origcpusubtype = mach_header->cpusubtype;
	}

	task = current_task();
	thread = current_thread();
	uthread = get_bsdthread_info(thread);

	if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64)
		imgp->ip_flags |= IMGPF_IS_64BIT;

	/* If posix_spawn binprefs exist, respect those prefs. */
	psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
	if (psa != NULL && psa->psa_binprefs[0] != 0) {
		int pr = 0;
		for (pr = 0; pr < NBINPREFS; pr++) {
			cpu_type_t pref = psa->psa_binprefs[pr];
			if (pref == 0) {
				/* No suitable arch in the pref list */
				error = EBADARCH;
				goto bad;
			}

			if (pref == CPU_TYPE_ANY) {
				/* Jump to regular grading */
				goto grade;
			}

			if (pref == imgp->ip_origcputype) {
				/* We have a match! */
				goto grade;
			}
		}
		error = EBADARCH;
		goto bad;
	}
grade:
  	//检测cpu平台
	if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK)) 	  {
		error = EBADARCH;
		goto bad;
	}

	/* Copy in arguments/environment from the old process */
    //获取环境变量和参数
    //为vfork执行macho做准备
	error = exec_extract_strings(imgp);
	if (error)
		goto bad;

	error = exec_add_apple_strings(imgp);
	if (error)
		goto bad;

	AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc, 
	    imgp->ip_endargv - imgp->ip_startargv);
	AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc,
	    imgp->ip_endenvv - imgp->ip_endargv);

	/*
	 * We are being called to activate an image subsequent to a vfork()
	 * operation; in this case, we know that our task, thread, and
	 * uthread are actually those of our parent, and our proc, which we
	 * obtained indirectly from the image_params vfs_context_t, is the
	 * new child process.
	 */
    // 通过fork，为macho生成一个新的线程
	if (vfexec || spawn) {
		if (vfexec) {
			imgp->ip_new_thread = fork_create_child(task, NULL, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT));
			if (imgp->ip_new_thread == NULL) {
				error = ENOMEM;
				goto bad;
			}
		}

		/* reset local idea of thread, uthread, task */
		thread = imgp->ip_new_thread;
		uthread = get_bsdthread_info(thread);
		task = new_task = get_threadtask(thread);
		map = get_task_map(task);
	} else {
		map = VM_MAP_NULL;
	}

	/*
	 * We set these flags here; this is OK, since if we fail after
	 * this point, we have already destroyed the parent process anyway.
	 */
    // 设置一些dyld需要使用的参数
	task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0);
	if (imgp->ip_flags & IMGPF_IS_64BIT) {
		task_set_64bit(task, TRUE);
		OSBitOrAtomic(P_LP64, &p->p_flag);
	} else {
		task_set_64bit(task, FALSE);
		OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
	}

	/*
	 *	Load the Mach-O file.
	 *
	 * NOTE: An error after this point  indicates we have potentially
	 * destroyed or overwritten some process state while attempting an
	 * execve() following a vfork(), which is an unrecoverable condition.
	 * We send the new process an immediate SIGKILL to avoid it executing
	 * any instructions in the mutated address space. For true spawns,
	 * this is not the case, and "too late" is still not too late to
	 * return an error code to the parent process.
	 */

	/*
	 * Actually load the image file we previously decided to load.
	 */
    //加载，映射macho文件到内存
	lret = load_machfile(imgp, mach_header, thread, map, &load_result);

	if (lret != LOAD_SUCCESS) {
		error = load_return_to_errno(lret);
		goto badtoolate;
	}

	proc_lock(p);
	p->p_cputype = imgp->ip_origcputype;
	p->p_cpusubtype = imgp->ip_origcpusubtype;
	proc_unlock(p);

	vm_map_set_user_wire_limit(get_task_map(task), p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);

	/* 
	 * Set code-signing flags if this binary is signed, or if parent has
	 * requested them on exec.
	 */
  	//设置了一堆标记位
    //需要关心一下的是这里和code-signgin有点关系
	if (load_result.csflags & CS_VALID) {
		imgp->ip_csflags |= load_result.csflags & 
			(CS_VALID|
			 CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV|CS_DYLD_PLATFORM|
			 CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT);
	} else {
		imgp->ip_csflags &= ~CS_VALID;
	}

	if (p->p_csflags & CS_EXEC_SET_HARD)
		imgp->ip_csflags |= CS_HARD;
	if (p->p_csflags & CS_EXEC_SET_KILL)
		imgp->ip_csflags |= CS_KILL;
	if (p->p_csflags & CS_EXEC_SET_ENFORCEMENT)
		imgp->ip_csflags |= CS_ENFORCEMENT;
	if (p->p_csflags & CS_EXEC_SET_INSTALLER)
		imgp->ip_csflags |= CS_INSTALLER;


	/*
	 * Set up the system reserved areas in the new address space.
	 */
    //依据可执行文件的平台，设置合适的执行环境
	vm_map_exec(get_task_map(task),
		    task,
		    (void *) p->p_fd->fd_rdir,
		    cpu_type());
	
	/*
	 * Close file descriptors which specify close-on-exec.
	 */
    //关闭所有被标记为close-on-exec的文件
	fdexec(p, psa != NULL ? psa->psa_flags : 0);

	/*
	 * deal with set[ug]id.
	 */
  	//处理setuid相关的逻辑，和权限相关
	error = exec_handle_sugid(imgp);
	if (error) {
		goto badtoolate;
	}	

	/*
	 * deal with voucher on exec-calling thread.
	 */
	if (imgp->ip_new_thread == NULL)
		thread_set_mach_voucher(current_thread(), IPC_VOUCHER_NULL);

	/* Make sure we won't interrupt ourself signalling a partial process */
	if (!vfexec && !spawn && (p->p_lflag & P_LTRACED))
		psignal(p, SIGTRAP);
	
  	//为进程设置应用层的栈地址
	if (load_result.unixproc &&
		create_unix_stack(get_task_map(task),
				  &load_result,
				  p) != KERN_SUCCESS) {
		error = load_return_to_errno(LOAD_NOSPACE);
		goto badtoolate;
	}

	if (vfexec || spawn) {
		old_map = vm_map_switch(get_task_map(task));
	}

	if (load_result.unixproc) {
		user_addr_t	ap;

		/*
		 * Copy the strings area out into the new process address
		 * space.
		 */
		ap = p->user_stack;
		error = exec_copyout_strings(imgp, &ap);
		if (error) {
			if (vfexec || spawn)
				vm_map_switch(old_map);
			goto badtoolate;
		}
		/* Set the stack */
		thread_setuserstack(thread, ap);
	}
	
	if (load_result.dynlinker) {
		uint64_t	ap;
		int			new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;

		/* Adjust the stack */
		ap = thread_adjuserstack(thread, -new_ptr_size);
		error = copyoutptr(load_result.mach_header, ap, new_ptr_size);

		if (error) {
			if (vfexec || spawn)
				vm_map_switch(old_map);
			goto badtoolate;
		}
		task_set_dyld_info(task, load_result.all_image_info_addr,
		    load_result.all_image_info_size);
	}

	/* Avoid immediate VM faults back into kernel */
  	//防止立刻执行指令导致的错误，做了大量和dyld相关的事情
	exec_prefault_data(p, imgp, &load_result);

	if (vfexec || spawn) {
		vm_map_switch(old_map);
	}
	/* Set the entry point */
	thread_setentrypoint(thread, load_result.entry_point);

	/* Stop profiling */
	stopprofclock(p);

	/*
	 * Reset signal state.
	 */
	execsigs(p, thread);

	/*
	 * need to cancel async IO requests that can be cancelled and wait for those
	 * already active.  MAY BLOCK!
	 */
	_aio_exec( p );

#if SYSV_SHM
	/* FIXME: Till vmspace inherit is fixed: */
	if (!vfexec && p->vm_shm)
		shmexec(p);
#endif
#if SYSV_SEM
	/* Clean up the semaphores */
	semexit(p);
#endif

	/*
	 * Remember file name for accounting.
	 */
	p->p_acflag &= ~AFORK;

	/*
	 * Set p->p_comm and p->p_name to the name passed to exec
	 */
	p_name_len = sizeof(p->p_name) - 1;
	if(imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len)
		imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len;
	bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name,
		(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
	p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';

	if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN)
		imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
	bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm,
		(unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
	p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0';

	pal_dbg_set_task_name( p->task );

#if DEVELOPMENT || DEBUG
	/* 
	 * Update the pid an proc name for importance base if any
	 */
	task_importance_update_owner_info(p->task);
#endif

	memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid));

// <rdar://6598155> dtrace code cleanup needed
#if CONFIG_DTRACE
	/*
	 * Invalidate any predicate evaluation already cached for this thread by DTrace.
	 * That's because we've just stored to p_comm and DTrace refers to that when it
	 * evaluates the "execname" special variable. uid and gid may have changed as well.
	 */
	dtrace_set_thread_predcache(current_thread(), 0);

	/*
	 * Free any outstanding lazy dof entries. It is imperative we
	 * always call dtrace_lazy_dofs_destroy, rather than null check
	 * and call if !NULL. If we NULL test, during lazy dof faulting
	 * we can race with the faulting code and proceed from here to
	 * beyond the helpers cleanup. The lazy dof faulting will then
	 * install new helpers which no longer belong to this process!
	 */
	dtrace_lazy_dofs_destroy(p);


	/*
    	 * Clean up any DTrace helpers for the process.
    	 */
    	if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
    		(*dtrace_helpers_cleanup)(p);
    	}
	
    	/*
    	 * Cleanup the DTrace provider associated with this process.
    	 */
	proc_lock(p);
	if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
		(*dtrace_fasttrap_exec_ptr)(p);
	}
	proc_unlock(p);
#endif

	if (kdebug_enable) {
		long dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4;

		/*
		 * Collect the pathname for tracing
		 */
		kdbg_trace_string(p, &dbg_arg1, &dbg_arg2, &dbg_arg3, &dbg_arg4);

		if (vfexec || spawn) {
			KERNEL_DEBUG_CONSTANT1(TRACE_DATA_EXEC | DBG_FUNC_NONE,
					p->p_pid ,0,0,0, (uintptr_t)thread_tid(thread));
			KERNEL_DEBUG_CONSTANT1(TRACE_STRING_EXEC | DBG_FUNC_NONE,
					dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, (uintptr_t)thread_tid(thread));
		} else {
			KERNEL_DEBUG_CONSTANT(TRACE_DATA_EXEC | DBG_FUNC_NONE,
					p->p_pid ,0,0,0,0);
			KERNEL_DEBUG_CONSTANT(TRACE_STRING_EXEC | DBG_FUNC_NONE,
					dbg_arg1, dbg_arg2, dbg_arg3, dbg_arg4, 0);
		}
	}

	/*
	 * If posix_spawned with the START_SUSPENDED flag, stop the
	 * process before it runs.
	 */
	if (imgp->ip_px_sa != NULL) {
		psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
		if (psa->psa_flags & POSIX_SPAWN_START_SUSPENDED) {
			proc_lock(p);
			p->p_stat = SSTOP;
			proc_unlock(p);
			(void) task_suspend_internal(p->task);
		}
	}

	/*
	 * mark as execed, wakeup the process that vforked (if any) and tell
	 * it that it now has its own resources back
	 */
	OSBitOrAtomic(P_EXEC, &p->p_flag);
	proc_resetregister(p);
	if (p->p_pptr && (p->p_lflag & P_LPPWAIT)) {
		proc_lock(p);
		p->p_lflag &= ~P_LPPWAIT;
		proc_unlock(p);
		wakeup((caddr_t)p->p_pptr);
	}

	/*
	 * Pay for our earlier safety; deliver the delayed signals from
	 * the incomplete vfexec process now that it's complete.
	 */
	if (vfexec && (p->p_lflag & P_LTRACED)) {
		psignal_vfork(p, new_task, thread, SIGTRAP);
	}

	goto done;

badtoolate:
	/* Don't allow child process to execute any instructions */
	if (!spawn) {
		if (vfexec) {
			psignal_vfork(p, new_task, thread, SIGKILL);
		} else {
			psignal(p, SIGKILL);
		}

		/* We can't stop this system call at this point, so just pretend we succeeded */
		error = 0;
	}
	
done:
	if (!spawn) {
		/* notify only if it has not failed due to FP Key error */
		if ((p->p_lflag & P_LTERM_DECRYPTFAIL) == 0)
			proc_knote(p, NOTE_EXEC);
	}

	/* Drop extra references for cases where we don't expect the caller to clean up */
	if (vfexec || (spawn && error == 0)) {
		task_deallocate(new_task);
		thread_deallocate(thread);
	}

bad:
	return(error);
}

该函数主要做这几件事情：

对macho文件做最基本的检测
fork新的线程运行macho
映射macho文件到内存中
对setuid，code-sign等权限相关的事情有处理
为dyld接手macho文件的处理做了大量的准备工作
dyld处理完之后，对资源的释放

1.4 load_machfile

load_return_t
load_machfile(
	struct image_params	*imgp,
	struct mach_header	*header,
	thread_t 		thread,
	vm_map_t 		new_map,
	load_result_t		*result
)
{
	struct vnode		*vp = imgp->ip_vp;
	off_t			file_offset = imgp->ip_arch_offset;
	off_t			macho_size = imgp->ip_arch_size;
	off_t			file_size = imgp->ip_vattr->va_data_size;
	
	pmap_t			pmap = 0;	/* protected by create_map */
	vm_map_t		map;
	vm_map_t		old_map;
	task_t			old_task = TASK_NULL; /* protected by create_map */
	load_result_t		myresult;
	load_return_t		lret;
	boolean_t create_map = FALSE;
	boolean_t enforce_hard_pagezero = TRUE;
	int spawn = (imgp->ip_flags & IMGPF_SPAWN);
	task_t task = current_task();
	proc_t p = current_proc();
	mach_vm_offset_t	aslr_offset = 0;
	mach_vm_offset_t	dyld_aslr_offset = 0;
	kern_return_t 		kret;

	if (macho_size > file_size) {
		return(LOAD_BADMACHO);
	}

	if (new_map == VM_MAP_NULL) {
		create_map = TRUE;
		old_task = current_task();
	}

	/*
	 * If we are spawning, we have created backing objects for the process
	 * already, which include non-lazily creating the task map.  So we
	 * are going to switch out the task map with one appropriate for the
	 * bitness of the image being loaded.
	 */
	if (spawn) {
		create_map = TRUE;
		old_task = get_threadtask(thread);
	}
	
  	//如果有new_map就用参数传进来的new_map
  	//否则就通过pmap_create,vm_map_create函数创建新的内存空间
	if (create_map) {
		task_t ledger_task;
		if (imgp->ip_new_thread) {
			ledger_task = get_threadtask(imgp->ip_new_thread);
		} else {
			ledger_task = task;
		}
		pmap = pmap_create(get_task_ledger(ledger_task),
				   (vm_map_size_t) 0,
				   ((imgp->ip_flags & IMGPF_IS_64BIT) != 0));
		pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT);
		map = vm_map_create(pmap,
				0,
				vm_compute_max_offset(((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT)),
				TRUE);
	} else
		map = new_map;

#if   (__ARM_ARCH_7K__ >= 2) && defined(PLATFORM_WatchOS)
	/* enforce 16KB alignment for watch targets with new ABI */
	vm_map_set_page_shift(map, SIXTEENK_PAGE_SHIFT);
#endif /* __arm64__ */

#ifndef	CONFIG_ENFORCE_SIGNED_CODE
	/* This turns off faulting for executable pages, which allows
	 * to circumvent Code Signing Enforcement. The per process
	 * flag (CS_ENFORCEMENT) is not set yet, but we can use the
	 * global flag.
	 */
	if ( !cs_enforcement(NULL) && (header->flags & MH_ALLOW_STACK_EXECUTION) )
	        vm_map_disable_NX(map);
#endif

	/* Forcibly disallow execution from data pages on even if the arch
	 * normally permits it. */
  	//将内存设置为不可执行，用来防止溢出漏洞的利用
	if ((header->flags & MH_NO_HEAP_EXECUTION) && !(imgp->ip_flags & IMGPF_ALLOW_DATA_EXEC))
		vm_map_disallow_data_exec(map);
	
	/*
	 * Compute a random offset for ASLR, and an independent random offset for dyld.
	 */
  	//地址随机，计算ASLR的偏移量
	if (!(imgp->ip_flags & IMGPF_DISABLE_ASLR)) {
		uint64_t max_slide_pages;

		max_slide_pages = vm_map_get_max_aslr_slide_pages(map);

		aslr_offset = random();
		aslr_offset %= max_slide_pages;
		aslr_offset <<= vm_map_page_shift(map);

		dyld_aslr_offset = random();
		dyld_aslr_offset %= max_slide_pages;
		dyld_aslr_offset <<= vm_map_page_shift(map);
	}
	
	if (!result)
		result = &myresult;

	*result = load_result_null;

  	//解析macho的文件格式
	lret = parse_machfile(vp, map, thread, header, file_offset, macho_size,
	                      0, (int64_t)aslr_offset, (int64_t)dyld_aslr_offset, result);

	if (lret != LOAD_SUCCESS) {
		if (create_map) {
			vm_map_deallocate(map);	/* will lose pmap reference too */
		}
		return(lret);
	}

#if __x86_64__
	/*
	 * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries.
	 */
	if ((imgp->ip_flags & IMGPF_IS_64BIT) == 0) {
		enforce_hard_pagezero = FALSE;
	}
#endif
	/*
	 * Check to see if the page zero is enforced by the map->min_offset.
	 */ 
	if (enforce_hard_pagezero &&
	    (vm_map_has_hard_pagezero(map, 0x1000) == FALSE)) {
		{
			if (create_map) {
				vm_map_deallocate(map);	/* will lose pmap reference too */
			}
			return (LOAD_BADMACHO);
		}
	}

	/*
	 *	Commit to new map.
	 *
	 *	Swap the new map for the old, which  consumes our new map
	 *	reference but each leaves us responsible for the old_map reference.
	 *	That lets us get off the pmap associated with it, and
	 *	then we can release it.
	 */
	 //用新申请的内存替换原来的内存
	 if (create_map) {
		/*
		 * If this is an exec, then we are going to destroy the old
		 * task, and it's correct to halt it; if it's spawn, the
		 * task is not yet running, and it makes no sense.
		 */
	 	if (!spawn) {
			/*
			 * Mark the task as halting and start the other
			 * threads towards terminating themselves.  Then
			 * make sure any threads waiting for a process
			 * transition get informed that we are committed to
			 * this transition, and then finally complete the
			 * task halting (wait for threads and then cleanup
			 * task resources).
			 *
			 * NOTE: task_start_halt() makes sure that no new
			 * threads are created in the task during the transition.
			 * We need to mark the workqueue as exiting before we
			 * wait for threads to terminate (at the end of which
			 * we no longer have a prohibition on thread creation).
			 * 
			 * Finally, clean up any lingering workqueue data structures
			 * that may have been left behind by the workqueue threads
			 * as they exited (and then clean up the work queue itself).
			 */
			kret = task_start_halt(task);
			if (kret != KERN_SUCCESS) {
				vm_map_deallocate(map);	/* will lose pmap reference too */
				return (LOAD_FAILURE);
			}
			proc_transcommit(p, 0);
			workqueue_mark_exiting(p);
			task_complete_halt(task);
			workqueue_exit(p);
			kqueue_dealloc(p->p_wqkqueue);
			p->p_wqkqueue = NULL;
		}
		old_map = swap_task_map(old_task, thread, map, !spawn);
		vm_map_deallocate(old_map);
	}
	return(LOAD_SUCCESS);
}

这个函数主要做了macho文件解析之外其他所有和加载相关的工作。

对新的task做了内存的分配
加强安全方面的设置主要是DEP和ASRL
调用函数解析macho文件
解析成功之后，用新申请的内存替换旧的内存。

1.5 parse_machfile

这个函数做的事情就非常的简单清楚了，就是将macho文件解析，并且映射到内存中。

在我的macho文件格式分析中已经分析过这一块代码了。这里就不复述了。

0x02 小结

通过对整个流程源码的一次简单梳理，大致明白了整个流程在源码中是怎么样实现的，在研究这方面的漏洞的时候可以更快的明白问题出在哪里，也可能更深刻的理解漏洞的成因以及重现的方法。

本文标题:OSX内核加载mach-o流程分析

文章作者:mrh

发布时间:2016年03月30日 - 19时58分

最后更新:2016年03月31日 - 11时39分

原始链接:http://turingh.github.io/2016/03/30/OSX内核加载mach-o流程分析/

许可协议: "署名-非商用-相同方式共享 3.0" 转载请保留原文链接及作者。