现在的位置: 首页 > 自动控制 > 工业·编程 > 正文

从glibc源码看系统调用原理–以fork库函数为例

2019-10-20 18:17 工业·编程 ⁄ 共 23277字 ⁄ 字号 暂无评论

  本文试图解答系统调用与库函数之间的关系、glibc库函数的实现原理、系统调用的实现原理等知识,本文讲述的CPU架构限定为ARM核心,为什么是ARM呢,原因很简单,X86我不熟。

系统调用概念

  从某种程度上来说,操作系统就是一个计算机的资源管理器,也可以理解为一个虚拟的计算机。那么这台虚拟计算机要向外提供功能,就必须提供一些接口,这些接口就是系统调用。

  有很多开发人员经常搞不清楚库函数和系统调用之间的关系,以为库函数(比如fork、read、write等)就是系统调用,从内核层次来划分,这其实是不对的,为此可以声明以下几点:

· 系统调用是操作系统内核为了向外提供服务而规定的一些调用接口(调用规范),它处于内核空间而非用户空间。

· 库函数(比如glibc)处于用户空间,它不是系统调用,但是它通常是进行系统调用的入口,它内部封装了进行系统调用的细节(和平台有关,比如arm和mips等),大多数库函数暴露的接口和系统调用接口都是一一对应的。

· 进行系统调用不一定非得经过库函数,应用代码依然可以使用嵌入式汇编/内联汇编/syscall等方式发起系统调用

wps1

                                           (注:此图来源于网络,具体来源未知)

   由于库函数是我们日常开发中进行系统调用的惯用手段,因此本文就以glibc-2.25源码为例,分析一下ARM平台下的系统调用原理。

glibc系统调用源码

   下面以fork库函数为例。首先找到fork库函数的定义位置,在/sysdeps/nptl/fork.c中,有如下定义:

weak_alias (__libc_fork, fork)

   看样子就是定义了一个别名fork,也就是真正的实现是__libc_fork,__libc_fork的实现如下(去掉一些细节):

pid_t __libc_fork(void)

{

pid_t pid;

#ifdef ARCH_FORK

  pid = ARCH_FORK();

#else

# error "ARCH_FORK must be defined so that the CLONE_SETTID flag is used"

  pid = INLINE_SYSCALL (fork, 0);#endif

return pid;

}

   __libc_fork本质就是在调用ARCH_FORK,其定义在/sysdeps/unix/sysv/linux/arm/arch-fork.h文件中:

#define ARCH_FORK()                                                     \

INLINE_SYSCALL (clone, 5, \

CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD, \

NULL, NULL, NULL, &THREAD_SELF->tid)

  ARCH_FORK宏中调用了INLINE_SYSCALL,继续跟踪INLINE_SYSCALL,其定义在/sysdeps/unix/sysv/linux/arm/sysdep.h:

/* Define a macro which expands into the inline wrapper code for a system

   call. 

*/

#undef INLINE_SYSCALL

#define INLINE_SYSCALL(name, nr, args...) \

  ({ unsigned int _sys_result = INTERNAL_SYSCALL (name, , nr, args); \

if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (_sys_result, ), 0)) \

       { \

__set_errno (INTERNAL_SYSCALL_ERRNO (_sys_result, )); \

_sys_result = (unsigned int) -1; \

       } \

     (int) _sys_result; })

      INLINE_SYSCALL宏一共需要三个参数,分别是:系统调用名称、参数个数、系统调用需要的参数(变长)。INLINE_SYSCALL支持使用直接使用SYS_ify对name字段修改了一下,就直接调用INTERNAL_SYSCALL_RAW。      

#undef INTERNAL_SYSCALL

#define INTERNAL_SYSCALL(name, err, nr, args...) \

INTERNAL_SYSCALL_RAW(SYS_ify(name), err, nr, args)

  SYS_ify只是在name前面加上了__NR_,因此这里为__NR_clone。

/* For Linux we can use the system call table in the header file

/usr/include/asm/unistd.h

   of the kernel.  But these symbols do not follow the SYS_* syntax

   so we have to redefine the `SYS_ify' macro here.  */

#undef SYS_ify#define SYS_ify(syscall_name) (__NR_##syscall_name)

     __NR_clone是什么意思呢?在linux/arch/arm/include/uapi/asm/unistd.h中,可以看到,__NR_clone就是clone系统调用对应的编号,内核空间不是以字符串来区分系统调用,而是为每个系统调用分配了一个独一无二的编号

#define __NR_SYSCALL_BASE 0/*

* This file contains the system call numbers.

*/

#define __NR_restart_syscall (__NR_SYSCALL_BASE+  0)

#define __NR_exit (__NR_SYSCALL_BASE+  1)

#define __NR_fork (__NR_SYSCALL_BASE+  2)

#define __NR_read (__NR_SYSCALL_BASE+  3)

#define __NR_write (__NR_SYSCALL_BASE+  4)

#define __NR_open (__NR_SYSCALL_BASE+  5)

#define __NR_close (__NR_SYSCALL_BASE+  6)

/*中间省略很多*/

#define __NR_clone (__NR_SYSCALL_BASE+120)

       INTERNAL_SYSCALL_RAW就是我们要看的重点,它封装了进行系统调用的细节。     

/* We can not expose the use of r7 to the compiler.  GCC (as

   of 4.5) uses r7 as the hard frame pointer for Thumb - although

   for Thumb-2 it isn't obviously a better choice than r11.

   And GCC does not support asms that conflict with the frame

   pointer.

   This would be easier if syscall numbers never exceeded 255,

   but they do.  For the moment the LOAD_ARGS_7 is sacrificed.

   We can't use push/pop inside the asm because that breaks

   unwinding (i.e. thread cancellation) for this frame.  We can't

   locally save and restore r7, because we do not know if this

   function uses r7 or if it is our caller's r7; if it is our caller's,

   then unwinding will fail higher up the stack.  So we move the

   syscall out of line and provide its own unwind information.  */

/* ARM */

# undef INTERNAL_SYSCALL_RAW# define INTERNAL_SYSCALL_RAW(name, err, nr, args...) \

  ({ \

register int _a1 asm ("r0"), _nr asm ("r7"); \

       LOAD_ARGS_##nr (args) \

       _nr = name; \

asm volatile ("swi 0x0 @ syscall " #name \

    : "=r" (_a1) \

    : "r" (_nr) ASM_ARGS_##nr \

    : "memory"); \

       _a1; })

        INTERNAL_SYSCALL_RAW中使用了内联汇编的方法产生系统调用,具体为:

· 把调用参数(args...)保存到r0-r6的寄存器中

· 把系统调用编号保存在r7寄存器中

· 执行swi 0x0软中断指令,陷入linux内核中

     其中,LOAD_ARGS_##nr和ASM_ARGS_##nr分别根据参数个数不同,具有如下系列的宏定义:

#define LOAD_ARGS_0()

#define ASM_ARGS_0

#define LOAD_ARGS_1(a1) \

int _a1tmp = (int) (a1); \

  LOAD_ARGS_0 () \

  _a1 = _a1tmp;

#define ASM_ARGS_1 ASM_ARGS_0, "r" (_a1)

#define LOAD_ARGS_2(a1, a2) \

int _a2tmp = (int) (a2); \

  LOAD_ARGS_1 (a1) \

  register int _a2 asm ("a2") = _a2tmp;

#define ASM_ARGS_2 ASM_ARGS_1, "r" (_a2)

#define LOAD_ARGS_3(a1, a2, a3) \

int _a3tmp = (int) (a3); \

  LOAD_ARGS_2 (a1, a2) \

  register int _a3 asm ("a3") = _a3tmp;

#define ASM_ARGS_3 ASM_ARGS_2, "r" (_a3)

#define LOAD_ARGS_4(a1, a2, a3, a4) \

int _a4tmp = (int) (a4); \

  LOAD_ARGS_3 (a1, a2, a3) \

  register int _a4 asm ("a4") = _a4tmp;

#define ASM_ARGS_4 ASM_ARGS_3, "r" (_a4)

#define LOAD_ARGS_5(a1, a2, a3, a4, a5) \

int _v1tmp = (int) (a5); \

  LOAD_ARGS_4 (a1, a2, a3, a4) \

  register int _v1 asm ("v1") = _v1tmp;

#define ASM_ARGS_5 ASM_ARGS_4, "r" (_v1)

#define LOAD_ARGS_6(a1, a2, a3, a4, a5, a6) \

int _v2tmp = (int) (a6); \

  LOAD_ARGS_5 (a1, a2, a3, a4, a5) \

  register int _v2 asm ("v2") = _v2tmp;

#define ASM_ARGS_6 ASM_ARGS_5, "r" (_v2)

#ifndef __thumb__

# define LOAD_ARGS_7(a1, a2, a3, a4, a5, a6, a7) \

int _v3tmp = (int) (a7); \

  LOAD_ARGS_6 (a1, a2, a3, a4, a5, a6) \

  register int _v3 asm ("v3") = _v3tmp;

# define ASM_ARGS_7 ASM_ARGS_6, "r" (_v3)

#endif

   关于参数传递与寄存器的对应关系,可以参考如下对应关系(读者可以思考当参数大于7个时如何处理):

/* Linux takes system call args in registers:

arg 1 r0

arg 2 r1

arg 3 r2

arg 4 r3

arg 5 r4 (this is different from the APCS convention)

arg 6 r5

arg 7 r6

The compiler is going to form a call by coming here, through PSEUDO, with

arguments

syscall number in the DO_CALL macro

arg 1 r0

arg 2 r1

arg 3 r2

arg 4 r3

arg 5 [sp]

arg 6 [sp+4]

arg 7 [sp+8]

We need to shuffle values between R4..R6 and the stack so that the

caller's v1..v3 and stack frame are not corrupted, and the kernel

sees the right arguments.

*/

       同时,ARM对系统调用编号的传递模式也稍有不同,见如下英文:

  The ARM EABI user interface passes the syscall number in r7, instead
   of in the swi.  This is more efficient, because the kernel does not need
   to fetch the swi from memory to find out the number; which can be painful
   with separate I-cache and D-cache.  Make sure to use 0 for the SWI
   argument; otherwise the (optional) compatibility code for APCS binaries
   may be invoked. 

   上面这段代码的意思大致是,ARM EABI用户接口使用r7传递系统调用编号,而传统的手段是把系统调用编号作为swi的立即数进行传递(SWI{cond} immed_24),这样在陷入内核状态后,需要先拿到swi指令的地址,然后在拿到swi指令后面的立即数才能拿到系统调用编号,而如果采用r7寄存器传递系统调用编号,效率将大大提升。

    至此,glibc的工作就完成了,控制权已经从用户空间陷入内核空间了,那么下面就看一下linux内核空间的主要工作。

linux内核系统调用

    上文中swi指令已经让代码陷入内核空间,swi是一条软中断指令,那么它对应的中断处理程序时什么呢?先看一下内核中的中断向量表:

.section .vectors, "ax", %progbits

.L__vectors_start:

W(b) swi

W(b) vector_und                      // 未定义

W(ldr) pc, .L__vectors_start + 0x1000 // swi中断向量

W(b) vector_pabt                     // 指令预取异常中断

W(b) vector_dabt                     // 数据中止

W(b) vector_addrexcptn               // 地址异常

W(b) vector_irq                      // IRQ(一般中断)

W(b) vector_fiq                      // FIQ(快速中断)

  可以看到软中断处理函数位于基于L__vectors_start+0x1000偏移出(相对于pc),那么在__vectors_start+0x1000偏移处有什么呢?0x1000又有何来源?我们可以从/arch/arm/kernel/vmlinux.lds.S链接脚本中找到答案,.vectors端放在0xffff0000起始位置,.stubs端放在0xffff0000 + 0x1000起始位置(其中.vectors存放了中断向量表,.stubs存放了对应的中断处理程序)

SECTIONS

{

. = PAGE_OFFSET + TEXT_OFFSET;

.head.text : {

_text = .;

HEAD_TEXT

}

.text : { /* Real text segment */

}

/*

* The vectors and stubs are relocatable code, and the

* only thing that matters is their relative offsets

*/

__vectors_start = .;

.vectors 0xffff0000 : AT(__vectors_start) {

*(.vectors)

}

. = __vectors_start + SIZEOF(.vectors);

__vectors_end = .;

__stubs_start = .;

.stubs ADDR(.vectors) + 0x1000 : AT(__stubs_start) {

*(.stubs)

}

. = __stubs_start + SIZEOF(.stubs);

__stubs_end = .;

}

   .vectors段的内容上文已经看到了,下面来看一下.stubs段的内容:

.section .stubs, "ax", %progbits

@ This must be the first word 注意这里

.word vector_swi

vector_rst:

ARM( swi SYS_ERROR0 )

THUMB( svc #0 )

THUMB( nop )

b vector_und

/*

* Interrupt dispatcher

*/

vector_stub irq, IRQ_MODE, 4

.long __irq_usr @  0  (USR_26 / USR_32)

.long __irq_invalid @  1  (FIQ_26 / FIQ_32)

.long __irq_invalid @  2  (IRQ_26 / IRQ_32)

.long __irq_svc @  3  (SVC_26 / SVC_32)

.long __irq_invalid @  4

.long __irq_invalid @  5

.long __irq_invalid @  6

.long __irq_invalid @  7

.long __irq_invalid @  8

.long __irq_invalid @  9

.long __irq_invalid @  a

.long __irq_invalid @  b

.long __irq_invalid @  c

.long __irq_invalid @  d

.long __irq_invalid @  e

.long __irq_invalid @  f

  可以清晰看到,.stubs段的起始位置存放的就是swi的中断处理程序入口地址(vector_swi),下面是其代码:

/*=============================================================================

* SWI handler

*-----------------------------------------------------------------------------

*/

.align 5

ENTRY(vector_swi)#ifdef CONFIG_CPU_V7M

v7m_exception_entry#else

    @这个栈帧大小正好是struct pt_regs的大小. struct pt_regs中保存的是

@线程用户态的寄存器上下文(模式上下文).

sub sp, sp, #PT_REGS_SIZE

    @ 将r0-r12寄存器压入内核栈中

stmia sp, {r0 - r12} @ Calling r0 - r12

    @ 将用户态的sp、lr压入内核栈中

ARM( add r8, sp, #S_PC )

ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr

THUMB( mov r8, sp )

THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr

    @ sprs_svc中保存的是调用swi指令前的cpsr值,这里将它保存在寄存器r8中

mrs r8, spsr @ called from non-FIQ mode, so ok.

    @ lr中的值是swi指令的下一行,也就是系统调用的用户态返回地址。将其压入内核栈中

str lr, [sp, #S_PC] @ Save calling PC

    @ 将调用swi指令前的cpsr值压入内核栈中

str r8, [sp, #S_PSR] @ Save CPSR

    @ 将r0压入内核栈中

str r0, [sp, #S_OLD_R0] @ Save OLD_R0#endif

zero_fp

alignment_trap r10, ip, __cr_alignment

    @使能中断

enable_irq

ct_user_exit

get_thread_info tsk

/*

* Get the system call number.

*/

#if defined(CONFIG_OABI_COMPAT)

/*

* If we have CONFIG_OABI_COMPAT then we need to look at the swi

* value to determine if it is an EABI or an old ABI call.

*/#ifdef CONFIG_ARM_THUMB

tst r8, #PSR_T_BIT

movne r10, #0 @ no thumb OABI emulation

USER( ldreq r10, [lr, #-4] ) @ get SWI instruction#else

USER( ldr r10, [lr, #-4] ) @ get SWI instruction#endif

ARM_BE8(rev r10, r10) @ little endian instruction

#elif defined(CONFIG_AEABI)

/*

* Pure EABI user space always put syscall number into scno (r7).

*/#elif defined(CONFIG_ARM_THUMB)

/* Legacy ABI only, possibly thumb mode. */

tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs

addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in

USER( ldreq scno, [lr, #-4] )

#else

/* Legacy ABI only. */

USER( ldr scno, [lr, #-4] ) @ get SWI instruction#endif

uaccess_disable tbl

adr tbl, sys_call_table @ load syscall table pointer

#if defined(CONFIG_OABI_COMPAT)

/*

* If the swi argument is zero, this is an EABI call and we do nothing.

*

* If this is an old ABI call, get the syscall number into scno and

* get the old ABI syscall table address.

*/

bics r10, r10, #0xff000000

eorne scno, r10, #__NR_OABI_SYSCALL_BASE

ldrne tbl, =sys_oabi_call_table#elif !defined(CONFIG_AEABI)

bic scno, scno, #0xff000000 @ mask off SWI op-code

eor scno, scno, #__NR_SYSCALL_BASE @ check OS number#endif

local_restart:

ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing

stmdb sp!, {r4, r5} @ push fifth and sixth args

tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?

bne __sys_trace

cmp scno, #NR_syscalls @ check upper syscall limit

    @ 这里先设置系统调用执行函数sys_xxx()的返回地址为ret_fast_syscall

    @ 这设置的是当前线程lr_svc寄存器,当下次通过__switch_to恢复

    @ 当前线程的上下文(cpu_context)时首先调用ret_fast_syscall来恢复其用户态

    @ 的线程上下文(struct pt_regs).

badr lr, ret_fast_syscall @ return address

    @ 以下就是根据系统调用号调用具体的执行函数。

ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine

add r1, sp, #S_OFF2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)

eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back

bcs arm_syscall

mov why, #0 @ no longer a real syscall

b sys_ni_syscall @ not private func

#if defined(CONFIG_OABI_COMPAT) || !defined(CONFIG_AEABI)

/*

* We failed to handle a fault trying to access the page

* containing the swi instruction, but we're not really in a

* position to return -EFAULT. Instead, return back to the

* instruction and re-enter the user fault handling path trying

* to page it in. This will likely result in sending SEGV to the

* current task.

*/9001:

sub lr, lr, #4

str lr, [sp, #S_PC]

b ret_fast_syscall#endif

ENDPROC(vector_swi)

    上面的代码中,最终会根据系统调用编号去sys_call_table中调用相应的函数,sys_call_table在/arch/arm/kernel/entry-common.S中定义:

.type sys_call_table, #objectENTRY(sys_call_table)#include "calls.S"

      其中重点是calls.S,路径为/arch/arm/kernel/calls.S,这里声明了所有的内核系统调用:

/* 0 */ CALL(sys_restart_syscall)

CALL(sys_exit)

CALL(sys_fork)

CALL(sys_read)

CALL(sys_write)/* 5 */ CALL(sys_open)

CALL(sys_close)

CALL(sys_ni_syscall) /* was sys_waitpid */

CALL(sys_creat)

CALL(sys_link)/* 10 */ CALL(sys_unlink)

CALL(sys_execve)

CALL(sys_chdir)

CALL(OBSOLETE(sys_time)) /* used by libc4 */

CALL(sys_mknod)/* 15 */ CALL(sys_chmod)

CALL(sys_lchown16)

CALL(sys_ni_syscall) /* was sys_break */

CALL(sys_ni_syscall) /* was sys_stat */

CALL(sys_lseek)/* 20 */ CALL(sys_getpid)

CALL(sys_mount)

CALL(OBSOLETE(sys_oldumount)) /* used by libc4 */

CALL(sys_setuid16)

CALL(sys_getuid16)/* 25 */ CALL(OBSOLETE(sys_stime))

CALL(sys_ptrace)

CALL(OBSOLETE(sys_alarm)) /* used by libc4 */

CALL(sys_ni_syscall) /* was sys_fstat */

CALL(sys_pause)/* 30 */ CALL(OBSOLETE(sys_utime)) /* used by libc4 */

CALL(sys_ni_syscall) /* was sys_stty */

CALL(sys_ni_syscall) /* was sys_getty */

CALL(sys_access)

CALL(sys_nice)/* 35 */ CALL(sys_ni_syscall) /* was sys_ftime */

CALL(sys_sync)

CALL(sys_kill)

CALL(sys_rename)

CALL(sys_mkdir)/* 40 */ CALL(sys_rmdir)

CALL(sys_dup)

CALL(sys_pipe)

CALL(sys_times)

CALL(sys_ni_syscall) /* was sys_prof *//* 45 */ CALL(sys_brk)

CALL(sys_setgid16)

CALL(sys_getgid16)

CALL(sys_ni_syscall) /* was sys_signal */

CALL(sys_geteuid16)/* 50 */ CALL(sys_getegid16)

CALL(sys_acct)

CALL(sys_umount)

CALL(sys_ni_syscall) /* was sys_lock */

CALL(sys_ioctl)/* 55 */ CALL(sys_fcntl)

CALL(sys_ni_syscall) /* was sys_mpx */

CALL(sys_setpgid)

CALL(sys_ni_syscall) /* was sys_ulimit */

CALL(sys_ni_syscall) /* was sys_olduname *//* 60 */ CALL(sys_umask)

CALL(sys_chroot)

CALL(sys_ustat)

CALL(sys_dup2)

CALL(sys_getppid)/* 65 */ CALL(sys_getpgrp)

CALL(sys_setsid)

CALL(sys_sigaction)

CALL(sys_ni_syscall) /* was sys_sgetmask */

CALL(sys_ni_syscall) /* was sys_ssetmask *//* 70 */ CALL(sys_setreuid16)

CALL(sys_setregid16)

CALL(sys_sigsuspend)

CALL(sys_sigpending)

CALL(sys_sethostname)/* 75 */ CALL(sys_setrlimit)

CALL(OBSOLETE(sys_old_getrlimit)) /* used by libc4 */

CALL(sys_getrusage)

CALL(sys_gettimeofday)

CALL(sys_settimeofday)/* 80 */ CALL(sys_getgroups16)

CALL(sys_setgroups16)

CALL(OBSOLETE(sys_old_select)) /* used by libc4 */

CALL(sys_symlink)

CALL(sys_ni_syscall) /* was sys_lstat *//* 85 */ CALL(sys_readlink)

CALL(sys_uselib)

CALL(sys_swapon)

CALL(sys_reboot)

CALL(OBSOLETE(sys_old_readdir)) /* used by libc4 *//* 90 */ CALL(OBSOLETE(sys_old_mmap)) /* used by libc4 */

CALL(sys_munmap)

CALL(sys_truncate)

CALL(sys_ftruncate)

CALL(sys_fchmod)/* 95 */ CALL(sys_fchown16)

CALL(sys_getpriority)

CALL(sys_setpriority)

CALL(sys_ni_syscall) /* was sys_profil */

CALL(sys_statfs)/* 100 */ CALL(sys_fstatfs)

CALL(sys_ni_syscall) /* sys_ioperm */

CALL(OBSOLETE(ABI(sys_socketcall, sys_oabi_socketcall)))

CALL(sys_syslog)

CALL(sys_setitimer)/* 105 */ CALL(sys_getitimer)

CALL(sys_newstat)

CALL(sys_newlstat)

CALL(sys_newfstat)

CALL(sys_ni_syscall) /* was sys_uname *//* 110 */ CALL(sys_ni_syscall) /* was sys_iopl */

CALL(sys_vhangup)

CALL(sys_ni_syscall)

CALL(OBSOLETE(sys_syscall)) /* call a syscall */

CALL(sys_wait4)/* 115 */ CALL(sys_swapoff)

CALL(sys_sysinfo)

CALL(OBSOLETE(ABI(sys_ipc, sys_oabi_ipc)))

CALL(sys_fsync)

CALL(sys_sigreturn_wrapper)/* 120 */ CALL(sys_clone)

CALL(sys_setdomainname)

CALL(sys_newuname)

CALL(sys_ni_syscall) /* modify_ldt */

CALL(sys_adjtimex)/* 125 */ CALL(sys_mprotect)

CALL(sys_sigprocmask)

CALL(sys_ni_syscall) /* was sys_create_module */

CALL(sys_init_module)

CALL(sys_delete_module)/* 130 */ CALL(sys_ni_syscall) /* was sys_get_kernel_syms */

CALL(sys_quotactl)

CALL(sys_getpgid)

CALL(sys_fchdir)

CALL(sys_bdflush)/* 135 */ CALL(sys_sysfs)

CALL(sys_personality)

CALL(sys_ni_syscall) /* reserved for afs_syscall */

CALL(sys_setfsuid16)

CALL(sys_setfsgid16)/* 140 */ CALL(sys_llseek)

CALL(sys_getdents)

CALL(sys_select)

CALL(sys_flock)

CALL(sys_msync)/* 145 */ CALL(sys_readv)

CALL(sys_writev)

CALL(sys_getsid)

CALL(sys_fdatasync)

CALL(sys_sysctl)/* 150 */ CALL(sys_mlock)

CALL(sys_munlock)

CALL(sys_mlockall)

CALL(sys_munlockall)

CALL(sys_sched_setparam)/* 155 */ CALL(sys_sched_getparam)

CALL(sys_sched_setscheduler)

CALL(sys_sched_getscheduler)

CALL(sys_sched_yield)

CALL(sys_sched_get_priority_max)/* 160 */ CALL(sys_sched_get_priority_min)

CALL(sys_sched_rr_get_interval)

CALL(sys_nanosleep)

CALL(sys_mremap)

CALL(sys_setresuid16)/* 165 */ CALL(sys_getresuid16)

CALL(sys_ni_syscall) /* vm86 */

CALL(sys_ni_syscall) /* was sys_query_module */

CALL(sys_poll)

CALL(sys_ni_syscall) /* was nfsservctl *//* 170 */ CALL(sys_setresgid16)

CALL(sys_getresgid16)

CALL(sys_prctl)

CALL(sys_rt_sigreturn_wrapper)

CALL(sys_rt_sigaction)/* 175 */ CALL(sys_rt_sigprocmask)

CALL(sys_rt_sigpending)

CALL(sys_rt_sigtimedwait)

CALL(sys_rt_sigqueueinfo)

CALL(sys_rt_sigsuspend)/* 180 */ CALL(ABI(sys_pread64, sys_oabi_pread64))

CALL(ABI(sys_pwrite64, sys_oabi_pwrite64))

CALL(sys_chown16)

CALL(sys_getcwd)

CALL(sys_capget)/* 185 */ CALL(sys_capset)

CALL(sys_sigaltstack)

CALL(sys_sendfile)

CALL(sys_ni_syscall) /* getpmsg */

CALL(sys_ni_syscall) /* putpmsg *//* 190 */ CALL(sys_vfork)

CALL(sys_getrlimit)

CALL(sys_mmap2)

CALL(ABI(sys_truncate64, sys_oabi_truncate64))

CALL(ABI(sys_ftruncate64, sys_oabi_ftruncate64))/* 195 */ CALL(ABI(sys_stat64, sys_oabi_stat64))

CALL(ABI(sys_lstat64, sys_oabi_lstat64))

CALL(ABI(sys_fstat64, sys_oabi_fstat64))

CALL(sys_lchown)

CALL(sys_getuid)/* 200 */ CALL(sys_getgid)

CALL(sys_geteuid)

CALL(sys_getegid)

CALL(sys_setreuid)

CALL(sys_setregid)/* 205 */ CALL(sys_getgroups)

CALL(sys_setgroups)

CALL(sys_fchown)

CALL(sys_setresuid)

CALL(sys_getresuid)/* 210 */ CALL(sys_setresgid)

CALL(sys_getresgid)

CALL(sys_chown)

CALL(sys_setuid)

CALL(sys_setgid)/* 215 */ CALL(sys_setfsuid)

CALL(sys_setfsgid)

CALL(sys_getdents64)

CALL(sys_pivot_root)

CALL(sys_mincore)/* 220 */ CALL(sys_madvise)

CALL(ABI(sys_fcntl64, sys_oabi_fcntl64))

CALL(sys_ni_syscall) /* TUX */

CALL(sys_ni_syscall)

CALL(sys_gettid)/* 225 */ CALL(ABI(sys_readahead, sys_oabi_readahead))

CALL(sys_setxattr)

CALL(sys_lsetxattr)

CALL(sys_fsetxattr)

CALL(sys_getxattr)/* 230 */ CALL(sys_lgetxattr)

CALL(sys_fgetxattr)

CALL(sys_listxattr)

CALL(sys_llistxattr)

CALL(sys_flistxattr)/* 235 */ CALL(sys_removexattr)

CALL(sys_lremovexattr)

CALL(sys_fremovexattr)

CALL(sys_tkill)

CALL(sys_sendfile64)/* 240 */ CALL(sys_futex)

CALL(sys_sched_setaffinity)

CALL(sys_sched_getaffinity)

CALL(sys_io_setup)

CALL(sys_io_destroy)/* 245 */ CALL(sys_io_getevents)

CALL(sys_io_submit)

CALL(sys_io_cancel)

CALL(sys_exit_group)

CALL(sys_lookup_dcookie)/* 250 */ CALL(sys_epoll_create)

CALL(ABI(sys_epoll_ctl, sys_oabi_epoll_ctl))

CALL(ABI(sys_epoll_wait, sys_oabi_epoll_wait))

CALL(sys_remap_file_pages)

CALL(sys_ni_syscall) /* sys_set_thread_area *//* 255 */ CALL(sys_ni_syscall) /* sys_get_thread_area */

CALL(sys_set_tid_address)

CALL(sys_timer_create)

CALL(sys_timer_settime)

CALL(sys_timer_gettime)/* 260 */ CALL(sys_timer_getoverrun)

CALL(sys_timer_delete)

CALL(sys_clock_settime)

CALL(sys_clock_gettime)

CALL(sys_clock_getres)/* 265 */ CALL(sys_clock_nanosleep)

CALL(sys_statfs64_wrapper)

CALL(sys_fstatfs64_wrapper)

CALL(sys_tgkill)

CALL(sys_utimes)/* 270 */ CALL(sys_arm_fadvise64_64)

CALL(sys_pciconfig_iobase)

CALL(sys_pciconfig_read)

CALL(sys_pciconfig_write)

CALL(sys_mq_open)/* 275 */ CALL(sys_mq_unlink)

CALL(sys_mq_timedsend)

CALL(sys_mq_timedreceive)

CALL(sys_mq_notify)

CALL(sys_mq_getsetattr)/* 280 */ CALL(sys_waitid)

CALL(sys_socket)

CALL(ABI(sys_bind, sys_oabi_bind))

CALL(ABI(sys_connect, sys_oabi_connect))

CALL(sys_listen)/* 285 */ CALL(sys_accept)

CALL(sys_getsockname)

CALL(sys_getpeername)

CALL(sys_socketpair)

CALL(sys_send)/* 290 */ CALL(ABI(sys_sendto, sys_oabi_sendto))

CALL(sys_recv)

CALL(sys_recvfrom)

CALL(sys_shutdown)

CALL(sys_setsockopt)/* 295 */ CALL(sys_getsockopt)

CALL(ABI(sys_sendmsg, sys_oabi_sendmsg))

CALL(sys_recvmsg)

CALL(ABI(sys_semop, sys_oabi_semop))

CALL(sys_semget)/* 300 */ CALL(sys_semctl)

CALL(sys_msgsnd)

CALL(sys_msgrcv)

CALL(sys_msgget)

CALL(sys_msgctl)/* 305 */ CALL(sys_shmat)

CALL(sys_shmdt)

CALL(sys_shmget)

CALL(sys_shmctl)

CALL(sys_add_key)/* 310 */ CALL(sys_request_key)

CALL(sys_keyctl)

CALL(ABI(sys_semtimedop, sys_oabi_semtimedop))/* vserver */ CALL(sys_ni_syscall)

CALL(sys_ioprio_set)/* 315 */ CALL(sys_ioprio_get)

CALL(sys_inotify_init)

CALL(sys_inotify_add_watch)

CALL(sys_inotify_rm_watch)

CALL(sys_mbind)/* 320 */ CALL(sys_get_mempolicy)

CALL(sys_set_mempolicy)

CALL(sys_openat)

CALL(sys_mkdirat)

CALL(sys_mknodat)/* 325 */ CALL(sys_fchownat)

CALL(sys_futimesat)

CALL(ABI(sys_fstatat64,  sys_oabi_fstatat64))

CALL(sys_unlinkat)

CALL(sys_renameat)/* 330 */ CALL(sys_linkat)

CALL(sys_symlinkat)

CALL(sys_readlinkat)

CALL(sys_fchmodat)

CALL(sys_faccessat)/* 335 */ CALL(sys_pselect6)

CALL(sys_ppoll)

CALL(sys_unshare)

CALL(sys_set_robust_list)

CALL(sys_get_robust_list)/* 340 */ CALL(sys_splice)

CALL(sys_sync_file_range2)

CALL(sys_tee)

CALL(sys_vmsplice)

CALL(sys_move_pages)/* 345 */ CALL(sys_getcpu)

CALL(sys_epoll_pwait)

CALL(sys_kexec_load)

CALL(sys_utimensat)

CALL(sys_signalfd)/* 350 */ CALL(sys_timerfd_create)

CALL(sys_eventfd)

CALL(sys_fallocate)

CALL(sys_timerfd_settime)

CALL(sys_timerfd_gettime)/* 355 */ CALL(sys_signalfd4)

CALL(sys_eventfd2)

CALL(sys_epoll_create1)

CALL(sys_dup3)

CALL(sys_pipe2)/* 360 */ CALL(sys_inotify_init1)

CALL(sys_preadv)

CALL(sys_pwritev)

CALL(sys_rt_tgsigqueueinfo)

CALL(sys_perf_event_open)/* 365 */ CALL(sys_recvmmsg)

CALL(sys_accept4)

CALL(sys_fanotify_init)

CALL(sys_fanotify_mark)

CALL(sys_prlimit64)/* 370 */ CALL(sys_name_to_handle_at)

CALL(sys_open_by_handle_at)

CALL(sys_clock_adjtime)

CALL(sys_syncfs)

CALL(sys_sendmmsg)/* 375 */ CALL(sys_setns)

CALL(sys_process_vm_readv)

CALL(sys_process_vm_writev)

CALL(sys_kcmp)

CALL(sys_finit_module)/* 380 */ CALL(sys_sched_setattr)

CALL(sys_sched_getattr)

CALL(sys_renameat2)

CALL(sys_seccomp)

CALL(sys_getrandom)/* 385 */ CALL(sys_memfd_create)

CALL(sys_bpf)

CALL(sys_execveat)

CALL(sys_userfaultfd)

CALL(sys_membarrier)/* 390 */ CALL(sys_mlock2)

CALL(sys_copy_file_range)

CALL(sys_preadv2)

CALL(sys_pwritev2)#ifndef syscalls_counted.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls#define syscalls_counted#endif.rept syscalls_padding

CALL(sys_ni_syscall).endr

      其中,CALL是一个宏,位于/arch/arm/kernel/entry-common.S,主要用来将每一个系统调用的函数地址依次放在sys_call_table表中:

.equ NR_syscalls,0

#define CALL(x) .equ NR_syscalls,NR_syscalls+1

#include "calls.S"

/*

* Ensure that the system call table is equal to __NR_syscalls,

* which is the value the rest of the system sees

*/.ifne NR_syscalls - __NR_syscalls.error "__NR_syscalls is not equal to the size of the syscall table".endif

#undef CALL#define CALL(x) .long x

      那么具体的系统调用函数的定义在哪里呢?以本文的fork系统调用为例,它就位于/kernel/fork.c中:

#ifdef __ARCH_WANT_SYS_CLONE#ifdef CONFIG_CLONE_BACKWARDS

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,

int __user *, parent_tidptr,

unsigned long, tls,

int __user *, child_tidptr)

#elif defined(CONFIG_CLONE_BACKWARDS2)

SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,

int __user *, parent_tidptr,

int __user *, child_tidptr,

unsigned long, tls)

#elif defined(CONFIG_CLONE_BACKWARDS3)

SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,

int, stack_size,

int __user *, parent_tidptr,

int __user *, child_tidptr,

unsigned long, tls)

#else

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,

int __user *, parent_tidptr,

int __user *, child_tidptr,

unsigned long, tls)

#endif

{

return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);

}#endif

  可以看到它有几个重载版本,着重关注SYSCALL_DEFINEx(x为参数个数),它只是一个简单的宏,方便把其中的参数转换为系统调用的声明格式:

#define SYSCALL_DEFINE0(sname) \

SYSCALL_METADATA(_##sname, 0); \

asmlinkage long sys_##sname(void)

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...) \

SYSCALL_METADATA(sname, x, __VA_ARGS__) \

__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)

#define __SYSCALL_DEFINEx(x, name, ...) \

asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \

__attribute__((alias(__stringify(SyS##name)))); \

static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \

asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \

asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \

{ \

long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \

__MAP(x,__SC_TEST,__VA_ARGS__); \

__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \

return ret; \

} \

static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

    比如:

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,

int __user *, parent_tidptr,

int __user *, child_tidptr,

unsigned long, tls)

    进行宏展开之后:

asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,

int __user *, unsigned long);

    具体的fork内部实现不是本文的重点,此处不再赘述,至此,一个完整的系统调用过程就算剖析清楚了。

结束

      系统调用是操作和理解linux内核的关键切入点,系统调用只是入口,真正的工作都是内核完成。内核中会有很多机制来完成这些工作,比如内核的驱动模块、内核的中断系统等,这些都是我们做内核开发时会经常设计的模块。在以后的文章中,我还会写一些关于linux内核中断系统、linux内核启动过程分析、linux内核select实现与字符设备驱动程序测试,敬请期待。

给我留言

留言无头像?