系统调用

在32位操作系统中,常常使用int 0x80指令来完成系统调用处理。但是在x64中,引入了syscall指令,称为快速系统调用,其不触发中断,而是有自己的一套控制流,在用户态使用syscall指令后,CPU会执行以下动作:

  • 从CPU特殊寄存器STAR MSR中加载cs、ss寄存器
  • 将当前的rflags存入r11寄存器,从CPU特殊寄存器RFMASK MSR中加载rflags
  • 将当前rip存入rcx,从CPU特殊寄存器LSTAR MSR中加载rip

借助快速系统调用机制,我们在系统初始化时将几个MSR寄存器中载入我们期望的系统调用处理入口即可,这样用户态使用syscall指令后,会跳转到syscall_entry代码片段执行

set_msr(LSTAR_MSR, syscall_entry as _);
set_msr(SFMASK_MSR, 0x47700); // TF|DF|IF|IOPL|AC|NT

类似于Trapframe结构,我们使用SyscallFrame结构来保存系统调用发生前用户态的上下文,且它是系统调用处理函数的参数。

/// 系统调用处理程序的参数
/// 
/// 同时也是调用系统调用处理函数前保存的现场
#[derive(Debug, Default, Clone, Copy)]
#[repr(C)]
pub struct SyscallFrame {
    pub caller: CallerRegs,
    pub callee: CalleeRegs,
}

系统调用入口

.global syscall_entry
syscall_entry:
  # syscall instruction do:
  # - load cs, ss from STAR MSR
  # - r11 <- rflags, mask rflags from RFMASK MSR
  # - rcx <- rip, load rip from LSTAR MSR

  # temporarily store user rsp into TSS.sp0 and load kernel rsp from it.
  xchg rsp, [TSS + rip + 4]
  push r15
  push r14
  push r13
  push r12
  push rbp
  push rbx
  push [TSS + rip + 4] # store user rsp into SyscallFrame.rsp
  save
  mov rdi, rsp
  call syscall_handler
  mov [rsp], rax # CallerRegs.rax is at offset 0
  jmp __syscall_return

首先从TSS中取出内核栈加载rsp,并暂时将用户栈rsp存在TSS中。接下来保存调用者保存寄存器和被调用者保存寄存器,这时内核栈顶构成一个SyscallFrame结构体,将其指针作为参数,调用syscall_handler函数。这就是内核中的系统调用总控函数,其根据系统调用号分发并处理系统调用。这个函数有一个返回值,存放在rax中,将其写入SyscallFrame中。

内核系统调用分发

#[no_mangle]
pub extern "C" fn syscall_handler(f: &'static mut SyscallFrame) -> isize {
    let r = &f.caller;
    syscall::syscall(r.rax, [r.rdi, r.rsi, r.rdx])
}

/// 系统调用总控函数
pub fn syscall(syscall_id: usize, args: [usize; 3]) -> isize {
    let ret = match syscall_id {
        SAYSCALL_DUP => sys_dup(args[0] as _),
        SYSCALL_OPEN => sys_open(args[0] as _, args[1] as _),
        SYSCALL_CLOSE => sys_close(args[0]),
        SYSCALL_PIPE => sys_pipe(args[0] as _),
        SYSCALL_READ => sys_read(args[0], args[1] as _, args[2]),
        SYSCALL_WRITE => sys_write(args[0], args[1] as _, args[2]),
        SYSCALL_EXIT => sys_exit(args[0] as i32),
        SYSCALL_SLEEP => sys_sleep(args[0]),
        SYSCALL_YIELD => sys_yield(),
        SYSCALL_GET_TIME => *pic::TICKS as _,
        SYSCALL_GETPID => sys_getpid(),
        SYSCALL_FORK => sys_fork(),
        SYSCALL_EXEC => sys_exec(args[0] as _, args[1] as _),
        SYSCALL_WAITPID => sys_waitpid(args[0] as _, args[1] as _),
        SYSCALL_THREAD_CREATE => sys_thread_create(args[0], args[1]),
        SYSCALL_GETTID => sys_gettid(),
        SYSCALL_WAITTID => sys_waittid(args[0]),
        SYSCALL_MUTEX_CREATE => sys_mutex_create(args[0] == 1),
        SYSCALL_MUTEX_LOCK => sys_mutex_lock(args[0]),
        SYSCALL_MUTEX_UNLOCK => sys_mutex_unlock(args[0]),
        SYSCALL_SEMAPHORE_CREATE => sys_semaphore_create(args[0]),
        SYSCALL_SEMAPHORE_UP => sys_semaphore_up(args[0]),
        SYSCALL_SEMAPHORE_DOWN => sys_semaphore_down(args[0]),
        _ => {
            println!("Unsupported syscall: {}", syscall_id);
            crate::task::current().exit(-1);
        }
    };
    ret
}

rax、rdi、rsi、rdx分别为用户系统调用传入的四个参数,其中rax为系统调用号。这也解释了为什么在系统调用时我们不是像中断一样只保存被调用者保存寄存器,因为我们需要调用者保存的rax、rdi作为参数。

系统调用返回

.global syscall_return
syscall_return: # (SyscallFrame *)
  mov rsp, rdi
__syscall_return:
  # sysretq instruction do:
  # - rip <- rcx
  # - rflags <- r11
  # - cs, ss <- STAR MSR
  
  lea rax, [rsp + 128] # prepare new TSS.sp0, 128 = sizeof(SyscallFrame)
  mov [TSS + rip + 4], rax
  restore
  mov rbx, [rsp + 8]
  mov rbp, [rsp + 16]
  mov r12, [rsp + 24]
  mov r13, [rsp + 32]
  mov r14, [rsp + 40]
  mov r15, [rsp + 48]
  mov rsp, [rsp + 0]
  sysretq

系统调用返回时,恢复TSS中存放的内核栈为之前的地址,从SyscallFrame中恢复通用寄存器,最后使用sysretq指令返回用户态,这条指令相当于syscall指令的逆过程。

用户态syscall

#[inline(always)]
fn syscall(id: usize, arg0: usize, arg1: usize, arg2: usize) -> isize {
    let ret;
    unsafe {
        asm!(
          "syscall",
          in("rax") id, in("rdi") arg0, in("rsi") arg1, in("rdx") arg2,
          out("rcx") _, out("r11") _, // clobbered by syscall
          lateout("rax") ret
        );
    }
    ret
}

用户态使用内联汇编来实现系统调用,将rax、rdi、rsi、rdx作为参数,rax存放系统调用号,最终返回值存放在rax中。用户态使用不同系统调用的方法如下:

pub fn sys_dup(fd: usize) -> isize {
    syscall(SYSCALL_DUP, fd, 0, 0)
}

pub fn sys_open(path: &str, flags: u32) -> isize {
    syscall(SYSCALL_OPEN, path.as_ptr() as _, flags as _, 0)
}
...