//go:build linux // sandbox_linux.go v1.5 package sandbox import ( _ "embed" "encoding/binary" "encoding/json" "errors" "fmt" "io" "os" "os/exec" "os/user" "path/filepath" "strconv" "strings" "syscall" "time" "apigo.cc/gojs" "github.com/ssgo/log" "github.com/ssgo/u" ) //go:embed init.gz var initGz []byte var initExec []byte func init() { initExec = u.GunzipN(initGz) } var defaultVolumes = []Volume{ {Source: "/usr", Target: "/usr", ReadOnly: true}, {Source: "/lib", Target: "/lib", ReadOnly: true}, {Source: "/lib64", Target: "/lib64", ReadOnly: true}, {Source: "/bin", Target: "/bin", ReadOnly: true}, {Source: "/sbin", Target: "/sbin", ReadOnly: true}, {Source: "/etc/resolv.conf", Target: "/etc/resolv.conf", ReadOnly: true}, {Source: "/etc/hosts", Target: "/etc/hosts", ReadOnly: true}, {Source: "/etc/localtime", Target: "/etc/localtime", ReadOnly: true}, {Source: "/dev/null", Target: "/dev/null", ReadOnly: false}, {Source: "/dev/urandom", Target: "/dev/urandom", ReadOnly: true}, {Source: "/dev/zero", Target: "/dev/zero", ReadOnly: true}, } func Start(cfg *Config) (*Sandbox, error) { s := &Sandbox{ config: cfg, mountedList: []string{}, status: "created", extra: map[string]any{}, } s.lock.Lock() defer s.lock.Unlock() if s.status != "created" { return nil, gojs.Err("sandbox is already used") } if s.config.WorkDir == "" { s.config.WorkDir = "/app" } if os.Getuid() == 0 { if user, err := user.Lookup("nobody"); err == nil { s.uid = u.Int(user.Uid) s.gid = u.Int(user.Gid) } else { s.uid = 65534 s.gid = 65534 } } initEnv := map[string]string{ "LANG": os.Getenv("LANG"), "LC_TIME": os.Getenv("LC_TIME"), "PATH": filepath.Join(s.config.WorkDir, "bin") + ":/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "HOME": s.config.WorkDir, "LD_LIBRARY_PATH": os.Getenv("LD_LIBRARY_PATH"), } if initEnv["LC_TIME"] == "" { initEnv["LC_TIME"] = "C" } if initEnv["LANG"] == "" { initEnv["LANG"] = "C.UTF-8" } addEnv := func(k, v string) { initEnv[k] = os.Expand(v, func(varName string) string { return initEnv[varName] }) } s.status = "starting" s.startTime = time.Now().Unix() s.id = getId() s.root = filepath.Join(pluginConfig.Root, s.id) // 1. 构建物理环境 (对应原 mountAll 逻辑) initPaths := []string{s.config.WorkDir, "/var/log", "/etc", "/proc", "/sys"} os.MkdirAll(s.root, 0755) if s.uid != 0 { os.Chown(s.root, s.uid, s.gid) } for _, p := range initPaths { path := filepath.Join(s.root, p) os.MkdirAll(path, 0755) if s.uid != 0 { os.Chown(path, s.uid, s.gid) } } // 2. 执行挂载逻辑 (严格保留 v20 的 bind mount + remount ro 逻辑) vs := NewVolumes() vs.Add(defaultVolumes...) vs.Add(s.config.Volumes...) if s.config.Limits.Shm > 0 { vs.Add(Volume{Source: "tmpfs", Target: "/dev/shm"}) } if s.config.Limits.Tmp > 0 { vs.Add(Volume{Source: "tmpfs", Target: "/tmp"}) } // 处理 Gpu 透传 if len(s.config.Gpu.Devices) > 0 { gpuDevices := []string{} driver := strings.ToLower(s.config.Gpu.Driver) switch driver { case "nvidia": // 核心管理与工具设备 gpuDevices = append(gpuDevices, "/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools", "/dev/nvidia-modeset") // 具体显卡设备 if len(s.config.Gpu.Devices) > 0 && s.config.Gpu.Devices[0] == "all" { matches, _ := filepath.Glob("/dev/nvidia[0-9]*") gpuDevices = append(gpuDevices, matches...) } else { for _, devId := range s.config.Gpu.Devices { gpuDevices = append(gpuDevices, "/dev/nvidia"+devId) } } // 设置环境变量 addEnv("CUDA_HOME", os.Getenv("CUDA_HOME")) addEnv("NVIDIA_DRIVER_CAPABILITIES", strings.Join(s.config.Gpu.Devices, ",")) case "amd", "render": // AMD GPU 或通用的 DRM 渲染设备 (Intel 也可以走此逻辑) // DRI (Direct Rendering Infrastructure) 设备 gpuDevices = append(gpuDevices, "/dev/kfd") // AMD 核心调度器 if len(s.config.Gpu.Devices) > 0 && s.config.Gpu.Devices[0] == "all" { matches, _ := filepath.Glob("/dev/dri/renderD*") gpuDevices = append(gpuDevices, matches...) matchesCard, _ := filepath.Glob("/dev/dri/card*") gpuDevices = append(gpuDevices, matchesCard...) } else { for _, devId := range s.config.Gpu.Devices { // 常见的 ID 如 128 (renderD128) gpuDevices = append(gpuDevices, "/dev/dri/renderD"+devId) } } addEnv("ROCR_VISIBLE_DEVICES", strings.Join(s.config.Gpu.Devices, ",")) // 对于某些旧版 OpenCL addEnv("GPU_DEVICE_ORDINAL", strings.Join(s.config.Gpu.Devices, ",")) case "intel": // Intel 特有的设备节点通常就在 /dev/dri 目录下 if len(s.config.Gpu.Devices) > 0 && s.config.Gpu.Devices[0] == "all" { matches, _ := filepath.Glob("/dev/dri/*") gpuDevices = append(gpuDevices, matches...) } else { for _, devId := range s.config.Gpu.Devices { gpuDevices = append(gpuDevices, "/dev/dri/renderD"+devId) } } addEnv("ONEAPI_DEVICE_SELECTOR", "level_zero:"+strings.Join(s.config.Gpu.Devices, ",")) } // 统一执行设备挂载 for _, dev := range gpuDevices { if u.FileExists(dev) { vs.Add(Volume{Source: dev, Target: dev}) } } } // 处理 ProjectDir 自动挂载 if s.config.ProjectDir != "" { s.workDir = u.GetAbsFilename(s.config.ProjectDir) } else { // 使用临时目录 s.workDir = filepath.Join(s.root, ".workdir") } if !u.FileExists(s.workDir) { os.MkdirAll(s.workDir, 0755) if s.uid != 0 { os.Chown(s.workDir, s.uid, s.gid) } } vs.Add(Volume{Source: s.workDir, Target: s.config.WorkDir}) // 处理 runtime runtimeListLock.RLock() rt := runtimeList[s.config.Runtime.Language] runtimeListLock.RUnlock() if rt != nil { rtCfg := pluginConfig.Runtime[s.config.Runtime.Language] if rtCfg == nil { rtCfg = &RuntimeConfig{} } if rtCfg.Root == "" { rtCfg.Root = filepath.Join(pluginConfig.Root, s.config.Runtime.Language) } runtimePath := filepath.Join(rtCfg.Root, "runtime", s.config.Runtime.Version) venvPath := filepath.Join(rtCfg.Root, "venv", fmt.Sprintf("%s_%s", s.config.Runtime.Version, s.config.Runtime.Venv)) if rtsbCfg, err := rt.Check(runtimePath, venvPath, s.config.ProjectDir, s.uid, s.gid, rtCfg); err == nil { if s.config.StartCmd == "" && rtsbCfg.StartCmd != "" { s.config.StartCmd = rtsbCfg.StartCmd } for k, v := range rtsbCfg.Envs { addEnv(k, v) } // 挂载运行时环境 if u.FileExists(runtimePath) { vs.Add(Volume{Source: runtimePath, Target: runtimePath, ReadOnly: true}) } if u.FileExists(venvPath) { vs.Add(Volume{Source: venvPath, Target: venvPath, ReadOnly: false}) } } else { s._cleanup() return nil, err } } else { s._cleanup() return nil, gojs.Err("runtime not found for language " + s.config.Runtime.Language) } if s.config.StartCmd == "" { s._cleanup() return nil, gojs.Err("start cmd is empty") } for k, v := range s.config.Envs { addEnv(k, v) } for _, v := range vs.Get() { from, to, mode := v.Source, v.Target, "rw" if v.ReadOnly { mode = "ro" } toPath := filepath.Join(s.root, to) fi, err := os.Lstat(from) if err == nil && fi.Mode()&os.ModeSymlink != 0 { // 处理符号链接 linkTarget, _ := os.Readlink(from) os.Symlink(linkTarget, toPath) } else { // 创建挂载点 if err != nil || fi.IsDir() { // 创建文件夹(如果err!=nil表示这是特殊文件夹例如/proc所以也要创建文件夹) os.MkdirAll(toPath, 0755) if s.uid != 0 { os.Chown(toPath, s.uid, s.gid) } } else { // 写入空文件(u.WriteFile会自动创建文件夹) u.WriteFile(toPath, "") if s.uid != 0 { os.Chown(toPath, s.uid, s.gid) } } // 挂载操作 var flags uintptr = syscall.MS_BIND | syscall.MS_REC fsType, opt := "", "" if from == "tmpfs" { fsType = "tmpfs" flags = 0 // 根据配置处理 shm/tmp 大小 if to == "/dev/shm" && s.config.Limits.Shm > 0 { opt = "size=" + strconv.FormatUint(uint64(s.config.Limits.Shm), 10) + "m" } else if to == "/tmp" && s.config.Limits.Tmp > 0 { opt = "size=" + strconv.FormatUint(uint64(s.config.Limits.Tmp), 10) + "m" } } err = syscall.Mount(from, toPath, fsType, flags, opt) if mode == "ro" && err == nil && fsType == "" { // bind模式下ReadOnly需要二次挂载才能实现 syscall.Mount("", toPath, "", flags|syscall.MS_REMOUNT|syscall.MS_RDONLY, "") } else if s.uid != 0 && mode != "ro" && fsType == "" && !strings.HasPrefix(to, "/dev") { // 可写的普通挂载,设置为用户权限 os.Lchown(toPath, s.uid, s.gid) } s.assertLog(fmt.Sprintf("Mount %s to %s", from, toPath), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir, "readonly", v.ReadOnly) if err == nil { s.mountedList = append(s.mountedList, toPath) } } } // 容器内会自动挂载/proc和/sys,需要在结束时自动卸载 s.mountedList = append(s.mountedList, filepath.Join(s.root, "/proc"), filepath.Join(s.root, "/sys")) // 3. Cgroup v2 资源限制 cgPath := "/sys/fs/cgroup/" + s.id enableCgroup := false if s.config.Limits.Mem >= 0.0001 { err := u.WriteFile(filepath.Join(cgPath, "memory.max"), u.String(int64(s.config.Limits.Mem*1024*1024*1024))) s.assertLog(fmt.Sprintf("Write %s to %s", u.String(s.config.Limits.Mem*1024*1024*1024), filepath.Join(cgPath, "memory.max")), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) if err == nil { enableCgroup = true } } if s.config.Limits.Swap >= 0.0001 || s.config.Limits.Mem >= 0.0001 { err := u.WriteFile(filepath.Join(cgPath, "memory.swap.max"), u.String(int64(s.config.Limits.Swap*1024*1024*1024))) s.assertLog(fmt.Sprintf("Write %s to %s", u.String(s.config.Limits.Swap*1024*1024*1024), filepath.Join(cgPath, "memory.swap.max")), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) if err == nil { enableCgroup = true } } if s.config.Limits.Cpu >= 0.0001 { err := u.WriteFile(filepath.Join(cgPath, "cpu.max"), fmt.Sprintf("%d 100000", int(s.config.Limits.Cpu*100000))) s.assertLog(fmt.Sprintf("Write %s to %s", fmt.Sprintf("%d 100000", int(s.config.Limits.Cpu*100000)), filepath.Join(cgPath, "cpu.max")), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) if err == nil { enableCgroup = true } } if (s.config.Limits.Mem >= 0.0001 || s.config.Limits.Swap >= 0.0001 || s.config.Limits.Cpu >= 0.0001) && !enableCgroup { // 至少有一个资源限制才会启用 Cgroup v2,不支持非 root 或 docker 环境 log.DefaultLogger.Warning("[Sandbox] cgroup v2 resource limit not enabled, please check is running as root or in docker?", "id", s.id, "name", s.config.Name, "workDir", s.workDir, "mem", s.config.Limits.Mem, "swap", s.config.Limits.Swap, "cpu", s.config.Limits.Cpu) } // 4. 部署引导程序 initFile := filepath.Join(s.root, "/init") os.WriteFile(initFile, initExec, 0755) // 启动引导进程 (注意:此时不再在 Go 层设置 Chroot,交由 init 处理) netEnabled := uint8(0) cloneFlags := syscall.CLONE_NEWNS | syscall.CLONE_NEWPID | syscall.CLONE_NEWUTS if !s.config.Network.AllowInternet && !s.config.Network.AllowLocalNetwork && len(s.config.Network.AllowListen) == 0 && len(s.config.Network.AllowList) == 0 { cloneFlags |= syscall.CLONE_NEWNET } else { netEnabled = 1 } cmdSysProcAttr := &syscall.SysProcAttr{ Chroot: s.root, // Go 依然需要先将进程禁锢在 root 目录下 Cloneflags: uintptr(cloneFlags), } // 设置环境变量(支持变量引用) cmdEnv := make([]string, 0, len(initEnv)) for k, v := range initEnv { cmdEnv = append(cmdEnv, fmt.Sprintf("%s=%s", k, v)) } // 启动 init 进程 var cmdStdin io.WriteCloser for i := 0; i < 10; i++ { time.Sleep(time.Millisecond) s.cmd = exec.Command("/init") s.cmd.SysProcAttr = cmdSysProcAttr s.cmd.Env = cmdEnv stdin, err := s.cmd.StdinPipe() if err != nil { s._cleanup() return nil, err } s.cmd.Stdout = os.Stdout s.cmd.Stderr = os.Stderr if err := s.cmd.Start(); err != nil { stdin.Close() if !errors.Is(err, syscall.ETXTBSY) { s._cleanup() return nil, err } if i >= 3 { s.log("init ETXTBSY", "retry", i) } } else { cmdStdin = stdin break } } if cmdStdin == nil { s._cleanup() return nil, fmt.Errorf("init start failed") } err := os.Remove(initFile) s.assertLog("remove init file", err, "id", s.id, "name", s.config.Name, "initFile", initFile) s.pid = s.cmd.Process.Pid // 6. 完成后续状态记录 if enableCgroup { err := os.WriteFile(filepath.Join(cgPath, "cgroup.procs"), []byte(strconv.Itoa(s.pid)), 0644) s.assertLog(fmt.Sprintf("Write %s to %s", strconv.Itoa(s.pid), filepath.Join(cgPath, "cgroup.procs")), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) } // 检查Cgroup // fmt.Println(u.BMagenta(filepath.Join(cgPath, "cpu.max")), u.ReadFileN(filepath.Join(cgPath, "cpu.max"))) // fmt.Println(u.BMagenta(filepath.Join(cgPath, "memory.max")), u.ReadFileN(filepath.Join(cgPath, "memory.max"))) // fmt.Println(u.BMagenta(filepath.Join(cgPath, "memory.swap.max")), u.ReadFileN(filepath.Join(cgPath, "memory.swap.max"))) // fmt.Println(u.BMagenta(filepath.Join(cgPath, "cgroup.procs")), u.ReadFileN(filepath.Join(cgPath, "cgroup.procs"))) // 7. 按照协议发送配置信息 (LittleEndian, 匹配 C 端的 uint32_t) // 写入 UID & GID binary.Write(cmdStdin, binary.LittleEndian, uint32(s.uid)) binary.Write(cmdStdin, binary.LittleEndian, uint32(s.gid)) // 写入 WorkDir binary.Write(cmdStdin, binary.LittleEndian, uint32(len(s.config.WorkDir))) cmdStdin.Write([]byte(s.config.WorkDir)) // 写入 Args allArgs := append([]string{s.config.StartCmd}, s.config.StartArgs...) binary.Write(cmdStdin, binary.LittleEndian, uint32(len(allArgs))) for _, arg := range allArgs { binary.Write(cmdStdin, binary.LittleEndian, uint32(len(arg))) cmdStdin.Write([]byte(arg)) } // 写入网络配置总开关 binary.Write(cmdStdin, binary.LittleEndian, netEnabled) if netEnabled == 1 { // 1. 基础开关 (各1字节) binary.Write(cmdStdin, binary.LittleEndian, u.If(s.config.Network.AllowInternet, uint8(1), uint8(0))) binary.Write(cmdStdin, binary.LittleEndian, u.If(s.config.Network.AllowLocalNetwork, uint8(1), uint8(0))) // 2. AllowListen (端口列表) binary.Write(cmdStdin, binary.LittleEndian, uint32(len(s.config.Network.AllowListen))) for _, port := range s.config.Network.AllowListen { binary.Write(cmdStdin, binary.LittleEndian, uint32(port)) } // 3. 解析并发送 AllowList allowRules := []NetRule{} for _, s := range s.config.Network.AllowList { if r, err := ParseNetRule(s); err == nil { allowRules = append(allowRules, *r) } } binary.Write(cmdStdin, binary.LittleEndian, uint32(len(allowRules))) for _, r := range allowRules { binary.Write(cmdStdin, binary.LittleEndian, r) } // 4. 解析并发送 BlockList blockRules := []NetRule{} for _, s := range s.config.Network.BlockList { if r, err := ParseNetRule(s); err == nil { blockRules = append(blockRules, *r) } } binary.Write(cmdStdin, binary.LittleEndian, uint32(len(blockRules))) for _, r := range blockRules { binary.Write(cmdStdin, binary.LittleEndian, r) } } // 必须主动关闭,触发 EOF 并释放 fd cmdStdin.Close() s.status = "running" u.Save(filepath.Join(s.root, ".state.json"), State{ Id: s.id, Pid: s.pid, StartTime: s.startTime, WorkDir: s.workDir, MountedList: s.mountedList, }) RegisterSandbox(s) return s, nil } func (s *Sandbox) Kill() error { s.lock.Lock() defer s.lock.Unlock() if s.status != "running" && s.status != "starting" { return gojs.Err("sandbox not running or starting") } s.status = "stopping" return s._kill(syscall.SIGTERM) } func (s *Sandbox) Wait(timeout int64) (any, error) { s.lock.Lock() defer s.lock.Unlock() if s.status != "running" && s.status != "stopping" { return nil, gojs.Err("sandbox not running or stopping") } if s._alive() { ch := make(chan error, 1) go func() { state, err := s._waitProcess() if err == nil && state != nil { s.status = "exited" } ch <- err }() select { case _ = <-ch: break case <-time.After(time.Duration(timeout) * time.Millisecond): s._kill(syscall.SIGKILL) break } } return s._cleanup() } func (s *Sandbox) Cleanup() (any, error) { s.lock.Lock() defer s.lock.Unlock() return s._cleanup() } func (s *Sandbox) _cleanup() (any, error) { s.log("cleaning up sandbox", "id", s.id, "name", s.config.Name, "workDir", s.workDir) cgPath := "/sys/fs/cgroup/" + s.id if u.FileExists(cgPath) { killPath := filepath.Join(cgPath, "cgroup.kill") if _, err := os.Stat(killPath); err == nil { _ = os.WriteFile(killPath, []byte("1"), 0644) time.Sleep(10 * time.Millisecond) // 稍等片刻让内核处理完毕 } err := os.RemoveAll(cgPath) s.assertLog(fmt.Sprintf("Remove Cgroup %s", cgPath), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) } for i := len(s.mountedList) - 1; i >= 0; i-- { err := syscall.Unmount(s.mountedList[i], syscall.MNT_DETACH) // 忽略 /proc 和 /sys 挂载失败提示 if err == nil || (!strings.HasSuffix(s.mountedList[i], "/proc") && !strings.HasSuffix(s.mountedList[i], "/sys")) { s.assertLog(fmt.Sprintf("Unmount %s", s.mountedList[i]), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) } } _, err := copyLog(s.workDir, s.startTime) s.assertLog("Copy log to logs directory", err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) outLog := u.ReadFileN(filepath.Join(s.workDir, "stdout.log")) errLog := u.ReadFileN(filepath.Join(s.workDir, "stderr.log")) err = os.RemoveAll(s.root) s.assertLog(fmt.Sprintf("Remove Sandbox %s", s.root), err, "id", s.id, "name", s.config.Name, "workDir", s.workDir) releaseId(s.id) ReleaseSandbox(s.id) var outData any = outLog if strings.HasPrefix(outLog, "{") && strings.HasSuffix(outLog, "}") || strings.HasPrefix(outLog, "[") && strings.HasSuffix(outLog, "]") { var data map[string]any if err := json.Unmarshal([]byte(outLog), &data); err == nil { outData = data } } if errLog != "" { return outData, errors.New(errLog) } return outData, nil } // 建议在 struct 中增加字段记录上次采样,或者由调用方维护 // 这里提供一个逻辑更健壮的单次查询版本 func (s *Sandbox) Status() Status { st := Status{ Id: s.id, Pid: s.pid, Alive: s._alive(), Status: s.status, StartTime: s.startTime, } // 统一处理 Uptime,避免除零错误 now := time.Now().Unix() st.Uptime = now - s.startTime if st.Uptime <= 0 { st.Uptime = 1 } cgPath := "/sys/fs/cgroup/" + s.id hasCgroup := u.FileExists(cgPath) // 1. 内存统计 if hasCgroup { if data, err := os.ReadFile(filepath.Join(cgPath, "memory.current")); err == nil { usage, _ := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) st.MemoryUsage = uint(usage / 1024 / 1024) } } else { // 非 Cgroup 模式下,读取 VmRSS (物理内存占用),比 statm 更直观 if data, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", st.Pid)); err == nil { lines := strings.Split(string(data), "\n") for _, line := range lines { if strings.HasPrefix(line, "VmRSS:") { fields := strings.Fields(line) if len(fields) >= 2 { val, _ := strconv.ParseUint(fields[1], 10, 64) st.MemoryUsage = uint(val / 1024) // status 里通常是 KB } break } } } } // 2. CPU 统计 if hasCgroup { if data, err := os.ReadFile(filepath.Join(cgPath, "cpu.stat")); err == nil { for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "usage_usec") { parts := strings.Fields(line) if len(parts) >= 2 { usageUsec, _ := strconv.ParseFloat(parts[1], 64) // 计算全生命周期平均负载 // 如果要瞬时负载,需在外部存储上次的 usageUsec 和时间戳做 delta st.CpuUsage = (usageUsec / (float64(st.Uptime) * 1000000.0)) * 100 } break } } } } else { if data, err := os.ReadFile(fmt.Sprintf("/proc/%d/stat", st.Pid)); err == nil { fields := strings.Fields(string(data)) if len(fields) > 14 { // utime(14) + stime(15) utime, _ := strconv.ParseFloat(fields[13], 64) stime, _ := strconv.ParseFloat(fields[14], 64) // 这里的 100 是单核系数,多核环境下建议除以 runtime.NumCPU() totalSec := (utime + stime) / 100.0 st.CpuUsage = (totalSec / float64(st.Uptime)) * 100 } } } // 修正:如果 CPU 计算结果超过 100% (多核情况),保持原样展示或按需缩放 return st }