diff --git a/examples/dummy.nix b/examples/dummy.nix index 4689af1..7d8c813 100644 --- a/examples/dummy.nix +++ b/examples/dummy.nix @@ -13,6 +13,8 @@ system.stateVersion = "25.11"; + networking.hostName = "dummy"; + vmapps.enable = true; _module.args.inputs = import ../npins; diff --git a/pkgs/taps/main.c b/pkgs/taps/main.c index 05edb64..3b276df 100644 --- a/pkgs/taps/main.c +++ b/pkgs/taps/main.c @@ -1,5 +1,6 @@ #define _GNU_SOURCE +#include #include /* secure_getenv */ #include #include @@ -15,10 +16,12 @@ #include #include #include +#include #define __UAPI_DEF_IF_IFNAMSIZ 1 #include #include +#include #include "sendfd.h" @@ -108,6 +111,11 @@ int tuntap_alloc(char *dev, short openFlags, short ifrFlags, int *out_fd) { strncpy(dev, ifr.ifr_name, IFNAMSIZ); *out_fd = fd; + + { + int sz = sizeof(struct virtio_net_hdr_v1); + DO_OR_DIE(ioctl(fd, TUNSETVNETHDRSZ, &sz)); + } return 0; } @@ -257,6 +265,13 @@ void cleanup(int signo, siginfo_t *info, void *_context) { errx(EXIT_FAILURE, "Exiting with signal %d", signo); } +/* skarlibs under ISC */ +int uncoe (int fd) +{ + int flags = fcntl(fd, F_GETFD, 0) ; + return flags < 0 ? flags : flags & FD_CLOEXEC ? fcntl(fd, F_SETFD, flags & ~FD_CLOEXEC) : 0 ; +} + int main(int argc, char **argv) { struct sigaction act = { 0 }; act.sa_flags = SA_SIGINFO; @@ -266,7 +281,7 @@ int main(int argc, char **argv) { bool cmdServe = false; bool cmdPass = false; - char *ifname = "vt%d"; + char *ifname = "vt-%d"; char **rest = argv + 1; char **end = argv + argc; @@ -303,7 +318,7 @@ int main(int argc, char **argv) { const char *servePath = secure_getenv("TAPS_SOCK"); if (servePath == NULL) { - servePath = "taps.sock"; + servePath = "/run/taps/taps.sock"; } if (cmdServe) { @@ -317,6 +332,7 @@ int main(int argc, char **argv) { close(fd); fd = 3; } + uncoe(fd); DO_OR_DIE(execvp(nextArgv[0], nextArgv)); } else { error(EINVAL, EINVAL, "subcommand args"); diff --git a/profiles/ch-runner.nix b/profiles/ch-runner.nix index fbb09a4..f156705 100644 --- a/profiles/ch-runner.nix +++ b/profiles/ch-runner.nix @@ -10,21 +10,112 @@ let cfg = config.uvms.cloud-hypervisor; + + inherit (config.networking) hostName; inherit (config.debug.closure.erofs) layers; + inherit (lib) + mkOption + types + concatMapStringsSep + getExe + getExe' + getBin + ; + + package = pkgs.cloud-hypervisor.overrideAttrs (oldAttrs: { + patches = oldAttrs.patches or [ ] ++ [ + # ../patches/ch.patch + ]; + buildType = "debug"; + dontStrip = true; + }); + uvmsPkgs = pkgs.callPackage ../pkgs { }; + + chSettingsFile = (pkgs.formats.json { }).generate "vm.json" cfg.settings; + + uvmPrefix = "\${HOME}/uvms/${hostName}"; + vmmSock = "${uvmPrefix}/vmm.sock"; + elbPrefix = "${lib.getBin pkgs.execline}/bin"; + s6Prefix = "${lib.getBin pkgs.s6}/bin"; + writeElb = name: text: writeElb' name "-W" text; + writeElb' = + name: elArgs: text: + pkgs.writeTextFile { + inherit name; + destination = "/bin/${name}"; + executable = true; + text = '' + #!${getExe' pkgs.execline "execlineb"}${lib.optionalString (elArgs != null) " "}${elArgs} + importas OLDPATH PATH + export PATH "${elbPrefix}:${s6Prefix}:''${OLDPATH}" + ${text} + ''; + }; in { options = { uvms.cloud-hypervisor.enable = lib.mkEnableOption "Configure guest (e.g. fileSystems)"; - uvms.cloud-hypervisor.runner = lib.mkOption { - type = lib.types.package; + uvms.cloud-hypervisor.runner = mkOption { + type = types.package; description = "A naive script for running this system in cloud-hypervisor"; }; - uvms.cloud-hypervisor.extraArgv = lib.mkOption { - type = lib.types.listOf lib.types.str; - default = [ ]; + uvms.cloud-hypervisor.debugger = mkOption { + type = types.lazyAttrsOf types.anything; + description = "Same but you can debug the kernel"; }; - uvms.cloud-hypervisor.argv = lib.mkOption { - type = lib.types.listOf lib.types.str; + uvms.cloud-hypervisor.settingsFile = mkOption { + type = types.package; + default = chSettingsFile; + defaultText = "..."; + readOnly = true; + }; + uvms.cloud-hypervisor.settings = mkOption { + default = { }; + type = types.submodule { + freeformType = (pkgs.formats.json { }).type; + options = { + payload = { + cmdline = mkOption { type = types.str; }; + kernel = mkOption { type = types.str; }; + initramfs = mkOption { + type = types.str; + default = "${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}"; + }; + }; + vsock = { + cid = mkOption { + type = types.int; + default = 4; + }; + socket = mkOption { + type = types.str; + default = "vsock.sock"; + }; + }; + "api-socket" = mkOption { + type = types.str; + default = "vmm.sock"; + }; + "serial".mode = mkOption { + type = types.str; + default = "File"; + }; + "serial".file = mkOption { + type = types.nullOr types.str; + default = "serial"; + }; + "console".mode = mkOption { + type = types.str; + default = "Pty"; + }; + "console".file = mkOption { + type = types.nullOr types.str; + default = null; + }; + # "watchdog" = true; + # "seccomp" = true; + }; + }; }; uvms.cloud-hypervisor.extraCmdline = lib.mkOption { type = lib.types.listOf lib.types.str; @@ -45,34 +136,539 @@ in }; config = lib.mkMerge [ { - uvms.cloud-hypervisor.argv = lib.mkBefore ( - [ - (lib.getExe pkgs.cloud-hypervisor) - "--cmdline=${lib.concatStringsSep " " cfg.cmdline}" - "--kernel=${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}" - "--initramfs=${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}" - "--vsock=cid=4,socket=vsock.sock" - "--api-socket=vmm.sock" - "--serial=tty" - "--console=null" - "--watchdog" - "--seccomp=true" - ] - ++ cfg.extraArgv - ); - uvms.cloud-hypervisor.runner = pkgs.writeShellScriptBin "run-${config.networking.hostName}" '' - set -euo pipefail - GUESTNAME=${config.networking.hostName} - args=( - ${lib.concatMapStringsSep "\n" lib.escapeShellArg cfg.argv} - ) - mkdir -p "$HOME/uvms/$GUESTNAME" - cd "$HOME/uvms/$GUESTNAME" - cleanup() { - rm "$HOME/uvms/$GUESTNAME"/{vmm,vsock}.sock + uvms.cloud-hypervisor.settings = { + payload = { + cmdline = lib.concatStringsSep " " cfg.cmdline; + kernel = "${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}"; + }; + disks = map (img: { + path = img; + readonly = true; + id = toString img.label; + }) layers; + memory = { + size = 1536 * 1048576; + shared = true; + mergeable = true; + # hotplugged_size = 512 * 1048576; + # hotplugd_size = 1536 * 1048576; + # hotplug_method = "virtio-mem" + }; + cpus = { + boot_vcpus = 4; + max_vcpus = 4; + }; + }; + + uvms.cloud-hypervisor.debugger = pkgs.testers.runNixOSTest ( + { config, ... }: + { + name = "test-run-${hostName}"; + passthru = rec { + inherit (config.nodes.machine.system.build) gdbScript; + inherit (config.nodes.machine.boot.kernelPackages) kernel; + kernelSrc = pkgs.srcOnly kernel; + }; + nodes.machine = + { config, ... }: + let + kernel = config.boot.kernelPackages.kernel; + kernelSrc = pkgs.srcOnly kernel; + gdbScript = writeElb "attach-gdb" '' + if { rm -rf /tmp/gdb } + if { mkdir -p /tmp/gdb/kos } + cd /tmp/gdb + if { + elglob -0 files ${kernelSrc}/* + forx -E f { $files } + ln -s $f ./ + } + if { mkdir -p build } + cd build + if { + forx -E pattern { + ${kernel.modules}/lib/modules/*/kernel/drivers/net/tun* + ${kernel.modules}/lib/modules/*/kernel/drivers/net/tap* + } + elglob -0 files $pattern + forx -E f { $files } + if { cp $f . } + backtick -E COMPRESSED { basename $f } + xz -d $COMPRESSED + } + elglob -0 GDB_SCRIPT_DIR ${lib.getDev kernel}/lib/modules/*/build/scripts/gdb + if { + if { cp -r --no-preserve=all $GDB_SCRIPT_DIR gdb_scripts } + mv gdb_scripts/linux/constants.py.in gdb_scripts/linux/constants.py + } + ${getExe pkgs.gdb} + -ex "python import sys; sys.path.insert(0, \"''${GDB_SCRIPT_DIR}\")" + -ex "target remote :1234" + -ex "source ''${GDB_SCRIPT_DIR}/vmlinux-gdb.py" + -ex "lx-symbols" + ${kernel.dev}/vmlinux + ''; + in + { + boot.kernelPackages = pkgs.linuxPackagesFor ( + (pkgs.linux.override (oldArgs: { + # extraMakeFlags = oldArgs.extraMakeFlags or [ ] ++ [ + # "scripts_gdb" + # ]; + kernelPatches = oldArgs.kernelPatches or [ ] ++ [ + { + name = "debug"; + patch = null; + structuredExtraConfig = { + GDB_SCRIPTS = lib.kernel.yes; + DEBUG_INFO = lib.kernel.yes; + DEBUG_INFO_REDUCED = lib.kernel.no; + # FRAME_POINTER = lib.kernel.yes; # "unused option"??? + KALLSYMS = lib.kernel.yes; + KGDB = lib.kernel.yes; + }; + } + ]; + })).overrideAttrs + (oldAttrs: { + dontStrip = true; + postInstall = oldAttrs.postInstall or "" + '' + cp "$buildRoot/scripts/gdb/linux/constants.py" $dev/lib/modules/*/build/scripts/gdb/linux/ || echo "$buildRoot/scripts/gdb/linux/constants.py doesn't exist" + ''; + }) + ); + boot.kernelParams = [ "nokaslr" ]; + networking.useNetworkd = true; + virtualisation.qemu.options = [ "-s" ]; + environment.systemPackages = [ + pkgs.gdb + package # CH + cfg.runner + uvmsPkgs.taps + ]; + system.build.gdbScript = gdbScript; + systemd.services.taps = { + wantedBy = [ "multi-user.target" ]; + environment.TAPS_SOCK = "/run/taps/taps.sock"; + serviceConfig = { + UMask = "0007"; + ExecStart = "${getExe uvmsPkgs.taps} serve"; + RuntimeDirectory = "taps"; + DynamicUser = true; + AmbientCapabilities = [ + "CAP_NET_BIND_SERVICE" + "CAP_NET_ADMIN" + ]; + NoNewPrivileges = true; + }; + }; + }; + testScript = '' + machine.succeed("${getExe cfg.runner}") + ''; } - exec -a "uuvm/$GUESTNAME" "''${args[@]}" - ''; + ); + + # NOTE: Used to be an even uglier bash script, but, for now, execline makes for easier comparisons against spectrum + uvms.cloud-hypervisor.runner = + let + toolsClosure = pkgs.writeClosure [ + (lib.getBin pkgs.execline) + (lib.getBin pkgs.s6) + (lib.getBin package) + (lib.getBin pkgs.virtiofsd) + (lib.getBin pkgs.bubblewrap) + uvmsPkgs.taps + ]; + + superviseVm = getExe superviseVm'; + superviseVm' = pkgs.writers.writePython3Bin "supervise-vm" { } '' + import os + import subprocess + import socket + from argparse import ArgumentParser + from contextlib import contextmanager, closing, ExitStack + + + parser = ArgumentParser("supervise-vm") + parser.add_argument("--vm") + parser.add_argument("--prefix", default="$HOME/uvms/$VM") + parser.add_argument("--sock", default="$PREFIX/supervisor.sock") + parser.add_argument("--vm-config") + + MSG_SIZE = 16 + ELB_DIR = "${lib.getBin pkgs.execline}/bin" # noqa: E501 + S6_DIR = "${lib.getBin pkgs.s6}/bin" # noqa: E501 + CH_DIR = "${lib.getBin package}/bin" # noqa: E501 + UTIL_LINUX_DIR = "${lib.getBin pkgs.util-linux}/bin" # noqa: E501 + SOCKETBINDER_PATH = S6_DIR + "/s6-ipcserver-socketbinder" # noqa: E501 + CH_PATH = CH_DIR + "/cloud-hypervisor" + CHR_PATH = CH_DIR + "/ch-remote" + TAPS_PATH = "${lib.getExe uvmsPkgs.taps}" # noqa: E501 + VIRTIOFSD_PATH = "${lib.getExe pkgs.virtiofsd}" # noqa: E501 + BWRAP_PATH = "${lib.getExe pkgs.bubblewrap}" # noqa: E501 + + with open("${toolsClosure}", mode="r") as f: # noqa: E501 + CLOSURE = [ + *(ln.rstrip() for ln in f.readlines()), + "${placeholder "out"}", # noqa: E501 + ] + + PASSTHRU_PATH = ":".join([ELB_DIR, S6_DIR, CH_DIR, UTIL_LINUX_DIR]) + PASSTHRU_ENV = { + **{ + k: v + for k, v in os.environ.items() + if k.startswith("RUST") + or k.startswith("WAYLAND") + or k in [ + "TAPS_SOCK", + ] + }, + "HOME": os.environ.get("HOME", os.getcwd()), + "PATH": PASSTHRU_PATH, + } + + + def preprocess_args(args_mut): + keys = [ + k + for k, v + in args_mut._get_kwargs() + if isinstance(v, str)] + for k in keys: + v = getattr(args_mut, k) + if "$HOME" in v: + setattr( + args_mut, + k, + v.replace("$HOME", PASSTHRU_ENV["HOME"])) + for k in keys: + v = getattr(args_mut, k) + if "$VM" in v: + setattr(args_mut, k, v.replace("$VM", args.vm)) + for k in keys: + v = getattr(args_mut, k) + if "$PREFIX" in v: + setattr(args_mut, k, v.replace("$PREFIX", args.prefix)) + return args_mut + + + class Processes: + def __init__(self, prefix, vm, check=True, **defaults): + self.prefix = prefix + self.vm = vm + self.check = check + self.defaults = defaults + + def make_env(self): + return { + **PASSTHRU_ENV, + "PATH": PASSTHRU_PATH, + "PREFIX": self.prefix, + "VM": self.vm, + } + + def exec(self, *args, **kwargs): + kwargs["cwd"] = kwargs.get("cwd", self.prefix) + kwargs["check"] = kwargs.get("check", self.check) + kwargs["env"] = kwargs.get("env", self.make_env()) + return subprocess.run( + [*args], + **self.defaults, + **kwargs) + + def execline(self, *args, **kwargs): + return exec( + "execlineb", "-c", "\n".join(args), + **self.defaults, + executable=ELB_DIR + "/execlineb", + **{ + "env": self.make_env(), + "check": self.check, + "cwd": self.prefix, + **kwargs, + }, + ) + + def popen(self, *args, **kwargs): + kwargs["pass_fds"] = kwargs.get("pass_fds", ()) + kwargs["env"] = kwargs.get("env", self.make_env()) + kwargs["cwd"] = kwargs.get("cwd", self.prefix) + return subprocess.Popen( + args, + **kwargs, + ) + + @contextmanager + def bwrap( + self, + *bwrap_args, + + die_with_parent=True, + + # Based on the args from + # `host/rootfs/image/usr/bin/run-vmm` + unshare_all=True, + unshare_user=True, + unshare_ipc=None, + unshare_pid=None, + unshare_net=None, + unshare_uts=None, + unshare_cgroup_try=True, + bind=(), + dev_bind=("/dev/kvm", "/dev/vfio"), + dev="/dev", + proc="/proc", + ro_bind=( + "/etc", + "/sys", + "/proc/sys", + "/dev/null", + "/proc/kallsyms", + *CLOSURE), + ro_bind_extra=(), + remount_ro=("/proc/fs", "/proc/irq"), + tmpfs=("/dev/shm", "/tmp", "/var/tmp", "/proc/fs", "/proc/irq"), + tmpfs_extra=(), + + pass_fds=(2,), + **popen_kwargs): + + bwrap_args_sock, remote = socket.socketpair() + remote.set_inheritable(True) + bwrap_args_f = bwrap_args_sock.makefile("w") + with closing(bwrap_args_sock), closing(bwrap_args_f): + def print_arg(*args): + print(*args, file=bwrap_args_f, sep="\0", end="\0") + + if unshare_all: + print_arg("--unshare-all") + if unshare_user: + print_arg("--unshare-user") + if unshare_ipc: + print_arg("--unshare-ipc") + if unshare_pid: + print_arg("--unshare-pid") + if unshare_net: + print_arg("--unshare-net") + if unshare_uts: + print_arg("--unshare-uts") + if unshare_cgroup_try: + print_arg("--unshare-cgroup-try") + if die_with_parent: + print_arg("--die-with-parent") + + for p in bind: + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--bind", p1, p2) + for p in (*ro_bind, *ro_bind_extra): + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--ro-bind", p1, p2) + for p in dev_bind: + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--dev-bind", p1, p2) + for p in (*tmpfs, *tmpfs_extra): + print_arg("--tmpfs", p) + # Hunch: order might matter... + for p in remount_ro: + print_arg("--remount-ro", p) + + bwrap_args_f.flush() + + with closing(remote): + proc = self.popen( + "bwrap", "--args", str(remote.fileno()), *bwrap_args, + **popen_kwargs, + executable=BWRAP_PATH, + pass_fds=(*pass_fds, remote.fileno()), + ) + + with proc as p: + try: + yield p + finally: + try: + p.poll() + except: # noqa: E722 + pass + if p.returncode is None: + p.terminate() + p.wait() + + @contextmanager + def run_ch(self): + args = [ + SOCKETBINDER_PATH, + "-B", + self.prefix + "/vmm.sock", + CH_PATH, + "--api-socket", + "fd=0", + ] + p = self.popen( + *args, + shell=False, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + pass_fds=(2,)) + try: + p.wait(0.125) + needs_cleanup = False + except subprocess.TimeoutExpired: + needs_cleanup = True + if not os.path.exists(self.prefix + "/vmm.sock"): + raise RuntimeError(f"{self.prefix}/vmm.sock should exist by now") + if p.returncode is not None: + raise RuntimeError("CH exited early") + try: + yield p + finally: + try: + p.poll() + except: # noqa: E722 + pass + if p.returncode is None: + p.terminate() # CH handles SIG{INT,TERM}? + p.wait() + unlink_paths = [ + self.prefix + "/vmm.sock", + self.prefix + "/vmm.sock.lock", + self.prefix + "/vsock.sock", + ] if needs_cleanup else [] + for p in unlink_paths: + if os.path.exists(p): + os.remove(p) + + @contextmanager + def add_virtiofsd( + self, + root_dir, + tag, + ro=False, + subdirs=None, + extra_flags=("--posix-acl",)): + + assert os.path.exists(root_dir) + + sock_path = self.prefix + f"/virtiofsd-{tag}.sock" + # s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + # NOTE: Nope. Virtiofsd actually expects a blocking socket + # s.setblocking(True) + + def rm_sock(): + if os.path.exists(sock_path): + os.remove(sock_path) + + with ExitStack() as cleanup: # noqa: F841 + # s.bind(sock_path.encode("utf8")) + # cleanup.enter_context(closing(s)) + cleanup.enter_context(defer(rm_sock)) + + args = [ + # If using bwrap(): + # "--argv0", "virtiofsd", + # "--uid", "1000", + # "--gid", "1000", + # "--", + "unshare", "-rUm", + "unshare", "--map-user", "1000", "--map-group", "1000", + VIRTIOFSD_PATH, + "--shared-dir", + root_dir, + "--tag", + tag, + + # "--fd", + # str(s.fileno()), + "--socket-path", + sock_path, + + # If relying on bwrap(): + # "--sandbox", + # "none", + ] + if ro: + args.append("--readonly") + kwargs = { + # If bwrap(): + # "bind": [], + # ("ro_bind_extra" if ro else "bind"): + # [*subdirs] + # if subdirs is not None + # else [root_dir], + + # "pass_fds": (2, s.fileno()), + } + proc_ctx = self.popen(*args, **kwargs) + with proc_ctx as p: + try: + try: + p.wait(0.125) + except subprocess.TimeoutExpired: + pass + if p.returncode is not None: + raise RuntimeError("virtiofsd exited too early") + yield p, sock_path + finally: + if p.returncode is None: + p.kill() + p.wait() + if os.path.exists(sock_path): + os.remove(sock_path) + + + @contextmanager + def defer(f): + try: + yield + finally: + f() + + + if __name__ == "__main__": + args, args_next = parser.parse_known_args() + preprocess_args(args) + + os.makedirs(args.prefix, exist_ok=True) + ps = Processes( + prefix=args.prefix, + vm=args.vm, + ) + + ch_remote = [ + "ch-remote", + "--api-socket", + args.prefix + "/vmm.sock", + ] + + with ExitStack() as cleanup: + ch = cleanup.enter_context(ps.run_ch()) + ps.exec(*ch_remote, "create", args.vm_config) + ps.exec( + TAPS_PATH, "pass", + *ch_remote, "add-net", + "id=wan,fd=3,mac=00:00:00:00:00:01") + + send_dir = PASSTHRU_ENV["HOME"] + f"/send/{args.vm}" + os.makedirs(send_dir, exist_ok=True) + vfsd, vfsd_path = cleanup.enter_context( + ps.add_virtiofsd( + send_dir, + tag="send", + )) + ps.exec(*ch_remote, "add-fs", f"tag=send,socket={vfsd_path},id=send") + ps.exec(*ch_remote, "boot") + ps.exec(*ch_remote, "info") + try: + ch.wait() + except KeyboardInterrupt: + pass + ''; + in + writeElb "run-${hostName}" '' + ${superviseVm} --vm-config=${chSettingsFile} --vm=${hostName} + ''; } (lib.mkIf cfg.enable { boot.initrd.availableKernelModules = [ @@ -103,12 +699,6 @@ in } ) layers ); - uvms.cloud-hypervisor.argv = [ - "--memory=size=1536M,hotplug_size=1536M,hotplugged_size=512M,hotplug_method=virtio-mem,mergeable=on,shared=on" - "--cpus=boot=4" - "--disk" - ] - ++ map (img: "path=${img},readonly=true,id=${toString img.label}") layers; }) ]; }