{ config, lib, pkgs, ... }: # It is not the intent to stick to the microvm.nix-like static interface, # but we shall begin by reproducing at least some of their work. let cfg = config.uvms.cloud-hypervisor; inherit (config.networking) hostName; inherit (config.debug.closure.erofs) layers; inherit (lib) mkOption types concatMapStringsSep getExe getExe' getBin ; package = pkgs.cloud-hypervisor.overrideAttrs (oldAttrs: { patches = oldAttrs.patches or [ ] ++ [ # ../patches/ch.patch ]; buildType = "debug"; dontStrip = true; }); uvmsPkgs = pkgs.callPackage ../pkgs { }; chSettingsFile = (pkgs.formats.json { }).generate "vm.json" cfg.settings; uvmPrefix = "\${HOME}/uvms/${hostName}"; vmmSock = "${uvmPrefix}/vmm.sock"; elbPrefix = "${lib.getBin pkgs.execline}/bin"; s6Prefix = "${lib.getBin pkgs.s6}/bin"; writeElb = name: text: writeElb' name "-W" text; writeElb' = name: elArgs: text: pkgs.writeTextFile { inherit name; destination = "/bin/${name}"; executable = true; text = '' #!${getExe' pkgs.execline "execlineb"}${lib.optionalString (elArgs != null) " "}${elArgs} importas OLDPATH PATH export PATH "${elbPrefix}:${s6Prefix}:''${OLDPATH}" ${text} ''; }; in { options = { uvms.cloud-hypervisor.enable = lib.mkEnableOption "Configure guest (e.g. fileSystems)"; uvms.cloud-hypervisor.runner = mkOption { type = types.package; description = "A naive script for running this system in cloud-hypervisor"; }; uvms.cloud-hypervisor.debugger = mkOption { type = types.lazyAttrsOf types.anything; description = "Same but you can debug the kernel"; }; uvms.cloud-hypervisor.settingsFile = mkOption { type = types.package; default = chSettingsFile; defaultText = "..."; readOnly = true; }; uvms.cloud-hypervisor.settings = mkOption { default = { }; type = types.submodule { freeformType = (pkgs.formats.json { }).type; options = { payload = { cmdline = mkOption { type = types.str; }; kernel = mkOption { type = types.str; }; initramfs = mkOption { type = types.str; default = "${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}"; }; }; vsock = { cid = mkOption { type = types.int; default = 4; }; socket = mkOption { type = types.str; default = "vsock.sock"; }; }; "api-socket" = mkOption { type = types.str; default = "vmm.sock"; }; "serial".mode = mkOption { type = types.str; default = "File"; }; "serial".file = mkOption { type = types.nullOr types.str; default = "serial"; }; "console".mode = mkOption { type = types.str; default = "Pty"; }; "console".file = mkOption { type = types.nullOr types.str; default = null; }; # "watchdog" = true; # "seccomp" = true; }; }; }; uvms.cloud-hypervisor.extraCmdline = lib.mkOption { type = lib.types.listOf lib.types.str; default = [ ]; }; uvms.cloud-hypervisor.cmdline = lib.mkOption { type = lib.types.listOf lib.types.str; default = [ "earlyprintk=ttyS0" "console=ttyS0" "reboot=t" "panic=-1" "init=${config.system.build.toplevel}/init" ] ++ config.boot.kernelParams ++ config.uvms.cloud-hypervisor.extraCmdline; }; }; config = lib.mkMerge [ { uvms.cloud-hypervisor.settings = { payload = { cmdline = lib.concatStringsSep " " cfg.cmdline; kernel = "${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}"; }; disks = map (img: { path = img; readonly = true; id = toString img.label; }) layers; memory = { size = 1536 * 1048576; shared = true; mergeable = true; # hotplugged_size = 512 * 1048576; # hotplugd_size = 1536 * 1048576; # hotplug_method = "virtio-mem" }; cpus = { boot_vcpus = 4; max_vcpus = 4; }; }; uvms.cloud-hypervisor.debugger = pkgs.testers.runNixOSTest ( { config, ... }: { name = "test-run-${hostName}"; passthru = rec { inherit (config.nodes.machine.system.build) gdbScript; inherit (config.nodes.machine.boot.kernelPackages) kernel; kernelSrc = pkgs.srcOnly kernel; }; nodes.machine = { config, ... }: let kernel = config.boot.kernelPackages.kernel; kernelSrc = pkgs.srcOnly kernel; gdbScript = writeElb "attach-gdb" '' if { rm -rf /tmp/gdb } if { mkdir -p /tmp/gdb/kos } cd /tmp/gdb if { elglob -0 files ${kernelSrc}/* forx -E f { $files } ln -s $f ./ } if { mkdir -p build } cd build if { forx -E pattern { ${kernel.modules}/lib/modules/*/kernel/drivers/net/tun* ${kernel.modules}/lib/modules/*/kernel/drivers/net/tap* } elglob -0 files $pattern forx -E f { $files } if { cp $f . } backtick -E COMPRESSED { basename $f } xz -d $COMPRESSED } elglob -0 GDB_SCRIPT_DIR ${lib.getDev kernel}/lib/modules/*/build/scripts/gdb if { if { cp -r --no-preserve=all $GDB_SCRIPT_DIR gdb_scripts } mv gdb_scripts/linux/constants.py.in gdb_scripts/linux/constants.py } ${getExe pkgs.gdb} -ex "python import sys; sys.path.insert(0, \"''${GDB_SCRIPT_DIR}\")" -ex "target remote :1234" -ex "source ''${GDB_SCRIPT_DIR}/vmlinux-gdb.py" -ex "lx-symbols" ${kernel.dev}/vmlinux ''; in { boot.kernelPackages = pkgs.linuxPackagesFor ( (pkgs.linux.override (oldArgs: { # extraMakeFlags = oldArgs.extraMakeFlags or [ ] ++ [ # "scripts_gdb" # ]; kernelPatches = oldArgs.kernelPatches or [ ] ++ [ { name = "debug"; patch = null; structuredExtraConfig = { GDB_SCRIPTS = lib.kernel.yes; DEBUG_INFO = lib.kernel.yes; DEBUG_INFO_REDUCED = lib.kernel.no; # FRAME_POINTER = lib.kernel.yes; # "unused option"??? KALLSYMS = lib.kernel.yes; KGDB = lib.kernel.yes; }; } ]; })).overrideAttrs (oldAttrs: { dontStrip = true; postInstall = oldAttrs.postInstall or "" + '' cp "$buildRoot/scripts/gdb/linux/constants.py" $dev/lib/modules/*/build/scripts/gdb/linux/ || echo "$buildRoot/scripts/gdb/linux/constants.py doesn't exist" ''; }) ); boot.kernelParams = [ "nokaslr" ]; networking.useNetworkd = true; virtualisation.qemu.options = [ "-s" ]; environment.systemPackages = [ pkgs.gdb package # CH cfg.runner uvmsPkgs.taps ]; system.build.gdbScript = gdbScript; systemd.services.taps = { wantedBy = [ "multi-user.target" ]; environment.TAPS_SOCK = "/run/taps/taps.sock"; serviceConfig = { UMask = "0007"; ExecStart = "${getExe uvmsPkgs.taps} serve"; RuntimeDirectory = "taps"; DynamicUser = true; AmbientCapabilities = [ "CAP_NET_BIND_SERVICE" "CAP_NET_ADMIN" ]; NoNewPrivileges = true; }; }; }; testScript = '' machine.succeed("${getExe cfg.runner}") ''; } ); # NOTE: Used to be an even uglier bash script, but, for now, execline makes for easier comparisons against spectrum uvms.cloud-hypervisor.runner = let toolsClosure = pkgs.writeClosure [ (lib.getBin pkgs.execline) (lib.getBin pkgs.s6) (lib.getBin package) (lib.getBin pkgs.virtiofsd) (lib.getBin pkgs.bubblewrap) uvmsPkgs.taps ]; superviseVm = getExe superviseVm'; superviseVm' = pkgs.writers.writePython3Bin "supervise-vm" { } '' import os import subprocess import socket from argparse import ArgumentParser from contextlib import contextmanager, closing, ExitStack parser = ArgumentParser("supervise-vm") parser.add_argument("--vm") parser.add_argument("--prefix", default="$HOME/uvms/$VM") parser.add_argument("--sock", default="$PREFIX/supervisor.sock") parser.add_argument("--vm-config") MSG_SIZE = 16 ELB_DIR = "${lib.getBin pkgs.execline}/bin" # noqa: E501 S6_DIR = "${lib.getBin pkgs.s6}/bin" # noqa: E501 CH_DIR = "${lib.getBin package}/bin" # noqa: E501 UTIL_LINUX_DIR = "${lib.getBin pkgs.util-linux}/bin" # noqa: E501 SOCKETBINDER_PATH = S6_DIR + "/s6-ipcserver-socketbinder" # noqa: E501 CH_PATH = CH_DIR + "/cloud-hypervisor" CHR_PATH = CH_DIR + "/ch-remote" TAPS_PATH = "${lib.getExe uvmsPkgs.taps}" # noqa: E501 VIRTIOFSD_PATH = "${lib.getExe pkgs.virtiofsd}" # noqa: E501 BWRAP_PATH = "${lib.getExe pkgs.bubblewrap}" # noqa: E501 with open("${toolsClosure}", mode="r") as f: # noqa: E501 CLOSURE = [ *(ln.rstrip() for ln in f.readlines()), "${placeholder "out"}", # noqa: E501 ] PASSTHRU_PATH = ":".join([ELB_DIR, S6_DIR, CH_DIR, UTIL_LINUX_DIR]) PASSTHRU_ENV = { **{ k: v for k, v in os.environ.items() if k.startswith("RUST") or k.startswith("WAYLAND") or k in [ "TAPS_SOCK", ] }, "HOME": os.environ.get("HOME", os.getcwd()), "PATH": PASSTHRU_PATH, } def preprocess_args(args_mut): keys = [ k for k, v in args_mut._get_kwargs() if isinstance(v, str)] for k in keys: v = getattr(args_mut, k) if "$HOME" in v: setattr( args_mut, k, v.replace("$HOME", PASSTHRU_ENV["HOME"])) for k in keys: v = getattr(args_mut, k) if "$VM" in v: setattr(args_mut, k, v.replace("$VM", args.vm)) for k in keys: v = getattr(args_mut, k) if "$PREFIX" in v: setattr(args_mut, k, v.replace("$PREFIX", args.prefix)) return args_mut class Processes: def __init__(self, prefix, vm, check=True, **defaults): self.prefix = prefix self.vm = vm self.check = check self.defaults = defaults def make_env(self): return { **PASSTHRU_ENV, "PATH": PASSTHRU_PATH, "PREFIX": self.prefix, "VM": self.vm, } def exec(self, *args, **kwargs): kwargs["cwd"] = kwargs.get("cwd", self.prefix) kwargs["check"] = kwargs.get("check", self.check) kwargs["env"] = kwargs.get("env", self.make_env()) return subprocess.run( [*args], **self.defaults, **kwargs) def execline(self, *args, **kwargs): return exec( "execlineb", "-c", "\n".join(args), **self.defaults, executable=ELB_DIR + "/execlineb", **{ "env": self.make_env(), "check": self.check, "cwd": self.prefix, **kwargs, }, ) def popen(self, *args, **kwargs): kwargs["pass_fds"] = kwargs.get("pass_fds", ()) kwargs["env"] = kwargs.get("env", self.make_env()) kwargs["cwd"] = kwargs.get("cwd", self.prefix) return subprocess.Popen( args, **kwargs, ) @contextmanager def bwrap( self, *bwrap_args, die_with_parent=True, # Based on the args from # `host/rootfs/image/usr/bin/run-vmm` unshare_all=True, unshare_user=True, unshare_ipc=None, unshare_pid=None, unshare_net=None, unshare_uts=None, unshare_cgroup_try=True, bind=(), dev_bind=("/dev/kvm", "/dev/vfio"), dev="/dev", proc="/proc", ro_bind=( "/etc", "/sys", "/proc/sys", "/dev/null", "/proc/kallsyms", *CLOSURE), ro_bind_extra=(), remount_ro=("/proc/fs", "/proc/irq"), tmpfs=("/dev/shm", "/tmp", "/var/tmp", "/proc/fs", "/proc/irq"), tmpfs_extra=(), pass_fds=(2,), **popen_kwargs): bwrap_args_sock, remote = socket.socketpair() remote.set_inheritable(True) bwrap_args_f = bwrap_args_sock.makefile("w") with closing(bwrap_args_sock), closing(bwrap_args_f): def print_arg(*args): print(*args, file=bwrap_args_f, sep="\0", end="\0") if unshare_all: print_arg("--unshare-all") if unshare_user: print_arg("--unshare-user") if unshare_ipc: print_arg("--unshare-ipc") if unshare_pid: print_arg("--unshare-pid") if unshare_net: print_arg("--unshare-net") if unshare_uts: print_arg("--unshare-uts") if unshare_cgroup_try: print_arg("--unshare-cgroup-try") if die_with_parent: print_arg("--die-with-parent") for p in bind: p1, p2 = (p, p) if isinstance(p, str) else p print_arg("--bind", p1, p2) for p in (*ro_bind, *ro_bind_extra): p1, p2 = (p, p) if isinstance(p, str) else p print_arg("--ro-bind", p1, p2) for p in dev_bind: p1, p2 = (p, p) if isinstance(p, str) else p print_arg("--dev-bind", p1, p2) for p in (*tmpfs, *tmpfs_extra): print_arg("--tmpfs", p) # Hunch: order might matter... for p in remount_ro: print_arg("--remount-ro", p) bwrap_args_f.flush() with closing(remote): proc = self.popen( "bwrap", "--args", str(remote.fileno()), *bwrap_args, **popen_kwargs, executable=BWRAP_PATH, pass_fds=(*pass_fds, remote.fileno()), ) with proc as p: try: yield p finally: try: p.poll() except: # noqa: E722 pass if p.returncode is None: p.terminate() p.wait() @contextmanager def run_ch(self): args = [ SOCKETBINDER_PATH, "-B", self.prefix + "/vmm.sock", CH_PATH, "--api-socket", "fd=0", ] p = self.popen( *args, shell=False, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, pass_fds=(2,)) try: p.wait(0.125) needs_cleanup = False except subprocess.TimeoutExpired: needs_cleanup = True if not os.path.exists(self.prefix + "/vmm.sock"): raise RuntimeError(f"{self.prefix}/vmm.sock should exist by now") if p.returncode is not None: raise RuntimeError("CH exited early") try: yield p finally: try: p.poll() except: # noqa: E722 pass if p.returncode is None: p.terminate() # CH handles SIG{INT,TERM}? p.wait() unlink_paths = [ self.prefix + "/vmm.sock", self.prefix + "/vmm.sock.lock", self.prefix + "/vsock.sock", ] if needs_cleanup else [] for p in unlink_paths: if os.path.exists(p): os.remove(p) @contextmanager def add_virtiofsd( self, root_dir, tag, ro=False, subdirs=None, extra_flags=("--posix-acl",)): assert os.path.exists(root_dir) sock_path = self.prefix + f"/virtiofsd-{tag}.sock" # s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) # NOTE: Nope. Virtiofsd actually expects a blocking socket # s.setblocking(True) def rm_sock(): if os.path.exists(sock_path): os.remove(sock_path) with ExitStack() as cleanup: # noqa: F841 # s.bind(sock_path.encode("utf8")) # cleanup.enter_context(closing(s)) cleanup.enter_context(defer(rm_sock)) args = [ # If using bwrap(): # "--argv0", "virtiofsd", # "--uid", "1000", # "--gid", "1000", # "--", "unshare", "-rUm", "unshare", "--map-user", "1000", "--map-group", "1000", VIRTIOFSD_PATH, "--shared-dir", root_dir, "--tag", tag, # "--fd", # str(s.fileno()), "--socket-path", sock_path, # If relying on bwrap(): # "--sandbox", # "none", ] if ro: args.append("--readonly") kwargs = { # If bwrap(): # "bind": [], # ("ro_bind_extra" if ro else "bind"): # [*subdirs] # if subdirs is not None # else [root_dir], # "pass_fds": (2, s.fileno()), } proc_ctx = self.popen(*args, **kwargs) with proc_ctx as p: try: try: p.wait(0.125) except subprocess.TimeoutExpired: pass if p.returncode is not None: raise RuntimeError("virtiofsd exited too early") yield p, sock_path finally: if p.returncode is None: p.kill() p.wait() if os.path.exists(sock_path): os.remove(sock_path) @contextmanager def defer(f): try: yield finally: f() if __name__ == "__main__": args, args_next = parser.parse_known_args() preprocess_args(args) os.makedirs(args.prefix, exist_ok=True) ps = Processes( prefix=args.prefix, vm=args.vm, ) ch_remote = [ "ch-remote", "--api-socket", args.prefix + "/vmm.sock", ] with ExitStack() as cleanup: ch = cleanup.enter_context(ps.run_ch()) ps.exec(*ch_remote, "create", args.vm_config) ps.exec( TAPS_PATH, "pass", *ch_remote, "add-net", "id=wan,fd=3,mac=00:00:00:00:00:01") send_dir = PASSTHRU_ENV["HOME"] + f"/send/{args.vm}" os.makedirs(send_dir, exist_ok=True) vfsd, vfsd_path = cleanup.enter_context( ps.add_virtiofsd( send_dir, tag="send", )) ps.exec(*ch_remote, "add-fs", f"tag=send,socket={vfsd_path},id=send") ps.exec(*ch_remote, "boot") ps.exec(*ch_remote, "info") try: ch.wait() except KeyboardInterrupt: pass ''; in writeElb "run-${hostName}" '' ${superviseVm} --vm-config=${chSettingsFile} --vm=${hostName} ''; } (lib.mkIf cfg.enable { boot.initrd.availableKernelModules = [ "erofs" "overlay" "virtio_mmio" "virtio_pci" "virtio_blk" # "9pnet_virtio" # "9p" "virtiofs" ]; boot.initrd.systemd.enable = lib.mkDefault true; fileSystems = { "/nix/store" = { fsType = "overlay"; overlay.lowerdir = map (img: "/nix/.ro-stores/${toString img.seq}") layers; neededForBoot = true; }; } // lib.listToAttrs ( map ( img: lib.nameValuePair "/nix/.ro-stores/${toString img.seq}" { device = "/dev/disk/by-label/${img.label}"; neededForBoot = true; options = [ "x-systemd.device-timeout=5" ]; } ) layers ); }) ]; }