uvms/profiles/ch-runner.nix
2026-02-25 17:37:39 +02:00

787 lines
29 KiB
Nix

{
config,
lib,
pkgs,
...
}:
# It is not the intent to stick to the microvm.nix-like static interface,
# but we shall begin by reproducing at least some of their work.
let
cfg = config.uvms.cloud-hypervisor;
inherit (config.networking) hostName;
inherit (config.debug.closure.erofs) layers;
inherit (lib)
mkOption
types
concatMapStringsSep
getExe
getExe'
getBin
;
package = uvmsPkgs.cloud-hypervisor-gpu;
uvmsPkgs = pkgs.callPackage ../pkgs { };
chSettingsFile = (pkgs.formats.json { }).generate "vm.json" cfg.settings;
uvmPrefix = "\${HOME}/uvms/${hostName}";
vmmSock = "${uvmPrefix}/vmm.sock";
elbPrefix = "${lib.getBin pkgs.execline}/bin";
s6Prefix = "${lib.getBin pkgs.s6}/bin";
writeElb = name: text: writeElb' name "-W" text;
writeElb' =
name: elArgs: text:
pkgs.writeTextFile {
inherit name;
destination = "/bin/${name}";
executable = true;
text = ''
#!${getExe' pkgs.execline "execlineb"}${lib.optionalString (elArgs != null) " "}${elArgs}
importas OLDPATH PATH
export PATH "${elbPrefix}:${s6Prefix}:''${OLDPATH}"
${text}
'';
};
in
{
options = {
uvms.cloud-hypervisor.enable = lib.mkEnableOption "Configure guest (e.g. fileSystems)";
uvms.cloud-hypervisor.runner = mkOption {
type = types.package;
description = "A naive script for running this system in cloud-hypervisor";
};
uvms.cloud-hypervisor.debugger = mkOption {
type = types.lazyAttrsOf types.anything;
description = "Same but you can debug the kernel";
};
uvms.cloud-hypervisor.settingsFile = mkOption {
type = types.package;
default = chSettingsFile;
defaultText = "...";
readOnly = true;
};
uvms.cloud-hypervisor.settings = mkOption {
default = { };
type = types.submodule {
freeformType = (pkgs.formats.json { }).type;
options = {
payload = {
cmdline = mkOption { type = types.str; };
kernel = mkOption { type = types.str; };
initramfs = mkOption {
type = types.str;
default = "${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}";
};
};
vsock = {
cid = mkOption {
type = types.int;
default = 4;
};
socket = mkOption {
type = types.str;
default = "vsock.sock";
};
};
"api-socket" = mkOption {
type = types.str;
default = "vmm.sock";
};
"serial".mode = mkOption {
type = types.str;
default = "File";
};
"serial".file = mkOption {
type = types.nullOr types.str;
default = "serial";
};
"console".mode = mkOption {
type = types.str;
default = "Pty";
};
"console".file = mkOption {
type = types.nullOr types.str;
default = null;
};
# "watchdog" = true;
# "seccomp" = true;
};
};
};
uvms.cloud-hypervisor.extraCmdline = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ ];
};
uvms.cloud-hypervisor.cmdline = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [
"earlyprintk=ttyS0"
"console=ttyS0"
"reboot=t"
"panic=-1"
"init=${config.system.build.toplevel}/init"
]
++ config.boot.kernelParams
++ config.uvms.cloud-hypervisor.extraCmdline;
};
};
config = lib.mkMerge [
{
uvms.cloud-hypervisor.settings = {
payload = {
cmdline = lib.concatStringsSep " " cfg.cmdline;
kernel = "${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}";
};
disks = map (img: {
path = img;
readonly = true;
id = toString img.label;
}) layers;
memory = {
size = 1536 * 1048576;
shared = true;
mergeable = true;
# hotplugged_size = 512 * 1048576;
# hotplugd_size = 1536 * 1048576;
# hotplug_method = "virtio-mem"
};
cpus = {
boot_vcpus = 4;
max_vcpus = 4;
};
};
uvms.cloud-hypervisor.debugger = pkgs.testers.runNixOSTest (
{ config, ... }:
{
name = "test-run-${hostName}";
passthru = rec {
inherit (config.nodes.machine.system.build) gdbScript;
inherit (config.nodes.machine.boot.kernelPackages) kernel;
kernelSrc = pkgs.srcOnly kernel;
};
nodes.machine =
{ config, ... }:
let
kernel = config.boot.kernelPackages.kernel;
kernelSrc = pkgs.srcOnly kernel;
gdbScript = writeElb "attach-gdb" ''
if { rm -rf /tmp/gdb }
if { mkdir -p /tmp/gdb/kos }
cd /tmp/gdb
if {
elglob -0 files ${kernelSrc}/*
forx -E f { $files }
ln -s $f ./
}
if { mkdir -p build }
cd build
if {
forx -E pattern {
${kernel.modules}/lib/modules/*/kernel/drivers/net/tun*
${kernel.modules}/lib/modules/*/kernel/drivers/net/tap*
}
elglob -0 files $pattern
forx -E f { $files }
if { cp $f . }
backtick -E COMPRESSED { basename $f }
xz -d $COMPRESSED
}
elglob -0 GDB_SCRIPT_DIR ${lib.getDev kernel}/lib/modules/*/build/scripts/gdb
if {
if { cp -r --no-preserve=all $GDB_SCRIPT_DIR gdb_scripts }
mv gdb_scripts/linux/constants.py.in gdb_scripts/linux/constants.py
}
${getExe pkgs.gdb}
-ex "python import sys; sys.path.insert(0, \"''${GDB_SCRIPT_DIR}\")"
-ex "target remote :1234"
-ex "source ''${GDB_SCRIPT_DIR}/vmlinux-gdb.py"
-ex "lx-symbols"
${kernel.dev}/vmlinux
'';
in
{
boot.kernelPackages = pkgs.linuxPackagesFor (
(pkgs.linux.override (oldArgs: {
# extraMakeFlags = oldArgs.extraMakeFlags or [ ] ++ [
# "scripts_gdb"
# ];
kernelPatches = oldArgs.kernelPatches or [ ] ++ [
{
name = "debug";
patch = null;
structuredExtraConfig = {
GDB_SCRIPTS = lib.kernel.yes;
DEBUG_INFO = lib.kernel.yes;
DEBUG_INFO_REDUCED = lib.kernel.no;
# FRAME_POINTER = lib.kernel.yes; # "unused option"???
KALLSYMS = lib.kernel.yes;
KGDB = lib.kernel.yes;
};
}
];
})).overrideAttrs
(oldAttrs: {
dontStrip = true;
postInstall = oldAttrs.postInstall or "" + ''
cp "$buildRoot/scripts/gdb/linux/constants.py" $dev/lib/modules/*/build/scripts/gdb/linux/ || echo "$buildRoot/scripts/gdb/linux/constants.py doesn't exist"
'';
})
);
boot.kernelParams = [ "nokaslr" ];
networking.useNetworkd = true;
virtualisation.qemu.options = [ "-s" ];
environment.systemPackages = [
pkgs.gdb
package # CH
cfg.runner
uvmsPkgs.taps
];
system.build.gdbScript = gdbScript;
systemd.services.taps = {
wantedBy = [ "multi-user.target" ];
environment.TAPS_SOCK = "/run/taps/taps.sock";
serviceConfig = {
UMask = "0007";
ExecStart = "${getExe uvmsPkgs.taps} serve";
RuntimeDirectory = "taps";
DynamicUser = true;
AmbientCapabilities = [
"CAP_NET_BIND_SERVICE"
"CAP_NET_ADMIN"
];
NoNewPrivileges = true;
};
};
};
testScript = ''
machine.succeed("${getExe cfg.runner}")
'';
}
);
# NOTE: Used to be an even uglier bash script, but, for now, execline makes for easier comparisons against spectrum
uvms.cloud-hypervisor.runner =
let
toolsClosure = pkgs.writeClosure [
(lib.getBin pkgs.execline)
(lib.getBin pkgs.s6)
(lib.getBin package)
(lib.getBin pkgs.virtiofsd)
(lib.getBin pkgs.bubblewrap)
(lib.getBin pkgs.strace)
(lib.getBin pkgs.crosvm)
uvmsPkgs.taps
];
superviseVm = getExe superviseVm';
superviseVm' = pkgs.writers.writePython3Bin "supervise-vm" { } ''
# NOTE: This would have been bash,
# and this was execlineb previously,
# but it was just easier to reason in terms of context managers
# and try-except-finally branches for the cleanup bit,
# than in terms of traps or such.
# Treat this as bash.
# Treat this as throwaway shitcode.
import os
import subprocess
import socket
from argparse import ArgumentParser
from contextlib import contextmanager, closing, ExitStack
parser = ArgumentParser("supervise-vm")
parser.add_argument("--vm")
parser.add_argument("--prefix", default="$HOME/uvms/$VM")
parser.add_argument("--sock", default="$PREFIX/supervisor.sock")
parser.add_argument("--vm-config")
MSG_SIZE = 16
ELB_DIR = "${lib.getBin pkgs.execline}/bin" # noqa: E501
S6_DIR = "${lib.getBin pkgs.s6}/bin" # noqa: E501
CH_DIR = "${lib.getBin package}/bin" # noqa: E501
UTIL_LINUX_DIR = "${lib.getBin pkgs.util-linux}/bin" # noqa: E501
SOCKETBINDER_PATH = S6_DIR + "/s6-ipcserver-socketbinder" # noqa: E501
CH_PATH = CH_DIR + "/cloud-hypervisor"
CHR_PATH = CH_DIR + "/ch-remote"
TAPS_PATH = "${lib.getExe uvmsPkgs.taps}" # noqa: E501
VIRTIOFSD_PATH = "${lib.getExe pkgs.virtiofsd}" # noqa: E501
BWRAP_PATH = "${lib.getExe pkgs.bubblewrap}" # noqa: E501
with open("${toolsClosure}", mode="r") as f: # noqa: E501
CLOSURE = [
*(ln.rstrip() for ln in f.readlines()),
"${placeholder "out"}", # noqa: E501
]
PASSTHRU_PATH = ":".join([ELB_DIR, S6_DIR, CH_DIR, UTIL_LINUX_DIR])
PASSTHRU_ENV = {
**{
k: v
for k, v in os.environ.items()
if k.startswith("RUST_")
or k.startswith("WAYLAND")
or k.startswith("XDG_")
or k.startswith("DBUS_")
or k in [
"TAPS_SOCK",
]
},
"HOME": os.environ.get("HOME", os.getcwd()),
"PATH": PASSTHRU_PATH,
}
def preprocess_args(args_mut):
keys = [
k
for k, v
in args_mut._get_kwargs()
if isinstance(v, str)]
for k in keys:
v = getattr(args_mut, k)
if "$HOME" in v:
setattr(
args_mut,
k,
v.replace("$HOME", PASSTHRU_ENV["HOME"]))
for k in keys:
v = getattr(args_mut, k)
if "$VM" in v:
setattr(args_mut, k, v.replace("$VM", args.vm))
for k in keys:
v = getattr(args_mut, k)
if "$PREFIX" in v:
setattr(args_mut, k, v.replace("$PREFIX", args.prefix))
return args_mut
def alive_after(proc, timeout):
if proc is None:
return False
if proc.returncode is not None:
return False
try:
proc.wait(timeout)
except subprocess.TimeoutExpired:
return True
return False
class Processes:
def __init__(self, prefix, vm, check=True, **defaults):
self.prefix = prefix
self.vm = vm
self.check = check
self.defaults = defaults
def make_env(self):
return {
**PASSTHRU_ENV,
"PATH": PASSTHRU_PATH,
"PREFIX": self.prefix,
"VM": self.vm,
}
def exec(self, *args, **kwargs):
kwargs["cwd"] = kwargs.get("cwd", self.prefix)
kwargs["check"] = kwargs.get("check", self.check)
kwargs["env"] = kwargs.get("env", self.make_env())
return subprocess.run(
[*args],
**self.defaults,
**kwargs)
def execline(self, *args, **kwargs):
return exec(
"execlineb", "-c", "\n".join(args),
**self.defaults,
executable=ELB_DIR + "/execlineb",
**{
"env": self.make_env(),
"check": self.check,
"cwd": self.prefix,
**kwargs,
},
)
@contextmanager
def popen(self, *args, **kwargs):
kwargs["pass_fds"] = kwargs.get("pass_fds", ())
kwargs["env"] = kwargs.get("env", self.make_env())
kwargs["cwd"] = kwargs.get("cwd", self.prefix)
kwargs["stdin"] = kwargs.get("stdin", subprocess.DEVNULL)
kwargs["stdout"] = kwargs.get("stdout", subprocess.DEVNULL)
kwargs["stderr"] = kwargs.get("stderr", subprocess.DEVNULL)
proc = None
try:
proc = subprocess.Popen(
args,
**kwargs,
)
if not alive_after(proc, 0.125):
raise RuntimeError("Failed to start", args)
yield proc
finally:
if alive_after(proc, 0.125):
proc.terminate()
if proc is not None:
proc.wait()
@contextmanager
def bwrap(
self,
*bwrap_args,
die_with_parent=True,
# Based on the args from
# `host/rootfs/image/usr/bin/run-vmm`
unshare_all=True,
unshare_user=True,
unshare_ipc=None,
unshare_pid=None,
unshare_net=None,
unshare_uts=None,
unshare_cgroup_try=True,
bind=(),
dev_bind=(),
dev_bind_implicit=("/dev/kvm", "/dev/vfio"),
dev="/dev",
proc="/proc",
ro_bind_implicit=(
"/etc",
"/sys",
"/proc/sys",
"/dev/null",
"/proc/kallsyms",
*CLOSURE),
ro_bind=(),
remount_ro=("/proc/fs", "/proc/irq"),
tmpfs_implicit=(
"/dev/shm",
"/tmp",
"/var/tmp",
"/proc/fs",
"/proc/irq"),
tmpfs=(),
pass_fds=(2,),
**popen_kwargs):
bwrap_args_sock, remote = socket.socketpair()
remote.set_inheritable(True)
bwrap_args_f = bwrap_args_sock.makefile("w")
with ExitStack() as cleanup:
# cleanup.enter_context(closing(bwrap_args_sock))
# cleanup.enter_context(closing(bwrap_args_f))
def print_arg(*args):
print(*args, file=bwrap_args_f, sep="\0", end="\0")
if unshare_all:
print_arg("--unshare-all")
if unshare_user:
print_arg("--unshare-user")
if unshare_ipc:
print_arg("--unshare-ipc")
if unshare_pid:
print_arg("--unshare-pid")
if unshare_net:
print_arg("--unshare-net")
elif unshare_net is False:
print_arg("--share-net")
if unshare_uts:
print_arg("--unshare-uts")
if unshare_cgroup_try:
print_arg("--unshare-cgroup-try")
if die_with_parent:
print_arg("--die-with-parent")
if dev:
print_arg("--dev", dev)
if proc:
print_arg("--proc", proc)
for p in bind:
p1, p2 = (p, p) if isinstance(p, str) else p
print_arg("--bind", p1, p2)
for p in (*ro_bind, *ro_bind_implicit):
p1, p2 = (p, p) if isinstance(p, str) else p
print_arg("--ro-bind", p1, p2)
for p in (*dev_bind, *dev_bind_implicit):
p1, p2 = (p, p) if isinstance(p, str) else p
print_arg("--dev-bind", p1, p2)
for p in (*tmpfs, *tmpfs_implicit):
print_arg("--tmpfs", p)
# Hunch: order might matter...
for p in remount_ro:
print_arg("--remount-ro", p)
bwrap_args_f.flush()
with ExitStack() as es:
es.enter_context(closing(remote))
es.enter_context(closing(bwrap_args_sock))
es.enter_context(closing(bwrap_args_f))
proc = cleanup.enter_context(self.popen(
"bwrap", "--args", str(remote.fileno()), *bwrap_args,
**popen_kwargs,
executable=BWRAP_PATH,
pass_fds=(*pass_fds, remote.fileno()),
))
yield proc
@contextmanager
def run_ch(self):
try:
# s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM, 0)
# s.set_inheritable(True)
# s.setblocking(True)
# s.bind(self.prefix + "/vmm.sock")
args = [
SOCKETBINDER_PATH,
"-B",
self.prefix + "/vmm.sock",
# "${lib.getExe pkgs.strace}", # noqa: E501
# "-Z",
# "-ff",
CH_PATH,
"--api-socket",
"fd=0",
# f"fd={s.fileno()}"
]
needs_cleanup = False
with self.bwrap(
*args,
bind=[self.prefix],
# Probably just need the path to vmlinux
ro_bind=["/nix/store"], # I give up
unshare_net=False,
shell=False,
stderr=None,
# pass_fds=(s.fileno(),)
) as proc:
# s.close()
assert alive_after(proc, 0.125)
if not os.path.exists(self.prefix + "/vmm.sock"):
raise RuntimeError(
f"{self.prefix}/vmm.sock should exist by now")
needs_cleanup = True
if proc.returncode is not None:
raise RuntimeError("CH exited early")
yield proc
finally:
unlink_paths = [
self.prefix + "/vmm.sock",
self.prefix + "/vmm.sock.lock",
self.prefix + "/vsock.sock",
] if needs_cleanup else []
for p in unlink_paths:
if os.path.exists(p):
os.remove(p)
@contextmanager
def start_gpu(
self,
):
sock_path = self.prefix + "/gpu.sock"
args = [
SOCKETBINDER_PATH,
"-b", "1",
sock_path,
"s6-ipcserverd",
"-1c1",
# "${lib.getExe pkgs.strace}", # noqa: E501
# "-Z",
# "-ff",
"${lib.getExe pkgs.crosvm}", # noqa: E501
"--no-syslog",
"device", "gpu",
"--fd", "0",
"--wayland-sock",
f'{PASSTHRU_ENV["XDG_RUNTIME_DIR"]}/{PASSTHRU_ENV["WAYLAND_DISPLAY"]}', # noqa: E501
"--params",
"{ \"context-types\": \"cross-domain:virgl2:venus\" }",
]
with self.popen(
*args,
stderr=None,
) as proc, removing(sock_path):
yield proc, sock_path
@contextmanager
def start_virtiofsd(
self,
root_dir,
tag,
ro=False,
subdirs=None,
extra_flags=("--posix-acl",)):
assert os.path.exists(root_dir)
sock_path = self.prefix + f"/virtiofsd-{tag}.sock"
# s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
# NOTE: Nope. Virtiofsd actually expects a blocking socket
# s.setblocking(True)
# s.set_inheritable(True)
def rm_sock():
if os.path.exists(sock_path):
os.remove(sock_path)
with ExitStack() as cleanup: # noqa: F841
# s.bind(sock_path.encode("utf8"))
# cleanup.enter_context(closing(s))
cleanup.enter_context(defer(rm_sock))
args = [
# If using bwrap():
# "--argv0", "virtiofsd",
# "--uid", "1000",
# "--gid", "1000",
# "--",
"unshare", "-rUm",
"unshare", "--map-user", "1000", "--map-group", "1000",
VIRTIOFSD_PATH,
"--shared-dir",
root_dir,
"--tag",
tag,
# "--fd",
# str(s.fileno()),
"--socket-path",
sock_path,
# If relying on bwrap():
# "--sandbox",
# "none",
]
if ro:
args.append("--readonly")
kwargs = {
# If bwrap():
# "bind": [],
# ("ro_bind" if ro else "bind"):
# [*subdirs]
# if subdirs is not None
# else [root_dir],
# "pass_fds": (2, s.fileno()),
}
try:
with self.popen(*args, **kwargs) as p:
yield p, sock_path
finally:
if os.path.exists(sock_path):
os.remove(sock_path)
@contextmanager
def defer(f):
try:
yield
finally:
f()
@contextmanager
def removing(*paths):
try:
yield
finally:
for p in paths:
if os.path.exists(p):
os.remove(p)
if __name__ == "__main__":
args, args_next = parser.parse_known_args()
preprocess_args(args)
send_dir = PASSTHRU_ENV["HOME"] + f"/send/{args.vm}"
os.makedirs(send_dir, exist_ok=True)
os.makedirs(args.prefix, exist_ok=True)
os.makedirs(args.prefix + "/pts", exist_ok=True)
ps = Processes(
prefix=args.prefix,
vm=args.vm,
)
ch_remote = [
"ch-remote",
"--api-socket",
args.prefix + "/vmm.sock",
]
with ExitStack() as cleanup:
vfsd, vfsd_path = cleanup.enter_context(
ps.start_virtiofsd(
send_dir,
tag="send",
))
gpud, gpud_path = cleanup.enter_context(
ps.start_gpu()
)
ch = cleanup.enter_context(ps.run_ch())
ps.exec(*ch_remote, "create", args.vm_config)
ps.exec(
TAPS_PATH, "pass",
*ch_remote, "add-net",
"id=wan,fd=3,mac=00:00:00:00:00:01")
ps.exec(*ch_remote, "add-fs", f"tag=send,socket={vfsd_path},id=send")
ps.exec(*ch_remote, "add-gpu", f"socket={gpud_path}")
ps.exec(*ch_remote, "boot")
ps.exec(*ch_remote, "info")
try:
ch.wait()
except KeyboardInterrupt:
pass
'';
in
writeElb "run-${hostName}" ''
${superviseVm} --vm-config=${chSettingsFile} --vm=${hostName}
'';
}
(lib.mkIf cfg.enable {
boot.initrd.availableKernelModules = [
"erofs"
"overlay"
"virtio_mmio"
"virtio_pci"
"virtio_blk"
# "9pnet_virtio"
# "9p"
"virtiofs"
];
boot.initrd.systemd.enable = lib.mkDefault true;
fileSystems = {
"/nix/store" = {
fsType = "overlay";
overlay.lowerdir = map (img: "/nix/.ro-stores/${toString img.seq}") layers;
neededForBoot = true;
};
}
// lib.listToAttrs (
map (
img:
lib.nameValuePair "/nix/.ro-stores/${toString img.seq}" {
device = "/dev/disk/by-label/${img.label}";
neededForBoot = true;
options = [ "x-systemd.device-timeout=5" ];
}
) layers
);
})
];
}