diff --git a/examples/dummy.nix b/examples/dummy.nix index 4689af1..7d8c813 100644 --- a/examples/dummy.nix +++ b/examples/dummy.nix @@ -13,6 +13,8 @@ system.stateVersion = "25.11"; + networking.hostName = "dummy"; + vmapps.enable = true; _module.args.inputs = import ../npins; diff --git a/pkgs/ch-proxy/meson.build b/pkgs/ch-proxy/meson.build index e3976a2..379f96f 100644 --- a/pkgs/ch-proxy/meson.build +++ b/pkgs/ch-proxy/meson.build @@ -1,3 +1,9 @@ project('ch-proxy', 'c') -executable('ch-proxy', 'proxy.c', install: true) +pkg = import('pkgconfig') + +sendfd = library('sendfd', [ 'sendfd.c', 'sendfd.h' ], install: true) +pkg.generate(sendfd) +install_headers('sendfd.h') + +executable('ch-proxy', 'proxy.c', link_with: [sendfd], install: true) diff --git a/pkgs/ch-proxy/package.nix b/pkgs/ch-proxy/package.nix index 9fd3b21..e94eec8 100644 --- a/pkgs/ch-proxy/package.nix +++ b/pkgs/ch-proxy/package.nix @@ -8,6 +8,7 @@ stdenv.mkDerivation { pname = "ch-proxy"; version = "0.0.0"; + outputs = [ "out" "lib" ]; nativeBuildInputs = [ meson ninja @@ -19,6 +20,8 @@ stdenv.mkDerivation { fs.toSource { fileset = fs.unions [ ./proxy.c + ./sendfd.c + ./sendfd.h ./meson.build ]; root = ./.; diff --git a/pkgs/ch-proxy/proxy.c b/pkgs/ch-proxy/proxy.c index ed1dea0..46730e6 100644 --- a/pkgs/ch-proxy/proxy.c +++ b/pkgs/ch-proxy/proxy.c @@ -9,9 +9,9 @@ #include -struct msghdr mk_msghdr(); +#include "sendfd.h" + int ch_connect(const char*, const char*); -ssize_t send_fd(int, int); #define _WRITE_CONFIRM(fd, buf, buflen) {if (write((fd), (buf), (buflen)) != (buflen)) { perror("ch-proxy/write/partial write"); exit(EXIT_FAILURE); }} @@ -168,19 +168,13 @@ int main(int argc, char** argv) { exit(EXIT_FAILURE); } - if (send_fd(1, s) == -1) { + if (send_fd(1, s, NULL) == -1) { perror("ssh-vsock-proxy/main/send_fd"); return EXIT_FAILURE; } return 0; } -struct msghdr mk_msghdr() { - struct msghdr msg; - memset(&msg, 0, sizeof(msg)); - - return msg; -} int ch_connect(const char *path, const char *port) { int s = socket(AF_UNIX, SOCK_STREAM, 0); @@ -212,38 +206,3 @@ int ch_connect(const char *path, const char *port) { return s; } - -ssize_t send_fd(int dst_fd, int fd) { - struct msghdr msg = mk_msghdr(); - - /* openssh expects to receive a dummy length=1 iovec? */ - char ch; - struct iovec vec; - vec.iov_base = &ch; - vec.iov_len = 1; - msg.msg_iov = &vec; - msg.msg_iovlen = 1; - - union { - struct cmsghdr align; - char buf[CMSG_SPACE(sizeof(int))]; - } u; - - msg.msg_control = u.buf; - msg.msg_controllen = sizeof(u.buf); - - struct cmsghdr *cmptr; - cmptr = CMSG_FIRSTHDR(&msg); - - if (cmptr == NULL) { - fprintf(stderr, "ch-proxy/send_fd/CMSG_FIRSTHDR: failed to initialize msg_control\n"); - exit(EXIT_FAILURE); - } - - cmptr->cmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; - *((int*) CMSG_DATA(cmptr)) = fd; - - return (sendmsg(dst_fd, &msg, 0)); -} diff --git a/pkgs/ch-proxy/sendfd.c b/pkgs/ch-proxy/sendfd.c new file mode 100644 index 0000000..b20e284 --- /dev/null +++ b/pkgs/ch-proxy/sendfd.c @@ -0,0 +1,74 @@ +#include "sendfd.h" +#include "sys/socket.h" /* cmsghdr */ +#include "stdio.h" /* perror */ + + +ssize_t send_fd(int dst_fd, int fd, const struct iovec *iov) { + struct msghdr msg = { 0 }; + + /* openssh expects to receive a dummy length=1 iovec? */ + char ch = 0; + struct iovec vecDefault = { 0 }; + vecDefault.iov_base = &ch; + vecDefault.iov_len = 1; + msg.msg_iov = iov == NULL ? &vecDefault : iov; + msg.msg_iovlen = 1; + + union { + struct cmsghdr align; + char buf[CMSG_SPACE(sizeof(int))]; + } u; + + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + struct cmsghdr *cmptr; + cmptr = CMSG_FIRSTHDR(&msg); + + if (cmptr == NULL) { + perror("ch-proxy/send_fd/CMSG_FIRSTHDR: failed to initialize msg_control\n"); + } + + cmptr->cmsg_len = CMSG_LEN(sizeof(int)); + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + *((int*) CMSG_DATA(cmptr)) = fd; + + return (sendmsg(dst_fd, &msg, 0)); +} + +int recv_fd(int sock, int flags) { + int out = -1; + + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg = NULL; + struct iovec iov = { 0 }; + char dummy = 0; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = &dummy; + iov.iov_len = sizeof(dummy); + + union { + struct cmsghdr align; + char buf[CMSG_SPACE(sizeof(int))]; + } u; + + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + int bytes = 0; + if ((bytes = recvmsg(sock, &msg, flags)) < 0) { + perror("recv_fd: recvmsg"); + return -1; + } + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET) { continue; } + if (cmsg->cmsg_type != SCM_RIGHTS) { continue; } + if (CMSG_LEN(cmsg) < sizeof(out)) { continue; } + out = *(int*)CMSG_DATA(cmsg); + } + return out; +} diff --git a/pkgs/ch-proxy/sendfd.h b/pkgs/ch-proxy/sendfd.h new file mode 100644 index 0000000..fc1d2f8 --- /dev/null +++ b/pkgs/ch-proxy/sendfd.h @@ -0,0 +1,27 @@ +#ifndef _CH_PROXY_SENFD +#define _CH_PROXY_SENFD + +#include /* size_t */ +#include /* ssize_t */ +#include /* iovec */ + + +/* send_fd(chanFd, fd, *iov) + * + * chanFd: fd to sendmsg over; + * fd: fd to send; + * iov: extra data to send or NULL; + * + * returns: result of sendmsg, + * i.e. the number of bytes sent */ +ssize_t send_fd(int chanFd, int fd, const struct iovec *); + +/* recv_fd(chanFd, flags) + * + * chanFd: fd to recvmsg from; + * flags: recvmsg flags e.g. 0, or MSG_CMSG_CLOEXEC? + * + * returns: the received fd or -1 */ +int recv_fd(int chanFd, int flags); + +#endif /* _CH_PROXY_SENFD */ diff --git a/pkgs/taps/.envrc b/pkgs/taps/.envrc new file mode 100644 index 0000000..35f8c10 --- /dev/null +++ b/pkgs/taps/.envrc @@ -0,0 +1 @@ +use nix ../../ -A pkgs.taps diff --git a/pkgs/taps/main.c b/pkgs/taps/main.c new file mode 100644 index 0000000..3b276df --- /dev/null +++ b/pkgs/taps/main.c @@ -0,0 +1,342 @@ +#define _GNU_SOURCE + +#include +#include /* secure_getenv */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* open, O_NONBLOCK, &c */ +#include +#include +#include +#include +#include +#include + +#define __UAPI_DEF_IF_IFNAMSIZ 1 +#include +#include +#include + +#include "sendfd.h" + +// From `man unix` +#define SUN_PATH_SZ 108 +#define N_CONNS 16 + +char *TEMP_PATHS[1024] = { 0 }; +int LAST_TEMP_PATH = -1; + +#define IFR_FLAGS_ALLOWED (IFF_NO_PI | IFF_TAP | IFF_TUN | IFF_VNET_HDR | IFF_MULTI_QUEUE | IFF_PERSIST) +#define IFR_FLAGS_DEFAULT (IFF_NO_PI | IFF_TAP | IFF_VNET_HDR | IFF_PERSIST) + +#define PTR_OR_DIE(expr) TRUE_OR_DIE((expr) != NULL) +#define DO_OR_DIE(expr) TRUE_OR_DIE((expr) != -1) +#define TRUE_OR_DIE(expr, ...) TRUE_OR_(EXIT_FAILURE, expr, __VA_ARGS__) +#define TRUE_OR_WARN(expr, ...) TRUE_OR_(0, expr, __VA_ARGS__) +#define TRUE_OR_(status, expr, ...) \ + do if (!(expr)) { \ + error(status, errno, "Failed assertion: " #expr "." __VA_ARGS__); \ + } while(false) + +struct allow_pattern { + // enum { USER = 1, GROUP = 2 } type; + // union { uid_t uid, gid_t gid } xid; + char *name; +}; +struct allow_patterns { + size_t n; + struct allow_pattern *patterns; +}; + +/* Running on the same host, not caring for alignment */ +struct tap_request { + short ifrFlags; /* 0 to use defaults: IFF_TAP | IFF_NO_PI | IFF_VNET_HDR */ + char name[IFNAMSIZ]; +}; + +struct tap_reply { + enum { OK = 0, AUTH_ERROR = 1 } status; + char name[IFNAMSIZ]; +}; + +int tuntap_alloc(char *dev, short openFlags, short ifrFlags, int *out_fd); + +bool match_mask(const char *test_addr, const char *expected_addr, const char *mask, int n) { + for (int octet = 0; octet < n; ++octet) { + if ((test_addr[octet] & mask[octet]) != expected_addr[octet]) { + return false; + } + } + return true; +} + +/* + * Adapted from spectrum's `mktuntap.c` (2019 Alyssa Ross + * GPL-2.0-only), which in turn adapts `tun_alloc` from + * `linux/Documentation/networking/tuntap.rst`. + * + * ifrFlags: IFF_TUN - TUN device (no Ethernet headers) + * IFF_TAP - TAP device + * + * IFF_NO_PI - Do not provide packet information + */ +int tuntap_alloc(char *dev, short openFlags, short ifrFlags, int *out_fd) { + struct ifreq ifr = { 0 }; + int fd = -1, err = 0; + + DO_OR_DIE(fd = open("/dev/net/tun", openFlags)); + + if (dev != NULL) { + int devLen = strlen(dev); + if (devLen >= IFNAMSIZ) { + /* If client requests a name, we do want the entire name to fit */ + errno = EINVAL; + return EINVAL; + } + strncpy(ifr.ifr_name, dev, IFNAMSIZ - 1); + } + ifr.ifr_flags = ifrFlags; + + TRUE_OR_WARN((err = ioctl(fd, TUNSETIFF, (void *)&ifr)) == 0); + if (err != 0) { + close(fd); + return err; + } + + strncpy(dev, ifr.ifr_name, IFNAMSIZ); + *out_fd = fd; + + { + int sz = sizeof(struct virtio_net_hdr_v1); + DO_OR_DIE(ioctl(fd, TUNSETVNETHDRSZ, &sz)); + } + return 0; +} + +int acceptRequests(const char *requestsPath, const struct allow_patterns *patterns) { + int listener; + struct sockaddr_un addr; + const int t = 1; + + DO_OR_DIE(listener = socket(AF_UNIX, SOCK_SEQPACKET, 0)); + DO_OR_DIE(setsockopt(listener, SOL_SOCKET, SO_PASSCRED, &t, sizeof(t))); + + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, requestsPath, SUN_PATH_SZ - 1); + DO_OR_DIE (bind(listener, &addr, sizeof(addr))); + PTR_OR_DIE(TEMP_PATHS[++LAST_TEMP_PATH] = strdup(requestsPath)); + + DO_OR_DIE(listen(listener, N_CONNS)); + + for (;;) { + /* Already changed my mind about looking at ucred, but keeping the code around for now */ + int sock = -1; + struct ucred cred = { 0 }; + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg = NULL; + struct iovec iov = { 0 }; + struct tap_request req = { 0 }; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = &req; + iov.iov_len = sizeof(struct tap_request); + + DO_OR_DIE((sock = accept(listener, NULL, NULL))); + + TRUE_OR_DIE(recvmsg(sock, &msg, 0) > 0); + req.name[IFNAMSIZ] = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET) { continue; } + if (cmsg->cmsg_type != SCM_CREDENTIALS) { continue; } + if (CMSG_LEN(cmsg) < sizeof(struct ucred)) { continue; } + memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct ucred)); + break; + } + + if (req.ifrFlags == 0) { + req.ifrFlags = IFR_FLAGS_DEFAULT; + } + + bool allowed = false; + for (int i = 0; !allowed && i < patterns->n; ++i) { + bool ifnameOk = fnmatch(patterns->patterns[i].name, req.name, 0) == 0; + bool flagsOk = (req.ifrFlags & IFR_FLAGS_ALLOWED) == req.ifrFlags; + allowed = ifnameOk && flagsOk; + } + + struct tap_reply reply = { 0 }; + if (!allowed) { reply.status = AUTH_ERROR; } + if (allowed) { + /* O_CLOEXEC? */ + int fd = -1; + TRUE_OR_DIE(tuntap_alloc(req.name, O_RDWR | O_NONBLOCK, req.ifrFlags, &fd) == 0); + struct iovec iov = { 0 }; + iov.iov_base = &reply; + iov.iov_len = sizeof(struct tap_reply); + TRUE_OR_DIE(send_fd(sock, fd, &iov) > 0); + close(fd); + } + close(sock); + } + close(listener); +} + +struct allow_patterns parsePatterns(const char *raw) { + const size_t rawLen = strlen(raw); + + size_t nPatterns = 0; + for (int i = 0; i < rawLen; ++i) { + const int start = i; + if (isspace(raw[i])) { continue; } + for (; i < rawLen && !isspace(raw[i]); ++i) { } + if (start < i) { ++nPatterns; } + } + + struct allow_pattern *patterns = NULL; + PTR_OR_DIE(patterns = calloc(nPatterns, sizeof(struct allow_pattern))); + + int iPattern = 0; + for (int i = 0; i < rawLen; ++i) { + if (isspace(raw[i])) { continue; } + /* used to have per-group/per-user patterns, "u:$username:$pattern", &c - gone */ + { + const int start = i; + for (; i < rawLen && !isspace(raw[i]); ++i) { } + if (start < i) { + PTR_OR_DIE(patterns[iPattern].name = strndup(&raw[start], i - start)); + iPattern += 1; + } + } + } + struct allow_patterns out = { + .n = nPatterns, + .patterns = patterns + }; + return out; +} + +int get(const char *servePath, const char *ifname, short ifrFlags) { + /* TODO: sock: move out */ + int sock; + struct sockaddr_un addr; + + DO_OR_DIE(sock = socket(AF_UNIX, SOCK_SEQPACKET, 0)); + + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, servePath, SUN_PATH_SZ - 1); + DO_OR_DIE (connect(sock, &addr, sizeof(addr))); + + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg = NULL; + struct iovec iov = { 0 }; + struct tap_request req = { 0 }; + strncpy(req.name, ifname, IFNAMSIZ - 1); + req.ifrFlags = ifrFlags; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = &req; + iov.iov_len = sizeof(struct tap_request); + + TRUE_OR_DIE(sendmsg(sock, &msg, 0) > 0); + + int tunFd = -1; + DO_OR_DIE(tunFd = recv_fd(sock, 0)); + close(sock); + return tunFd; +} + +void cleanup(int signo, siginfo_t *info, void *_context) { + for (int i = 0; i <= LAST_TEMP_PATH; ++i) { + TRUE_OR_DIE(unlink(TEMP_PATHS[i]) != -1 || errno == ENOENT); + } + if (signo == SIGINT) { + exit(EXIT_SUCCESS); + } + errx(EXIT_FAILURE, "Exiting with signal %d", signo); +} + +/* skarlibs under ISC */ +int uncoe (int fd) +{ + int flags = fcntl(fd, F_GETFD, 0) ; + return flags < 0 ? flags : flags & FD_CLOEXEC ? fcntl(fd, F_SETFD, flags & ~FD_CLOEXEC) : 0 ; +} + +int main(int argc, char **argv) { + struct sigaction act = { 0 }; + act.sa_flags = SA_SIGINFO; + act.sa_sigaction = cleanup; + DO_OR_DIE(sigaction(SIGINT, &act, NULL)); + DO_OR_DIE(sigaction(SIGSEGV, &act, NULL)); + + bool cmdServe = false; + bool cmdPass = false; + char *ifname = "vt-%d"; + + char **rest = argv + 1; + char **end = argv + argc; + + TRUE_OR_DIE(argc > 1); + if (strcmp(rest[0], "serve") == 0) { + cmdServe = true; + ++rest; + } else if (strcmp(rest[0], "pass") == 0) { + cmdPass = true; + ++rest; + for (; rest != end && rest[0][0] == '-'; ++rest) { + if (strcmp(rest[0], "--")) { break; } + else if (strncmp(rest[0], "--ifname=", sizeof("--ifname="))) { + ifname = rest[0] + sizeof("--ifname="); + } + } + } else { + error(EINVAL, EINVAL, "no subcommand \"%s\"", rest[0]); + } + + int nextArgc = argc - (rest - argv); + char * const* nextArgv = rest; + + const char *patternsRaw = secure_getenv("TAPS_ALLOW"); + if (patternsRaw == NULL) { + patternsRaw = "*"; + } + + struct allow_patterns patterns = { 0 }; + if (cmdServe) { + PTR_OR_DIE((patterns = parsePatterns(patternsRaw)).patterns); + } + + const char *servePath = secure_getenv("TAPS_SOCK"); + if (servePath == NULL) { + servePath = "/run/taps/taps.sock"; + } + + if (cmdServe) { + acceptRequests(servePath, &patterns); + } else if (cmdPass) { + TRUE_OR_DIE(nextArgc > 0); + int fd = -1; + DO_OR_DIE(fd = get(servePath, ifname, 0)); + if (fd != 3) { + DO_OR_DIE(dup2(fd, 3)); + close(fd); + fd = 3; + } + uncoe(fd); + DO_OR_DIE(execvp(nextArgv[0], nextArgv)); + } else { + error(EINVAL, EINVAL, "subcommand args"); + } + + return 0; +} diff --git a/pkgs/taps/meson.build b/pkgs/taps/meson.build new file mode 100644 index 0000000..06057b4 --- /dev/null +++ b/pkgs/taps/meson.build @@ -0,0 +1,4 @@ +project('taps', 'c') + +sendfd = dependency('sendfd') +executable('taps', 'main.c', dependencies: [sendfd], install: true) diff --git a/pkgs/taps/package.nix b/pkgs/taps/package.nix new file mode 100644 index 0000000..c666cd9 --- /dev/null +++ b/pkgs/taps/package.nix @@ -0,0 +1,44 @@ +{ + lib, + stdenv, + meson, + pkg-config, + rustc, + ninja, + ch-proxy, +}: + +stdenv.mkDerivation { + pname = "taps"; + version = "0.0.0"; + src = + let + fs = lib.fileset; + in + fs.toSource { + root = ./.; + fileset = fs.unions [ + ./meson.build + ./main.c + ]; + }; + + nativeBuildInputs = [ + ninja + meson + pkg-config + rustc + ]; + buildInputs = [ ch-proxy ]; +} +# { lib, rustPlatform }: +# +# rustPlatform.buildRustPackage { +# pname = "taps"; +# version = "0.0.0"; +# src = let fs = lib.filesystem; in fs.toSource { +# root = ./.; +# fileset = fs.unions [ +# ]; +# }; +# }; diff --git a/profiles/ch-runner.nix b/profiles/ch-runner.nix index fbb09a4..f156705 100644 --- a/profiles/ch-runner.nix +++ b/profiles/ch-runner.nix @@ -10,21 +10,112 @@ let cfg = config.uvms.cloud-hypervisor; + + inherit (config.networking) hostName; inherit (config.debug.closure.erofs) layers; + inherit (lib) + mkOption + types + concatMapStringsSep + getExe + getExe' + getBin + ; + + package = pkgs.cloud-hypervisor.overrideAttrs (oldAttrs: { + patches = oldAttrs.patches or [ ] ++ [ + # ../patches/ch.patch + ]; + buildType = "debug"; + dontStrip = true; + }); + uvmsPkgs = pkgs.callPackage ../pkgs { }; + + chSettingsFile = (pkgs.formats.json { }).generate "vm.json" cfg.settings; + + uvmPrefix = "\${HOME}/uvms/${hostName}"; + vmmSock = "${uvmPrefix}/vmm.sock"; + elbPrefix = "${lib.getBin pkgs.execline}/bin"; + s6Prefix = "${lib.getBin pkgs.s6}/bin"; + writeElb = name: text: writeElb' name "-W" text; + writeElb' = + name: elArgs: text: + pkgs.writeTextFile { + inherit name; + destination = "/bin/${name}"; + executable = true; + text = '' + #!${getExe' pkgs.execline "execlineb"}${lib.optionalString (elArgs != null) " "}${elArgs} + importas OLDPATH PATH + export PATH "${elbPrefix}:${s6Prefix}:''${OLDPATH}" + ${text} + ''; + }; in { options = { uvms.cloud-hypervisor.enable = lib.mkEnableOption "Configure guest (e.g. fileSystems)"; - uvms.cloud-hypervisor.runner = lib.mkOption { - type = lib.types.package; + uvms.cloud-hypervisor.runner = mkOption { + type = types.package; description = "A naive script for running this system in cloud-hypervisor"; }; - uvms.cloud-hypervisor.extraArgv = lib.mkOption { - type = lib.types.listOf lib.types.str; - default = [ ]; + uvms.cloud-hypervisor.debugger = mkOption { + type = types.lazyAttrsOf types.anything; + description = "Same but you can debug the kernel"; }; - uvms.cloud-hypervisor.argv = lib.mkOption { - type = lib.types.listOf lib.types.str; + uvms.cloud-hypervisor.settingsFile = mkOption { + type = types.package; + default = chSettingsFile; + defaultText = "..."; + readOnly = true; + }; + uvms.cloud-hypervisor.settings = mkOption { + default = { }; + type = types.submodule { + freeformType = (pkgs.formats.json { }).type; + options = { + payload = { + cmdline = mkOption { type = types.str; }; + kernel = mkOption { type = types.str; }; + initramfs = mkOption { + type = types.str; + default = "${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}"; + }; + }; + vsock = { + cid = mkOption { + type = types.int; + default = 4; + }; + socket = mkOption { + type = types.str; + default = "vsock.sock"; + }; + }; + "api-socket" = mkOption { + type = types.str; + default = "vmm.sock"; + }; + "serial".mode = mkOption { + type = types.str; + default = "File"; + }; + "serial".file = mkOption { + type = types.nullOr types.str; + default = "serial"; + }; + "console".mode = mkOption { + type = types.str; + default = "Pty"; + }; + "console".file = mkOption { + type = types.nullOr types.str; + default = null; + }; + # "watchdog" = true; + # "seccomp" = true; + }; + }; }; uvms.cloud-hypervisor.extraCmdline = lib.mkOption { type = lib.types.listOf lib.types.str; @@ -45,34 +136,539 @@ in }; config = lib.mkMerge [ { - uvms.cloud-hypervisor.argv = lib.mkBefore ( - [ - (lib.getExe pkgs.cloud-hypervisor) - "--cmdline=${lib.concatStringsSep " " cfg.cmdline}" - "--kernel=${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}" - "--initramfs=${config.system.build.initialRamdisk}/${config.system.boot.loader.initrdFile}" - "--vsock=cid=4,socket=vsock.sock" - "--api-socket=vmm.sock" - "--serial=tty" - "--console=null" - "--watchdog" - "--seccomp=true" - ] - ++ cfg.extraArgv - ); - uvms.cloud-hypervisor.runner = pkgs.writeShellScriptBin "run-${config.networking.hostName}" '' - set -euo pipefail - GUESTNAME=${config.networking.hostName} - args=( - ${lib.concatMapStringsSep "\n" lib.escapeShellArg cfg.argv} - ) - mkdir -p "$HOME/uvms/$GUESTNAME" - cd "$HOME/uvms/$GUESTNAME" - cleanup() { - rm "$HOME/uvms/$GUESTNAME"/{vmm,vsock}.sock + uvms.cloud-hypervisor.settings = { + payload = { + cmdline = lib.concatStringsSep " " cfg.cmdline; + kernel = "${config.boot.kernelPackages.kernel}/${pkgs.stdenv.hostPlatform.linux-kernel.target}"; + }; + disks = map (img: { + path = img; + readonly = true; + id = toString img.label; + }) layers; + memory = { + size = 1536 * 1048576; + shared = true; + mergeable = true; + # hotplugged_size = 512 * 1048576; + # hotplugd_size = 1536 * 1048576; + # hotplug_method = "virtio-mem" + }; + cpus = { + boot_vcpus = 4; + max_vcpus = 4; + }; + }; + + uvms.cloud-hypervisor.debugger = pkgs.testers.runNixOSTest ( + { config, ... }: + { + name = "test-run-${hostName}"; + passthru = rec { + inherit (config.nodes.machine.system.build) gdbScript; + inherit (config.nodes.machine.boot.kernelPackages) kernel; + kernelSrc = pkgs.srcOnly kernel; + }; + nodes.machine = + { config, ... }: + let + kernel = config.boot.kernelPackages.kernel; + kernelSrc = pkgs.srcOnly kernel; + gdbScript = writeElb "attach-gdb" '' + if { rm -rf /tmp/gdb } + if { mkdir -p /tmp/gdb/kos } + cd /tmp/gdb + if { + elglob -0 files ${kernelSrc}/* + forx -E f { $files } + ln -s $f ./ + } + if { mkdir -p build } + cd build + if { + forx -E pattern { + ${kernel.modules}/lib/modules/*/kernel/drivers/net/tun* + ${kernel.modules}/lib/modules/*/kernel/drivers/net/tap* + } + elglob -0 files $pattern + forx -E f { $files } + if { cp $f . } + backtick -E COMPRESSED { basename $f } + xz -d $COMPRESSED + } + elglob -0 GDB_SCRIPT_DIR ${lib.getDev kernel}/lib/modules/*/build/scripts/gdb + if { + if { cp -r --no-preserve=all $GDB_SCRIPT_DIR gdb_scripts } + mv gdb_scripts/linux/constants.py.in gdb_scripts/linux/constants.py + } + ${getExe pkgs.gdb} + -ex "python import sys; sys.path.insert(0, \"''${GDB_SCRIPT_DIR}\")" + -ex "target remote :1234" + -ex "source ''${GDB_SCRIPT_DIR}/vmlinux-gdb.py" + -ex "lx-symbols" + ${kernel.dev}/vmlinux + ''; + in + { + boot.kernelPackages = pkgs.linuxPackagesFor ( + (pkgs.linux.override (oldArgs: { + # extraMakeFlags = oldArgs.extraMakeFlags or [ ] ++ [ + # "scripts_gdb" + # ]; + kernelPatches = oldArgs.kernelPatches or [ ] ++ [ + { + name = "debug"; + patch = null; + structuredExtraConfig = { + GDB_SCRIPTS = lib.kernel.yes; + DEBUG_INFO = lib.kernel.yes; + DEBUG_INFO_REDUCED = lib.kernel.no; + # FRAME_POINTER = lib.kernel.yes; # "unused option"??? + KALLSYMS = lib.kernel.yes; + KGDB = lib.kernel.yes; + }; + } + ]; + })).overrideAttrs + (oldAttrs: { + dontStrip = true; + postInstall = oldAttrs.postInstall or "" + '' + cp "$buildRoot/scripts/gdb/linux/constants.py" $dev/lib/modules/*/build/scripts/gdb/linux/ || echo "$buildRoot/scripts/gdb/linux/constants.py doesn't exist" + ''; + }) + ); + boot.kernelParams = [ "nokaslr" ]; + networking.useNetworkd = true; + virtualisation.qemu.options = [ "-s" ]; + environment.systemPackages = [ + pkgs.gdb + package # CH + cfg.runner + uvmsPkgs.taps + ]; + system.build.gdbScript = gdbScript; + systemd.services.taps = { + wantedBy = [ "multi-user.target" ]; + environment.TAPS_SOCK = "/run/taps/taps.sock"; + serviceConfig = { + UMask = "0007"; + ExecStart = "${getExe uvmsPkgs.taps} serve"; + RuntimeDirectory = "taps"; + DynamicUser = true; + AmbientCapabilities = [ + "CAP_NET_BIND_SERVICE" + "CAP_NET_ADMIN" + ]; + NoNewPrivileges = true; + }; + }; + }; + testScript = '' + machine.succeed("${getExe cfg.runner}") + ''; } - exec -a "uuvm/$GUESTNAME" "''${args[@]}" - ''; + ); + + # NOTE: Used to be an even uglier bash script, but, for now, execline makes for easier comparisons against spectrum + uvms.cloud-hypervisor.runner = + let + toolsClosure = pkgs.writeClosure [ + (lib.getBin pkgs.execline) + (lib.getBin pkgs.s6) + (lib.getBin package) + (lib.getBin pkgs.virtiofsd) + (lib.getBin pkgs.bubblewrap) + uvmsPkgs.taps + ]; + + superviseVm = getExe superviseVm'; + superviseVm' = pkgs.writers.writePython3Bin "supervise-vm" { } '' + import os + import subprocess + import socket + from argparse import ArgumentParser + from contextlib import contextmanager, closing, ExitStack + + + parser = ArgumentParser("supervise-vm") + parser.add_argument("--vm") + parser.add_argument("--prefix", default="$HOME/uvms/$VM") + parser.add_argument("--sock", default="$PREFIX/supervisor.sock") + parser.add_argument("--vm-config") + + MSG_SIZE = 16 + ELB_DIR = "${lib.getBin pkgs.execline}/bin" # noqa: E501 + S6_DIR = "${lib.getBin pkgs.s6}/bin" # noqa: E501 + CH_DIR = "${lib.getBin package}/bin" # noqa: E501 + UTIL_LINUX_DIR = "${lib.getBin pkgs.util-linux}/bin" # noqa: E501 + SOCKETBINDER_PATH = S6_DIR + "/s6-ipcserver-socketbinder" # noqa: E501 + CH_PATH = CH_DIR + "/cloud-hypervisor" + CHR_PATH = CH_DIR + "/ch-remote" + TAPS_PATH = "${lib.getExe uvmsPkgs.taps}" # noqa: E501 + VIRTIOFSD_PATH = "${lib.getExe pkgs.virtiofsd}" # noqa: E501 + BWRAP_PATH = "${lib.getExe pkgs.bubblewrap}" # noqa: E501 + + with open("${toolsClosure}", mode="r") as f: # noqa: E501 + CLOSURE = [ + *(ln.rstrip() for ln in f.readlines()), + "${placeholder "out"}", # noqa: E501 + ] + + PASSTHRU_PATH = ":".join([ELB_DIR, S6_DIR, CH_DIR, UTIL_LINUX_DIR]) + PASSTHRU_ENV = { + **{ + k: v + for k, v in os.environ.items() + if k.startswith("RUST") + or k.startswith("WAYLAND") + or k in [ + "TAPS_SOCK", + ] + }, + "HOME": os.environ.get("HOME", os.getcwd()), + "PATH": PASSTHRU_PATH, + } + + + def preprocess_args(args_mut): + keys = [ + k + for k, v + in args_mut._get_kwargs() + if isinstance(v, str)] + for k in keys: + v = getattr(args_mut, k) + if "$HOME" in v: + setattr( + args_mut, + k, + v.replace("$HOME", PASSTHRU_ENV["HOME"])) + for k in keys: + v = getattr(args_mut, k) + if "$VM" in v: + setattr(args_mut, k, v.replace("$VM", args.vm)) + for k in keys: + v = getattr(args_mut, k) + if "$PREFIX" in v: + setattr(args_mut, k, v.replace("$PREFIX", args.prefix)) + return args_mut + + + class Processes: + def __init__(self, prefix, vm, check=True, **defaults): + self.prefix = prefix + self.vm = vm + self.check = check + self.defaults = defaults + + def make_env(self): + return { + **PASSTHRU_ENV, + "PATH": PASSTHRU_PATH, + "PREFIX": self.prefix, + "VM": self.vm, + } + + def exec(self, *args, **kwargs): + kwargs["cwd"] = kwargs.get("cwd", self.prefix) + kwargs["check"] = kwargs.get("check", self.check) + kwargs["env"] = kwargs.get("env", self.make_env()) + return subprocess.run( + [*args], + **self.defaults, + **kwargs) + + def execline(self, *args, **kwargs): + return exec( + "execlineb", "-c", "\n".join(args), + **self.defaults, + executable=ELB_DIR + "/execlineb", + **{ + "env": self.make_env(), + "check": self.check, + "cwd": self.prefix, + **kwargs, + }, + ) + + def popen(self, *args, **kwargs): + kwargs["pass_fds"] = kwargs.get("pass_fds", ()) + kwargs["env"] = kwargs.get("env", self.make_env()) + kwargs["cwd"] = kwargs.get("cwd", self.prefix) + return subprocess.Popen( + args, + **kwargs, + ) + + @contextmanager + def bwrap( + self, + *bwrap_args, + + die_with_parent=True, + + # Based on the args from + # `host/rootfs/image/usr/bin/run-vmm` + unshare_all=True, + unshare_user=True, + unshare_ipc=None, + unshare_pid=None, + unshare_net=None, + unshare_uts=None, + unshare_cgroup_try=True, + bind=(), + dev_bind=("/dev/kvm", "/dev/vfio"), + dev="/dev", + proc="/proc", + ro_bind=( + "/etc", + "/sys", + "/proc/sys", + "/dev/null", + "/proc/kallsyms", + *CLOSURE), + ro_bind_extra=(), + remount_ro=("/proc/fs", "/proc/irq"), + tmpfs=("/dev/shm", "/tmp", "/var/tmp", "/proc/fs", "/proc/irq"), + tmpfs_extra=(), + + pass_fds=(2,), + **popen_kwargs): + + bwrap_args_sock, remote = socket.socketpair() + remote.set_inheritable(True) + bwrap_args_f = bwrap_args_sock.makefile("w") + with closing(bwrap_args_sock), closing(bwrap_args_f): + def print_arg(*args): + print(*args, file=bwrap_args_f, sep="\0", end="\0") + + if unshare_all: + print_arg("--unshare-all") + if unshare_user: + print_arg("--unshare-user") + if unshare_ipc: + print_arg("--unshare-ipc") + if unshare_pid: + print_arg("--unshare-pid") + if unshare_net: + print_arg("--unshare-net") + if unshare_uts: + print_arg("--unshare-uts") + if unshare_cgroup_try: + print_arg("--unshare-cgroup-try") + if die_with_parent: + print_arg("--die-with-parent") + + for p in bind: + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--bind", p1, p2) + for p in (*ro_bind, *ro_bind_extra): + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--ro-bind", p1, p2) + for p in dev_bind: + p1, p2 = (p, p) if isinstance(p, str) else p + print_arg("--dev-bind", p1, p2) + for p in (*tmpfs, *tmpfs_extra): + print_arg("--tmpfs", p) + # Hunch: order might matter... + for p in remount_ro: + print_arg("--remount-ro", p) + + bwrap_args_f.flush() + + with closing(remote): + proc = self.popen( + "bwrap", "--args", str(remote.fileno()), *bwrap_args, + **popen_kwargs, + executable=BWRAP_PATH, + pass_fds=(*pass_fds, remote.fileno()), + ) + + with proc as p: + try: + yield p + finally: + try: + p.poll() + except: # noqa: E722 + pass + if p.returncode is None: + p.terminate() + p.wait() + + @contextmanager + def run_ch(self): + args = [ + SOCKETBINDER_PATH, + "-B", + self.prefix + "/vmm.sock", + CH_PATH, + "--api-socket", + "fd=0", + ] + p = self.popen( + *args, + shell=False, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + pass_fds=(2,)) + try: + p.wait(0.125) + needs_cleanup = False + except subprocess.TimeoutExpired: + needs_cleanup = True + if not os.path.exists(self.prefix + "/vmm.sock"): + raise RuntimeError(f"{self.prefix}/vmm.sock should exist by now") + if p.returncode is not None: + raise RuntimeError("CH exited early") + try: + yield p + finally: + try: + p.poll() + except: # noqa: E722 + pass + if p.returncode is None: + p.terminate() # CH handles SIG{INT,TERM}? + p.wait() + unlink_paths = [ + self.prefix + "/vmm.sock", + self.prefix + "/vmm.sock.lock", + self.prefix + "/vsock.sock", + ] if needs_cleanup else [] + for p in unlink_paths: + if os.path.exists(p): + os.remove(p) + + @contextmanager + def add_virtiofsd( + self, + root_dir, + tag, + ro=False, + subdirs=None, + extra_flags=("--posix-acl",)): + + assert os.path.exists(root_dir) + + sock_path = self.prefix + f"/virtiofsd-{tag}.sock" + # s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + # NOTE: Nope. Virtiofsd actually expects a blocking socket + # s.setblocking(True) + + def rm_sock(): + if os.path.exists(sock_path): + os.remove(sock_path) + + with ExitStack() as cleanup: # noqa: F841 + # s.bind(sock_path.encode("utf8")) + # cleanup.enter_context(closing(s)) + cleanup.enter_context(defer(rm_sock)) + + args = [ + # If using bwrap(): + # "--argv0", "virtiofsd", + # "--uid", "1000", + # "--gid", "1000", + # "--", + "unshare", "-rUm", + "unshare", "--map-user", "1000", "--map-group", "1000", + VIRTIOFSD_PATH, + "--shared-dir", + root_dir, + "--tag", + tag, + + # "--fd", + # str(s.fileno()), + "--socket-path", + sock_path, + + # If relying on bwrap(): + # "--sandbox", + # "none", + ] + if ro: + args.append("--readonly") + kwargs = { + # If bwrap(): + # "bind": [], + # ("ro_bind_extra" if ro else "bind"): + # [*subdirs] + # if subdirs is not None + # else [root_dir], + + # "pass_fds": (2, s.fileno()), + } + proc_ctx = self.popen(*args, **kwargs) + with proc_ctx as p: + try: + try: + p.wait(0.125) + except subprocess.TimeoutExpired: + pass + if p.returncode is not None: + raise RuntimeError("virtiofsd exited too early") + yield p, sock_path + finally: + if p.returncode is None: + p.kill() + p.wait() + if os.path.exists(sock_path): + os.remove(sock_path) + + + @contextmanager + def defer(f): + try: + yield + finally: + f() + + + if __name__ == "__main__": + args, args_next = parser.parse_known_args() + preprocess_args(args) + + os.makedirs(args.prefix, exist_ok=True) + ps = Processes( + prefix=args.prefix, + vm=args.vm, + ) + + ch_remote = [ + "ch-remote", + "--api-socket", + args.prefix + "/vmm.sock", + ] + + with ExitStack() as cleanup: + ch = cleanup.enter_context(ps.run_ch()) + ps.exec(*ch_remote, "create", args.vm_config) + ps.exec( + TAPS_PATH, "pass", + *ch_remote, "add-net", + "id=wan,fd=3,mac=00:00:00:00:00:01") + + send_dir = PASSTHRU_ENV["HOME"] + f"/send/{args.vm}" + os.makedirs(send_dir, exist_ok=True) + vfsd, vfsd_path = cleanup.enter_context( + ps.add_virtiofsd( + send_dir, + tag="send", + )) + ps.exec(*ch_remote, "add-fs", f"tag=send,socket={vfsd_path},id=send") + ps.exec(*ch_remote, "boot") + ps.exec(*ch_remote, "info") + try: + ch.wait() + except KeyboardInterrupt: + pass + ''; + in + writeElb "run-${hostName}" '' + ${superviseVm} --vm-config=${chSettingsFile} --vm=${hostName} + ''; } (lib.mkIf cfg.enable { boot.initrd.availableKernelModules = [ @@ -103,12 +699,6 @@ in } ) layers ); - uvms.cloud-hypervisor.argv = [ - "--memory=size=1536M,hotplug_size=1536M,hotplugged_size=512M,hotplug_method=virtio-mem,mergeable=on,shared=on" - "--cpus=boot=4" - "--disk" - ] - ++ map (img: "path=${img},readonly=true,id=${toString img.label}") layers; }) ]; } diff --git a/profiles/uvms-guest.nix b/profiles/uvms-guest.nix index e8c307d..281f343 100644 --- a/profiles/uvms-guest.nix +++ b/profiles/uvms-guest.nix @@ -41,6 +41,7 @@ in volumes = [ { image = "swapfile.img"; + serial = "swapfiles"; mountPoint = "/var/swapfiles"; size = 1024; } diff --git a/profiles/uvms-users.nix b/profiles/uvms-users.nix index e75ac8f..e7bbacf 100644 --- a/profiles/uvms-users.nix +++ b/profiles/uvms-users.nix @@ -29,7 +29,6 @@ in }; config = mergeIf cfg.enable [ { - services.getty.autologinUser = "user"; security.sudo.wheelNeedsPassword = false; users.mutableUsers = false; users.users.user = { diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..f8bb9a7 --- /dev/null +++ b/shell.nix @@ -0,0 +1,13 @@ +with import { }; + +mkShell.override { stdenv = stdenvNoCC; } { + packages = map lib.getBin [ + cloud-hypervisor + virtiofsd + crosvm # virtio-gpu + npins + ] ++ [ + man-pages + linux-manual + ]; +}