First attempt at both Rust and snix-castore.
I'll start by putting together a single-file CLI app,
no splitting modules out until needed, no separation of concerns.

Currently, just the fetching is implemented:

```
$ sidx pkgs/dev*/cuda-*/cuda/manifests/*.json
```

npins: init
This commit is contained in:
Else, Someone 2025-04-15 14:07:49 +00:00
commit 65326b2dcb
15 changed files with 17844 additions and 0 deletions

3759
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

13546
Cargo.nix Normal file

File diff suppressed because it is too large Load diff

15
Cargo.toml Normal file
View file

@ -0,0 +1,15 @@
[package]
name = "sidx"
version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1.0.98"
clap = "4.5.35"
futures = "0.3.31"
reqwest = "0.12.15"
rusqlite = "0.34.0"
snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" }
tokio = "1.44.2"
tokio-util = "0.7.14"
url = "2.5.4"

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2025 SomeoneSerge
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

20
README.md Normal file
View file

@ -0,0 +1,20 @@
sidx
===
Indexing archives and build outputs.
Roadmap
---
- [x] PoC: basic `reqwest` fetcher, minimal `snix-castore` integration, claims of the form "`$uri` referred to `$blake3` at `$time`" stored in sqlite.
- [ ] Zip, tar, git fetchers.
- [ ] ELF scanning: claims of the form "`$blake3` is ELF" and "`$blake3`'s `DT_RUNPATH` is `$runpath`", etc. (`DT_NEEDED`, `.interp`, symbols exported in `.data` and `.text`)
- [ ] `cuobjdump` scanner: claims of the form "`$blake3` contains `PTX` or `SASS` code for `$gencode`".
Approach
---
Vapourware and means to an end.

51
default.nix Normal file
View file

@ -0,0 +1,51 @@
{
npins ? import ./npins,
nixpkgs ? npins.nixpkgs,
pkgs ? import nixpkgs { },
lib ? pkgs.lib,
...
}:
lib.makeScope pkgs.newScope (
self:
let
addProtobuf = oldAttrs: {
nativeBuildInputs = oldAttrs.nativeBuildInputs or [ ] ++ [
(self.callPackage ({ protobuf }: protobuf) { })
];
};
in
{
sidx-crate2nix = (
self.callPackage ./Cargo.nix {
defaultCrateOverrides = pkgs.defaultCrateOverrides // {
sidx = oldAttrs: {
src =
let
fs = pkgs.lib.fileset;
in
fs.toSource {
root = ./.;
fileset = fs.unions [
./src
./Cargo.toml
./Cargo.lock
];
};
};
snix-castore = addProtobuf;
# prost-wkt-types = addProtobuf;
# openssl-sys = oldAttrs: {
# nativeBuildInputs = oldAttrs.buildInputs or [ ] ++ [
# (self.callPackage ({ pkg-config }: pkg-config) { })
# ];
# buildInputs = oldAttrs.buildInputs or [ ] ++ [
# (self.callPackage ({ openssl }: openssl) { })
# ];
# };
};
}
);
sidx = self.sidx-crate2nix.rootCrate.build;
}
)

80
npins/default.nix Normal file
View file

@ -0,0 +1,80 @@
# Generated by npins. Do not modify; will be overwritten regularly
let
data = builtins.fromJSON (builtins.readFile ./sources.json);
version = data.version;
mkSource =
spec:
assert spec ? type;
let
path =
if spec.type == "Git" then
mkGitSource spec
else if spec.type == "GitRelease" then
mkGitSource spec
else if spec.type == "PyPi" then
mkPyPiSource spec
else if spec.type == "Channel" then
mkChannelSource spec
else
builtins.throw "Unknown source type ${spec.type}";
in
spec // { outPath = path; };
mkGitSource =
{
repository,
revision,
url ? null,
hash,
branch ? null,
...
}:
assert repository ? type;
# At the moment, either it is a plain git repository (which has an url), or it is a GitHub/GitLab repository
# In the latter case, there we will always be an url to the tarball
if url != null then
(builtins.fetchTarball {
inherit url;
sha256 = hash; # FIXME: check nix version & use SRI hashes
})
else
assert repository.type == "Git";
let
urlToName =
url: rev:
let
matched = builtins.match "^.*/([^/]*)(\\.git)?$" repository.url;
short = builtins.substring 0 7 rev;
appendShort = if (builtins.match "[a-f0-9]*" rev) != null then "-${short}" else "";
in
"${if matched == null then "source" else builtins.head matched}${appendShort}";
name = urlToName repository.url revision;
in
builtins.fetchGit {
url = repository.url;
rev = revision;
inherit name;
# hash = hash;
};
mkPyPiSource =
{ url, hash, ... }:
builtins.fetchurl {
inherit url;
sha256 = hash;
};
mkChannelSource =
{ url, hash, ... }:
builtins.fetchTarball {
inherit url;
sha256 = hash;
};
in
if version == 3 then
builtins.mapAttrs (_: mkSource) data.pins
else
throw "Unsupported format version ${toString version} in sources.json. Try running `npins upgrade`"

11
npins/sources.json Normal file
View file

@ -0,0 +1,11 @@
{
"pins": {
"nixpkgs": {
"type": "Channel",
"name": "nixpkgs-unstable",
"url": "https://releases.nixos.org/nixpkgs/nixpkgs-25.05pre782598.18dd725c2960/nixexprs.tar.xz",
"hash": "1p7kgyph7xkj57p19nbxpycmbchc6d9gwdznsmxhymrzyzi3if21"
}
},
"version": 3
}

34
shell.nix Normal file
View file

@ -0,0 +1,34 @@
{
npins ? import ./npins,
nixpkgs ? npins.nixpkgs,
pkgs ? import nixpkgs { },
lib ? pkgs.lib,
mkShell ? pkgs.mkShell,
sqlite ? pkgs.sqlite,
openssl ? pkgs.openssl,
rust-analyzer ? pkgs.rust-analyzer,
rustc ? pkgs.rustc,
cargo ? pkgs.cargo,
pkg-config ? pkgs.pkg-config,
crate2nix ? pkgs.crate2nix,
protobuf ? pkgs.protobuf,
datasette ? pkgs.datasette,
...
}:
mkShell {
name = "sidx-shell";
nativeBuildInputs = [
(lib.getBin sqlite)
cargo
crate2nix
rustc
rust-analyzer
pkg-config
protobuf
datasette
];
buildInputs = [
openssl
sqlite
];
}

240
src/main.rs Normal file
View file

@ -0,0 +1,240 @@
use std::{
path::{absolute, PathBuf},
pin::Pin,
};
use anyhow::anyhow;
use clap::Parser;
use futures::{stream, StreamExt, TryStreamExt};
use rusqlite::{params, OptionalExtension};
use snix_castore::{blobservice, directoryservice};
use url::Url;
#[derive(Clone)]
enum Ingestable {
Url(Url),
Path(PathBuf),
}
#[derive(Debug)]
enum IngestedWhen {
Now,
Before,
}
#[derive(Debug)]
#[allow(dead_code)]
struct Ingested {
sample_id: u32,
uri: String,
blake3: String,
epoch: u32,
when: IngestedWhen,
}
impl std::fmt::Display for Ingestable {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Ingestable::Url(url) => write!(f, "{}", url),
Ingestable::Path(path_buf) => match path_buf.to_str() {
Some(s) => write!(f, "{}", s),
None => {
panic!("PathBuf::to_str failed")
}
},
}
}
}
fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
if s.is_empty() {
Err(anyhow!("Empty path (url)"))
} else if s.starts_with("./") || s.starts_with("/") {
Ok(Ingestable::Path(PathBuf::from(s)))
} else {
let url = Url::parse(s)?;
if url.scheme() == "file" {
match url.to_file_path() {
Ok(s) => Ok(Ingestable::Path(s)),
Err(()) => Err(anyhow!(
"parse_url_or_path: couldn't convert Url ({}) to Path",
url
)),
}
} else {
Ok(Ingestable::Url(url))
}
}
}
fn data_path() -> PathBuf {
let xdg_data_dir = std::env::var("XDG_DATA_DIR")
.and_then(|s| Ok(PathBuf::from(s)))
.or_else(|_| -> Result<PathBuf, anyhow::Error> {
match std::env::home_dir() {
Some(p) => Ok(p.join(".local/share")),
None => Err(anyhow!("...")), // FIXME
}
});
match xdg_data_dir {
Ok(p) => p.join("sidx"),
Err(_) => PathBuf::from(".sidx"),
}
}
fn default_castore_path() -> PathBuf {
data_path().join("castore")
}
fn default_db_path() -> PathBuf {
data_path().join("sidx.db")
}
#[derive(Parser)]
struct Cli {
#[clap(value_parser = parse_url_or_path, num_args = 1)]
inputs: Vec<Ingestable>,
#[clap(short, long, action)]
refetch: bool,
#[clap(short, long, value_parser, default_value_t = 5)]
max_parallel: usize,
#[clap(short, long, value_parser, default_value_os_t = default_db_path())]
db_path: PathBuf,
#[clap(short, long, value_parser, default_value_os_t = default_castore_path())]
castore_path: PathBuf,
}
#[tokio::main]
async fn main() {
let args = Cli::parse();
args.db_path.parent().and_then(|p| {
let _ = std::fs::create_dir_all(p);
Some(())
});
let con =
rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object");
con.execute_batch(include_str!("q/init.sql"))
.expect("Failed to execute init.sql");
let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path");
let blob_service = blobservice::from_addr(&std::format!(
"objectstore+file://{}",
castore_path
.join("blob")
.to_str()
.expect("Path::to_str unexpectedly broken")
))
.await
.expect("Couldn't initialize .castore/blob");
let dir_service = directoryservice::from_addr(&std::format!(
"objectstore+file://{}",
castore_path
.join("directory")
.to_str()
.expect("Path::to_str unexpectedly broken")
))
.await
.expect("Couldn't initialize .castore/directory");
let client = reqwest::Client::new();
let samples = stream::iter(args.inputs.iter().map(|uri| {
let client = &client;
let blob_service = &blob_service;
let _dir_service = &dir_service;
let con = &con;
let mut find_sample = con
.prepare(include_str!("q/latest-download.sql"))
.expect("Failed to prepare latest-download.sql");
let mut add_sample = con
.prepare(include_str!("q/add-sample.sql"))
.expect("Failed to prepare add-sample.sql");
let mut add_blob = con
.prepare(include_str!("q/upsert-blob.sql"))
.expect("Failed to prepare upsert-blob.sql");
let mut add_uri = con
.prepare(include_str!("q/upsert-uri.sql"))
.expect("Failed to prepare upsert-uri.sql");
async move {
let uri_s = uri.to_string();
let latest_download = find_sample
.query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r))
.optional()?;
if let Some((sample_id, blake3, epoch)) = latest_download {
if !args.refetch {
return Ok::<Option<Ingested>, anyhow::Error>(Some(Ingested {
sample_id,
uri: uri_s,
blake3,
epoch,
when: IngestedWhen::Before,
}));
}
}
let mut r: Pin<Box<dyn tokio::io::AsyncRead>> = {
match uri {
Ingestable::Path(path) => match tokio::fs::File::open(path).await {
Ok(f) => Box::pin(f),
Err(e) => {
return Err(anyhow!("Failed to read {:?}: {}", path, e));
}
},
Ingestable::Url(url) => {
let res = match client.get(url.clone()).send().await {
Ok(res) => res.error_for_status()?,
Err(e) => {
return Err(anyhow!("Failed to GET {}: {}", url, e));
}
};
let r = tokio_util::io::StreamReader::new(
res.bytes_stream().map_err(std::io::Error::other),
);
Box::pin(r)
}
}
};
let mut w = blob_service.open_write().await;
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
Ok(n) => n,
Err(e) => {
return Err(anyhow!(
"tokio::io::copy failed for uri={} with {}",
uri_s,
e
));
}
};
let digest = w.close().await?;
let digest64 = format!("{}", digest);
add_blob.execute(params![digest64, n_bytes,])?;
add_uri.execute(params![uri_s])?;
let (sample_id, epoch) = add_sample
.query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?;
Ok(Some(Ingested {
sample_id,
uri: uri_s,
blake3: digest64,
epoch,
when: IngestedWhen::Now,
}))
}
}))
.buffer_unordered(args.max_parallel)
.collect::<Vec<Result<Option<Ingested>, _>>>()
.await;
for s in samples {
match s {
Err(e) => {
println!("Failed to fetch ...: {}", e);
}
Ok(None) => {}
Ok(Some(ingested)) => {
println!("{:?}", ingested)
}
}
}
}

21
src/q/add-sample.sql Normal file
View file

@ -0,0 +1,21 @@
INSERT INTO sidx_uri_sample(uri_id, blob_id)
VALUES(
(
SELECT
id
FROM
sidx_uri
WHERE
uri = ?
LIMIT 1
),
(
SELECT
id
FROM
sidx_blob
WHERE
blake3 = ?
)
)
RETURNING id, epoch;

22
src/q/init.sql Normal file
View file

@ -0,0 +1,22 @@
CREATE TABLE IF NOT EXISTS sidx_uri(
id INTEGER,
uri TEXT UNIQUE,
PRIMARY KEY(id)
);
CREATE TABLE IF NOT EXISTS sidx_blob(
id INTEGER,
blake3 TEXT UNIQUE,
n_bytes INTEGER NOT NULL,
PRIMARY KEY(id)
);
CREATE TABLE IF NOT EXISTS sidx_uri_sample(
id INTEGER,
uri_id INTEGER NOT NULL,
blob_id INTEGER,
epoch INTEGER NOT NULL DEFAULT (unixepoch()),
PRIMARY KEY(id),
FOREIGN KEY(uri_id) REFERENCES sidx_uri(id),
FOREIGN KEY(blob_id) REFERENCES sidx_blob(id)
);
CREATE INDEX IF NOT EXISTS sidx_uri_blob_idx
ON sidx_uri_sample(uri_id, blob_id, epoch);

16
src/q/latest-download.sql Normal file
View file

@ -0,0 +1,16 @@
SELECT
s.id AS sample_id,
b.blake3,
s.epoch
FROM
sidx_uri_sample AS s,
sidx_uri AS u,
sidx_blob AS b
ON
s.uri_id = u.id
AND s.blob_id = b.id
WHERE
u.uri = ?
ORDER BY
s.epoch DESC
LIMIT 1;

4
src/q/upsert-blob.sql Normal file
View file

@ -0,0 +1,4 @@
INSERT INTO sidx_blob(blake3, n_bytes)
VALUES
(?, ?)
ON CONFLICT DO NOTHING;

4
src/q/upsert-uri.sql Normal file
View file

@ -0,0 +1,4 @@
INSERT INTO sidx_uri(uri)
VALUES
(?)
ON CONFLICT DO NOTHING;