init
First attempt at both Rust and snix-castore. I'll start by putting together a single-file CLI app, no splitting modules out until needed, no separation of concerns. Currently, just the fetching is implemented: ``` $ sidx pkgs/dev*/cuda-*/cuda/manifests/*.json ``` npins: init
This commit is contained in:
commit
65326b2dcb
15 changed files with 17844 additions and 0 deletions
3759
Cargo.lock
generated
Normal file
3759
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
15
Cargo.toml
Normal file
15
Cargo.toml
Normal file
|
@ -0,0 +1,15 @@
|
|||
[package]
|
||||
name = "sidx"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.98"
|
||||
clap = "4.5.35"
|
||||
futures = "0.3.31"
|
||||
reqwest = "0.12.15"
|
||||
rusqlite = "0.34.0"
|
||||
snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" }
|
||||
tokio = "1.44.2"
|
||||
tokio-util = "0.7.14"
|
||||
url = "2.5.4"
|
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2025 SomeoneSerge
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
20
README.md
Normal file
20
README.md
Normal file
|
@ -0,0 +1,20 @@
|
|||
sidx
|
||||
===
|
||||
|
||||
|
||||
Indexing archives and build outputs.
|
||||
|
||||
|
||||
Roadmap
|
||||
---
|
||||
|
||||
|
||||
- [x] PoC: basic `reqwest` fetcher, minimal `snix-castore` integration, claims of the form "`$uri` referred to `$blake3` at `$time`" stored in sqlite.
|
||||
- [ ] Zip, tar, git fetchers.
|
||||
- [ ] ELF scanning: claims of the form "`$blake3` is ELF" and "`$blake3`'s `DT_RUNPATH` is `$runpath`", etc. (`DT_NEEDED`, `.interp`, symbols exported in `.data` and `.text`)
|
||||
- [ ] `cuobjdump` scanner: claims of the form "`$blake3` contains `PTX` or `SASS` code for `$gencode`".
|
||||
|
||||
Approach
|
||||
---
|
||||
|
||||
Vapourware and means to an end.
|
51
default.nix
Normal file
51
default.nix
Normal file
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
npins ? import ./npins,
|
||||
nixpkgs ? npins.nixpkgs,
|
||||
pkgs ? import nixpkgs { },
|
||||
lib ? pkgs.lib,
|
||||
...
|
||||
}:
|
||||
|
||||
lib.makeScope pkgs.newScope (
|
||||
self:
|
||||
let
|
||||
addProtobuf = oldAttrs: {
|
||||
nativeBuildInputs = oldAttrs.nativeBuildInputs or [ ] ++ [
|
||||
(self.callPackage ({ protobuf }: protobuf) { })
|
||||
];
|
||||
};
|
||||
in
|
||||
{
|
||||
sidx-crate2nix = (
|
||||
self.callPackage ./Cargo.nix {
|
||||
defaultCrateOverrides = pkgs.defaultCrateOverrides // {
|
||||
sidx = oldAttrs: {
|
||||
src =
|
||||
let
|
||||
fs = pkgs.lib.fileset;
|
||||
in
|
||||
fs.toSource {
|
||||
root = ./.;
|
||||
fileset = fs.unions [
|
||||
./src
|
||||
./Cargo.toml
|
||||
./Cargo.lock
|
||||
];
|
||||
};
|
||||
};
|
||||
snix-castore = addProtobuf;
|
||||
# prost-wkt-types = addProtobuf;
|
||||
# openssl-sys = oldAttrs: {
|
||||
# nativeBuildInputs = oldAttrs.buildInputs or [ ] ++ [
|
||||
# (self.callPackage ({ pkg-config }: pkg-config) { })
|
||||
# ];
|
||||
# buildInputs = oldAttrs.buildInputs or [ ] ++ [
|
||||
# (self.callPackage ({ openssl }: openssl) { })
|
||||
# ];
|
||||
# };
|
||||
};
|
||||
}
|
||||
);
|
||||
sidx = self.sidx-crate2nix.rootCrate.build;
|
||||
}
|
||||
)
|
80
npins/default.nix
Normal file
80
npins/default.nix
Normal file
|
@ -0,0 +1,80 @@
|
|||
# Generated by npins. Do not modify; will be overwritten regularly
|
||||
let
|
||||
data = builtins.fromJSON (builtins.readFile ./sources.json);
|
||||
version = data.version;
|
||||
|
||||
mkSource =
|
||||
spec:
|
||||
assert spec ? type;
|
||||
let
|
||||
path =
|
||||
if spec.type == "Git" then
|
||||
mkGitSource spec
|
||||
else if spec.type == "GitRelease" then
|
||||
mkGitSource spec
|
||||
else if spec.type == "PyPi" then
|
||||
mkPyPiSource spec
|
||||
else if spec.type == "Channel" then
|
||||
mkChannelSource spec
|
||||
else
|
||||
builtins.throw "Unknown source type ${spec.type}";
|
||||
in
|
||||
spec // { outPath = path; };
|
||||
|
||||
mkGitSource =
|
||||
{
|
||||
repository,
|
||||
revision,
|
||||
url ? null,
|
||||
hash,
|
||||
branch ? null,
|
||||
...
|
||||
}:
|
||||
assert repository ? type;
|
||||
# At the moment, either it is a plain git repository (which has an url), or it is a GitHub/GitLab repository
|
||||
# In the latter case, there we will always be an url to the tarball
|
||||
if url != null then
|
||||
(builtins.fetchTarball {
|
||||
inherit url;
|
||||
sha256 = hash; # FIXME: check nix version & use SRI hashes
|
||||
})
|
||||
else
|
||||
assert repository.type == "Git";
|
||||
let
|
||||
urlToName =
|
||||
url: rev:
|
||||
let
|
||||
matched = builtins.match "^.*/([^/]*)(\\.git)?$" repository.url;
|
||||
|
||||
short = builtins.substring 0 7 rev;
|
||||
|
||||
appendShort = if (builtins.match "[a-f0-9]*" rev) != null then "-${short}" else "";
|
||||
in
|
||||
"${if matched == null then "source" else builtins.head matched}${appendShort}";
|
||||
name = urlToName repository.url revision;
|
||||
in
|
||||
builtins.fetchGit {
|
||||
url = repository.url;
|
||||
rev = revision;
|
||||
inherit name;
|
||||
# hash = hash;
|
||||
};
|
||||
|
||||
mkPyPiSource =
|
||||
{ url, hash, ... }:
|
||||
builtins.fetchurl {
|
||||
inherit url;
|
||||
sha256 = hash;
|
||||
};
|
||||
|
||||
mkChannelSource =
|
||||
{ url, hash, ... }:
|
||||
builtins.fetchTarball {
|
||||
inherit url;
|
||||
sha256 = hash;
|
||||
};
|
||||
in
|
||||
if version == 3 then
|
||||
builtins.mapAttrs (_: mkSource) data.pins
|
||||
else
|
||||
throw "Unsupported format version ${toString version} in sources.json. Try running `npins upgrade`"
|
11
npins/sources.json
Normal file
11
npins/sources.json
Normal file
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"pins": {
|
||||
"nixpkgs": {
|
||||
"type": "Channel",
|
||||
"name": "nixpkgs-unstable",
|
||||
"url": "https://releases.nixos.org/nixpkgs/nixpkgs-25.05pre782598.18dd725c2960/nixexprs.tar.xz",
|
||||
"hash": "1p7kgyph7xkj57p19nbxpycmbchc6d9gwdznsmxhymrzyzi3if21"
|
||||
}
|
||||
},
|
||||
"version": 3
|
||||
}
|
34
shell.nix
Normal file
34
shell.nix
Normal file
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
npins ? import ./npins,
|
||||
nixpkgs ? npins.nixpkgs,
|
||||
pkgs ? import nixpkgs { },
|
||||
lib ? pkgs.lib,
|
||||
mkShell ? pkgs.mkShell,
|
||||
sqlite ? pkgs.sqlite,
|
||||
openssl ? pkgs.openssl,
|
||||
rust-analyzer ? pkgs.rust-analyzer,
|
||||
rustc ? pkgs.rustc,
|
||||
cargo ? pkgs.cargo,
|
||||
pkg-config ? pkgs.pkg-config,
|
||||
crate2nix ? pkgs.crate2nix,
|
||||
protobuf ? pkgs.protobuf,
|
||||
datasette ? pkgs.datasette,
|
||||
...
|
||||
}:
|
||||
mkShell {
|
||||
name = "sidx-shell";
|
||||
nativeBuildInputs = [
|
||||
(lib.getBin sqlite)
|
||||
cargo
|
||||
crate2nix
|
||||
rustc
|
||||
rust-analyzer
|
||||
pkg-config
|
||||
protobuf
|
||||
datasette
|
||||
];
|
||||
buildInputs = [
|
||||
openssl
|
||||
sqlite
|
||||
];
|
||||
}
|
240
src/main.rs
Normal file
240
src/main.rs
Normal file
|
@ -0,0 +1,240 @@
|
|||
use std::{
|
||||
path::{absolute, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::anyhow;
|
||||
use clap::Parser;
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use rusqlite::{params, OptionalExtension};
|
||||
use snix_castore::{blobservice, directoryservice};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Ingestable {
|
||||
Url(Url),
|
||||
Path(PathBuf),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum IngestedWhen {
|
||||
Now,
|
||||
Before,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
struct Ingested {
|
||||
sample_id: u32,
|
||||
uri: String,
|
||||
blake3: String,
|
||||
epoch: u32,
|
||||
when: IngestedWhen,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Ingestable {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Ingestable::Url(url) => write!(f, "{}", url),
|
||||
Ingestable::Path(path_buf) => match path_buf.to_str() {
|
||||
Some(s) => write!(f, "{}", s),
|
||||
None => {
|
||||
panic!("PathBuf::to_str failed")
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
|
||||
if s.is_empty() {
|
||||
Err(anyhow!("Empty path (url)"))
|
||||
} else if s.starts_with("./") || s.starts_with("/") {
|
||||
Ok(Ingestable::Path(PathBuf::from(s)))
|
||||
} else {
|
||||
let url = Url::parse(s)?;
|
||||
if url.scheme() == "file" {
|
||||
match url.to_file_path() {
|
||||
Ok(s) => Ok(Ingestable::Path(s)),
|
||||
Err(()) => Err(anyhow!(
|
||||
"parse_url_or_path: couldn't convert Url ({}) to Path",
|
||||
url
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
Ok(Ingestable::Url(url))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn data_path() -> PathBuf {
|
||||
let xdg_data_dir = std::env::var("XDG_DATA_DIR")
|
||||
.and_then(|s| Ok(PathBuf::from(s)))
|
||||
.or_else(|_| -> Result<PathBuf, anyhow::Error> {
|
||||
match std::env::home_dir() {
|
||||
Some(p) => Ok(p.join(".local/share")),
|
||||
None => Err(anyhow!("...")), // FIXME
|
||||
}
|
||||
});
|
||||
match xdg_data_dir {
|
||||
Ok(p) => p.join("sidx"),
|
||||
Err(_) => PathBuf::from(".sidx"),
|
||||
}
|
||||
}
|
||||
fn default_castore_path() -> PathBuf {
|
||||
data_path().join("castore")
|
||||
}
|
||||
fn default_db_path() -> PathBuf {
|
||||
data_path().join("sidx.db")
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Cli {
|
||||
#[clap(value_parser = parse_url_or_path, num_args = 1)]
|
||||
inputs: Vec<Ingestable>,
|
||||
|
||||
#[clap(short, long, action)]
|
||||
refetch: bool,
|
||||
|
||||
#[clap(short, long, value_parser, default_value_t = 5)]
|
||||
max_parallel: usize,
|
||||
|
||||
#[clap(short, long, value_parser, default_value_os_t = default_db_path())]
|
||||
db_path: PathBuf,
|
||||
|
||||
#[clap(short, long, value_parser, default_value_os_t = default_castore_path())]
|
||||
castore_path: PathBuf,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Cli::parse();
|
||||
|
||||
args.db_path.parent().and_then(|p| {
|
||||
let _ = std::fs::create_dir_all(p);
|
||||
Some(())
|
||||
});
|
||||
|
||||
let con =
|
||||
rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object");
|
||||
con.execute_batch(include_str!("q/init.sql"))
|
||||
.expect("Failed to execute init.sql");
|
||||
let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path");
|
||||
let blob_service = blobservice::from_addr(&std::format!(
|
||||
"objectstore+file://{}",
|
||||
castore_path
|
||||
.join("blob")
|
||||
.to_str()
|
||||
.expect("Path::to_str unexpectedly broken")
|
||||
))
|
||||
.await
|
||||
.expect("Couldn't initialize .castore/blob");
|
||||
let dir_service = directoryservice::from_addr(&std::format!(
|
||||
"objectstore+file://{}",
|
||||
castore_path
|
||||
.join("directory")
|
||||
.to_str()
|
||||
.expect("Path::to_str unexpectedly broken")
|
||||
))
|
||||
.await
|
||||
.expect("Couldn't initialize .castore/directory");
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let samples = stream::iter(args.inputs.iter().map(|uri| {
|
||||
let client = &client;
|
||||
let blob_service = &blob_service;
|
||||
let _dir_service = &dir_service;
|
||||
let con = &con;
|
||||
let mut find_sample = con
|
||||
.prepare(include_str!("q/latest-download.sql"))
|
||||
.expect("Failed to prepare latest-download.sql");
|
||||
let mut add_sample = con
|
||||
.prepare(include_str!("q/add-sample.sql"))
|
||||
.expect("Failed to prepare add-sample.sql");
|
||||
let mut add_blob = con
|
||||
.prepare(include_str!("q/upsert-blob.sql"))
|
||||
.expect("Failed to prepare upsert-blob.sql");
|
||||
let mut add_uri = con
|
||||
.prepare(include_str!("q/upsert-uri.sql"))
|
||||
.expect("Failed to prepare upsert-uri.sql");
|
||||
|
||||
async move {
|
||||
let uri_s = uri.to_string();
|
||||
let latest_download = find_sample
|
||||
.query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r))
|
||||
.optional()?;
|
||||
if let Some((sample_id, blake3, epoch)) = latest_download {
|
||||
if !args.refetch {
|
||||
return Ok::<Option<Ingested>, anyhow::Error>(Some(Ingested {
|
||||
sample_id,
|
||||
uri: uri_s,
|
||||
blake3,
|
||||
epoch,
|
||||
when: IngestedWhen::Before,
|
||||
}));
|
||||
}
|
||||
}
|
||||
let mut r: Pin<Box<dyn tokio::io::AsyncRead>> = {
|
||||
match uri {
|
||||
Ingestable::Path(path) => match tokio::fs::File::open(path).await {
|
||||
Ok(f) => Box::pin(f),
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Failed to read {:?}: {}", path, e));
|
||||
}
|
||||
},
|
||||
Ingestable::Url(url) => {
|
||||
let res = match client.get(url.clone()).send().await {
|
||||
Ok(res) => res.error_for_status()?,
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Failed to GET {}: {}", url, e));
|
||||
}
|
||||
};
|
||||
let r = tokio_util::io::StreamReader::new(
|
||||
res.bytes_stream().map_err(std::io::Error::other),
|
||||
);
|
||||
Box::pin(r)
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut w = blob_service.open_write().await;
|
||||
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"tokio::io::copy failed for uri={} with {}",
|
||||
uri_s,
|
||||
e
|
||||
));
|
||||
}
|
||||
};
|
||||
let digest = w.close().await?;
|
||||
let digest64 = format!("{}", digest);
|
||||
add_blob.execute(params![digest64, n_bytes,])?;
|
||||
add_uri.execute(params![uri_s])?;
|
||||
let (sample_id, epoch) = add_sample
|
||||
.query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?;
|
||||
Ok(Some(Ingested {
|
||||
sample_id,
|
||||
uri: uri_s,
|
||||
blake3: digest64,
|
||||
epoch,
|
||||
when: IngestedWhen::Now,
|
||||
}))
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(args.max_parallel)
|
||||
.collect::<Vec<Result<Option<Ingested>, _>>>()
|
||||
.await;
|
||||
|
||||
for s in samples {
|
||||
match s {
|
||||
Err(e) => {
|
||||
println!("Failed to fetch ...: {}", e);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Ok(Some(ingested)) => {
|
||||
println!("{:?}", ingested)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
21
src/q/add-sample.sql
Normal file
21
src/q/add-sample.sql
Normal file
|
@ -0,0 +1,21 @@
|
|||
INSERT INTO sidx_uri_sample(uri_id, blob_id)
|
||||
VALUES(
|
||||
(
|
||||
SELECT
|
||||
id
|
||||
FROM
|
||||
sidx_uri
|
||||
WHERE
|
||||
uri = ?
|
||||
LIMIT 1
|
||||
),
|
||||
(
|
||||
SELECT
|
||||
id
|
||||
FROM
|
||||
sidx_blob
|
||||
WHERE
|
||||
blake3 = ?
|
||||
)
|
||||
)
|
||||
RETURNING id, epoch;
|
22
src/q/init.sql
Normal file
22
src/q/init.sql
Normal file
|
@ -0,0 +1,22 @@
|
|||
CREATE TABLE IF NOT EXISTS sidx_uri(
|
||||
id INTEGER,
|
||||
uri TEXT UNIQUE,
|
||||
PRIMARY KEY(id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS sidx_blob(
|
||||
id INTEGER,
|
||||
blake3 TEXT UNIQUE,
|
||||
n_bytes INTEGER NOT NULL,
|
||||
PRIMARY KEY(id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS sidx_uri_sample(
|
||||
id INTEGER,
|
||||
uri_id INTEGER NOT NULL,
|
||||
blob_id INTEGER,
|
||||
epoch INTEGER NOT NULL DEFAULT (unixepoch()),
|
||||
PRIMARY KEY(id),
|
||||
FOREIGN KEY(uri_id) REFERENCES sidx_uri(id),
|
||||
FOREIGN KEY(blob_id) REFERENCES sidx_blob(id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS sidx_uri_blob_idx
|
||||
ON sidx_uri_sample(uri_id, blob_id, epoch);
|
16
src/q/latest-download.sql
Normal file
16
src/q/latest-download.sql
Normal file
|
@ -0,0 +1,16 @@
|
|||
SELECT
|
||||
s.id AS sample_id,
|
||||
b.blake3,
|
||||
s.epoch
|
||||
FROM
|
||||
sidx_uri_sample AS s,
|
||||
sidx_uri AS u,
|
||||
sidx_blob AS b
|
||||
ON
|
||||
s.uri_id = u.id
|
||||
AND s.blob_id = b.id
|
||||
WHERE
|
||||
u.uri = ?
|
||||
ORDER BY
|
||||
s.epoch DESC
|
||||
LIMIT 1;
|
4
src/q/upsert-blob.sql
Normal file
4
src/q/upsert-blob.sql
Normal file
|
@ -0,0 +1,4 @@
|
|||
INSERT INTO sidx_blob(blake3, n_bytes)
|
||||
VALUES
|
||||
(?, ?)
|
||||
ON CONFLICT DO NOTHING;
|
4
src/q/upsert-uri.sql
Normal file
4
src/q/upsert-uri.sql
Normal file
|
@ -0,0 +1,4 @@
|
|||
INSERT INTO sidx_uri(uri)
|
||||
VALUES
|
||||
(?)
|
||||
ON CONFLICT DO NOTHING;
|
Loading…
Add table
Add a link
Reference in a new issue