Ingestable: use ingest_path for local paths

This way we don't error-out when a path is a directory. That said, we're
still only including the root in sidx.db (e.g. manifests/ but not
manifests/*.json). We should change that next.

Also renamed "blob_id" to "blake3_id" because datasette has a special
branch for ${column}_id referencing a table that contains ${column}
This commit is contained in:
Else, Someone 2025-04-20 00:12:51 +00:00
parent 65326b2dcb
commit 56a0b346cd
5 changed files with 53 additions and 48 deletions

View file

@ -1,16 +1,14 @@
use std::{
path::{absolute, PathBuf},
pin::Pin,
};
use std::path::{absolute, PathBuf};
use anyhow::anyhow;
use anyhow::Context;
use clap::Parser;
use futures::{stream, StreamExt, TryStreamExt};
use rusqlite::{params, OptionalExtension};
use snix_castore::{blobservice, directoryservice};
use snix_castore::{blobservice, directoryservice, import::fs::ingest_path};
use url::Url;
#[derive(Clone)]
#[derive(Clone, Debug)]
enum Ingestable {
Url(Url),
Path(PathBuf),
@ -143,7 +141,7 @@ async fn main() {
let samples = stream::iter(args.inputs.iter().map(|uri| {
let client = &client;
let blob_service = &blob_service;
let _dir_service = &dir_service;
let dir_service = &dir_service;
let con = &con;
let mut find_sample = con
.prepare(include_str!("q/latest-download.sql"))
@ -174,28 +172,32 @@ async fn main() {
}));
}
}
let mut r: Pin<Box<dyn tokio::io::AsyncRead>> = {
match uri {
Ingestable::Path(path) => match tokio::fs::File::open(path).await {
Ok(f) => Box::pin(f),
Err(e) => {
return Err(anyhow!("Failed to read {:?}: {}", path, e));
let (digest, n_bytes) = match uri {
Ingestable::Path(path) => {
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
.await?
{
snix_castore::Node::Directory { digest, size } => (digest, size),
snix_castore::Node::File {
digest,
size,
executable: _,
} => (digest, size),
snix_castore::Node::Symlink { target: _ } => {
return Err(anyhow!("TODO: Figure out what to do with symlink roots"))
}
}
}
},
Ingestable::Url(url) => {
let res = match client.get(url.clone()).send().await {
Ok(res) => res.error_for_status()?,
Err(e) => {
return Err(anyhow!("Failed to GET {}: {}", url, e));
}
};
let r = tokio_util::io::StreamReader::new(
let res = client
.get(url.clone())
.send()
.await
.context(format!("Request.send failed early for {:?}", uri))?
.error_for_status()?;
let mut r = tokio_util::io::StreamReader::new(
res.bytes_stream().map_err(std::io::Error::other),
);
Box::pin(r)
}
}
};
let mut w = blob_service.open_write().await;
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
Ok(n) => n,
@ -208,6 +210,9 @@ async fn main() {
}
};
let digest = w.close().await?;
(digest, n_bytes)
}
};
let digest64 = format!("{}", digest);
add_blob.execute(params![digest64, n_bytes,])?;
add_uri.execute(params![uri_s])?;
@ -229,7 +234,7 @@ async fn main() {
for s in samples {
match s {
Err(e) => {
println!("Failed to fetch ...: {}", e);
println!("Failed to fetch: {}", e);
}
Ok(None) => {}
Ok(Some(ingested)) => {

View file

@ -1,4 +1,4 @@
INSERT INTO sidx_uri_sample(uri_id, blob_id)
INSERT INTO sidx_uri_sample(uri_id, blake3_id)
VALUES(
(
SELECT
@ -13,7 +13,7 @@ VALUES(
SELECT
id
FROM
sidx_blob
sidx_blake3
WHERE
blake3 = ?
)

View file

@ -3,20 +3,20 @@ CREATE TABLE IF NOT EXISTS sidx_uri(
uri TEXT UNIQUE,
PRIMARY KEY(id)
);
CREATE TABLE IF NOT EXISTS sidx_blob(
CREATE TABLE IF NOT EXISTS sidx_blake3(
id INTEGER,
blake3 TEXT UNIQUE,
blake3 TEXT UNIQUE, /* snix-castore node */
n_bytes INTEGER NOT NULL,
PRIMARY KEY(id)
);
CREATE TABLE IF NOT EXISTS sidx_uri_sample(
id INTEGER,
uri_id INTEGER NOT NULL,
blob_id INTEGER,
blake3_id INTEGER,
epoch INTEGER NOT NULL DEFAULT (unixepoch()),
PRIMARY KEY(id),
FOREIGN KEY(uri_id) REFERENCES sidx_uri(id),
FOREIGN KEY(blob_id) REFERENCES sidx_blob(id)
FOREIGN KEY(blake3_id) REFERENCES sidx_blake3(id)
);
CREATE INDEX IF NOT EXISTS sidx_uri_blob_idx
ON sidx_uri_sample(uri_id, blob_id, epoch);
CREATE INDEX IF NOT EXISTS sidx_uri_blake3_idx
ON sidx_uri_sample(uri_id, blake3_id, epoch);

View file

@ -5,10 +5,10 @@ SELECT
FROM
sidx_uri_sample AS s,
sidx_uri AS u,
sidx_blob AS b
sidx_blake3 AS b
ON
s.uri_id = u.id
AND s.blob_id = b.id
AND s.blake3_id = b.id
WHERE
u.uri = ?
ORDER BY

View file

@ -1,4 +1,4 @@
INSERT INTO sidx_blob(blake3, n_bytes)
INSERT INTO sidx_blake3(blake3, n_bytes)
VALUES
(?, ?)
ON CONFLICT DO NOTHING;