Ingestable: use ingest_path for local paths
This way we don't error-out when a path is a directory. That said, we're still only including the root in sidx.db (e.g. manifests/ but not manifests/*.json). We should change that next. Also renamed "blob_id" to "blake3_id" because datasette has a special branch for ${column}_id referencing a table that contains ${column}
This commit is contained in:
parent
65326b2dcb
commit
56a0b346cd
5 changed files with 53 additions and 48 deletions
79
src/main.rs
79
src/main.rs
|
@ -1,16 +1,14 @@
|
|||
use std::{
|
||||
path::{absolute, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
use std::path::{absolute, PathBuf};
|
||||
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Context;
|
||||
use clap::Parser;
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use rusqlite::{params, OptionalExtension};
|
||||
use snix_castore::{blobservice, directoryservice};
|
||||
use snix_castore::{blobservice, directoryservice, import::fs::ingest_path};
|
||||
use url::Url;
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
enum Ingestable {
|
||||
Url(Url),
|
||||
Path(PathBuf),
|
||||
|
@ -143,7 +141,7 @@ async fn main() {
|
|||
let samples = stream::iter(args.inputs.iter().map(|uri| {
|
||||
let client = &client;
|
||||
let blob_service = &blob_service;
|
||||
let _dir_service = &dir_service;
|
||||
let dir_service = &dir_service;
|
||||
let con = &con;
|
||||
let mut find_sample = con
|
||||
.prepare(include_str!("q/latest-download.sql"))
|
||||
|
@ -174,40 +172,47 @@ async fn main() {
|
|||
}));
|
||||
}
|
||||
}
|
||||
let mut r: Pin<Box<dyn tokio::io::AsyncRead>> = {
|
||||
match uri {
|
||||
Ingestable::Path(path) => match tokio::fs::File::open(path).await {
|
||||
Ok(f) => Box::pin(f),
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Failed to read {:?}: {}", path, e));
|
||||
let (digest, n_bytes) = match uri {
|
||||
Ingestable::Path(path) => {
|
||||
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
|
||||
.await?
|
||||
{
|
||||
snix_castore::Node::Directory { digest, size } => (digest, size),
|
||||
snix_castore::Node::File {
|
||||
digest,
|
||||
size,
|
||||
executable: _,
|
||||
} => (digest, size),
|
||||
snix_castore::Node::Symlink { target: _ } => {
|
||||
return Err(anyhow!("TODO: Figure out what to do with symlink roots"))
|
||||
}
|
||||
},
|
||||
Ingestable::Url(url) => {
|
||||
let res = match client.get(url.clone()).send().await {
|
||||
Ok(res) => res.error_for_status()?,
|
||||
Err(e) => {
|
||||
return Err(anyhow!("Failed to GET {}: {}", url, e));
|
||||
}
|
||||
};
|
||||
let r = tokio_util::io::StreamReader::new(
|
||||
res.bytes_stream().map_err(std::io::Error::other),
|
||||
);
|
||||
Box::pin(r)
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut w = blob_service.open_write().await;
|
||||
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"tokio::io::copy failed for uri={} with {}",
|
||||
uri_s,
|
||||
e
|
||||
));
|
||||
Ingestable::Url(url) => {
|
||||
let res = client
|
||||
.get(url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(format!("Request.send failed early for {:?}", uri))?
|
||||
.error_for_status()?;
|
||||
let mut r = tokio_util::io::StreamReader::new(
|
||||
res.bytes_stream().map_err(std::io::Error::other),
|
||||
);
|
||||
let mut w = blob_service.open_write().await;
|
||||
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"tokio::io::copy failed for uri={} with {}",
|
||||
uri_s,
|
||||
e
|
||||
));
|
||||
}
|
||||
};
|
||||
let digest = w.close().await?;
|
||||
(digest, n_bytes)
|
||||
}
|
||||
};
|
||||
let digest = w.close().await?;
|
||||
let digest64 = format!("{}", digest);
|
||||
add_blob.execute(params![digest64, n_bytes,])?;
|
||||
add_uri.execute(params![uri_s])?;
|
||||
|
@ -229,7 +234,7 @@ async fn main() {
|
|||
for s in samples {
|
||||
match s {
|
||||
Err(e) => {
|
||||
println!("Failed to fetch ...: {}", e);
|
||||
println!("Failed to fetch: {}", e);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Ok(Some(ingested)) => {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
INSERT INTO sidx_uri_sample(uri_id, blob_id)
|
||||
INSERT INTO sidx_uri_sample(uri_id, blake3_id)
|
||||
VALUES(
|
||||
(
|
||||
SELECT
|
||||
|
@ -13,7 +13,7 @@ VALUES(
|
|||
SELECT
|
||||
id
|
||||
FROM
|
||||
sidx_blob
|
||||
sidx_blake3
|
||||
WHERE
|
||||
blake3 = ?
|
||||
)
|
||||
|
|
|
@ -3,20 +3,20 @@ CREATE TABLE IF NOT EXISTS sidx_uri(
|
|||
uri TEXT UNIQUE,
|
||||
PRIMARY KEY(id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS sidx_blob(
|
||||
CREATE TABLE IF NOT EXISTS sidx_blake3(
|
||||
id INTEGER,
|
||||
blake3 TEXT UNIQUE,
|
||||
blake3 TEXT UNIQUE, /* snix-castore node */
|
||||
n_bytes INTEGER NOT NULL,
|
||||
PRIMARY KEY(id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS sidx_uri_sample(
|
||||
id INTEGER,
|
||||
uri_id INTEGER NOT NULL,
|
||||
blob_id INTEGER,
|
||||
blake3_id INTEGER,
|
||||
epoch INTEGER NOT NULL DEFAULT (unixepoch()),
|
||||
PRIMARY KEY(id),
|
||||
FOREIGN KEY(uri_id) REFERENCES sidx_uri(id),
|
||||
FOREIGN KEY(blob_id) REFERENCES sidx_blob(id)
|
||||
FOREIGN KEY(blake3_id) REFERENCES sidx_blake3(id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS sidx_uri_blob_idx
|
||||
ON sidx_uri_sample(uri_id, blob_id, epoch);
|
||||
CREATE INDEX IF NOT EXISTS sidx_uri_blake3_idx
|
||||
ON sidx_uri_sample(uri_id, blake3_id, epoch);
|
||||
|
|
|
@ -5,10 +5,10 @@ SELECT
|
|||
FROM
|
||||
sidx_uri_sample AS s,
|
||||
sidx_uri AS u,
|
||||
sidx_blob AS b
|
||||
sidx_blake3 AS b
|
||||
ON
|
||||
s.uri_id = u.id
|
||||
AND s.blob_id = b.id
|
||||
AND s.blake3_id = b.id
|
||||
WHERE
|
||||
u.uri = ?
|
||||
ORDER BY
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
INSERT INTO sidx_blob(blake3, n_bytes)
|
||||
INSERT INTO sidx_blake3(blake3, n_bytes)
|
||||
VALUES
|
||||
(?, ?)
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue