feat(fetch-listing): poc recursive fetching
This commit is contained in:
parent
6df68a7e9c
commit
cfff120c9b
5 changed files with 1370 additions and 133 deletions
463
src/main.rs
463
src/main.rs
|
@ -1,12 +1,24 @@
|
|||
use std::collections::HashSet;
|
||||
use std::path::{absolute, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Error};
|
||||
use clap::Parser;
|
||||
use clap::Subcommand;
|
||||
use futures::{stream, StreamExt, TryStreamExt};
|
||||
use rusqlite::{params, OptionalExtension};
|
||||
use scraper::{Html, Selector};
|
||||
use snix_castore::blobservice::BlobService;
|
||||
use snix_castore::directoryservice::DirectoryService;
|
||||
use snix_castore::B3Digest;
|
||||
use snix_castore::{blobservice, directoryservice, import::fs::ingest_path};
|
||||
use std::sync::Mutex;
|
||||
use tokio::io::{AsyncReadExt, BufReader};
|
||||
use tokio::sync::mpsc::{channel, Sender};
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
|
@ -15,22 +27,28 @@ enum Ingestable {
|
|||
Path(PathBuf),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
enum IngestedWhen {
|
||||
Now,
|
||||
Before,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
#[allow(dead_code)]
|
||||
struct Ingested {
|
||||
sample_id: u32,
|
||||
uri: String,
|
||||
blake3: String,
|
||||
blake3: B3Digest,
|
||||
epoch: u32,
|
||||
when: IngestedWhen,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum FetchListingMessage {
|
||||
Ingested(Url, Ingested),
|
||||
Recurse(Url, usize),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Ingestable {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
|
@ -45,7 +63,7 @@ impl std::fmt::Display for Ingestable {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
|
||||
fn parse_url_or_path(s: &str) -> Result<Ingestable, Error> {
|
||||
if s.is_empty() {
|
||||
Err(anyhow!("Empty path (url)"))
|
||||
} else if s.starts_with("./") || s.starts_with("/") {
|
||||
|
@ -69,7 +87,7 @@ fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
|
|||
fn data_path() -> PathBuf {
|
||||
let xdg_data_dir = std::env::var("XDG_DATA_DIR")
|
||||
.and_then(|s| Ok(PathBuf::from(s)))
|
||||
.or_else(|_| -> Result<PathBuf, anyhow::Error> {
|
||||
.or_else(|_| -> Result<PathBuf, Error> {
|
||||
match std::env::home_dir() {
|
||||
Some(p) => Ok(p.join(".local/share")),
|
||||
None => Err(anyhow!("...")), // FIXME
|
||||
|
@ -93,6 +111,12 @@ enum Command {
|
|||
#[clap(value_parser = parse_url_or_path, num_args = 1)]
|
||||
inputs: Vec<Ingestable>,
|
||||
},
|
||||
FetchListing {
|
||||
#[clap(value_parser, long, default_value_t = 5)]
|
||||
max_depth: usize,
|
||||
#[clap(value_parser, num_args = 1)]
|
||||
inputs: Vec<Url>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
|
@ -100,7 +124,7 @@ struct Cli {
|
|||
#[clap(short, long, action)]
|
||||
refetch: bool,
|
||||
|
||||
#[clap(short, long, value_parser, default_value_t = 5)]
|
||||
#[clap(short, long, value_parser, default_value_t = 4)]
|
||||
max_parallel: usize,
|
||||
|
||||
#[clap(short, long, value_parser, default_value_os_t = default_db_path())]
|
||||
|
@ -113,130 +137,33 @@ struct Cli {
|
|||
command: Option<Command>,
|
||||
}
|
||||
|
||||
async fn ingest<BS, DS>(
|
||||
inputs: &Vec<Ingestable>,
|
||||
struct SidxContext<BS, DS>
|
||||
where
|
||||
BS: blobservice::BlobService + Clone + Send + 'static,
|
||||
DS: directoryservice::DirectoryService + Clone + Send + 'static,
|
||||
{
|
||||
refetch: bool,
|
||||
max_parallel: usize,
|
||||
http_client: reqwest::Client,
|
||||
http: reqwest::Client,
|
||||
con: Arc<Mutex<rusqlite::Connection>>,
|
||||
blob_service: BS,
|
||||
dir_service: DS,
|
||||
con: rusqlite::Connection,
|
||||
) -> Vec<Result<Option<Ingested>, anyhow::Error>>
|
||||
where
|
||||
BS: blobservice::BlobService,
|
||||
DS: directoryservice::DirectoryService,
|
||||
{
|
||||
let samples = stream::iter(inputs.iter().map(|uri| {
|
||||
let client = &http_client;
|
||||
let blob_service = &blob_service;
|
||||
let dir_service = &dir_service;
|
||||
let con = &con;
|
||||
|
||||
let mut find_sample = con
|
||||
.prepare(include_str!("q/latest-download.sql"))
|
||||
.expect("Failed to prepare latest-download.sql");
|
||||
let mut add_sample = con
|
||||
.prepare(include_str!("q/add-sample.sql"))
|
||||
.expect("Failed to prepare add-sample.sql");
|
||||
let mut add_blob = con
|
||||
.prepare(include_str!("q/upsert-blob.sql"))
|
||||
.expect("Failed to prepare upsert-blob.sql");
|
||||
let mut add_uri = con
|
||||
.prepare(include_str!("q/upsert-uri.sql"))
|
||||
.expect("Failed to prepare upsert-uri.sql");
|
||||
|
||||
async move {
|
||||
let uri_s = uri.to_string();
|
||||
let latest_download = find_sample
|
||||
.query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r))
|
||||
.optional()?;
|
||||
if let Some((sample_id, blake3, epoch)) = latest_download {
|
||||
if !refetch {
|
||||
return Ok::<Option<Ingested>, anyhow::Error>(Some(Ingested {
|
||||
sample_id,
|
||||
uri: uri_s,
|
||||
blake3,
|
||||
epoch,
|
||||
when: IngestedWhen::Before,
|
||||
}));
|
||||
}
|
||||
}
|
||||
let (digest, n_bytes) = match uri {
|
||||
Ingestable::Path(path) => {
|
||||
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
|
||||
.await?
|
||||
{
|
||||
snix_castore::Node::Directory { digest, size } => (digest, size),
|
||||
snix_castore::Node::File {
|
||||
digest,
|
||||
size,
|
||||
executable: _,
|
||||
} => (digest, size),
|
||||
snix_castore::Node::Symlink { target: _ } => {
|
||||
return Err(anyhow!("TODO: Figure out what to do with symlink roots"))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ingestable::Url(url) => {
|
||||
let res = client
|
||||
.get(url.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(format!("Request.send failed early for {:?}", uri))?
|
||||
.error_for_status()?;
|
||||
let mut r = tokio_util::io::StreamReader::new(
|
||||
res.bytes_stream().map_err(std::io::Error::other),
|
||||
);
|
||||
let mut w = blob_service.open_write().await;
|
||||
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"tokio::io::copy failed for uri={} with {}",
|
||||
uri_s,
|
||||
e
|
||||
));
|
||||
}
|
||||
};
|
||||
let digest = w.close().await?;
|
||||
(digest, n_bytes)
|
||||
}
|
||||
};
|
||||
let digest64 = format!("{}", digest);
|
||||
add_blob.execute(params![digest64, n_bytes,])?;
|
||||
add_uri.execute(params![uri_s])?;
|
||||
let (sample_id, epoch) = add_sample
|
||||
.query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?;
|
||||
Ok(Some(Ingested {
|
||||
sample_id,
|
||||
uri: uri_s,
|
||||
blake3: digest64,
|
||||
epoch,
|
||||
when: IngestedWhen::Now,
|
||||
}))
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(max_parallel)
|
||||
.collect::<Vec<Result<Option<Ingested>, _>>>()
|
||||
.await;
|
||||
|
||||
samples
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Cli::parse();
|
||||
|
||||
args.db_path.parent().and_then(|p| {
|
||||
async fn open_context(
|
||||
refetch: bool,
|
||||
max_parallel: usize,
|
||||
db_path: PathBuf,
|
||||
castore_path: PathBuf,
|
||||
) -> SidxContext<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
|
||||
if let Some(p) = db_path.parent() {
|
||||
let _ = std::fs::create_dir_all(p);
|
||||
Some(())
|
||||
});
|
||||
}
|
||||
|
||||
let con =
|
||||
rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object");
|
||||
let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object");
|
||||
con.execute_batch(include_str!("q/init.sql"))
|
||||
.expect("Failed to execute init.sql");
|
||||
let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path");
|
||||
let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path");
|
||||
let blob_service = blobservice::from_addr(&std::format!(
|
||||
"objectstore+file://{}",
|
||||
castore_path
|
||||
|
@ -256,20 +183,279 @@ async fn main() {
|
|||
.await
|
||||
.expect("Couldn't initialize .castore/directory");
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
SidxContext::<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
|
||||
refetch,
|
||||
max_parallel,
|
||||
http: reqwest::Client::new(),
|
||||
con: Arc::new(Mutex::new(con)),
|
||||
blob_service,
|
||||
dir_service,
|
||||
}
|
||||
}
|
||||
|
||||
impl<BS: BlobService + Clone, DS: DirectoryService + Clone> SidxContext<BS, DS> {
|
||||
async fn db_latest_download(&self, uri: &str) -> Result<Option<Ingested>, Error> {
|
||||
let lock = self.con.lock().unwrap();
|
||||
let mut find_sample = lock
|
||||
.prepare_cached(include_str!("q/latest-download.sql"))
|
||||
.expect("Failed to prepare latest-download.sql");
|
||||
find_sample
|
||||
.query_row(params![uri], |r| <(u32, String, u32)>::try_from(r))
|
||||
.optional()
|
||||
.context("db_latest_download.sql")
|
||||
.and_then(|maybe_triple| match maybe_triple {
|
||||
Some((sample_id, blake3, epoch)) => Ok(Some(Ingested {
|
||||
sample_id,
|
||||
uri: uri.to_string(),
|
||||
blake3: B3Digest::from_str(&blake3)?,
|
||||
epoch,
|
||||
when: IngestedWhen::Before,
|
||||
})),
|
||||
None => Ok(None),
|
||||
})
|
||||
}
|
||||
async fn db_add_sample(&self, uri: &str, blake3: &str) -> Result<(u32, u32), rusqlite::Error> {
|
||||
let lock = self.con.lock().unwrap();
|
||||
let mut add_sample = lock
|
||||
.prepare_cached(include_str!("q/add-sample.sql"))
|
||||
.expect("Failed to prepare add-sample.sql");
|
||||
add_sample.query_row(params![uri, blake3], |row| <(u32, u32)>::try_from(row))
|
||||
}
|
||||
async fn db_add_blob(&self, blake3: &str, n_bytes: u64) -> Result<usize, rusqlite::Error> {
|
||||
let lock = self.con.lock().unwrap();
|
||||
let mut add_blob = lock
|
||||
.prepare_cached(include_str!("q/upsert-blob.sql"))
|
||||
.expect("Failed to prepare upsert-blob.sql");
|
||||
add_blob.execute(params![blake3, n_bytes,])
|
||||
}
|
||||
async fn db_add_uri(&self, uri: &str) -> Result<usize, rusqlite::Error> {
|
||||
let lock = self.con.lock().unwrap();
|
||||
let mut add_uri = lock
|
||||
.prepare_cached(include_str!("q/upsert-uri.sql"))
|
||||
.expect("Failed to prepare upsert-uri.sql");
|
||||
|
||||
add_uri.execute(params![uri])
|
||||
}
|
||||
async fn record_ingested_node(
|
||||
&self,
|
||||
uri: &str,
|
||||
blake3: &snix_castore::B3Digest,
|
||||
n_bytes: u64,
|
||||
) -> Result<Ingested, Error> {
|
||||
let digest64 = format!("{}", blake3);
|
||||
self.db_add_blob(&digest64, n_bytes).await?;
|
||||
self.db_add_uri(&uri).await?;
|
||||
let (sample_id, epoch) = self.db_add_sample(&uri, &digest64).await?;
|
||||
Ok(Ingested {
|
||||
sample_id,
|
||||
uri: uri.to_string(),
|
||||
blake3: blake3.clone(),
|
||||
epoch,
|
||||
when: IngestedWhen::Now,
|
||||
})
|
||||
}
|
||||
async fn download_no_cache(&self, uri: &Url) -> Result<Ingested, Error> {
|
||||
let uri_s = uri.to_string();
|
||||
let res = self
|
||||
.http
|
||||
.get(uri.clone())
|
||||
.send()
|
||||
.await
|
||||
.context(format!("Request::send failed early for {:?}", uri))?
|
||||
.error_for_status()?;
|
||||
let mut r =
|
||||
tokio_util::io::StreamReader::new(res.bytes_stream().map_err(std::io::Error::other));
|
||||
let mut w = self.blob_service.open_write().await;
|
||||
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
||||
Ok(n) => n,
|
||||
Err(e) => {
|
||||
return Err(anyhow!(
|
||||
"tokio::io::copy failed for uri={} with {}",
|
||||
uri_s,
|
||||
e
|
||||
));
|
||||
}
|
||||
};
|
||||
let digest = w.close().await?;
|
||||
self.record_ingested_node(&uri_s, &digest, n_bytes).await
|
||||
}
|
||||
async fn download(&self, uri: &Url) -> Result<Ingested, Error> {
|
||||
if self.refetch {
|
||||
self.download_no_cache(&uri).await
|
||||
} else {
|
||||
match self.db_latest_download(&uri.to_string()).await? {
|
||||
Some(ingested) => Ok(ingested),
|
||||
None => self.download_no_cache(&uri).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn ingest(&self, inputs: &Vec<Ingestable>) -> Vec<Result<Option<Ingested>, Error>> {
|
||||
let samples = stream::iter(inputs.iter().map(|uri| {
|
||||
let blob_service = &self.blob_service;
|
||||
let dir_service = &self.dir_service;
|
||||
|
||||
async move {
|
||||
let uri_s = uri.to_string();
|
||||
let latest_download = self.db_latest_download(&uri_s).await?;
|
||||
if latest_download.is_some() {
|
||||
return Ok(latest_download);
|
||||
}
|
||||
match uri {
|
||||
Ingestable::Path(path) => {
|
||||
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
|
||||
.await?
|
||||
{
|
||||
snix_castore::Node::Directory { digest, size } => self
|
||||
.record_ingested_node(&uri_s, &digest, size)
|
||||
.await
|
||||
.map(Some),
|
||||
|
||||
snix_castore::Node::File {
|
||||
digest,
|
||||
size,
|
||||
executable: _,
|
||||
} => self
|
||||
.record_ingested_node(&uri_s, &digest, size)
|
||||
.await
|
||||
.map(Some),
|
||||
snix_castore::Node::Symlink { target: _ } => {
|
||||
Err(anyhow!("TODO: Figure out what to do with symlink roots"))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ingestable::Url(url) => self.download(url).await.map(Some),
|
||||
}
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(self.max_parallel)
|
||||
.collect::<Vec<Result<Option<Ingested>, _>>>()
|
||||
.await;
|
||||
|
||||
samples
|
||||
}
|
||||
|
||||
fn extract_hrefs(content: &str) -> Result<Vec<String>, Error> {
|
||||
let sel = Selector::parse("a").map_err(|e| anyhow!(e.to_string()))?;
|
||||
let html = Html::parse_document(&content);
|
||||
|
||||
Ok(html
|
||||
.select(&sel)
|
||||
.flat_map(|elt| elt.value().attr("href"))
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
|
||||
async fn fetch_from_listing_impl(
|
||||
self: Arc<Self>,
|
||||
url: Url,
|
||||
max_depth: usize,
|
||||
tx: Sender<FetchListingMessage>,
|
||||
) -> Result<(), Error> {
|
||||
eprintln!("Downloading {:?}", url.to_string());
|
||||
let root = self.download(&url).await?;
|
||||
tx.send(FetchListingMessage::Ingested(url.clone(), root.clone()))
|
||||
.await
|
||||
.context("Stopped accepting tasks before processing an Ingested notification")?;
|
||||
if max_depth <= 0 {
|
||||
return Ok(());
|
||||
}
|
||||
/* TODO: no need to load blobs to memory unless you know they're text/html */
|
||||
match self.blob_service.open_read(&root.blake3).await? {
|
||||
Some(mut reader) => {
|
||||
let content = {
|
||||
let mut br = BufReader::new(&mut *reader);
|
||||
let mut content = String::new();
|
||||
br.read_to_string(&mut content).await?;
|
||||
content
|
||||
};
|
||||
let hrefs = Self::extract_hrefs(&content).unwrap_or(vec![]);
|
||||
/* max_depth > 0 here */
|
||||
for href in hrefs {
|
||||
let next_url = url.join(&href).context("Constructing next_url")?;
|
||||
tx.send(FetchListingMessage::Recurse(
|
||||
next_url.clone(),
|
||||
max_depth - 1,
|
||||
))
|
||||
.await
|
||||
.context("Stopped accepting tasks before finishing all hrefs")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
None => Err(anyhow!("Couldn't read the ingested blob")),
|
||||
}
|
||||
}
|
||||
|
||||
async fn fetch_from_listing(
|
||||
self: Arc<Self>,
|
||||
url: Url,
|
||||
max_depth: usize,
|
||||
) -> ReceiverStream<Ingested> {
|
||||
let mq_size = 10;
|
||||
|
||||
/* TODO: move task queue to e.g. sqlite */
|
||||
let (tx, mut rx) = channel(mq_size);
|
||||
|
||||
let (out_tx, out_rx) = channel(mq_size);
|
||||
|
||||
let semaphore = Arc::new(Semaphore::new(self.max_parallel));
|
||||
|
||||
tokio::spawn({
|
||||
async move {
|
||||
let mut seen: HashSet<String> = HashSet::new();
|
||||
tx.send(FetchListingMessage::Recurse(url, max_depth))
|
||||
.await
|
||||
.expect("fetch_from_listing failed populating the queue");
|
||||
while let Some(m) = rx.recv().await {
|
||||
match m {
|
||||
FetchListingMessage::Ingested(_url, ingested) => {
|
||||
out_tx
|
||||
.send(ingested)
|
||||
.await
|
||||
.expect("ReceiverStream failed to accept an Ingestable");
|
||||
}
|
||||
FetchListingMessage::Recurse(url, max_depth) => {
|
||||
if max_depth > 0 && !seen.contains(&url.to_string()) {
|
||||
seen.insert(url.to_string());
|
||||
tokio::spawn({
|
||||
let s = self.clone();
|
||||
let url = url.clone();
|
||||
let tx = tx.clone();
|
||||
let semaphore = semaphore.clone();
|
||||
async move {
|
||||
let _permit = semaphore.acquire();
|
||||
s.fetch_from_listing_impl(url, max_depth, tx).await
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
ReceiverStream::new(out_rx)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Cli::parse();
|
||||
|
||||
let _cwd = std::env::current_dir().expect("Couldn't get CWD");
|
||||
let _host_name = std::env::var("HOSTNAME").map_or(None, Some);
|
||||
|
||||
let ctx = Arc::new(
|
||||
open_context(
|
||||
args.refetch,
|
||||
args.max_parallel,
|
||||
args.db_path,
|
||||
args.castore_path,
|
||||
)
|
||||
.await,
|
||||
);
|
||||
|
||||
match args.command {
|
||||
Some(Command::Ingest { inputs }) => {
|
||||
let samples = ingest(
|
||||
&inputs,
|
||||
args.refetch,
|
||||
args.max_parallel,
|
||||
client,
|
||||
blob_service,
|
||||
dir_service,
|
||||
con,
|
||||
)
|
||||
.await;
|
||||
let samples = ctx.ingest(&inputs).await;
|
||||
for s in samples {
|
||||
match s {
|
||||
Err(e) => {
|
||||
|
@ -282,6 +468,19 @@ async fn main() {
|
|||
}
|
||||
}
|
||||
}
|
||||
Some(Command::FetchListing { max_depth, inputs }) => {
|
||||
let ingested: Vec<Ingested> = stream::iter(inputs)
|
||||
.then(async |i| {
|
||||
let i = i.clone();
|
||||
ctx.clone().fetch_from_listing(i, max_depth).await
|
||||
})
|
||||
.flatten_unordered(args.max_parallel)
|
||||
.collect()
|
||||
.await;
|
||||
for i in ingested {
|
||||
eprintln!("{:?}", i);
|
||||
}
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue