2025-05-09 04:01:38 +00:00
|
|
|
use std::collections::{HashMap, HashSet};
|
|
|
|
use std::marker::PhantomData;
|
|
|
|
use std::path::{PathBuf, absolute};
|
2025-04-27 16:53:07 +00:00
|
|
|
use std::str::FromStr;
|
|
|
|
use std::sync::Arc;
|
2025-05-09 04:01:38 +00:00
|
|
|
use std::{fmt, io};
|
2025-04-15 14:07:49 +00:00
|
|
|
|
2025-04-20 00:12:51 +00:00
|
|
|
use anyhow::Context;
|
2025-05-09 04:01:38 +00:00
|
|
|
use anyhow::{Error, anyhow};
|
2025-04-15 14:07:49 +00:00
|
|
|
use clap::Parser;
|
2025-04-20 08:59:50 +00:00
|
|
|
use clap::Subcommand;
|
2025-05-09 04:01:38 +00:00
|
|
|
use futures::{StreamExt, TryStreamExt, stream};
|
|
|
|
use rusqlite::fallible_iterator::FallibleIterator as _;
|
|
|
|
use rusqlite::{OptionalExtension, named_params, params};
|
2025-04-27 16:53:07 +00:00
|
|
|
use scraper::{Html, Selector};
|
2025-05-09 04:01:38 +00:00
|
|
|
use serde::de::{self, Visitor};
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use snix_castore::B3Digest;
|
2025-04-27 16:53:07 +00:00
|
|
|
use snix_castore::blobservice::BlobService;
|
|
|
|
use snix_castore::directoryservice::DirectoryService;
|
2025-04-20 00:12:51 +00:00
|
|
|
use snix_castore::{blobservice, directoryservice, import::fs::ingest_path};
|
2025-04-27 16:53:07 +00:00
|
|
|
use std::sync::Mutex;
|
|
|
|
use tokio::io::{AsyncReadExt, BufReader};
|
|
|
|
use tokio::sync::Semaphore;
|
2025-05-09 04:01:38 +00:00
|
|
|
use tokio::sync::mpsc::{Sender, channel};
|
2025-04-27 16:53:07 +00:00
|
|
|
use tokio_stream::wrappers::ReceiverStream;
|
2025-04-15 14:07:49 +00:00
|
|
|
use url::Url;
|
|
|
|
|
2025-04-20 00:12:51 +00:00
|
|
|
#[derive(Clone, Debug)]
|
2025-04-15 14:07:49 +00:00
|
|
|
enum Ingestable {
|
|
|
|
Url(Url),
|
|
|
|
Path(PathBuf),
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
#[derive(Debug, Clone)]
|
2025-04-29 01:50:55 +00:00
|
|
|
enum SampledWhen {
|
2025-04-15 14:07:49 +00:00
|
|
|
Now,
|
|
|
|
Before,
|
|
|
|
}
|
|
|
|
|
2025-04-29 01:50:55 +00:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
struct SizedBlob {
|
|
|
|
hash: B3Digest,
|
|
|
|
n_bytes: u64,
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
#[derive(Debug, Clone)]
|
2025-04-15 14:07:49 +00:00
|
|
|
#[allow(dead_code)]
|
2025-04-29 01:50:55 +00:00
|
|
|
struct Sampled {
|
2025-04-15 14:07:49 +00:00
|
|
|
sample_id: u32,
|
|
|
|
uri: String,
|
2025-04-29 01:50:55 +00:00
|
|
|
blob: Option<SizedBlob>,
|
|
|
|
http_status: Option<u16>,
|
2025-04-15 14:07:49 +00:00
|
|
|
epoch: u32,
|
2025-04-29 01:50:55 +00:00
|
|
|
when: SampledWhen,
|
2025-04-15 14:07:49 +00:00
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
#[derive(Clone)]
|
|
|
|
enum FetchListingMessage {
|
2025-04-29 01:50:55 +00:00
|
|
|
Sampled(Url, Sampled),
|
|
|
|
Recurse(Url, usize, Sender<FetchListingMessage>),
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
|
2025-04-15 14:07:49 +00:00
|
|
|
impl std::fmt::Display for Ingestable {
|
|
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
|
match self {
|
|
|
|
Ingestable::Url(url) => write!(f, "{}", url),
|
|
|
|
Ingestable::Path(path_buf) => match path_buf.to_str() {
|
|
|
|
Some(s) => write!(f, "{}", s),
|
|
|
|
None => {
|
|
|
|
panic!("PathBuf::to_str failed")
|
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
fn parse_url_or_path(s: &str) -> Result<Ingestable, Error> {
|
2025-04-15 14:07:49 +00:00
|
|
|
if s.is_empty() {
|
|
|
|
Err(anyhow!("Empty path (url)"))
|
|
|
|
} else if s.starts_with("./") || s.starts_with("/") {
|
|
|
|
Ok(Ingestable::Path(PathBuf::from(s)))
|
|
|
|
} else {
|
|
|
|
let url = Url::parse(s)?;
|
|
|
|
if url.scheme() == "file" {
|
|
|
|
match url.to_file_path() {
|
|
|
|
Ok(s) => Ok(Ingestable::Path(s)),
|
|
|
|
Err(()) => Err(anyhow!(
|
|
|
|
"parse_url_or_path: couldn't convert Url ({}) to Path",
|
|
|
|
url
|
|
|
|
)),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
Ok(Ingestable::Url(url))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn data_path() -> PathBuf {
|
|
|
|
let xdg_data_dir = std::env::var("XDG_DATA_DIR")
|
|
|
|
.and_then(|s| Ok(PathBuf::from(s)))
|
2025-04-27 16:53:07 +00:00
|
|
|
.or_else(|_| -> Result<PathBuf, Error> {
|
2025-04-15 14:07:49 +00:00
|
|
|
match std::env::home_dir() {
|
|
|
|
Some(p) => Ok(p.join(".local/share")),
|
|
|
|
None => Err(anyhow!("...")), // FIXME
|
|
|
|
}
|
|
|
|
});
|
|
|
|
match xdg_data_dir {
|
|
|
|
Ok(p) => p.join("sidx"),
|
|
|
|
Err(_) => PathBuf::from(".sidx"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fn default_castore_path() -> PathBuf {
|
|
|
|
data_path().join("castore")
|
|
|
|
}
|
|
|
|
fn default_db_path() -> PathBuf {
|
|
|
|
data_path().join("sidx.db")
|
|
|
|
}
|
|
|
|
|
2025-04-20 08:59:50 +00:00
|
|
|
#[derive(Subcommand)]
|
|
|
|
enum Command {
|
|
|
|
Ingest {
|
|
|
|
#[clap(value_parser = parse_url_or_path, num_args = 1)]
|
|
|
|
inputs: Vec<Ingestable>,
|
|
|
|
},
|
2025-04-27 16:53:07 +00:00
|
|
|
FetchListing {
|
|
|
|
#[clap(value_parser, long, default_value_t = 5)]
|
|
|
|
max_depth: usize,
|
2025-04-29 01:50:55 +00:00
|
|
|
#[clap(value_parser, long, default_value_t = 1024 * 1024)]
|
|
|
|
html_max_bytes: u64,
|
2025-04-27 16:53:07 +00:00
|
|
|
#[clap(value_parser, num_args = 1)]
|
|
|
|
inputs: Vec<Url>,
|
|
|
|
},
|
2025-04-29 01:50:55 +00:00
|
|
|
ParseUrl {
|
|
|
|
#[clap(value_parser, num_args = 1)]
|
|
|
|
url: Vec<Url>,
|
|
|
|
},
|
2025-05-09 04:01:38 +00:00
|
|
|
DemoCudaManifest,
|
|
|
|
FormatCudaManifest,
|
|
|
|
ProcessCudaManifests {
|
|
|
|
#[clap(short, long, action)]
|
|
|
|
include_finished: bool,
|
|
|
|
},
|
2025-04-20 08:59:50 +00:00
|
|
|
}
|
|
|
|
|
2025-04-15 14:07:49 +00:00
|
|
|
#[derive(Parser)]
|
|
|
|
struct Cli {
|
|
|
|
#[clap(short, long, action)]
|
|
|
|
refetch: bool,
|
|
|
|
|
2025-04-29 01:50:55 +00:00
|
|
|
#[clap(short, long, value_parser, default_value_t = 2)]
|
2025-04-15 14:07:49 +00:00
|
|
|
max_parallel: usize,
|
|
|
|
|
|
|
|
#[clap(short, long, value_parser, default_value_os_t = default_db_path())]
|
|
|
|
db_path: PathBuf,
|
|
|
|
|
|
|
|
#[clap(short, long, value_parser, default_value_os_t = default_castore_path())]
|
|
|
|
castore_path: PathBuf,
|
|
|
|
|
2025-04-20 08:59:50 +00:00
|
|
|
#[command(subcommand)]
|
|
|
|
command: Option<Command>,
|
|
|
|
}
|
2025-04-15 14:07:49 +00:00
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
struct SidxContext<BS, DS>
|
|
|
|
where
|
|
|
|
BS: blobservice::BlobService + Clone + Send + 'static,
|
|
|
|
DS: directoryservice::DirectoryService + Clone + Send + 'static,
|
|
|
|
{
|
2025-04-20 08:59:50 +00:00
|
|
|
refetch: bool,
|
|
|
|
max_parallel: usize,
|
2025-04-27 16:53:07 +00:00
|
|
|
http: reqwest::Client,
|
2025-04-29 01:50:55 +00:00
|
|
|
http_semaphore: Arc<Semaphore>,
|
2025-04-27 16:53:07 +00:00
|
|
|
con: Arc<Mutex<rusqlite::Connection>>,
|
2025-04-20 08:59:50 +00:00
|
|
|
blob_service: BS,
|
|
|
|
dir_service: DS,
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
async fn open_context(
|
|
|
|
refetch: bool,
|
|
|
|
max_parallel: usize,
|
|
|
|
db_path: PathBuf,
|
|
|
|
castore_path: PathBuf,
|
|
|
|
) -> SidxContext<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
|
|
|
|
if let Some(p) = db_path.parent() {
|
2025-04-20 08:59:50 +00:00
|
|
|
let _ = std::fs::create_dir_all(p);
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
2025-04-20 08:59:50 +00:00
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object");
|
2025-05-09 04:01:38 +00:00
|
|
|
con.pragma_update(None, "jorunal_mode", "wal").unwrap();
|
|
|
|
con.pragma_update(None, "synchronous", "normal").unwrap();
|
|
|
|
con.pragma_update(None, "temp_store", "memory").unwrap();
|
|
|
|
con.pragma_update(None, "foreign_keys", "on").unwrap();
|
2025-04-28 20:42:35 +00:00
|
|
|
con.execute_batch(include_str!("q/sidx-init.sql"))
|
|
|
|
.expect("Failed to execute sidx-init.sql");
|
2025-04-27 16:53:07 +00:00
|
|
|
let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path");
|
2025-04-20 08:59:50 +00:00
|
|
|
let blob_service = blobservice::from_addr(&std::format!(
|
|
|
|
"objectstore+file://{}",
|
|
|
|
castore_path
|
|
|
|
.join("blob")
|
|
|
|
.to_str()
|
|
|
|
.expect("Path::to_str unexpectedly broken")
|
|
|
|
))
|
|
|
|
.await
|
|
|
|
.expect("Couldn't initialize .castore/blob");
|
|
|
|
let dir_service = directoryservice::from_addr(&std::format!(
|
|
|
|
"objectstore+file://{}",
|
|
|
|
castore_path
|
2025-05-09 04:01:38 +00:00
|
|
|
.join("directories")
|
2025-04-20 08:59:50 +00:00
|
|
|
.to_str()
|
|
|
|
.expect("Path::to_str unexpectedly broken")
|
|
|
|
))
|
|
|
|
.await
|
2025-05-09 04:01:38 +00:00
|
|
|
.expect("Couldn't initialize .castore/directories");
|
2025-04-20 08:59:50 +00:00
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
SidxContext::<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
|
|
|
|
refetch,
|
|
|
|
max_parallel,
|
|
|
|
http: reqwest::Client::new(),
|
2025-04-29 01:50:55 +00:00
|
|
|
http_semaphore: Arc::new(Semaphore::new(max_parallel)),
|
2025-04-27 16:53:07 +00:00
|
|
|
con: Arc::new(Mutex::new(con)),
|
|
|
|
blob_service,
|
|
|
|
dir_service,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-09 04:01:38 +00:00
|
|
|
impl<BS, DS> Drop for SidxContext<BS, DS>
|
|
|
|
where
|
|
|
|
BS: BlobService + Clone,
|
|
|
|
DS: DirectoryService + Clone,
|
|
|
|
{
|
|
|
|
fn drop(&mut self) {
|
|
|
|
let con = self
|
|
|
|
.con
|
|
|
|
.lock()
|
|
|
|
.expect("Acquiring mutex for sqlite to run #pragma optimize before exit");
|
|
|
|
con.pragma_update(None, "analysis_limit", 500).unwrap();
|
|
|
|
con.pragma_query(None, "optimize", |_| Ok(())).unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
impl<BS: BlobService + Clone, DS: DirectoryService + Clone> SidxContext<BS, DS> {
|
2025-05-09 04:01:38 +00:00
|
|
|
async fn latest_sample(&self, uri: &str) -> Result<Option<Sampled>, Error> {
|
2025-04-27 16:53:07 +00:00
|
|
|
let lock = self.con.lock().unwrap();
|
|
|
|
let mut find_sample = lock
|
|
|
|
.prepare_cached(include_str!("q/latest-download.sql"))
|
|
|
|
.expect("Failed to prepare latest-download.sql");
|
|
|
|
find_sample
|
2025-04-29 01:50:55 +00:00
|
|
|
.query_row(params![uri], |r| {
|
|
|
|
<(u32, String, u64, Option<u16>, u32)>::try_from(r)
|
|
|
|
})
|
2025-04-27 16:53:07 +00:00
|
|
|
.optional()
|
|
|
|
.context("db_latest_download.sql")
|
2025-04-29 01:50:55 +00:00
|
|
|
.and_then(|maybe_tuple| match maybe_tuple {
|
|
|
|
Some((sample_id, hash, n_bytes, http_code, epoch)) => Ok(Some(Sampled {
|
2025-04-27 16:53:07 +00:00
|
|
|
sample_id,
|
|
|
|
uri: uri.to_string(),
|
2025-04-29 01:50:55 +00:00
|
|
|
blob: Some(SizedBlob {
|
|
|
|
hash: B3Digest::from_str(&hash)?,
|
|
|
|
n_bytes,
|
|
|
|
}),
|
|
|
|
http_status: http_code,
|
2025-04-27 16:53:07 +00:00
|
|
|
epoch,
|
2025-04-29 01:50:55 +00:00
|
|
|
when: SampledWhen::Before,
|
2025-04-27 16:53:07 +00:00
|
|
|
})),
|
|
|
|
None => Ok(None),
|
|
|
|
})
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
async fn db_add_sample(
|
|
|
|
&self,
|
|
|
|
uri: &str,
|
|
|
|
hash: &Option<String>,
|
2025-05-09 04:01:38 +00:00
|
|
|
http_code: &Option<u16>,
|
|
|
|
content_type: &Option<String>,
|
2025-04-29 01:50:55 +00:00
|
|
|
) -> Result<(u32, u32), Error> {
|
2025-05-09 04:01:38 +00:00
|
|
|
let lock = self.con.lock().expect("Locking mutex for db_add_sample");
|
2025-04-27 16:53:07 +00:00
|
|
|
let mut add_sample = lock
|
|
|
|
.prepare_cached(include_str!("q/add-sample.sql"))
|
2025-04-29 01:50:55 +00:00
|
|
|
.context("Failed to prepare add-sample.sql")?;
|
2025-05-09 04:01:38 +00:00
|
|
|
Ok(add_sample.query_row(
|
|
|
|
named_params! {
|
|
|
|
":uri": uri,
|
|
|
|
":hash": hash,
|
|
|
|
":http_code": http_code,
|
|
|
|
":content_type": content_type
|
|
|
|
},
|
|
|
|
|row| <(u32, u32)>::try_from(row),
|
|
|
|
)?)
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
async fn db_add_blob(&self, hash: &str, n_bytes: u64) -> Result<usize, Error> {
|
|
|
|
let lock = self.con.lock().expect("db_add_blob: couldn't lock mutex?");
|
2025-04-27 16:53:07 +00:00
|
|
|
let mut add_blob = lock
|
|
|
|
.prepare_cached(include_str!("q/upsert-blob.sql"))
|
2025-04-29 01:50:55 +00:00
|
|
|
.context("Failed to prepare upsert-blob.sql")?;
|
|
|
|
Ok(add_blob.execute(params![hash, n_bytes,])?)
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
async fn db_add_uri(&self, uri: &str) -> Result<usize, Error> {
|
2025-04-27 16:53:07 +00:00
|
|
|
let lock = self.con.lock().unwrap();
|
|
|
|
let mut add_uri = lock
|
|
|
|
.prepare_cached(include_str!("q/upsert-uri.sql"))
|
2025-04-29 01:50:55 +00:00
|
|
|
.context("Failed to prepare upsert-uri.sql")?;
|
2025-04-27 16:53:07 +00:00
|
|
|
|
2025-04-29 01:50:55 +00:00
|
|
|
Ok(add_uri.execute(params![uri])?)
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
async fn record_ingested_node(
|
|
|
|
&self,
|
|
|
|
uri: &str,
|
2025-04-29 01:50:55 +00:00
|
|
|
blob: &Option<SizedBlob>,
|
|
|
|
http_code: Option<u16>,
|
2025-05-09 04:01:38 +00:00
|
|
|
content_type: Option<String>,
|
2025-04-29 01:50:55 +00:00
|
|
|
) -> Result<Sampled, Error> {
|
|
|
|
let digest64 = if let Some(SizedBlob { hash, n_bytes }) = blob {
|
|
|
|
let digest64 = format!("{}", hash);
|
|
|
|
self.db_add_blob(&digest64, n_bytes.clone()).await?;
|
|
|
|
Some(digest64)
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
};
|
2025-04-27 16:53:07 +00:00
|
|
|
self.db_add_uri(&uri).await?;
|
2025-04-29 01:50:55 +00:00
|
|
|
let (sample_id, epoch) = self
|
2025-05-09 04:01:38 +00:00
|
|
|
.db_add_sample(&uri, &digest64, &http_code, &content_type)
|
2025-04-29 01:50:55 +00:00
|
|
|
.await?;
|
|
|
|
Ok(Sampled {
|
2025-04-27 16:53:07 +00:00
|
|
|
sample_id,
|
|
|
|
uri: uri.to_string(),
|
2025-04-29 01:50:55 +00:00
|
|
|
blob: blob.clone(),
|
|
|
|
http_status: http_code,
|
2025-04-27 16:53:07 +00:00
|
|
|
epoch,
|
2025-04-29 01:50:55 +00:00
|
|
|
when: SampledWhen::Now,
|
2025-04-27 16:53:07 +00:00
|
|
|
})
|
|
|
|
}
|
2025-05-09 04:01:38 +00:00
|
|
|
async fn download(&self, uri: &Url) -> Result<Sampled, Error> {
|
2025-04-29 01:50:55 +00:00
|
|
|
let _permit = self.http_semaphore.acquire().await.unwrap();
|
|
|
|
eprintln!("Downloading {:?}", uri.to_string());
|
2025-04-27 16:53:07 +00:00
|
|
|
let uri_s = uri.to_string();
|
|
|
|
let res = self
|
|
|
|
.http
|
|
|
|
.get(uri.clone())
|
|
|
|
.send()
|
|
|
|
.await
|
2025-04-29 01:50:55 +00:00
|
|
|
.context(format!("Request::send failed early for {:?}", uri))?;
|
|
|
|
|
|
|
|
let status = res.status();
|
|
|
|
let status_code = status.as_u16();
|
2025-05-09 04:01:38 +00:00
|
|
|
let content_type = res
|
|
|
|
.headers()
|
|
|
|
.get(reqwest::header::CONTENT_TYPE)
|
|
|
|
.and_then(|x| x.to_str().ok())
|
|
|
|
.map(|x| x.to_string());
|
2025-04-29 01:50:55 +00:00
|
|
|
|
|
|
|
if status.is_success() {
|
|
|
|
let mut r = tokio_util::io::StreamReader::new(
|
|
|
|
res.bytes_stream().map_err(std::io::Error::other),
|
|
|
|
);
|
|
|
|
let mut w = self.blob_service.open_write().await;
|
|
|
|
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
|
|
|
|
Ok(n) => n,
|
|
|
|
Err(e) => {
|
|
|
|
return Err(anyhow!(
|
|
|
|
"tokio::io::copy failed for uri={} with {}",
|
|
|
|
uri_s,
|
|
|
|
e
|
|
|
|
));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
let digest = w.close().await?;
|
|
|
|
|
|
|
|
self.record_ingested_node(
|
|
|
|
&uri_s,
|
|
|
|
&Some(SizedBlob {
|
|
|
|
hash: digest,
|
|
|
|
n_bytes,
|
|
|
|
}),
|
|
|
|
Some(status_code),
|
2025-05-09 04:01:38 +00:00
|
|
|
content_type,
|
2025-04-29 01:50:55 +00:00
|
|
|
)
|
|
|
|
.await
|
|
|
|
} else {
|
2025-05-09 04:01:38 +00:00
|
|
|
self.record_ingested_node(&uri_s, &None, Some(status_code), content_type)
|
2025-04-29 01:50:55 +00:00
|
|
|
.await
|
|
|
|
}
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
2025-05-09 04:01:38 +00:00
|
|
|
async fn ensure_blob(&self, hash: &B3Digest) -> Result<(), Error> {
|
|
|
|
if self
|
|
|
|
.blob_service
|
|
|
|
.has(hash)
|
|
|
|
.await
|
|
|
|
.context("ensure_has() accessing blob_service")?
|
|
|
|
{
|
|
|
|
Ok(())
|
|
|
|
} else {
|
|
|
|
let b64 = hash.to_string();
|
|
|
|
let uris = {
|
|
|
|
let con = self.con.lock().unwrap();
|
|
|
|
let mut find_uris = con
|
|
|
|
.prepare_cached(include_str!("q/uris-of-hash.sql"))
|
|
|
|
.context("Preparing statement: q/uris-of-hash.sql")
|
|
|
|
.unwrap();
|
|
|
|
find_uris
|
|
|
|
.query(named_params! {":hash": b64, ":limit": 100})?
|
|
|
|
.map(|b| b.get(0))
|
|
|
|
.collect::<Vec<String>>()?
|
|
|
|
};
|
|
|
|
if uris.is_empty() {
|
|
|
|
return Err(anyhow!("No uris recorded for {}", b64));
|
|
|
|
};
|
|
|
|
for uri in uris {
|
|
|
|
let url = match Url::parse(&uri) {
|
|
|
|
Ok(url) => url,
|
|
|
|
Err(_) => continue,
|
|
|
|
};
|
|
|
|
match self
|
|
|
|
.download(&url)
|
|
|
|
.await
|
|
|
|
.context("Redownloading missing blob for ensure_hash")
|
|
|
|
{
|
|
|
|
Ok(Sampled {
|
|
|
|
sample_id: _,
|
|
|
|
uri: _,
|
|
|
|
blob,
|
|
|
|
http_status: _,
|
|
|
|
epoch: _,
|
|
|
|
when: _,
|
|
|
|
}) => {
|
|
|
|
if blob.map_or(false, |sb| sb.hash == *hash) {
|
|
|
|
return Ok(());
|
|
|
|
} else {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(_) => {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(anyhow!(
|
|
|
|
"All uris for {} are out of date (result in errors or different hashes)",
|
|
|
|
b64
|
|
|
|
))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
async fn ensure_sampled_uri(&self, uri: &Url) -> Result<Sampled, Error> {
|
|
|
|
/* TODO: flatten */
|
2025-04-27 16:53:07 +00:00
|
|
|
if self.refetch {
|
2025-05-09 04:01:38 +00:00
|
|
|
self.download(&uri).await
|
2025-04-27 16:53:07 +00:00
|
|
|
} else {
|
2025-05-09 04:01:38 +00:00
|
|
|
/* TODO: Add negative TTL */
|
|
|
|
match self.latest_sample(&uri.to_string()).await? {
|
|
|
|
Some(ingested) => match ingested.blob.clone() {
|
|
|
|
Some(SizedBlob { hash, n_bytes: _ }) => {
|
|
|
|
if self.blob_service.has(&hash).await? {
|
|
|
|
Ok(ingested)
|
|
|
|
} else {
|
|
|
|
self.download(&uri).await
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => self.download(&uri).await,
|
|
|
|
},
|
|
|
|
None => self.download(&uri).await,
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
async fn ingest(&self, inputs: &Vec<Ingestable>) -> Vec<Result<Option<Sampled>, Error>> {
|
2025-04-27 16:53:07 +00:00
|
|
|
let samples = stream::iter(inputs.iter().map(|uri| {
|
|
|
|
let blob_service = &self.blob_service;
|
|
|
|
let dir_service = &self.dir_service;
|
|
|
|
|
|
|
|
async move {
|
|
|
|
let uri_s = uri.to_string();
|
2025-05-09 04:01:38 +00:00
|
|
|
let latest_download = self.latest_sample(&uri_s).await?;
|
2025-04-27 16:53:07 +00:00
|
|
|
if latest_download.is_some() {
|
|
|
|
return Ok(latest_download);
|
|
|
|
}
|
|
|
|
match uri {
|
|
|
|
Ingestable::Path(path) => {
|
|
|
|
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
|
|
|
|
.await?
|
|
|
|
{
|
|
|
|
snix_castore::Node::Directory { digest, size } => self
|
2025-04-29 01:50:55 +00:00
|
|
|
.record_ingested_node(
|
|
|
|
&uri_s,
|
|
|
|
&Some(SizedBlob {
|
|
|
|
hash: digest,
|
|
|
|
n_bytes: size,
|
|
|
|
}),
|
|
|
|
None,
|
2025-05-09 04:01:38 +00:00
|
|
|
None,
|
2025-04-29 01:50:55 +00:00
|
|
|
)
|
2025-04-27 16:53:07 +00:00
|
|
|
.await
|
|
|
|
.map(Some),
|
|
|
|
|
|
|
|
snix_castore::Node::File {
|
|
|
|
digest,
|
|
|
|
size,
|
|
|
|
executable: _,
|
|
|
|
} => self
|
2025-04-29 01:50:55 +00:00
|
|
|
.record_ingested_node(
|
|
|
|
&uri_s,
|
|
|
|
&Some(SizedBlob {
|
|
|
|
hash: digest,
|
|
|
|
n_bytes: size,
|
|
|
|
}),
|
|
|
|
None,
|
2025-05-09 04:01:38 +00:00
|
|
|
None,
|
2025-04-29 01:50:55 +00:00
|
|
|
)
|
2025-04-27 16:53:07 +00:00
|
|
|
.await
|
|
|
|
.map(Some),
|
|
|
|
snix_castore::Node::Symlink { target: _ } => {
|
|
|
|
Err(anyhow!("TODO: Figure out what to do with symlink roots"))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2025-05-09 04:01:38 +00:00
|
|
|
Ingestable::Url(url) => self.ensure_sampled_uri(url).await.map(Some),
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}))
|
|
|
|
.buffer_unordered(self.max_parallel)
|
2025-04-29 01:50:55 +00:00
|
|
|
.collect::<Vec<Result<Option<Sampled>, _>>>()
|
2025-04-27 16:53:07 +00:00
|
|
|
.await;
|
|
|
|
|
|
|
|
samples
|
|
|
|
}
|
|
|
|
|
|
|
|
fn extract_hrefs(content: &str) -> Result<Vec<String>, Error> {
|
|
|
|
let sel = Selector::parse("a").map_err(|e| anyhow!(e.to_string()))?;
|
|
|
|
let html = Html::parse_document(&content);
|
|
|
|
|
|
|
|
Ok(html
|
|
|
|
.select(&sel)
|
|
|
|
.flat_map(|elt| elt.value().attr("href"))
|
|
|
|
.map(|s| s.to_string())
|
|
|
|
.collect::<Vec<_>>())
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn fetch_from_listing_impl(
|
|
|
|
self: Arc<Self>,
|
|
|
|
url: Url,
|
|
|
|
max_depth: usize,
|
2025-04-29 01:50:55 +00:00
|
|
|
html_max_bytes: u64,
|
2025-04-27 16:53:07 +00:00
|
|
|
tx: Sender<FetchListingMessage>,
|
|
|
|
) -> Result<(), Error> {
|
2025-05-09 04:01:38 +00:00
|
|
|
let maybe_root = self.ensure_sampled_uri(&url).await;
|
2025-04-29 01:50:55 +00:00
|
|
|
if let Err(ref e) = maybe_root {
|
|
|
|
eprintln!("Couldn't download {}: {:?}", url, e);
|
|
|
|
};
|
|
|
|
let root = maybe_root?;
|
|
|
|
tx.send(FetchListingMessage::Sampled(url.clone(), root.clone()))
|
2025-04-27 16:53:07 +00:00
|
|
|
.await
|
|
|
|
.context("Stopped accepting tasks before processing an Ingested notification")?;
|
|
|
|
if max_depth <= 0 {
|
|
|
|
return Ok(());
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
|
|
|
|
match root.blob {
|
|
|
|
None => Err(anyhow!(
|
|
|
|
"Couldn't download {}. Status code: {:?}",
|
|
|
|
url,
|
|
|
|
root.http_status
|
|
|
|
)),
|
|
|
|
Some(SizedBlob { hash, n_bytes }) => {
|
|
|
|
if n_bytes > html_max_bytes {
|
|
|
|
return Ok(());
|
|
|
|
}
|
|
|
|
match self.blob_service.open_read(&hash).await? {
|
|
|
|
Some(mut reader) => {
|
|
|
|
let content = {
|
|
|
|
let mut br = BufReader::new(&mut *reader);
|
|
|
|
let mut content = String::new();
|
|
|
|
br.read_to_string(&mut content).await?;
|
|
|
|
content
|
|
|
|
};
|
|
|
|
let hrefs = Self::extract_hrefs(&content).unwrap_or(vec![]);
|
|
|
|
/* max_depth > 0 here */
|
|
|
|
for href in hrefs.clone() {
|
|
|
|
let next_url = url.join(&href).context("Constructing next_url")?;
|
|
|
|
tx.send(FetchListingMessage::Recurse(
|
|
|
|
next_url.clone(),
|
|
|
|
max_depth - 1,
|
|
|
|
tx.clone(),
|
|
|
|
))
|
|
|
|
.await
|
|
|
|
.context("Stopped accepting tasks before finishing all hrefs")?;
|
|
|
|
}
|
|
|
|
{
|
|
|
|
let lock = self.con.lock().expect("Couldn't acquire Mutex?");
|
|
|
|
for href in hrefs {
|
|
|
|
let mut stmt =
|
|
|
|
lock.prepare_cached(include_str!("q/add-str.sql"))?;
|
|
|
|
stmt.execute(params!["href"])?;
|
|
|
|
|
|
|
|
let next_url = url.join(&href).context("Constructing next_url")?;
|
|
|
|
let mut stmt =
|
|
|
|
lock.prepare_cached(include_str!("q/add-uri-ref.sql"))?;
|
|
|
|
let digest64 = hash.to_string();
|
2025-05-09 04:01:38 +00:00
|
|
|
stmt.execute(named_params! {":source": digest64, ":target": next_url.to_string(), ":why": "href"})?;
|
2025-04-29 01:50:55 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
None => Err(anyhow!("Couldn't read the ingested blob")),
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn fetch_from_listing(
|
|
|
|
self: Arc<Self>,
|
|
|
|
url: Url,
|
|
|
|
max_depth: usize,
|
2025-04-29 01:50:55 +00:00
|
|
|
html_max_bytes: u64,
|
|
|
|
) -> ReceiverStream<Sampled> {
|
2025-04-27 16:53:07 +00:00
|
|
|
let mq_size = 10;
|
|
|
|
|
|
|
|
/* TODO: move task queue to e.g. sqlite */
|
|
|
|
let (tx, mut rx) = channel(mq_size);
|
|
|
|
|
|
|
|
let (out_tx, out_rx) = channel(mq_size);
|
|
|
|
|
|
|
|
tokio::spawn({
|
|
|
|
async move {
|
|
|
|
let mut seen: HashSet<String> = HashSet::new();
|
2025-04-29 01:50:55 +00:00
|
|
|
{
|
|
|
|
let tx_moved = tx;
|
|
|
|
tx_moved
|
|
|
|
.send(FetchListingMessage::Recurse(
|
|
|
|
url,
|
|
|
|
max_depth,
|
|
|
|
tx_moved.clone(),
|
|
|
|
))
|
|
|
|
.await
|
|
|
|
.expect("fetch_from_listing failed populating the queue");
|
|
|
|
};
|
2025-04-27 16:53:07 +00:00
|
|
|
while let Some(m) = rx.recv().await {
|
|
|
|
match m {
|
2025-04-29 01:50:55 +00:00
|
|
|
FetchListingMessage::Sampled(_url, ingested) => {
|
2025-04-27 16:53:07 +00:00
|
|
|
out_tx
|
|
|
|
.send(ingested)
|
|
|
|
.await
|
|
|
|
.expect("ReceiverStream failed to accept an Ingestable");
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
FetchListingMessage::Recurse(url, max_depth, tx) => {
|
2025-04-27 16:53:07 +00:00
|
|
|
if max_depth > 0 && !seen.contains(&url.to_string()) {
|
|
|
|
seen.insert(url.to_string());
|
|
|
|
tokio::spawn({
|
|
|
|
let s = self.clone();
|
|
|
|
let url = url.clone();
|
|
|
|
async move {
|
2025-04-29 01:50:55 +00:00
|
|
|
s.fetch_from_listing_impl(
|
|
|
|
url,
|
|
|
|
max_depth,
|
|
|
|
html_max_bytes,
|
|
|
|
tx,
|
|
|
|
)
|
|
|
|
.await
|
2025-04-27 16:53:07 +00:00
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ReceiverStream::new(out_rx)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-09 04:01:38 +00:00
|
|
|
fn string_or_int<'de, T, D>(deserializer: D) -> Result<T, D::Error>
|
|
|
|
where
|
|
|
|
T: Deserialize<'de> + TryFrom<u64> + FromStr<Err = std::num::ParseIntError>,
|
|
|
|
D: serde::Deserializer<'de>,
|
|
|
|
{
|
|
|
|
struct StringOrInt<T>(PhantomData<fn() -> T>);
|
|
|
|
|
|
|
|
impl<'de, T> Visitor<'de> for StringOrInt<T>
|
|
|
|
where
|
|
|
|
T: Deserialize<'de> + TryFrom<u64> + FromStr<Err = std::num::ParseIntError>,
|
|
|
|
{
|
|
|
|
type Value = T;
|
|
|
|
|
|
|
|
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
formatter.write_str("string or int")
|
|
|
|
}
|
|
|
|
|
|
|
|
fn visit_u64<E>(self, value: u64) -> Result<T, E>
|
|
|
|
where
|
|
|
|
E: de::Error,
|
|
|
|
{
|
|
|
|
T::try_from(value).map_err(|_e| de::Error::custom("ignored error"))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn visit_str<E>(self, value: &str) -> Result<T, E>
|
|
|
|
where
|
|
|
|
E: de::Error,
|
|
|
|
{
|
|
|
|
FromStr::from_str(value).map_err(de::Error::custom)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
deserializer.deserialize_any(StringOrInt(PhantomData))
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
|
|
|
struct CudaArtifact {
|
|
|
|
relative_path: String,
|
|
|
|
sha256: String,
|
|
|
|
md5: Option<String>,
|
|
|
|
|
|
|
|
// Tha manifests export size as string instead of number
|
|
|
|
#[serde(deserialize_with = "string_or_int")]
|
|
|
|
size: i64,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
|
|
|
#[serde(untagged)]
|
|
|
|
enum CudaArtifactsByTag {
|
|
|
|
Single(CudaArtifact),
|
|
|
|
Many {
|
|
|
|
#[serde(flatten)]
|
|
|
|
by_tag: HashMap<String, CudaArtifact>,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
impl IntoIterator for CudaArtifactsByTag {
|
|
|
|
type Item = (Option<String>, CudaArtifact);
|
|
|
|
type IntoIter = std::vec::IntoIter<Self::Item>;
|
|
|
|
|
|
|
|
fn into_iter(self) -> std::vec::IntoIter<Self::Item> {
|
|
|
|
match self {
|
|
|
|
CudaArtifactsByTag::Single(art) => vec![(None, art)].into_iter(),
|
|
|
|
CudaArtifactsByTag::Many { by_tag: by_compat } => by_compat
|
|
|
|
.iter()
|
|
|
|
.map(|(k, x)| (Some(k.clone()), x.clone()))
|
|
|
|
.collect::<Vec<Self::Item>>()
|
|
|
|
.into_iter(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
|
|
|
#[serde(untagged)]
|
|
|
|
enum CudaArtifactsByPlatform {
|
|
|
|
Binary {
|
|
|
|
#[serde(flatten)]
|
|
|
|
by_platform: HashMap<String, CudaArtifactsByTag>,
|
|
|
|
},
|
|
|
|
Source {
|
|
|
|
source: CudaArtifact,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
impl IntoIterator for CudaArtifactsByPlatform {
|
|
|
|
type Item = (String, Option<String>, CudaArtifact);
|
|
|
|
|
|
|
|
/* TODO: Figure out which is the trait that doesn't involve copying */
|
|
|
|
type IntoIter = std::vec::IntoIter<(String, Option<String>, CudaArtifact)>;
|
|
|
|
|
|
|
|
fn into_iter(self) -> Self::IntoIter {
|
|
|
|
match self {
|
|
|
|
CudaArtifactsByPlatform::Binary { by_platform } => by_platform
|
|
|
|
.iter()
|
|
|
|
.flat_map(|(platform, by_tag)| {
|
|
|
|
by_tag
|
|
|
|
.clone()
|
|
|
|
.into_iter()
|
|
|
|
.map(|(tag, artifact)| (platform.clone(), tag.clone(), artifact))
|
|
|
|
})
|
|
|
|
.collect::<Vec<Self::Item>>()
|
|
|
|
.into_iter(),
|
|
|
|
CudaArtifactsByPlatform::Source { source } => {
|
|
|
|
(vec![("source".to_string(), None, source)]).into_iter()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Debug)]
|
|
|
|
struct CudaJsonPackage {
|
|
|
|
name: Option<String>,
|
|
|
|
license: String,
|
|
|
|
license_path: Option<String>,
|
|
|
|
version: String,
|
|
|
|
|
|
|
|
cuda_variant: Option<Vec<String>>,
|
|
|
|
|
|
|
|
#[serde(flatten)]
|
|
|
|
artifacts: CudaArtifactsByPlatform,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Serialize, Deserialize, Debug)]
|
|
|
|
struct CudaJsonManifest {
|
|
|
|
release_date: Option<String>,
|
|
|
|
release_label: Option<String>,
|
|
|
|
release_product: Option<String>,
|
|
|
|
|
|
|
|
#[serde(flatten)]
|
|
|
|
by_pname: HashMap<String, CudaJsonPackage>,
|
|
|
|
}
|
|
|
|
|
2025-04-27 16:53:07 +00:00
|
|
|
#[tokio::main]
|
|
|
|
async fn main() {
|
|
|
|
let args = Cli::parse();
|
|
|
|
|
|
|
|
let _cwd = std::env::current_dir().expect("Couldn't get CWD");
|
|
|
|
let _host_name = std::env::var("HOSTNAME").map_or(None, Some);
|
|
|
|
|
|
|
|
let ctx = Arc::new(
|
|
|
|
open_context(
|
|
|
|
args.refetch,
|
|
|
|
args.max_parallel,
|
|
|
|
args.db_path,
|
|
|
|
args.castore_path,
|
|
|
|
)
|
|
|
|
.await,
|
|
|
|
);
|
2025-04-20 08:59:50 +00:00
|
|
|
|
|
|
|
match args.command {
|
|
|
|
Some(Command::Ingest { inputs }) => {
|
2025-04-27 16:53:07 +00:00
|
|
|
let samples = ctx.ingest(&inputs).await;
|
2025-04-20 08:59:50 +00:00
|
|
|
for s in samples {
|
|
|
|
match s {
|
|
|
|
Err(e) => {
|
2025-04-20 17:53:54 +00:00
|
|
|
eprintln!("Failed to fetch: {}", e);
|
2025-04-20 08:59:50 +00:00
|
|
|
}
|
|
|
|
Ok(None) => {}
|
|
|
|
Ok(Some(ingested)) => {
|
2025-04-20 17:53:54 +00:00
|
|
|
eprintln!("{:?}", ingested)
|
2025-04-20 08:59:50 +00:00
|
|
|
}
|
|
|
|
}
|
2025-04-15 14:07:49 +00:00
|
|
|
}
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
Some(Command::FetchListing {
|
|
|
|
max_depth,
|
|
|
|
html_max_bytes,
|
|
|
|
inputs,
|
|
|
|
}) => {
|
|
|
|
let ingested: Vec<Sampled> = stream::iter(inputs)
|
2025-04-27 16:53:07 +00:00
|
|
|
.then(async |i| {
|
|
|
|
let i = i.clone();
|
2025-04-29 01:50:55 +00:00
|
|
|
ctx.clone()
|
|
|
|
.fetch_from_listing(i, max_depth, html_max_bytes)
|
|
|
|
.await
|
2025-04-27 16:53:07 +00:00
|
|
|
})
|
|
|
|
.flatten_unordered(args.max_parallel)
|
|
|
|
.collect()
|
|
|
|
.await;
|
|
|
|
for i in ingested {
|
|
|
|
eprintln!("{:?}", i);
|
|
|
|
}
|
|
|
|
}
|
2025-04-29 01:50:55 +00:00
|
|
|
Some(Command::ParseUrl { url: urls }) => {
|
|
|
|
for url in urls {
|
|
|
|
println!("{:?}", url);
|
|
|
|
}
|
|
|
|
}
|
2025-05-09 04:01:38 +00:00
|
|
|
Some(Command::FormatCudaManifest) => {
|
|
|
|
println!(
|
|
|
|
"{}",
|
|
|
|
serde_json::to_string(
|
|
|
|
&serde_json::from_reader::<_, CudaJsonManifest>(io::stdin()).unwrap()
|
|
|
|
)
|
|
|
|
.unwrap()
|
|
|
|
);
|
|
|
|
}
|
|
|
|
Some(Command::DemoCudaManifest) => {
|
|
|
|
println!(
|
|
|
|
"{}",
|
|
|
|
serde_json::to_string(&CudaJsonManifest {
|
|
|
|
release_date: Some("1984-01-01".to_string()),
|
|
|
|
release_label: Some("8.9.x".to_string()),
|
|
|
|
release_product: Some("cudnn".to_string()),
|
|
|
|
by_pname: HashMap::from([
|
|
|
|
(
|
|
|
|
"cudnn".to_string(),
|
|
|
|
CudaJsonPackage {
|
|
|
|
name: Some("cuDNN Library".to_string()),
|
|
|
|
license: "cudnn".to_string(),
|
|
|
|
license_path: Some("bar/foo".to_string()),
|
|
|
|
version: "8.9.7.6".to_string(),
|
|
|
|
cuda_variant: Some(vec!["11".to_string(), "12".to_string()]),
|
|
|
|
artifacts: CudaArtifactsByPlatform::Binary {
|
|
|
|
by_platform: HashMap::from([
|
|
|
|
("x86_64-linux".to_string(),
|
|
|
|
CudaArtifactsByTag::Many {
|
|
|
|
by_tag:
|
|
|
|
HashMap::from([
|
|
|
|
("cuda11"
|
|
|
|
.to_string(),
|
|
|
|
CudaArtifact{
|
|
|
|
relative_path:
|
|
|
|
"kek".to_string(),
|
|
|
|
sha256: "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824".to_string(),
|
|
|
|
md5: Some("5d41402abc4b2a76b9719d911017c592".to_string()),
|
|
|
|
size: 5 })])})
|
|
|
|
|
|
|
|
]),
|
|
|
|
}
|
|
|
|
}),
|
|
|
|
(
|
|
|
|
"cuda_samples".to_string(),
|
|
|
|
CudaJsonPackage {
|
|
|
|
name: Some("NVIDIA cuDNN samples".to_string()),
|
|
|
|
license: "cudnn".to_string(),
|
|
|
|
license_path: Some("foo/bar".to_string()),
|
|
|
|
version: "8.9.7.6".to_string(),
|
|
|
|
cuda_variant: None,
|
|
|
|
artifacts: CudaArtifactsByPlatform::Source {
|
|
|
|
source: CudaArtifact {
|
|
|
|
relative_path: "/biba/boba/fifa".to_string(),
|
|
|
|
sha256: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855".to_string(),
|
|
|
|
md5: Some("d41d8cd98f00b204e9800998ecf8427e".to_string()),
|
|
|
|
size: 0,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
),])
|
|
|
|
})
|
|
|
|
.unwrap()
|
|
|
|
);
|
|
|
|
}
|
|
|
|
Some(Command::ProcessCudaManifests { include_finished }) => {
|
|
|
|
let manifests: Vec<(String, String, Option<u64>)> = {
|
|
|
|
let con = ctx.con.lock().unwrap();
|
|
|
|
con.execute_batch(include_str!("q/cuda-init.sql"))
|
|
|
|
.context("q/cuda-init.sql")
|
|
|
|
.unwrap();
|
|
|
|
let mut find_manifests = con
|
|
|
|
.prepare_cached(include_str!("q/find-cuda-manifests.sql"))
|
|
|
|
.context("q/find-cuda-manifests.sql")
|
|
|
|
.unwrap();
|
|
|
|
find_manifests
|
|
|
|
.query(named_params! {":include_finished": include_finished})
|
|
|
|
.context("q/find-cuda-manifests.sql")
|
|
|
|
.unwrap()
|
|
|
|
.map(|row| <(String, String, Option<u64>)>::try_from(row))
|
|
|
|
.collect()
|
|
|
|
.expect("Casting result of q/find-cuda-manifests.sql")
|
|
|
|
};
|
|
|
|
for m in &manifests {
|
|
|
|
let b64 = m.1.clone();
|
|
|
|
let b3 = match B3Digest::from_str(&b64) {
|
|
|
|
Ok(b3) => b3,
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("Invalid hash recorded for {:?}: {}", m, e);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
if let Err(e) = ctx.ensure_blob(&b3).await {
|
|
|
|
eprintln!("Couldn't provision the blob for {:?}: {}", m, e);
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
let json = {
|
|
|
|
let mut reader = match ctx.blob_service.open_read(&b3).await {
|
|
|
|
Ok(Some(reader)) => reader,
|
|
|
|
Ok(None) => {
|
|
|
|
eprintln!("Blob doesn't exist after ensure_blob: {:?}", m);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("Couldn't query the blob for {:?}: {}", m, e);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
let mut json = String::new();
|
|
|
|
match reader.read_to_string(&mut json).await {
|
|
|
|
Ok(_) => (),
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("Couldn't read blob {:?}: {:?}", m, e);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
json
|
|
|
|
};
|
|
|
|
let parsed: CudaJsonManifest = match serde_json::from_str(&json) {
|
|
|
|
Ok(m) => m,
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("Couldn't parse JSON for {:?}: {:?}", m, e);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
{
|
|
|
|
let mut lock = ctx.con.lock().unwrap();
|
|
|
|
let tx = lock.transaction().unwrap();
|
|
|
|
{
|
|
|
|
let mut add_str = tx
|
|
|
|
.prepare_cached(include_str!("q/add-str.sql"))
|
|
|
|
.context("q/add-str.sql")
|
|
|
|
.unwrap();
|
|
|
|
let mut add_hash = tx
|
|
|
|
.prepare_cached(include_str!("q/upsert-blob.sql"))
|
|
|
|
.context("q/upsert-blob.sql")
|
|
|
|
.unwrap();
|
|
|
|
let mut add_manifest = tx
|
|
|
|
.prepare_cached(include_str!("q/add-cuda-manifest.sql"))
|
|
|
|
.context("q/add-cuda-manifest.sql")
|
|
|
|
.unwrap();
|
|
|
|
let mut add_comp = tx
|
|
|
|
.prepare_cached(include_str!("q/add-cuda-artifact.sql"))
|
|
|
|
.context("q/add-cuda-artifact.sql")
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
add_hash.execute(params![b64, None::<usize>]).unwrap();
|
|
|
|
for s in vec![
|
|
|
|
&parsed.release_date,
|
|
|
|
&parsed.release_label,
|
|
|
|
&parsed.release_product,
|
|
|
|
] {
|
|
|
|
add_str.execute((s,)).unwrap();
|
|
|
|
}
|
|
|
|
add_manifest
|
|
|
|
.execute(named_params! {
|
|
|
|
":hash": b64,
|
|
|
|
":release_date": parsed.release_date,
|
|
|
|
":release_label": parsed.release_label,
|
|
|
|
":release_product": parsed.release_product,
|
|
|
|
})
|
|
|
|
.context("Executing q/add-cuda-manifest.sql")
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
for (pname, pkg) in parsed.by_pname {
|
|
|
|
for (platform, maybe_tag, comp) in pkg.artifacts.into_iter() {
|
|
|
|
let ps = named_params! {
|
|
|
|
":manifest": b64,
|
|
|
|
":name": pkg.name,
|
|
|
|
":pname": pname,
|
|
|
|
":license_name": pkg.license,
|
|
|
|
":license_path": pkg.license_path,
|
|
|
|
":version": pkg.version,
|
|
|
|
":sha256": comp.sha256,
|
|
|
|
":md5": comp.md5,
|
|
|
|
":platform": platform,
|
|
|
|
":relative_path": comp.relative_path,
|
|
|
|
":n_bytes": comp.size,
|
|
|
|
":compat_tag": maybe_tag
|
|
|
|
};
|
|
|
|
for h in &vec![Some(&comp.sha256), comp.md5.as_ref()] {
|
|
|
|
add_hash.execute(params![h, None::<usize>]).unwrap();
|
|
|
|
}
|
|
|
|
for s in &vec![
|
|
|
|
Some(&pname),
|
|
|
|
pkg.name.as_ref(),
|
|
|
|
Some(&pkg.license),
|
|
|
|
pkg.license_path.as_ref(),
|
|
|
|
Some(&pkg.version),
|
|
|
|
Some(&platform.to_string()),
|
|
|
|
Some(&comp.relative_path),
|
|
|
|
maybe_tag.as_ref(),
|
|
|
|
] {
|
|
|
|
add_str.execute(params![s]).unwrap();
|
|
|
|
}
|
|
|
|
add_comp
|
|
|
|
.execute(ps)
|
|
|
|
.context("Executing q/add-cuda-artifact.sql")
|
|
|
|
.unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tx.commit()
|
|
|
|
.expect("Couldn't commit transaction adding manifest or its component");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2025-04-20 08:59:50 +00:00
|
|
|
None => {}
|
2025-04-15 14:07:49 +00:00
|
|
|
}
|
|
|
|
}
|