From c8b8b564569965858d2fd2bf9824ea2d5a49d932 Mon Sep 17 00:00:00 2001 From: SomeoneSerge Date: Fri, 9 May 2025 04:01:38 +0000 Subject: [PATCH] cuda: support cudnn manifests Also changes the schemas a bit (still don't care about migrations), switches to named_params!, fixes up perf issues when ingesting manifests and/or hrefs into sqlite, etc. Adds sample queries such as "CudaArtifact conflicts" to datasette config, which explain some issues associated with choosing an evalModules schema on the cudaPackages side. --- Cargo.lock | 1 + Cargo.toml | 1 + README.md | 9 +- default.nix | 97 ++++++- shell.nix | 4 + src/main.rs | 509 ++++++++++++++++++++++++++++++++-- src/q/add-cuda-artifact.sql | 17 ++ src/q/add-cuda-manifest.sql | 9 + src/q/add-sample.sql | 26 +- src/q/add-uri-ref.sql | 7 + src/q/cuda-init.sql | 49 ++++ src/q/find-cuda-manifests.sql | 15 + src/q/sidx-init.sql | 35 +-- src/q/uris-of-hash.sql | 16 ++ static/some.css | 48 ++++ 15 files changed, 770 insertions(+), 73 deletions(-) create mode 100644 src/q/add-cuda-artifact.sql create mode 100644 src/q/add-cuda-manifest.sql create mode 100644 src/q/add-uri-ref.sql create mode 100644 src/q/cuda-init.sql create mode 100644 src/q/find-cuda-manifests.sql create mode 100644 src/q/uris-of-hash.sql create mode 100644 static/some.css diff --git a/Cargo.lock b/Cargo.lock index dd4233c..107eccf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2695,6 +2695,7 @@ dependencies = [ "reqwest", "rusqlite", "scraper", + "serde", "serde_json", "snix-castore", "tokio", diff --git a/Cargo.toml b/Cargo.toml index f8e09cf..a75186e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ futures = "0.3.31" reqwest = "0.12.15" rusqlite = "0.34.0" scraper = "0.23.1" +serde = "1.0.219" serde_json = "1.0.140" snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" } tokio = "1.44.2" diff --git a/README.md b/README.md index 073d99a..1e072bb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -sidx +[sidx](https://forge.someonex.net/else/sidx) === +Work in Progress. Indexing archives and build outputs. @@ -17,4 +18,8 @@ Roadmap Approach --- -Vapourware and means to an end. +Vapourware and means to an end: +[this project](https://forge.someonex.net/else/sidx) was originally motivated by the needs of maintaining +`cudaPackages` in Nixpkgs. +Specifically, it attempts to answer the question of "what is there to be maintained", +improve [observability and debug-ability of the package set (cf. demo)](https://cuda-index.someonex.net/sidx/UriReference). diff --git a/default.nix b/default.nix index a822a80..01ecf46 100644 --- a/default.nix +++ b/default.nix @@ -50,25 +50,110 @@ lib.makeScope pkgs.newScope ( datasette-wrapped = self.callPackage ( { datasette, - datasette-metadata, + datasette-assets, makeWrapper, runCommand, }: - runCommand "datasettew" { nativeBuildInputs = [ makeWrapper ]; } '' - mkdir -p "$out/bin" - makeWrapper ${lib.getExe datasette} "$out/bin/datasettew" \ - --add-flags --metadata=${datasette-metadata} - '' + runCommand "datasettew" + { + nativeBuildInputs = [ makeWrapper ]; + preferLocalBuild = true; + allowSubstitutes = false; + } + '' + mkdir -p "$out/bin" + makeWrapper ${lib.getExe datasette} "$out/bin/datasettew" \ + --append-flags --metadata=${datasette-assets}/metadata.json \ + --append-flags --static=static:${datasette-assets}/static + '' + ) { }; + datasette-assets = self.callPackage ( + { + runCommand, + datasette-metadata, + datasette-settings, + }: + runCommand "datasette-assets" + { + preferLocalBuild = true; + allowSubstitutes = false; + } + '' + mkdir "$out" + cp --no-preserve=mode -r ${./static} "$out"/static + cp ${datasette-metadata} "$out"/metadata.json + cp ${datasette-settings} "$out"/settings.json + '' + ) { }; + datasette-settings = self.callPackage ( + { formats }: + (formats.json { }).generate "datasette-settings.json" { + sql_time_limit_ms = 8000; + } ) { }; datasette-metadata = self.callPackage ( { formats }: (formats.json { }).generate "datasette-metadata.json" { + title = "CUDA INDEX"; + description_html = '' +

Visualizing the contents of Nixpkgs' cudaPackages. + Generated via an ad-hoc indexing tool. +

+ ''; + "extra_css_urls" = [ + "/static/some.css" + ]; "databases" = { "sidx" = { "tables" = { "Hash" = { "label_column" = "hash"; }; + "CudaArtifact" = { + facets = [ + "pname" + "platform" + ]; + }; + }; + queries.cuda_conflicts = { + title = "CudaArtifact Conflicts"; + description_html = '' + CudaArtifacts (identified by sha256) + claiming the same (pname, version, platform) triple + ''; + sql = '' + SELECT + COUNT(DISTINCT sha256) AS conflicts, + pname.str AS pname, + ver.str AS ver, + plat.str AS plat, + GROUP_CONCAT(name.str, char(10)) AS name, + GROUP_CONCAT(tag.str, char(10)) AS tag, + GROUP_CONCAT(h.hash, char(10)) AS sha256 + FROM + ( + CudaArtifact AS cc, + Str AS name, + Str AS pname, + Str as ver, + Str as plat, + Hash as h + ON cc.name=name.id + AND cc.pname=pname.id + AND cc.version = ver.id + AND cc.platform = plat.id + AND cc.sha256 = h.id + ) + LEFT JOIN Str AS tag + ON + cc.compat_tag=tag.id + GROUP BY + cc.pname, cc.version, cc.platform + HAVING + conflicts >= CAST(:min_conflicts AS INTEGER) + ORDER BY conflicts DESC + ''; }; }; }; diff --git a/shell.nix b/shell.nix index c168f3f..50c74c7 100644 --- a/shell.nix +++ b/shell.nix @@ -9,11 +9,13 @@ openssl ? pkgs.openssl, rust-analyzer ? pkgs.rust-analyzer, rustc ? pkgs.rustc, + rustfmt ? pkgs.rustfmt, cargo ? pkgs.cargo, pkg-config ? pkgs.pkg-config, crate2nix ? pkgs.crate2nix, protobuf ? pkgs.protobuf, datasette-wrapped ? self.datasette-wrapped, + datasette-assets ? self.datasette-assets, ... }: mkShell { @@ -23,6 +25,7 @@ mkShell { cargo crate2nix rustc + rustfmt rust-analyzer pkg-config protobuf @@ -32,6 +35,7 @@ mkShell { openssl sqlite ]; + DATASETTE_ASSETS = datasette-assets; # uploaded to cuda-index.someonex.net in bulk... shellHook = '' export DATABASE_PATH="$HOME/.local/share/sidx/sidx.db" unset out outputs phases diff --git a/src/main.rs b/src/main.rs index 43f7ee2..ec3a3e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,23 +1,28 @@ -use std::collections::HashSet; -use std::path::{absolute, PathBuf}; +use std::collections::{HashMap, HashSet}; +use std::marker::PhantomData; +use std::path::{PathBuf, absolute}; use std::str::FromStr; use std::sync::Arc; +use std::{fmt, io}; use anyhow::Context; -use anyhow::{anyhow, Error}; +use anyhow::{Error, anyhow}; use clap::Parser; use clap::Subcommand; -use futures::{stream, StreamExt, TryStreamExt}; -use rusqlite::{params, OptionalExtension}; +use futures::{StreamExt, TryStreamExt, stream}; +use rusqlite::fallible_iterator::FallibleIterator as _; +use rusqlite::{OptionalExtension, named_params, params}; use scraper::{Html, Selector}; +use serde::de::{self, Visitor}; +use serde::{Deserialize, Serialize}; +use snix_castore::B3Digest; use snix_castore::blobservice::BlobService; use snix_castore::directoryservice::DirectoryService; -use snix_castore::B3Digest; use snix_castore::{blobservice, directoryservice, import::fs::ingest_path}; use std::sync::Mutex; use tokio::io::{AsyncReadExt, BufReader}; -use tokio::sync::mpsc::{channel, Sender}; use tokio::sync::Semaphore; +use tokio::sync::mpsc::{Sender, channel}; use tokio_stream::wrappers::ReceiverStream; use url::Url; @@ -130,6 +135,12 @@ enum Command { #[clap(value_parser, num_args = 1)] url: Vec, }, + DemoCudaManifest, + FormatCudaManifest, + ProcessCudaManifests { + #[clap(short, long, action)] + include_finished: bool, + }, } #[derive(Parser)] @@ -175,6 +186,10 @@ async fn open_context( } let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object"); + con.pragma_update(None, "jorunal_mode", "wal").unwrap(); + con.pragma_update(None, "synchronous", "normal").unwrap(); + con.pragma_update(None, "temp_store", "memory").unwrap(); + con.pragma_update(None, "foreign_keys", "on").unwrap(); con.execute_batch(include_str!("q/sidx-init.sql")) .expect("Failed to execute sidx-init.sql"); let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path"); @@ -190,12 +205,12 @@ async fn open_context( let dir_service = directoryservice::from_addr(&std::format!( "objectstore+file://{}", castore_path - .join("directory") + .join("directories") .to_str() .expect("Path::to_str unexpectedly broken") )) .await - .expect("Couldn't initialize .castore/directory"); + .expect("Couldn't initialize .castore/directories"); SidxContext::, Arc> { refetch, @@ -208,8 +223,23 @@ async fn open_context( } } +impl Drop for SidxContext +where + BS: BlobService + Clone, + DS: DirectoryService + Clone, +{ + fn drop(&mut self) { + let con = self + .con + .lock() + .expect("Acquiring mutex for sqlite to run #pragma optimize before exit"); + con.pragma_update(None, "analysis_limit", 500).unwrap(); + con.pragma_query(None, "optimize", |_| Ok(())).unwrap(); + } +} + impl SidxContext { - async fn db_latest_download(&self, uri: &str) -> Result, Error> { + async fn latest_sample(&self, uri: &str) -> Result, Error> { let lock = self.con.lock().unwrap(); let mut find_sample = lock .prepare_cached(include_str!("q/latest-download.sql")) @@ -239,15 +269,22 @@ impl SidxContext &self, uri: &str, hash: &Option, - http_code: Option, + http_code: &Option, + content_type: &Option, ) -> Result<(u32, u32), Error> { - let lock = self.con.lock().expect("Couldn't lock mutex"); + let lock = self.con.lock().expect("Locking mutex for db_add_sample"); let mut add_sample = lock .prepare_cached(include_str!("q/add-sample.sql")) .context("Failed to prepare add-sample.sql")?; - Ok(add_sample.query_row(params![uri, hash, http_code], |row| { - <(u32, u32)>::try_from(row) - })?) + Ok(add_sample.query_row( + named_params! { + ":uri": uri, + ":hash": hash, + ":http_code": http_code, + ":content_type": content_type + }, + |row| <(u32, u32)>::try_from(row), + )?) } async fn db_add_blob(&self, hash: &str, n_bytes: u64) -> Result { let lock = self.con.lock().expect("db_add_blob: couldn't lock mutex?"); @@ -269,6 +306,7 @@ impl SidxContext uri: &str, blob: &Option, http_code: Option, + content_type: Option, ) -> Result { let digest64 = if let Some(SizedBlob { hash, n_bytes }) = blob { let digest64 = format!("{}", hash); @@ -279,7 +317,7 @@ impl SidxContext }; self.db_add_uri(&uri).await?; let (sample_id, epoch) = self - .db_add_sample(&uri, &digest64, http_code.clone()) + .db_add_sample(&uri, &digest64, &http_code, &content_type) .await?; Ok(Sampled { sample_id, @@ -290,7 +328,7 @@ impl SidxContext when: SampledWhen::Now, }) } - async fn download_no_cache(&self, uri: &Url) -> Result { + async fn download(&self, uri: &Url) -> Result { let _permit = self.http_semaphore.acquire().await.unwrap(); eprintln!("Downloading {:?}", uri.to_string()); let uri_s = uri.to_string(); @@ -303,6 +341,11 @@ impl SidxContext let status = res.status(); let status_code = status.as_u16(); + let content_type = res + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|x| x.to_str().ok()) + .map(|x| x.to_string()); if status.is_success() { let mut r = tokio_util::io::StreamReader::new( @@ -328,20 +371,91 @@ impl SidxContext n_bytes, }), Some(status_code), + content_type, ) .await } else { - self.record_ingested_node(&uri_s, &None, Some(status_code)) + self.record_ingested_node(&uri_s, &None, Some(status_code), content_type) .await } } - async fn download(&self, uri: &Url) -> Result { - if self.refetch { - self.download_no_cache(&uri).await + async fn ensure_blob(&self, hash: &B3Digest) -> Result<(), Error> { + if self + .blob_service + .has(hash) + .await + .context("ensure_has() accessing blob_service")? + { + Ok(()) } else { - match self.db_latest_download(&uri.to_string()).await? { - Some(ingested) => Ok(ingested), - None => self.download_no_cache(&uri).await, + let b64 = hash.to_string(); + let uris = { + let con = self.con.lock().unwrap(); + let mut find_uris = con + .prepare_cached(include_str!("q/uris-of-hash.sql")) + .context("Preparing statement: q/uris-of-hash.sql") + .unwrap(); + find_uris + .query(named_params! {":hash": b64, ":limit": 100})? + .map(|b| b.get(0)) + .collect::>()? + }; + if uris.is_empty() { + return Err(anyhow!("No uris recorded for {}", b64)); + }; + for uri in uris { + let url = match Url::parse(&uri) { + Ok(url) => url, + Err(_) => continue, + }; + match self + .download(&url) + .await + .context("Redownloading missing blob for ensure_hash") + { + Ok(Sampled { + sample_id: _, + uri: _, + blob, + http_status: _, + epoch: _, + when: _, + }) => { + if blob.map_or(false, |sb| sb.hash == *hash) { + return Ok(()); + } else { + continue; + } + } + Err(_) => { + continue; + } + } + } + Err(anyhow!( + "All uris for {} are out of date (result in errors or different hashes)", + b64 + )) + } + } + async fn ensure_sampled_uri(&self, uri: &Url) -> Result { + /* TODO: flatten */ + if self.refetch { + self.download(&uri).await + } else { + /* TODO: Add negative TTL */ + match self.latest_sample(&uri.to_string()).await? { + Some(ingested) => match ingested.blob.clone() { + Some(SizedBlob { hash, n_bytes: _ }) => { + if self.blob_service.has(&hash).await? { + Ok(ingested) + } else { + self.download(&uri).await + } + } + None => self.download(&uri).await, + }, + None => self.download(&uri).await, } } } @@ -352,7 +466,7 @@ impl SidxContext async move { let uri_s = uri.to_string(); - let latest_download = self.db_latest_download(&uri_s).await?; + let latest_download = self.latest_sample(&uri_s).await?; if latest_download.is_some() { return Ok(latest_download); } @@ -369,6 +483,7 @@ impl SidxContext n_bytes: size, }), None, + None, ) .await .map(Some), @@ -385,6 +500,7 @@ impl SidxContext n_bytes: size, }), None, + None, ) .await .map(Some), @@ -393,7 +509,7 @@ impl SidxContext } } } - Ingestable::Url(url) => self.download(url).await.map(Some), + Ingestable::Url(url) => self.ensure_sampled_uri(url).await.map(Some), } } })) @@ -422,7 +538,7 @@ impl SidxContext html_max_bytes: u64, tx: Sender, ) -> Result<(), Error> { - let maybe_root = self.download(&url).await; + let maybe_root = self.ensure_sampled_uri(&url).await; if let Err(ref e) = maybe_root { eprintln!("Couldn't download {}: {:?}", url, e); }; @@ -475,7 +591,7 @@ impl SidxContext let mut stmt = lock.prepare_cached(include_str!("q/add-uri-ref.sql"))?; let digest64 = hash.to_string(); - stmt.execute(params![digest64, next_url.to_string(), href])?; + stmt.execute(named_params! {":source": digest64, ":target": next_url.to_string(), ":why": "href"})?; } }; Ok(()) @@ -547,6 +663,136 @@ impl SidxContext } } +fn string_or_int<'de, T, D>(deserializer: D) -> Result +where + T: Deserialize<'de> + TryFrom + FromStr, + D: serde::Deserializer<'de>, +{ + struct StringOrInt(PhantomData T>); + + impl<'de, T> Visitor<'de> for StringOrInt + where + T: Deserialize<'de> + TryFrom + FromStr, + { + type Value = T; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("string or int") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + T::try_from(value).map_err(|_e| de::Error::custom("ignored error")) + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + FromStr::from_str(value).map_err(de::Error::custom) + } + } + + deserializer.deserialize_any(StringOrInt(PhantomData)) +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +struct CudaArtifact { + relative_path: String, + sha256: String, + md5: Option, + + // Tha manifests export size as string instead of number + #[serde(deserialize_with = "string_or_int")] + size: i64, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(untagged)] +enum CudaArtifactsByTag { + Single(CudaArtifact), + Many { + #[serde(flatten)] + by_tag: HashMap, + }, +} +impl IntoIterator for CudaArtifactsByTag { + type Item = (Option, CudaArtifact); + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> std::vec::IntoIter { + match self { + CudaArtifactsByTag::Single(art) => vec![(None, art)].into_iter(), + CudaArtifactsByTag::Many { by_tag: by_compat } => by_compat + .iter() + .map(|(k, x)| (Some(k.clone()), x.clone())) + .collect::>() + .into_iter(), + } + } +} +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(untagged)] +enum CudaArtifactsByPlatform { + Binary { + #[serde(flatten)] + by_platform: HashMap, + }, + Source { + source: CudaArtifact, + }, +} + +impl IntoIterator for CudaArtifactsByPlatform { + type Item = (String, Option, CudaArtifact); + + /* TODO: Figure out which is the trait that doesn't involve copying */ + type IntoIter = std::vec::IntoIter<(String, Option, CudaArtifact)>; + + fn into_iter(self) -> Self::IntoIter { + match self { + CudaArtifactsByPlatform::Binary { by_platform } => by_platform + .iter() + .flat_map(|(platform, by_tag)| { + by_tag + .clone() + .into_iter() + .map(|(tag, artifact)| (platform.clone(), tag.clone(), artifact)) + }) + .collect::>() + .into_iter(), + CudaArtifactsByPlatform::Source { source } => { + (vec![("source".to_string(), None, source)]).into_iter() + } + } + } +} + +#[derive(Serialize, Deserialize, Debug)] +struct CudaJsonPackage { + name: Option, + license: String, + license_path: Option, + version: String, + + cuda_variant: Option>, + + #[serde(flatten)] + artifacts: CudaArtifactsByPlatform, +} + +#[derive(Serialize, Deserialize, Debug)] +struct CudaJsonManifest { + release_date: Option, + release_label: Option, + release_product: Option, + + #[serde(flatten)] + by_pname: HashMap, +} + #[tokio::main] async fn main() { let args = Cli::parse(); @@ -603,6 +849,213 @@ async fn main() { println!("{:?}", url); } } + Some(Command::FormatCudaManifest) => { + println!( + "{}", + serde_json::to_string( + &serde_json::from_reader::<_, CudaJsonManifest>(io::stdin()).unwrap() + ) + .unwrap() + ); + } + Some(Command::DemoCudaManifest) => { + println!( + "{}", + serde_json::to_string(&CudaJsonManifest { + release_date: Some("1984-01-01".to_string()), + release_label: Some("8.9.x".to_string()), + release_product: Some("cudnn".to_string()), + by_pname: HashMap::from([ + ( + "cudnn".to_string(), + CudaJsonPackage { + name: Some("cuDNN Library".to_string()), + license: "cudnn".to_string(), + license_path: Some("bar/foo".to_string()), + version: "8.9.7.6".to_string(), + cuda_variant: Some(vec!["11".to_string(), "12".to_string()]), + artifacts: CudaArtifactsByPlatform::Binary { + by_platform: HashMap::from([ + ("x86_64-linux".to_string(), + CudaArtifactsByTag::Many { + by_tag: + HashMap::from([ + ("cuda11" + .to_string(), + CudaArtifact{ + relative_path: + "kek".to_string(), + sha256: "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824".to_string(), + md5: Some("5d41402abc4b2a76b9719d911017c592".to_string()), + size: 5 })])}) + + ]), + } + }), + ( + "cuda_samples".to_string(), + CudaJsonPackage { + name: Some("NVIDIA cuDNN samples".to_string()), + license: "cudnn".to_string(), + license_path: Some("foo/bar".to_string()), + version: "8.9.7.6".to_string(), + cuda_variant: None, + artifacts: CudaArtifactsByPlatform::Source { + source: CudaArtifact { + relative_path: "/biba/boba/fifa".to_string(), + sha256: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855".to_string(), + md5: Some("d41d8cd98f00b204e9800998ecf8427e".to_string()), + size: 0, + } + } + } + ),]) + }) + .unwrap() + ); + } + Some(Command::ProcessCudaManifests { include_finished }) => { + let manifests: Vec<(String, String, Option)> = { + let con = ctx.con.lock().unwrap(); + con.execute_batch(include_str!("q/cuda-init.sql")) + .context("q/cuda-init.sql") + .unwrap(); + let mut find_manifests = con + .prepare_cached(include_str!("q/find-cuda-manifests.sql")) + .context("q/find-cuda-manifests.sql") + .unwrap(); + find_manifests + .query(named_params! {":include_finished": include_finished}) + .context("q/find-cuda-manifests.sql") + .unwrap() + .map(|row| <(String, String, Option)>::try_from(row)) + .collect() + .expect("Casting result of q/find-cuda-manifests.sql") + }; + for m in &manifests { + let b64 = m.1.clone(); + let b3 = match B3Digest::from_str(&b64) { + Ok(b3) => b3, + Err(e) => { + eprintln!("Invalid hash recorded for {:?}: {}", m, e); + continue; + } + }; + if let Err(e) = ctx.ensure_blob(&b3).await { + eprintln!("Couldn't provision the blob for {:?}: {}", m, e); + continue; + }; + let json = { + let mut reader = match ctx.blob_service.open_read(&b3).await { + Ok(Some(reader)) => reader, + Ok(None) => { + eprintln!("Blob doesn't exist after ensure_blob: {:?}", m); + continue; + } + Err(e) => { + eprintln!("Couldn't query the blob for {:?}: {}", m, e); + continue; + } + }; + let mut json = String::new(); + match reader.read_to_string(&mut json).await { + Ok(_) => (), + Err(e) => { + eprintln!("Couldn't read blob {:?}: {:?}", m, e); + continue; + } + }; + json + }; + let parsed: CudaJsonManifest = match serde_json::from_str(&json) { + Ok(m) => m, + Err(e) => { + eprintln!("Couldn't parse JSON for {:?}: {:?}", m, e); + continue; + } + }; + { + let mut lock = ctx.con.lock().unwrap(); + let tx = lock.transaction().unwrap(); + { + let mut add_str = tx + .prepare_cached(include_str!("q/add-str.sql")) + .context("q/add-str.sql") + .unwrap(); + let mut add_hash = tx + .prepare_cached(include_str!("q/upsert-blob.sql")) + .context("q/upsert-blob.sql") + .unwrap(); + let mut add_manifest = tx + .prepare_cached(include_str!("q/add-cuda-manifest.sql")) + .context("q/add-cuda-manifest.sql") + .unwrap(); + let mut add_comp = tx + .prepare_cached(include_str!("q/add-cuda-artifact.sql")) + .context("q/add-cuda-artifact.sql") + .unwrap(); + + add_hash.execute(params![b64, None::]).unwrap(); + for s in vec![ + &parsed.release_date, + &parsed.release_label, + &parsed.release_product, + ] { + add_str.execute((s,)).unwrap(); + } + add_manifest + .execute(named_params! { + ":hash": b64, + ":release_date": parsed.release_date, + ":release_label": parsed.release_label, + ":release_product": parsed.release_product, + }) + .context("Executing q/add-cuda-manifest.sql") + .unwrap(); + + for (pname, pkg) in parsed.by_pname { + for (platform, maybe_tag, comp) in pkg.artifacts.into_iter() { + let ps = named_params! { + ":manifest": b64, + ":name": pkg.name, + ":pname": pname, + ":license_name": pkg.license, + ":license_path": pkg.license_path, + ":version": pkg.version, + ":sha256": comp.sha256, + ":md5": comp.md5, + ":platform": platform, + ":relative_path": comp.relative_path, + ":n_bytes": comp.size, + ":compat_tag": maybe_tag + }; + for h in &vec![Some(&comp.sha256), comp.md5.as_ref()] { + add_hash.execute(params![h, None::]).unwrap(); + } + for s in &vec![ + Some(&pname), + pkg.name.as_ref(), + Some(&pkg.license), + pkg.license_path.as_ref(), + Some(&pkg.version), + Some(&platform.to_string()), + Some(&comp.relative_path), + maybe_tag.as_ref(), + ] { + add_str.execute(params![s]).unwrap(); + } + add_comp + .execute(ps) + .context("Executing q/add-cuda-artifact.sql") + .unwrap(); + } + } + } + tx.commit() + .expect("Couldn't commit transaction adding manifest or its component"); + } + } + } None => {} } } diff --git a/src/q/add-cuda-artifact.sql b/src/q/add-cuda-artifact.sql new file mode 100644 index 0000000..2b860f6 --- /dev/null +++ b/src/q/add-cuda-artifact.sql @@ -0,0 +1,17 @@ +INSERT INTO + CudaArtifact(manifest, sha256, md5, name, pname, license_name, license_path, version, platform, compat_tag, relative_path, n_bytes) +VALUES ( + (SELECT id FROM Hash WHERE hash=:manifest LIMIT 1), + (SELECT id FROM Hash WHERE hash=:sha256 LIMIT 1), + (SELECT id FROM Hash WHERE hash=:md5 LIMIT 1), + (SELECT id FROM Str WHERE str=:name LIMIT 1), + (SELECT id FROM Str WHERE str=:pname LIMIT 1), + (SELECT id FROM Str WHERE str=:license_name LIMIT 1), + (SELECT id FROM Str WHERE str=:license_path LIMIT 1), + (SELECT id FROM Str WHERE str=:version LIMIT 1), + (SELECT id FROM Str WHERE str=:platform LIMIT 1), + (SELECT id FROM Str WHERE str=:compat_tag LIMIT 1), + (SELECT id FROM Str WHERE str=:relative_path LIMIT 1), + :n_bytes +) +ON CONFLICT DO NOTHING diff --git a/src/q/add-cuda-manifest.sql b/src/q/add-cuda-manifest.sql new file mode 100644 index 0000000..02e8b1c --- /dev/null +++ b/src/q/add-cuda-manifest.sql @@ -0,0 +1,9 @@ +INSERT INTO + CudaManifest(id, release_date, release_label, release_product) +VALUES ( + (SELECT id FROM Hash WHERE hash=:hash LIMIT 1), + (SELECT id FROM Str WHERE str=:release_date LIMIT 1), + (SELECT id FROM Str WHERE str=:release_label LIMIT 1), + (SELECT id FROM Str WHERE str=:release_product LIMIT 1) +) +ON CONFLICT DO NOTHING diff --git a/src/q/add-sample.sql b/src/q/add-sample.sql index 2817b13..71131ce 100644 --- a/src/q/add-sample.sql +++ b/src/q/add-sample.sql @@ -1,22 +1,8 @@ -INSERT INTO SidxUriSample(uri, hash, http_code) -VALUES( - ( - SELECT - id - FROM - Str - WHERE - str = ? - LIMIT 1 - ), - ( - SELECT - id - FROM - Hash - WHERE - hash = ? - ), - ? +INSERT INTO SidxUriSample(uri, hash, http_code, content_type) +VALUES ( + ( SELECT id FROM Str WHERE str = :uri LIMIT 1), + ( SELECT id FROM Hash WHERE hash = :hash LIMIT 1 ), + :http_code, + ( SELECT id FROM Str WHERE str = :content_type LIMIT 1) ) RETURNING id, epoch; diff --git a/src/q/add-uri-ref.sql b/src/q/add-uri-ref.sql new file mode 100644 index 0000000..798310a --- /dev/null +++ b/src/q/add-uri-ref.sql @@ -0,0 +1,7 @@ +INSERT INTO UriReference(content, target, why) +VALUES ( + (SELECT id FROM Hash WHERE hash=:source LIMIT 1), + (SELECT id FROM Str WHERE str=:target LIMIT 1), + (SELECT id FROM Str WHERE str=:why LIMIT 1) +) +ON CONFLICT DO UPDATE SET why=excluded.why; diff --git a/src/q/cuda-init.sql b/src/q/cuda-init.sql new file mode 100644 index 0000000..4c7d29d --- /dev/null +++ b/src/q/cuda-init.sql @@ -0,0 +1,49 @@ +CREATE TABLE IF NOT EXISTS CudaManifest( + id INTEGER, /* Blake3/ca-node of the JSON */ + release_date INTEGER, /* E.g. "2025-03-06" */ + release_label INTEGER, /* E.g. "12.8.1" */ + release_product INTEGER, /* E.g. "cuda" */ + PRIMARY KEY(id), + FOREIGN KEY(id) REFERENCES Hash(id), + FOREIGN KEY(release_date) REFERENCES Str(id), + FOREIGN KEY(release_label) REFERENCES Str(id), + FOREIGN KEY(release_product) REFERENCES Str(id) +) STRICT; + +CREATE TABLE IF NOT EXISTS CudaArtifact( + manifest INTEGER NOT NULL, + name INTEGER, /* E.g. "cuda_nvcc" */ + pname INTEGER, /* E.g. "CUDA NVCC" */ + license_name INTEGER, /* E.g. "CUDA Toolkit" */ + license_path INTEGER, /* E.g. "cuda_cccl/LICENSE.txt" */ + version INTEGER NOT NULL, /* E.g. "12.8.90" */ + /* Consider making external */ + compat_tag INTEGER, /* E.g. "cuda12" in cudnn */ + sha256 INTEGER, + md5 INTEGER, + platform INTEGER, /* E.g. "linux-x86_64" */ + /* E.g. "cuda_cccl/linux-x86_64/cuda_cccl-linux-x86_64-12.8.90-archive.tar.xz" */ + relative_path INTEGER, + n_bytes INTEGER, /* May be a string in the JSON */ + /* Tempting to have + * PRIMARY KEY(manifest, name, platform), + * however that's not unique at least because of `compat_tag`, + * which might also be `NULL`. + */ + PRIMARY KEY(sha256, manifest), + FOREIGN KEY(manifest) REFERENCES CudaManifest(id), + FOREIGN KEY(manifest) REFERENCES Hash(id), + FOREIGN KEY(name) REFERENCES Str(id), + FOREIGN KEY(pname) REFERENCES Str(id), + FOREIGN KEY(license_name) REFERENCES Str(id), + FOREIGN KEY(license_path) REFERENCES Str(id), + FOREIGN KEY(version) REFERENCES Str(id), + FOREIGN KEY(compat_tag) REFERENCES Str(id), + FOREIGN KEY(sha256) REFERENCES Hash(id), + FOREIGN KEY(md5) REFERENCES Hash(id), + FOREIGN KEY(platform) REFERENCES Str(id), + FOREIGN KEY(relative_path) REFERENCES Str(id) +) STRICT; + +CREATE UNIQUE INDEX IF NOT EXISTS CudaArtifactIdx +ON CudaArtifact(pname, platform, version, compat_tag, name, manifest); diff --git a/src/q/find-cuda-manifests.sql b/src/q/find-cuda-manifests.sql new file mode 100644 index 0000000..6074454 --- /dev/null +++ b/src/q/find-cuda-manifests.sql @@ -0,0 +1,15 @@ +SELECT + uri.str AS uri, h.hash, cm.id AS manifest +FROM + SidxUriSample AS s + INNER JOIN Str AS uri + INNER JOIN (Hash AS h LEFT JOIN CudaManifest AS cm ON h.id=cm.id) +ON + s.uri=uri.id + AND s.hash=h.id +WHERE + uri.str LIKE 'https://developer.download.nvidia.com/compute/%.json' + AND (:include_finished OR cm.id IS NULL) +GROUP BY + s.hash +ORDER BY uri.str, s.id DESC; diff --git a/src/q/sidx-init.sql b/src/q/sidx-init.sql index 7a0a9a1..5cf2492 100644 --- a/src/q/sidx-init.sql +++ b/src/q/sidx-init.sql @@ -3,34 +3,35 @@ CREATE TABLE IF NOT EXISTS Hash( hash TEXT UNIQUE, /* snix-castore node */ n_bytes INTEGER, PRIMARY KEY(id) -); /* Essentially random strings */ +) STRICT; /* Essentially random strings */ CREATE TABLE IF NOT EXISTS Str( id INTEGER, str TEXT UNIQUE, PRIMARY KEY(id) -); /* "Naturally occuring" strings */ -CREATE INDEX IF NOT EXISTS StrIdx ON Str(str); +) STRICT; /* "Naturally occuring" strings */ CREATE TABLE IF NOT EXISTS SidxUriSample( id INTEGER, uri INTEGER NOT NULL, hash INTEGER, epoch INTEGER NOT NULL DEFAULT (unixepoch()), http_code INTEGER DEFAULT NULL, + content_type INTEGER DEFAULT NULL, PRIMARY KEY(id), FOREIGN KEY(uri) REFERENCES Str(id), - FOREIGN KEY(hash) REFERENCES Hash(id) -); + FOREIGN KEY(hash) REFERENCES Hash(id), + FOREIGN KEY(content_type) REFERENCES Str(id) +) STRICT; CREATE INDEX IF NOT EXISTS SidxUriHashIdx -ON SidxUriSample(uri, epoch); +ON SidxUriSample(uri, hash, epoch); -CREATE TABLE IF NOT EXISTS UriReference( - content INTEGER, - target INTEGER, - why INTEGER, - PRIMARY KEY (content, target, why), - FOREIGN KEY(content) REFERENCES Hash(id), - FOREIGN KEY(target) REFERENCES Str(id), /* E.g. Uri or Path */ - FOREIGN KEY(why) REFERENCES Str(id) /* E.g. "href" */ -); -CREATE INDEX IF NOT EXISTS UriReferenceIdx -ON UriReference(target, content); +CREATE TABLE IF NOT EXISTS "UriReference" ( + "id" INTEGER, + "content" INTEGER NOT NULL, + "target" INTEGER NOT NULL, + "why" INTEGER, + PRIMARY KEY("id"), + CONSTRAINT "NoDupRefs" UNIQUE("content","target","why"), + FOREIGN KEY("content") REFERENCES "Hash"("id"), + FOREIGN KEY("target") REFERENCES "Str"("id"), + FOREIGN KEY("why") REFERENCES "Str"("id") +) STRICT; diff --git a/src/q/uris-of-hash.sql b/src/q/uris-of-hash.sql new file mode 100644 index 0000000..827fdeb --- /dev/null +++ b/src/q/uris-of-hash.sql @@ -0,0 +1,16 @@ +SELECT + uri.str AS uri +FROM + SidxUriSample AS s + INNER JOIN Str AS uri + INNER JOIN Hash AS h +ON + s.uri=uri.id + AND s.hash=h.id +WHERE + h.hash=:hash +ORDER BY + s.epoch DESC +LIMIT + :limit +; diff --git a/static/some.css b/static/some.css new file mode 100644 index 0000000..5d4ca51 --- /dev/null +++ b/static/some.css @@ -0,0 +1,48 @@ +.index { + font-family: "Source Serif Pro", "Linux Libertine", monospace; +} + +.db-table > h3 { + font-variant: small-caps; +} + +h1, nav { + font-variant: small-caps; + font-family: "Inconsolata", monospace; +} + +h2 { + font-variant: small-caps; +} + +th { + font-variant: small-caps; +} + +header { + background-color: black; +} + +.ft { + background-color: black; +} + +form input[type="submit"] { + background-color: black; +} + +a:link { + color: #404040; +} + +a:visited { + color: darkgrey; +} + +a:hover { + color: black; +} + +.rows-and-columns > tbody > tr:nth-child(even) { + background-color: #F5F5F5; +}