diff --git a/Cargo.lock b/Cargo.lock index ee023c5..107eccf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -342,6 +342,12 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.10.1" @@ -512,6 +518,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.100", +] + [[package]] name = "darling" version = "0.20.11" @@ -563,6 +592,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_more" +version = "0.99.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "digest" version = "0.10.7" @@ -591,6 +631,27 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "dtoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "either" version = "1.15.0" @@ -727,6 +788,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -816,6 +887,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "gcp_auth" version = "0.12.3" @@ -853,6 +933,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width 0.1.14", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -958,6 +1047,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "html5ever" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" +dependencies = [ + "log", + "mac", + "markup5ever", + "match_token", +] + [[package]] name = "http" version = "1.3.1" @@ -1429,6 +1530,37 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1509,6 +1641,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1685,6 +1823,58 @@ dependencies = [ "indexmap 2.9.0", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.100", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -1744,6 +1934,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.32" @@ -2300,6 +2496,21 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527e65d9d888567588db4c12da1087598d0f6f8b346cc2c5abc91f05fc2dffe2" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors", + "tendril", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -2336,6 +2547,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +dependencies = [ + "bitflags 2.9.0", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "serde" version = "1.0.219" @@ -2431,6 +2661,15 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "servo_arc" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2455,8 +2694,12 @@ dependencies = [ "futures", "reqwest", "rusqlite", + "scraper", + "serde", + "serde_json", "snix-castore", "tokio", + "tokio-stream", "tokio-util", "url", ] @@ -2470,6 +2713,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -2588,6 +2837,31 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -2676,6 +2950,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -3129,6 +3414,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf16_iter" version = "1.0.5" diff --git a/Cargo.nix b/Cargo.nix index 22eb2d2..4cf77bc 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -1104,6 +1104,19 @@ rec { }; resolvedDefaultFeatures = [ "default" ]; }; + "byteorder" = rec { + crateName = "byteorder"; + version = "1.5.0"; + edition = "2021"; + sha256 = "0jzncxyf404mwqdbspihyzpkndfgda450l0893pz5xj685cg5l0z"; + authors = [ + "Andrew Gallant " + ]; + features = { + "default" = [ "std" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "bytes" = rec { crateName = "bytes"; version = "1.10.1"; @@ -1588,6 +1601,65 @@ rec { "rand_core" = [ "dep:rand_core" ]; }; }; + "cssparser" = rec { + crateName = "cssparser"; + version = "0.34.0"; + edition = "2018"; + sha256 = "1qx3hha392szcl812l6hp0d4029gg8x62cl4nf0byqgdv0f6vimp"; + authors = [ + "Simon Sapin " + ]; + dependencies = [ + { + name = "cssparser-macros"; + packageId = "cssparser-macros"; + } + { + name = "dtoa-short"; + packageId = "dtoa-short"; + } + { + name = "itoa"; + packageId = "itoa"; + } + { + name = "phf"; + packageId = "phf"; + features = [ "macros" ]; + } + { + name = "smallvec"; + packageId = "smallvec"; + } + ]; + features = { + "serde" = [ "dep:serde" ]; + }; + }; + "cssparser-macros" = rec { + crateName = "cssparser-macros"; + version = "0.6.1"; + edition = "2018"; + sha256 = "0cfkzj60avrnskdmaf7f8zw6pp3di4ylplk455zrzaf19ax8id8k"; + procMacro = true; + libName = "cssparser_macros"; + libPath = "lib.rs"; + authors = [ + "Simon Sapin " + ]; + dependencies = [ + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" "extra-traits" ]; + } + ]; + + }; "darling" = rec { crateName = "darling"; version = "0.20.11"; @@ -1731,6 +1803,49 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "powerfmt" "serde" "std" ]; }; + "derive_more" = rec { + crateName = "derive_more"; + version = "0.99.19"; + edition = "2018"; + sha256 = "17y6g78dg31fsv7z4p455bzxs670spg476ww2ibg3mj3vww9m8ix"; + procMacro = true; + authors = [ + "Jelte Fennema " + ]; + dependencies = [ + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + } + ]; + features = { + "convert_case" = [ "dep:convert_case" ]; + "default" = [ "add_assign" "add" "as_mut" "as_ref" "constructor" "deref" "deref_mut" "display" "error" "from" "from_str" "index" "index_mut" "into" "into_iterator" "iterator" "mul_assign" "mul" "not" "sum" "try_into" "is_variant" "unwrap" ]; + "display" = [ "syn/extra-traits" ]; + "error" = [ "syn/extra-traits" ]; + "from" = [ "syn/extra-traits" ]; + "generate-parsing-rs" = [ "peg" ]; + "into" = [ "syn/extra-traits" ]; + "is_variant" = [ "convert_case" ]; + "mul" = [ "syn/extra-traits" ]; + "mul_assign" = [ "syn/extra-traits" ]; + "not" = [ "syn/extra-traits" ]; + "peg" = [ "dep:peg" ]; + "rustc_version" = [ "dep:rustc_version" ]; + "testing-helpers" = [ "rustc_version" ]; + "try_into" = [ "syn/extra-traits" ]; + "unwrap" = [ "convert_case" "rustc_version" ]; + }; + resolvedDefaultFeatures = [ "add" "add_assign" ]; + }; "digest" = rec { crateName = "digest"; version = "0.10.7"; @@ -1810,6 +1925,49 @@ rec { features = { }; }; + "dtoa" = rec { + crateName = "dtoa"; + version = "1.0.10"; + edition = "2018"; + sha256 = "016gid01rarcdv57h049d7nr9daxc2hc2gqzx0mji57krywd7bfn"; + authors = [ + "David Tolnay " + ]; + features = { + "no-panic" = [ "dep:no-panic" ]; + }; + }; + "dtoa-short" = rec { + crateName = "dtoa-short"; + version = "0.3.5"; + edition = "2015"; + sha256 = "11rwnkgql5jilsmwxpx6hjzkgyrbdmx1d71s0jyrjqm5nski25fd"; + libName = "dtoa_short"; + authors = [ + "Xidorn Quan " + ]; + dependencies = [ + { + name = "dtoa"; + packageId = "dtoa"; + } + ]; + + }; + "ego-tree" = rec { + crateName = "ego-tree"; + version = "0.10.0"; + edition = "2021"; + sha256 = "1n2csy99chk5v5vzjl0ff79vxpxhl76xmcb3aj6brrzzipmjz5xj"; + libName = "ego_tree"; + authors = [ + "June McEnroe " + "Carlo Federico Vescovo " + ]; + features = { + "serde" = [ "dep:serde" ]; + }; + }; "either" = rec { crateName = "either"; version = "1.15.0"; @@ -2155,6 +2313,26 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "default" "std" ]; }; + "futf" = rec { + crateName = "futf"; + version = "0.1.5"; + edition = "2015"; + sha256 = "0hvqk2r7v4fnc34hvc3vkri89gn52d5m9ihygmwn75l1hhp0whnz"; + authors = [ + "Keegan McAllister " + ]; + dependencies = [ + { + name = "mac"; + packageId = "mac"; + } + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + ]; + + }; "futures" = rec { crateName = "futures"; version = "0.3.31"; @@ -2433,6 +2611,23 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "async-await" "async-await-macro" "channel" "default" "futures-channel" "futures-io" "futures-macro" "futures-sink" "io" "memchr" "sink" "slab" "std" ]; }; + "fxhash" = rec { + crateName = "fxhash"; + version = "0.2.1"; + edition = "2015"; + sha256 = "037mb9ichariqi45xm6mz0b11pa92gj38ba0409z3iz239sns6y3"; + libPath = "lib.rs"; + authors = [ + "cbreeden " + ]; + dependencies = [ + { + name = "byteorder"; + packageId = "byteorder"; + } + ]; + + }; "gcp_auth" = rec { crateName = "gcp_auth"; version = "0.12.3"; @@ -2565,6 +2760,26 @@ rec { }; resolvedDefaultFeatures = [ "more_lengths" ]; }; + "getopts" = rec { + crateName = "getopts"; + version = "0.2.21"; + edition = "2015"; + sha256 = "1mgb3qvivi26gs6ihqqhh8iyhp3vgxri6vwyrwg28w0xqzavznql"; + authors = [ + "The Rust Project Developers" + ]; + dependencies = [ + { + name = "unicode-width"; + packageId = "unicode-width 0.1.14"; + } + ]; + features = { + "core" = [ "dep:core" ]; + "rustc-dep-of-std" = [ "unicode-width/rustc-dep-of-std" "std" "core" ]; + "std" = [ "dep:std" ]; + }; + }; "getrandom 0.2.15" = rec { crateName = "getrandom"; version = "0.2.15"; @@ -2938,6 +3153,35 @@ rec { ]; }; + "html5ever" = rec { + crateName = "html5ever"; + version = "0.29.1"; + edition = "2021"; + sha256 = "07518h5gbw0c6x7w5br76bgxvgphs6zlrb4q7ii7bg1ww7510x1v"; + authors = [ + "The html5ever Project Developers" + ]; + dependencies = [ + { + name = "log"; + packageId = "log"; + } + { + name = "mac"; + packageId = "mac"; + } + { + name = "markup5ever"; + packageId = "markup5ever"; + } + { + name = "match_token"; + packageId = "match_token"; + } + ]; + features = { + }; + }; "http" = rec { crateName = "http"; version = "1.3.1"; @@ -4471,6 +4715,78 @@ rec { }; resolvedDefaultFeatures = [ "std" ]; }; + "mac" = rec { + crateName = "mac"; + version = "0.1.1"; + edition = "2015"; + sha256 = "194vc7vrshqff72rl56f9xgb0cazyl4jda7qsv31m5l6xx7hq7n4"; + authors = [ + "Jonathan Reem " + ]; + + }; + "markup5ever" = rec { + crateName = "markup5ever"; + version = "0.14.1"; + edition = "2021"; + sha256 = "063sdq7hwxn2al9ygify8dd96mj57n9c4lig007lr1p128yj39y7"; + libPath = "lib.rs"; + authors = [ + "The html5ever Project Developers" + ]; + dependencies = [ + { + name = "log"; + packageId = "log"; + } + { + name = "phf"; + packageId = "phf"; + } + { + name = "string_cache"; + packageId = "string_cache"; + } + { + name = "tendril"; + packageId = "tendril"; + } + ]; + buildDependencies = [ + { + name = "phf_codegen"; + packageId = "phf_codegen"; + } + { + name = "string_cache_codegen"; + packageId = "string_cache_codegen"; + } + ]; + + }; + "match_token" = rec { + crateName = "match_token"; + version = "0.1.0"; + edition = "2021"; + sha256 = "0sx3212vkjqfblfhr556ayabbjflbigjf5j591j9kgs4infniac8"; + procMacro = true; + dependencies = [ + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" ]; + } + ]; + + }; "matchers" = rec { crateName = "matchers"; version = "0.1.0"; @@ -4717,6 +5033,18 @@ rec { "vendored" = [ "openssl/vendored" ]; }; }; + "new_debug_unreachable" = rec { + crateName = "new_debug_unreachable"; + version = "1.0.6"; + edition = "2021"; + sha256 = "11phpf1mjxq6khk91yzcbd3ympm78m3ivl7xg6lg2c0lf66fy3k5"; + libName = "debug_unreachable"; + authors = [ + "Matt Brubeck " + "Jonathan Reem " + ]; + + }; "nu-ansi-term" = rec { crateName = "nu-ansi-term"; version = "0.46.0"; @@ -5280,6 +5608,142 @@ rec { "unstable" = [ "generate" ]; }; }; + "phf" = rec { + crateName = "phf"; + version = "0.11.3"; + edition = "2021"; + sha256 = "0y6hxp1d48rx2434wgi5g8j1pr8s5jja29ha2b65435fh057imhz"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_macros"; + packageId = "phf_macros"; + optional = true; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + ]; + features = { + "default" = [ "std" ]; + "macros" = [ "phf_macros" ]; + "phf_macros" = [ "dep:phf_macros" ]; + "serde" = [ "dep:serde" ]; + "std" = [ "phf_shared/std" ]; + "uncased" = [ "phf_shared/uncased" ]; + "unicase" = [ "phf_macros?/unicase" "phf_shared/unicase" ]; + }; + resolvedDefaultFeatures = [ "default" "macros" "phf_macros" "std" ]; + }; + "phf_codegen" = rec { + crateName = "phf_codegen"; + version = "0.11.3"; + edition = "2021"; + sha256 = "0si1n6zr93kzjs3wah04ikw8z6npsr39jw4dam8yi9czg2609y5f"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + ]; + + }; + "phf_generator" = rec { + crateName = "phf_generator"; + version = "0.11.3"; + edition = "2021"; + crateBin = []; + sha256 = "0gc4np7s91ynrgw73s2i7iakhb4lzdv1gcyx7yhlc0n214a2701w"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + { + name = "rand"; + packageId = "rand 0.8.5"; + usesDefaultFeatures = false; + features = [ "small_rng" ]; + } + ]; + features = { + "criterion" = [ "dep:criterion" ]; + }; + }; + "phf_macros" = rec { + crateName = "phf_macros"; + version = "0.11.3"; + edition = "2021"; + sha256 = "05kjfbyb439344rhmlzzw0f9bwk9fp95mmw56zs7yfn1552c0jpq"; + procMacro = true; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" ]; + } + ]; + features = { + "unicase" = [ "unicase_" "phf_shared/unicase" ]; + "unicase_" = [ "dep:unicase_" ]; + }; + }; + "phf_shared" = rec { + crateName = "phf_shared"; + version = "0.11.3"; + edition = "2021"; + sha256 = "1rallyvh28jqd9i916gk5gk2igdmzlgvv5q0l3xbf3m6y8pbrsk7"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "siphasher"; + packageId = "siphasher"; + } + ]; + features = { + "default" = [ "std" ]; + "uncased" = [ "dep:uncased" ]; + "unicase" = [ "dep:unicase" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "pin-project" = rec { crateName = "pin-project"; version = "1.1.10"; @@ -5397,6 +5861,17 @@ rec { }; resolvedDefaultFeatures = [ "simd" "std" ]; }; + "precomputed-hash" = rec { + crateName = "precomputed-hash"; + version = "0.1.1"; + edition = "2015"; + sha256 = "075k9bfy39jhs53cb2fpb9klfakx2glxnf28zdw08ws6lgpq6lwj"; + libName = "precomputed_hash"; + authors = [ + "Emilio Cobos Álvarez " + ]; + + }; "prettyplease" = rec { crateName = "prettyplease"; version = "0.2.32"; @@ -7485,6 +7960,56 @@ rec { "default" = [ "use_std" ]; }; }; + "scraper" = rec { + crateName = "scraper"; + version = "0.23.1"; + edition = "2021"; + crateBin = []; + sha256 = "1qpz5py0a7y9mg2w4v1lidphz3arhw8dl4jcvf47aml8v3cnazjj"; + authors = [ + "June McEnroe " + ]; + dependencies = [ + { + name = "cssparser"; + packageId = "cssparser"; + } + { + name = "ego-tree"; + packageId = "ego-tree"; + } + { + name = "getopts"; + packageId = "getopts"; + optional = true; + } + { + name = "html5ever"; + packageId = "html5ever"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "selectors"; + packageId = "selectors"; + } + { + name = "tendril"; + packageId = "tendril"; + } + ]; + features = { + "default" = [ "main" "errors" ]; + "deterministic" = [ "indexmap" ]; + "getopts" = [ "dep:getopts" ]; + "indexmap" = [ "dep:indexmap" ]; + "main" = [ "getopts" ]; + "serde" = [ "dep:serde" ]; + }; + resolvedDefaultFeatures = [ "default" "errors" "getopts" "main" ]; + }; "security-framework 2.11.1" = rec { crateName = "security-framework"; version = "2.11.1"; @@ -7607,6 +8132,69 @@ rec { }; resolvedDefaultFeatures = [ "OSX_10_10" "OSX_10_11" "OSX_10_12" "OSX_10_9" "default" ]; }; + "selectors" = rec { + crateName = "selectors"; + version = "0.26.0"; + edition = "2021"; + sha256 = "1s3zv30rqgdvil7mnfr4xq5nb9m8yp0sai42l28y565mkd68lmpx"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "bitflags"; + packageId = "bitflags 2.9.0"; + } + { + name = "cssparser"; + packageId = "cssparser"; + } + { + name = "derive_more"; + packageId = "derive_more"; + usesDefaultFeatures = false; + features = [ "add" "add_assign" ]; + } + { + name = "fxhash"; + packageId = "fxhash"; + } + { + name = "log"; + packageId = "log"; + } + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + { + name = "phf"; + packageId = "phf"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "servo_arc"; + packageId = "servo_arc"; + } + { + name = "smallvec"; + packageId = "smallvec"; + } + ]; + buildDependencies = [ + { + name = "phf_codegen"; + packageId = "phf_codegen"; + } + ]; + features = { + "to_shmem" = [ "dep:to_shmem" "dep:to_shmem_derive" ]; + }; + }; "serde" = rec { crateName = "serde"; version = "1.0.219"; @@ -7951,6 +8539,26 @@ rec { features = { }; }; + "servo_arc" = rec { + crateName = "servo_arc"; + version = "0.4.0"; + edition = "2021"; + sha256 = "06ljch4isnnbv1xpwhjajz4a4mpc7ki47ys9n9yn98kqjhjc8rdf"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "stable_deref_trait"; + packageId = "stable_deref_trait"; + } + ]; + features = { + "serde" = [ "dep:serde" ]; + "servo" = [ "serde" "track_alloc_size" ]; + }; + }; "sharded-slab" = rec { crateName = "sharded-slab"; version = "0.1.7"; @@ -8021,6 +8629,14 @@ rec { name = "rusqlite"; packageId = "rusqlite"; } + { + name = "scraper"; + packageId = "scraper"; + } + { + name = "serde_json"; + packageId = "serde_json"; + } { name = "snix-castore"; packageId = "snix-castore"; @@ -8058,6 +8674,23 @@ rec { ]; }; + "siphasher" = rec { + crateName = "siphasher"; + version = "1.0.1"; + edition = "2018"; + sha256 = "17f35782ma3fn6sh21c027kjmd227xyrx06ffi8gw4xzv9yry6an"; + authors = [ + "Frank Denis " + ]; + features = { + "default" = [ "std" ]; + "serde" = [ "dep:serde" ]; + "serde_json" = [ "dep:serde_json" ]; + "serde_no_std" = [ "serde/alloc" ]; + "serde_std" = [ "std" "serde/std" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "slab" = rec { crateName = "slab"; version = "0.4.9"; @@ -8477,7 +9110,75 @@ rec { "default" = [ "std" ]; "std" = [ "alloc" ]; }; - resolvedDefaultFeatures = [ "alloc" ]; + resolvedDefaultFeatures = [ "alloc" "default" "std" ]; + }; + "string_cache" = rec { + crateName = "string_cache"; + version = "0.8.9"; + edition = "2018"; + sha256 = "03z7km2kzlwiv2r2qifq5riv4g8phazwng9wnvs3py3lzainnxxz"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + { + name = "parking_lot"; + packageId = "parking_lot"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "serde"; + packageId = "serde"; + optional = true; + } + ]; + features = { + "default" = [ "serde_support" ]; + "malloc_size_of" = [ "dep:malloc_size_of" ]; + "serde" = [ "dep:serde" ]; + "serde_support" = [ "serde" ]; + }; + resolvedDefaultFeatures = [ "default" "serde" "serde_support" ]; + }; + "string_cache_codegen" = rec { + crateName = "string_cache_codegen"; + version = "0.5.4"; + edition = "2018"; + sha256 = "181ir4d6y053s1kka2idpjx5g9d9jgll6fy517jhzzpi2n3r44f7"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + ]; + }; "strsim" = rec { crateName = "strsim"; @@ -8718,6 +9419,35 @@ rec { }; resolvedDefaultFeatures = [ "default" "getrandom" ]; }; + "tendril" = rec { + crateName = "tendril"; + version = "0.4.3"; + edition = "2015"; + sha256 = "1c3vip59sqwxn148i714nmkrvjzbk7105vj0h92s6r64bw614jnj"; + authors = [ + "Keegan McAllister " + "Simon Sapin " + "Chris Morgan " + ]; + dependencies = [ + { + name = "futf"; + packageId = "futf"; + } + { + name = "mac"; + packageId = "mac"; + } + { + name = "utf-8"; + packageId = "utf-8"; + } + ]; + features = { + "encoding" = [ "dep:encoding" ]; + "encoding_rs" = [ "dep:encoding_rs" ]; + }; + }; "thiserror 1.0.69" = rec { crateName = "thiserror"; version = "1.0.69"; @@ -10325,6 +11055,17 @@ rec { }; resolvedDefaultFeatures = [ "default" "std" ]; }; + "utf-8" = rec { + crateName = "utf-8"; + version = "0.7.6"; + edition = "2015"; + sha256 = "1a9ns3fvgird0snjkd3wbdhwd3zdpc2h5gpyybrfr6ra5pkqxk09"; + libName = "utf8"; + authors = [ + "Simon Sapin " + ]; + + }; "utf16_iter" = rec { crateName = "utf16_iter"; version = "1.0.5"; diff --git a/Cargo.toml b/Cargo.toml index 9b6b61b..a75186e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,11 @@ clap = "4.5.35" futures = "0.3.31" reqwest = "0.12.15" rusqlite = "0.34.0" +scraper = "0.23.1" +serde = "1.0.219" +serde_json = "1.0.140" snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" } tokio = "1.44.2" +tokio-stream = "0.1.17" tokio-util = "0.7.14" url = "2.5.4" diff --git a/README.md b/README.md index 073d99a..1e072bb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -sidx +[sidx](https://forge.someonex.net/else/sidx) === +Work in Progress. Indexing archives and build outputs. @@ -17,4 +18,8 @@ Roadmap Approach --- -Vapourware and means to an end. +Vapourware and means to an end: +[this project](https://forge.someonex.net/else/sidx) was originally motivated by the needs of maintaining +`cudaPackages` in Nixpkgs. +Specifically, it attempts to answer the question of "what is there to be maintained", +improve [observability and debug-ability of the package set (cf. demo)](https://cuda-index.someonex.net/sidx/UriReference). diff --git a/default.nix b/default.nix index 4ae9a1a..01ecf46 100644 --- a/default.nix +++ b/default.nix @@ -47,5 +47,118 @@ lib.makeScope pkgs.newScope ( } ); sidx = self.sidx-crate2nix.rootCrate.build; + datasette-wrapped = self.callPackage ( + { + datasette, + datasette-assets, + makeWrapper, + runCommand, + }: + runCommand "datasettew" + { + nativeBuildInputs = [ makeWrapper ]; + preferLocalBuild = true; + allowSubstitutes = false; + } + '' + mkdir -p "$out/bin" + makeWrapper ${lib.getExe datasette} "$out/bin/datasettew" \ + --append-flags --metadata=${datasette-assets}/metadata.json \ + --append-flags --static=static:${datasette-assets}/static + '' + ) { }; + datasette-assets = self.callPackage ( + { + runCommand, + datasette-metadata, + datasette-settings, + }: + runCommand "datasette-assets" + { + preferLocalBuild = true; + allowSubstitutes = false; + } + '' + mkdir "$out" + cp --no-preserve=mode -r ${./static} "$out"/static + cp ${datasette-metadata} "$out"/metadata.json + cp ${datasette-settings} "$out"/settings.json + '' + ) { }; + datasette-settings = self.callPackage ( + { formats }: + (formats.json { }).generate "datasette-settings.json" { + sql_time_limit_ms = 8000; + } + ) { }; + datasette-metadata = self.callPackage ( + { formats }: + (formats.json { }).generate "datasette-metadata.json" { + title = "CUDA INDEX"; + description_html = '' +

Visualizing the contents of Nixpkgs' cudaPackages. + Generated via an ad-hoc indexing tool. +

+ ''; + "extra_css_urls" = [ + "/static/some.css" + ]; + "databases" = { + "sidx" = { + "tables" = { + "Hash" = { + "label_column" = "hash"; + }; + "CudaArtifact" = { + facets = [ + "pname" + "platform" + ]; + }; + }; + queries.cuda_conflicts = { + title = "CudaArtifact Conflicts"; + description_html = '' + CudaArtifacts (identified by sha256) + claiming the same (pname, version, platform) triple + ''; + sql = '' + SELECT + COUNT(DISTINCT sha256) AS conflicts, + pname.str AS pname, + ver.str AS ver, + plat.str AS plat, + GROUP_CONCAT(name.str, char(10)) AS name, + GROUP_CONCAT(tag.str, char(10)) AS tag, + GROUP_CONCAT(h.hash, char(10)) AS sha256 + FROM + ( + CudaArtifact AS cc, + Str AS name, + Str AS pname, + Str as ver, + Str as plat, + Hash as h + ON cc.name=name.id + AND cc.pname=pname.id + AND cc.version = ver.id + AND cc.platform = plat.id + AND cc.sha256 = h.id + ) + LEFT JOIN Str AS tag + ON + cc.compat_tag=tag.id + GROUP BY + cc.pname, cc.version, cc.platform + HAVING + conflicts >= CAST(:min_conflicts AS INTEGER) + ORDER BY conflicts DESC + ''; + }; + }; + }; + + } + ) { }; } ) diff --git a/shell.nix b/shell.nix index 0336a60..50c74c7 100644 --- a/shell.nix +++ b/shell.nix @@ -2,17 +2,20 @@ npins ? import ./npins, nixpkgs ? npins.nixpkgs, pkgs ? import nixpkgs { }, + self ? import ./. { inherit nixpkgs pkgs npins; }, lib ? pkgs.lib, mkShell ? pkgs.mkShell, sqlite ? pkgs.sqlite, openssl ? pkgs.openssl, rust-analyzer ? pkgs.rust-analyzer, rustc ? pkgs.rustc, + rustfmt ? pkgs.rustfmt, cargo ? pkgs.cargo, pkg-config ? pkgs.pkg-config, crate2nix ? pkgs.crate2nix, protobuf ? pkgs.protobuf, - datasette ? pkgs.datasette, + datasette-wrapped ? self.datasette-wrapped, + datasette-assets ? self.datasette-assets, ... }: mkShell { @@ -22,13 +25,19 @@ mkShell { cargo crate2nix rustc + rustfmt rust-analyzer pkg-config protobuf - datasette + datasette-wrapped ]; buildInputs = [ openssl sqlite ]; + DATASETTE_ASSETS = datasette-assets; # uploaded to cuda-index.someonex.net in bulk... + shellHook = '' + export DATABASE_PATH="$HOME/.local/share/sidx/sidx.db" + unset out outputs phases + ''; } diff --git a/src/main.rs b/src/main.rs index 482aae7..ec3a3e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,29 @@ -use std::path::{absolute, PathBuf}; +use std::collections::{HashMap, HashSet}; +use std::marker::PhantomData; +use std::path::{PathBuf, absolute}; +use std::str::FromStr; +use std::sync::Arc; +use std::{fmt, io}; -use anyhow::anyhow; use anyhow::Context; +use anyhow::{Error, anyhow}; use clap::Parser; -use futures::{stream, StreamExt, TryStreamExt}; -use rusqlite::{params, OptionalExtension}; +use clap::Subcommand; +use futures::{StreamExt, TryStreamExt, stream}; +use rusqlite::fallible_iterator::FallibleIterator as _; +use rusqlite::{OptionalExtension, named_params, params}; +use scraper::{Html, Selector}; +use serde::de::{self, Visitor}; +use serde::{Deserialize, Serialize}; +use snix_castore::B3Digest; +use snix_castore::blobservice::BlobService; +use snix_castore::directoryservice::DirectoryService; use snix_castore::{blobservice, directoryservice, import::fs::ingest_path}; +use std::sync::Mutex; +use tokio::io::{AsyncReadExt, BufReader}; +use tokio::sync::Semaphore; +use tokio::sync::mpsc::{Sender, channel}; +use tokio_stream::wrappers::ReceiverStream; use url::Url; #[derive(Clone, Debug)] @@ -14,20 +32,33 @@ enum Ingestable { Path(PathBuf), } -#[derive(Debug)] -enum IngestedWhen { +#[derive(Debug, Clone)] +enum SampledWhen { Now, Before, } -#[derive(Debug)] +#[derive(Debug, Clone)] +struct SizedBlob { + hash: B3Digest, + n_bytes: u64, +} + +#[derive(Debug, Clone)] #[allow(dead_code)] -struct Ingested { +struct Sampled { sample_id: u32, uri: String, - blake3: String, + blob: Option, + http_status: Option, epoch: u32, - when: IngestedWhen, + when: SampledWhen, +} + +#[derive(Clone)] +enum FetchListingMessage { + Sampled(Url, Sampled), + Recurse(Url, usize, Sender), } impl std::fmt::Display for Ingestable { @@ -44,7 +75,7 @@ impl std::fmt::Display for Ingestable { } } -fn parse_url_or_path(s: &str) -> Result { +fn parse_url_or_path(s: &str) -> Result { if s.is_empty() { Err(anyhow!("Empty path (url)")) } else if s.starts_with("./") || s.starts_with("/") { @@ -68,7 +99,7 @@ fn parse_url_or_path(s: &str) -> Result { fn data_path() -> PathBuf { let xdg_data_dir = std::env::var("XDG_DATA_DIR") .and_then(|s| Ok(PathBuf::from(s))) - .or_else(|_| -> Result { + .or_else(|_| -> Result { match std::env::home_dir() { Some(p) => Ok(p.join(".local/share")), None => Err(anyhow!("...")), // FIXME @@ -86,15 +117,38 @@ fn default_db_path() -> PathBuf { data_path().join("sidx.db") } +#[derive(Subcommand)] +enum Command { + Ingest { + #[clap(value_parser = parse_url_or_path, num_args = 1)] + inputs: Vec, + }, + FetchListing { + #[clap(value_parser, long, default_value_t = 5)] + max_depth: usize, + #[clap(value_parser, long, default_value_t = 1024 * 1024)] + html_max_bytes: u64, + #[clap(value_parser, num_args = 1)] + inputs: Vec, + }, + ParseUrl { + #[clap(value_parser, num_args = 1)] + url: Vec, + }, + DemoCudaManifest, + FormatCudaManifest, + ProcessCudaManifests { + #[clap(short, long, action)] + include_finished: bool, + }, +} + #[derive(Parser)] struct Cli { - #[clap(value_parser = parse_url_or_path, num_args = 1)] - inputs: Vec, - #[clap(short, long, action)] refetch: bool, - #[clap(short, long, value_parser, default_value_t = 5)] + #[clap(short, long, value_parser, default_value_t = 2)] max_parallel: usize, #[clap(short, long, value_parser, default_value_os_t = default_db_path())] @@ -102,22 +156,43 @@ struct Cli { #[clap(short, long, value_parser, default_value_os_t = default_castore_path())] castore_path: PathBuf, + + #[command(subcommand)] + command: Option, } -#[tokio::main] -async fn main() { - let args = Cli::parse(); +struct SidxContext +where + BS: blobservice::BlobService + Clone + Send + 'static, + DS: directoryservice::DirectoryService + Clone + Send + 'static, +{ + refetch: bool, + max_parallel: usize, + http: reqwest::Client, + http_semaphore: Arc, + con: Arc>, + blob_service: BS, + dir_service: DS, +} - args.db_path.parent().and_then(|p| { +async fn open_context( + refetch: bool, + max_parallel: usize, + db_path: PathBuf, + castore_path: PathBuf, +) -> SidxContext, Arc> { + if let Some(p) = db_path.parent() { let _ = std::fs::create_dir_all(p); - Some(()) - }); + } - let con = - rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object"); - con.execute_batch(include_str!("q/init.sql")) - .expect("Failed to execute init.sql"); - let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path"); + let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object"); + con.pragma_update(None, "jorunal_mode", "wal").unwrap(); + con.pragma_update(None, "synchronous", "normal").unwrap(); + con.pragma_update(None, "temp_store", "memory").unwrap(); + con.pragma_update(None, "foreign_keys", "on").unwrap(); + con.execute_batch(include_str!("q/sidx-init.sql")) + .expect("Failed to execute sidx-init.sql"); + let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path"); let blob_service = blobservice::from_addr(&std::format!( "objectstore+file://{}", castore_path @@ -130,116 +205,857 @@ async fn main() { let dir_service = directoryservice::from_addr(&std::format!( "objectstore+file://{}", castore_path - .join("directory") + .join("directories") .to_str() .expect("Path::to_str unexpectedly broken") )) .await - .expect("Couldn't initialize .castore/directory"); + .expect("Couldn't initialize .castore/directories"); - let client = reqwest::Client::new(); - let samples = stream::iter(args.inputs.iter().map(|uri| { - let client = &client; - let blob_service = &blob_service; - let dir_service = &dir_service; - let con = &con; - let mut find_sample = con - .prepare(include_str!("q/latest-download.sql")) + SidxContext::, Arc> { + refetch, + max_parallel, + http: reqwest::Client::new(), + http_semaphore: Arc::new(Semaphore::new(max_parallel)), + con: Arc::new(Mutex::new(con)), + blob_service, + dir_service, + } +} + +impl Drop for SidxContext +where + BS: BlobService + Clone, + DS: DirectoryService + Clone, +{ + fn drop(&mut self) { + let con = self + .con + .lock() + .expect("Acquiring mutex for sqlite to run #pragma optimize before exit"); + con.pragma_update(None, "analysis_limit", 500).unwrap(); + con.pragma_query(None, "optimize", |_| Ok(())).unwrap(); + } +} + +impl SidxContext { + async fn latest_sample(&self, uri: &str) -> Result, Error> { + let lock = self.con.lock().unwrap(); + let mut find_sample = lock + .prepare_cached(include_str!("q/latest-download.sql")) .expect("Failed to prepare latest-download.sql"); - let mut add_sample = con - .prepare(include_str!("q/add-sample.sql")) - .expect("Failed to prepare add-sample.sql"); - let mut add_blob = con - .prepare(include_str!("q/upsert-blob.sql")) - .expect("Failed to prepare upsert-blob.sql"); - let mut add_uri = con - .prepare(include_str!("q/upsert-uri.sql")) - .expect("Failed to prepare upsert-uri.sql"); + find_sample + .query_row(params![uri], |r| { + <(u32, String, u64, Option, u32)>::try_from(r) + }) + .optional() + .context("db_latest_download.sql") + .and_then(|maybe_tuple| match maybe_tuple { + Some((sample_id, hash, n_bytes, http_code, epoch)) => Ok(Some(Sampled { + sample_id, + uri: uri.to_string(), + blob: Some(SizedBlob { + hash: B3Digest::from_str(&hash)?, + n_bytes, + }), + http_status: http_code, + epoch, + when: SampledWhen::Before, + })), + None => Ok(None), + }) + } + async fn db_add_sample( + &self, + uri: &str, + hash: &Option, + http_code: &Option, + content_type: &Option, + ) -> Result<(u32, u32), Error> { + let lock = self.con.lock().expect("Locking mutex for db_add_sample"); + let mut add_sample = lock + .prepare_cached(include_str!("q/add-sample.sql")) + .context("Failed to prepare add-sample.sql")?; + Ok(add_sample.query_row( + named_params! { + ":uri": uri, + ":hash": hash, + ":http_code": http_code, + ":content_type": content_type + }, + |row| <(u32, u32)>::try_from(row), + )?) + } + async fn db_add_blob(&self, hash: &str, n_bytes: u64) -> Result { + let lock = self.con.lock().expect("db_add_blob: couldn't lock mutex?"); + let mut add_blob = lock + .prepare_cached(include_str!("q/upsert-blob.sql")) + .context("Failed to prepare upsert-blob.sql")?; + Ok(add_blob.execute(params![hash, n_bytes,])?) + } + async fn db_add_uri(&self, uri: &str) -> Result { + let lock = self.con.lock().unwrap(); + let mut add_uri = lock + .prepare_cached(include_str!("q/upsert-uri.sql")) + .context("Failed to prepare upsert-uri.sql")?; - async move { - let uri_s = uri.to_string(); - let latest_download = find_sample - .query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r)) - .optional()?; - if let Some((sample_id, blake3, epoch)) = latest_download { - if !args.refetch { - return Ok::, anyhow::Error>(Some(Ingested { - sample_id, - uri: uri_s, - blake3, - epoch, - when: IngestedWhen::Before, - })); + Ok(add_uri.execute(params![uri])?) + } + async fn record_ingested_node( + &self, + uri: &str, + blob: &Option, + http_code: Option, + content_type: Option, + ) -> Result { + let digest64 = if let Some(SizedBlob { hash, n_bytes }) = blob { + let digest64 = format!("{}", hash); + self.db_add_blob(&digest64, n_bytes.clone()).await?; + Some(digest64) + } else { + None + }; + self.db_add_uri(&uri).await?; + let (sample_id, epoch) = self + .db_add_sample(&uri, &digest64, &http_code, &content_type) + .await?; + Ok(Sampled { + sample_id, + uri: uri.to_string(), + blob: blob.clone(), + http_status: http_code, + epoch, + when: SampledWhen::Now, + }) + } + async fn download(&self, uri: &Url) -> Result { + let _permit = self.http_semaphore.acquire().await.unwrap(); + eprintln!("Downloading {:?}", uri.to_string()); + let uri_s = uri.to_string(); + let res = self + .http + .get(uri.clone()) + .send() + .await + .context(format!("Request::send failed early for {:?}", uri))?; + + let status = res.status(); + let status_code = status.as_u16(); + let content_type = res + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|x| x.to_str().ok()) + .map(|x| x.to_string()); + + if status.is_success() { + let mut r = tokio_util::io::StreamReader::new( + res.bytes_stream().map_err(std::io::Error::other), + ); + let mut w = self.blob_service.open_write().await; + let n_bytes = match tokio::io::copy(&mut r, &mut w).await { + Ok(n) => n, + Err(e) => { + return Err(anyhow!( + "tokio::io::copy failed for uri={} with {}", + uri_s, + e + )); + } + }; + let digest = w.close().await?; + + self.record_ingested_node( + &uri_s, + &Some(SizedBlob { + hash: digest, + n_bytes, + }), + Some(status_code), + content_type, + ) + .await + } else { + self.record_ingested_node(&uri_s, &None, Some(status_code), content_type) + .await + } + } + async fn ensure_blob(&self, hash: &B3Digest) -> Result<(), Error> { + if self + .blob_service + .has(hash) + .await + .context("ensure_has() accessing blob_service")? + { + Ok(()) + } else { + let b64 = hash.to_string(); + let uris = { + let con = self.con.lock().unwrap(); + let mut find_uris = con + .prepare_cached(include_str!("q/uris-of-hash.sql")) + .context("Preparing statement: q/uris-of-hash.sql") + .unwrap(); + find_uris + .query(named_params! {":hash": b64, ":limit": 100})? + .map(|b| b.get(0)) + .collect::>()? + }; + if uris.is_empty() { + return Err(anyhow!("No uris recorded for {}", b64)); + }; + for uri in uris { + let url = match Url::parse(&uri) { + Ok(url) => url, + Err(_) => continue, + }; + match self + .download(&url) + .await + .context("Redownloading missing blob for ensure_hash") + { + Ok(Sampled { + sample_id: _, + uri: _, + blob, + http_status: _, + epoch: _, + when: _, + }) => { + if blob.map_or(false, |sb| sb.hash == *hash) { + return Ok(()); + } else { + continue; + } + } + Err(_) => { + continue; + } } } - let (digest, n_bytes) = match uri { - Ingestable::Path(path) => { - match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None) - .await? - { - snix_castore::Node::Directory { digest, size } => (digest, size), - snix_castore::Node::File { - digest, - size, - executable: _, - } => (digest, size), - snix_castore::Node::Symlink { target: _ } => { - return Err(anyhow!("TODO: Figure out what to do with symlink roots")) + Err(anyhow!( + "All uris for {} are out of date (result in errors or different hashes)", + b64 + )) + } + } + async fn ensure_sampled_uri(&self, uri: &Url) -> Result { + /* TODO: flatten */ + if self.refetch { + self.download(&uri).await + } else { + /* TODO: Add negative TTL */ + match self.latest_sample(&uri.to_string()).await? { + Some(ingested) => match ingested.blob.clone() { + Some(SizedBlob { hash, n_bytes: _ }) => { + if self.blob_service.has(&hash).await? { + Ok(ingested) + } else { + self.download(&uri).await + } + } + None => self.download(&uri).await, + }, + None => self.download(&uri).await, + } + } + } + async fn ingest(&self, inputs: &Vec) -> Vec, Error>> { + let samples = stream::iter(inputs.iter().map(|uri| { + let blob_service = &self.blob_service; + let dir_service = &self.dir_service; + + async move { + let uri_s = uri.to_string(); + let latest_download = self.latest_sample(&uri_s).await?; + if latest_download.is_some() { + return Ok(latest_download); + } + match uri { + Ingestable::Path(path) => { + match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None) + .await? + { + snix_castore::Node::Directory { digest, size } => self + .record_ingested_node( + &uri_s, + &Some(SizedBlob { + hash: digest, + n_bytes: size, + }), + None, + None, + ) + .await + .map(Some), + + snix_castore::Node::File { + digest, + size, + executable: _, + } => self + .record_ingested_node( + &uri_s, + &Some(SizedBlob { + hash: digest, + n_bytes: size, + }), + None, + None, + ) + .await + .map(Some), + snix_castore::Node::Symlink { target: _ } => { + Err(anyhow!("TODO: Figure out what to do with symlink roots")) + } + } + } + Ingestable::Url(url) => self.ensure_sampled_uri(url).await.map(Some), + } + } + })) + .buffer_unordered(self.max_parallel) + .collect::, _>>>() + .await; + + samples + } + + fn extract_hrefs(content: &str) -> Result, Error> { + let sel = Selector::parse("a").map_err(|e| anyhow!(e.to_string()))?; + let html = Html::parse_document(&content); + + Ok(html + .select(&sel) + .flat_map(|elt| elt.value().attr("href")) + .map(|s| s.to_string()) + .collect::>()) + } + + async fn fetch_from_listing_impl( + self: Arc, + url: Url, + max_depth: usize, + html_max_bytes: u64, + tx: Sender, + ) -> Result<(), Error> { + let maybe_root = self.ensure_sampled_uri(&url).await; + if let Err(ref e) = maybe_root { + eprintln!("Couldn't download {}: {:?}", url, e); + }; + let root = maybe_root?; + tx.send(FetchListingMessage::Sampled(url.clone(), root.clone())) + .await + .context("Stopped accepting tasks before processing an Ingested notification")?; + if max_depth <= 0 { + return Ok(()); + } + + match root.blob { + None => Err(anyhow!( + "Couldn't download {}. Status code: {:?}", + url, + root.http_status + )), + Some(SizedBlob { hash, n_bytes }) => { + if n_bytes > html_max_bytes { + return Ok(()); + } + match self.blob_service.open_read(&hash).await? { + Some(mut reader) => { + let content = { + let mut br = BufReader::new(&mut *reader); + let mut content = String::new(); + br.read_to_string(&mut content).await?; + content + }; + let hrefs = Self::extract_hrefs(&content).unwrap_or(vec![]); + /* max_depth > 0 here */ + for href in hrefs.clone() { + let next_url = url.join(&href).context("Constructing next_url")?; + tx.send(FetchListingMessage::Recurse( + next_url.clone(), + max_depth - 1, + tx.clone(), + )) + .await + .context("Stopped accepting tasks before finishing all hrefs")?; + } + { + let lock = self.con.lock().expect("Couldn't acquire Mutex?"); + for href in hrefs { + let mut stmt = + lock.prepare_cached(include_str!("q/add-str.sql"))?; + stmt.execute(params!["href"])?; + + let next_url = url.join(&href).context("Constructing next_url")?; + let mut stmt = + lock.prepare_cached(include_str!("q/add-uri-ref.sql"))?; + let digest64 = hash.to_string(); + stmt.execute(named_params! {":source": digest64, ":target": next_url.to_string(), ":why": "href"})?; + } + }; + Ok(()) + } + None => Err(anyhow!("Couldn't read the ingested blob")), + } + } + } + } + + async fn fetch_from_listing( + self: Arc, + url: Url, + max_depth: usize, + html_max_bytes: u64, + ) -> ReceiverStream { + let mq_size = 10; + + /* TODO: move task queue to e.g. sqlite */ + let (tx, mut rx) = channel(mq_size); + + let (out_tx, out_rx) = channel(mq_size); + + tokio::spawn({ + async move { + let mut seen: HashSet = HashSet::new(); + { + let tx_moved = tx; + tx_moved + .send(FetchListingMessage::Recurse( + url, + max_depth, + tx_moved.clone(), + )) + .await + .expect("fetch_from_listing failed populating the queue"); + }; + while let Some(m) = rx.recv().await { + match m { + FetchListingMessage::Sampled(_url, ingested) => { + out_tx + .send(ingested) + .await + .expect("ReceiverStream failed to accept an Ingestable"); + } + FetchListingMessage::Recurse(url, max_depth, tx) => { + if max_depth > 0 && !seen.contains(&url.to_string()) { + seen.insert(url.to_string()); + tokio::spawn({ + let s = self.clone(); + let url = url.clone(); + async move { + s.fetch_from_listing_impl( + url, + max_depth, + html_max_bytes, + tx, + ) + .await + } + }); + } } } } - Ingestable::Url(url) => { - let res = client - .get(url.clone()) - .send() - .await - .context(format!("Request.send failed early for {:?}", uri))? - .error_for_status()?; - let mut r = tokio_util::io::StreamReader::new( - res.bytes_stream().map_err(std::io::Error::other), - ); - let mut w = blob_service.open_write().await; - let n_bytes = match tokio::io::copy(&mut r, &mut w).await { - Ok(n) => n, - Err(e) => { - return Err(anyhow!( - "tokio::io::copy failed for uri={} with {}", - uri_s, - e - )); - } - }; - let digest = w.close().await?; - (digest, n_bytes) - } - }; - let digest64 = format!("{}", digest); - add_blob.execute(params![digest64, n_bytes,])?; - add_uri.execute(params![uri_s])?; - let (sample_id, epoch) = add_sample - .query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?; - Ok(Some(Ingested { - sample_id, - uri: uri_s, - blake3: digest64, - epoch, - when: IngestedWhen::Now, - })) - } - })) - .buffer_unordered(args.max_parallel) - .collect::, _>>>() - .await; - - for s in samples { - match s { - Err(e) => { - println!("Failed to fetch: {}", e); } - Ok(None) => {} - Ok(Some(ingested)) => { - println!("{:?}", ingested) + }); + ReceiverStream::new(out_rx) + } +} + +fn string_or_int<'de, T, D>(deserializer: D) -> Result +where + T: Deserialize<'de> + TryFrom + FromStr, + D: serde::Deserializer<'de>, +{ + struct StringOrInt(PhantomData T>); + + impl<'de, T> Visitor<'de> for StringOrInt + where + T: Deserialize<'de> + TryFrom + FromStr, + { + type Value = T; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("string or int") + } + + fn visit_u64(self, value: u64) -> Result + where + E: de::Error, + { + T::try_from(value).map_err(|_e| de::Error::custom("ignored error")) + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + FromStr::from_str(value).map_err(de::Error::custom) + } + } + + deserializer.deserialize_any(StringOrInt(PhantomData)) +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +struct CudaArtifact { + relative_path: String, + sha256: String, + md5: Option, + + // Tha manifests export size as string instead of number + #[serde(deserialize_with = "string_or_int")] + size: i64, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(untagged)] +enum CudaArtifactsByTag { + Single(CudaArtifact), + Many { + #[serde(flatten)] + by_tag: HashMap, + }, +} +impl IntoIterator for CudaArtifactsByTag { + type Item = (Option, CudaArtifact); + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> std::vec::IntoIter { + match self { + CudaArtifactsByTag::Single(art) => vec![(None, art)].into_iter(), + CudaArtifactsByTag::Many { by_tag: by_compat } => by_compat + .iter() + .map(|(k, x)| (Some(k.clone()), x.clone())) + .collect::>() + .into_iter(), + } + } +} +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(untagged)] +enum CudaArtifactsByPlatform { + Binary { + #[serde(flatten)] + by_platform: HashMap, + }, + Source { + source: CudaArtifact, + }, +} + +impl IntoIterator for CudaArtifactsByPlatform { + type Item = (String, Option, CudaArtifact); + + /* TODO: Figure out which is the trait that doesn't involve copying */ + type IntoIter = std::vec::IntoIter<(String, Option, CudaArtifact)>; + + fn into_iter(self) -> Self::IntoIter { + match self { + CudaArtifactsByPlatform::Binary { by_platform } => by_platform + .iter() + .flat_map(|(platform, by_tag)| { + by_tag + .clone() + .into_iter() + .map(|(tag, artifact)| (platform.clone(), tag.clone(), artifact)) + }) + .collect::>() + .into_iter(), + CudaArtifactsByPlatform::Source { source } => { + (vec![("source".to_string(), None, source)]).into_iter() } } } } + +#[derive(Serialize, Deserialize, Debug)] +struct CudaJsonPackage { + name: Option, + license: String, + license_path: Option, + version: String, + + cuda_variant: Option>, + + #[serde(flatten)] + artifacts: CudaArtifactsByPlatform, +} + +#[derive(Serialize, Deserialize, Debug)] +struct CudaJsonManifest { + release_date: Option, + release_label: Option, + release_product: Option, + + #[serde(flatten)] + by_pname: HashMap, +} + +#[tokio::main] +async fn main() { + let args = Cli::parse(); + + let _cwd = std::env::current_dir().expect("Couldn't get CWD"); + let _host_name = std::env::var("HOSTNAME").map_or(None, Some); + + let ctx = Arc::new( + open_context( + args.refetch, + args.max_parallel, + args.db_path, + args.castore_path, + ) + .await, + ); + + match args.command { + Some(Command::Ingest { inputs }) => { + let samples = ctx.ingest(&inputs).await; + for s in samples { + match s { + Err(e) => { + eprintln!("Failed to fetch: {}", e); + } + Ok(None) => {} + Ok(Some(ingested)) => { + eprintln!("{:?}", ingested) + } + } + } + } + Some(Command::FetchListing { + max_depth, + html_max_bytes, + inputs, + }) => { + let ingested: Vec = stream::iter(inputs) + .then(async |i| { + let i = i.clone(); + ctx.clone() + .fetch_from_listing(i, max_depth, html_max_bytes) + .await + }) + .flatten_unordered(args.max_parallel) + .collect() + .await; + for i in ingested { + eprintln!("{:?}", i); + } + } + Some(Command::ParseUrl { url: urls }) => { + for url in urls { + println!("{:?}", url); + } + } + Some(Command::FormatCudaManifest) => { + println!( + "{}", + serde_json::to_string( + &serde_json::from_reader::<_, CudaJsonManifest>(io::stdin()).unwrap() + ) + .unwrap() + ); + } + Some(Command::DemoCudaManifest) => { + println!( + "{}", + serde_json::to_string(&CudaJsonManifest { + release_date: Some("1984-01-01".to_string()), + release_label: Some("8.9.x".to_string()), + release_product: Some("cudnn".to_string()), + by_pname: HashMap::from([ + ( + "cudnn".to_string(), + CudaJsonPackage { + name: Some("cuDNN Library".to_string()), + license: "cudnn".to_string(), + license_path: Some("bar/foo".to_string()), + version: "8.9.7.6".to_string(), + cuda_variant: Some(vec!["11".to_string(), "12".to_string()]), + artifacts: CudaArtifactsByPlatform::Binary { + by_platform: HashMap::from([ + ("x86_64-linux".to_string(), + CudaArtifactsByTag::Many { + by_tag: + HashMap::from([ + ("cuda11" + .to_string(), + CudaArtifact{ + relative_path: + "kek".to_string(), + sha256: "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824".to_string(), + md5: Some("5d41402abc4b2a76b9719d911017c592".to_string()), + size: 5 })])}) + + ]), + } + }), + ( + "cuda_samples".to_string(), + CudaJsonPackage { + name: Some("NVIDIA cuDNN samples".to_string()), + license: "cudnn".to_string(), + license_path: Some("foo/bar".to_string()), + version: "8.9.7.6".to_string(), + cuda_variant: None, + artifacts: CudaArtifactsByPlatform::Source { + source: CudaArtifact { + relative_path: "/biba/boba/fifa".to_string(), + sha256: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855".to_string(), + md5: Some("d41d8cd98f00b204e9800998ecf8427e".to_string()), + size: 0, + } + } + } + ),]) + }) + .unwrap() + ); + } + Some(Command::ProcessCudaManifests { include_finished }) => { + let manifests: Vec<(String, String, Option)> = { + let con = ctx.con.lock().unwrap(); + con.execute_batch(include_str!("q/cuda-init.sql")) + .context("q/cuda-init.sql") + .unwrap(); + let mut find_manifests = con + .prepare_cached(include_str!("q/find-cuda-manifests.sql")) + .context("q/find-cuda-manifests.sql") + .unwrap(); + find_manifests + .query(named_params! {":include_finished": include_finished}) + .context("q/find-cuda-manifests.sql") + .unwrap() + .map(|row| <(String, String, Option)>::try_from(row)) + .collect() + .expect("Casting result of q/find-cuda-manifests.sql") + }; + for m in &manifests { + let b64 = m.1.clone(); + let b3 = match B3Digest::from_str(&b64) { + Ok(b3) => b3, + Err(e) => { + eprintln!("Invalid hash recorded for {:?}: {}", m, e); + continue; + } + }; + if let Err(e) = ctx.ensure_blob(&b3).await { + eprintln!("Couldn't provision the blob for {:?}: {}", m, e); + continue; + }; + let json = { + let mut reader = match ctx.blob_service.open_read(&b3).await { + Ok(Some(reader)) => reader, + Ok(None) => { + eprintln!("Blob doesn't exist after ensure_blob: {:?}", m); + continue; + } + Err(e) => { + eprintln!("Couldn't query the blob for {:?}: {}", m, e); + continue; + } + }; + let mut json = String::new(); + match reader.read_to_string(&mut json).await { + Ok(_) => (), + Err(e) => { + eprintln!("Couldn't read blob {:?}: {:?}", m, e); + continue; + } + }; + json + }; + let parsed: CudaJsonManifest = match serde_json::from_str(&json) { + Ok(m) => m, + Err(e) => { + eprintln!("Couldn't parse JSON for {:?}: {:?}", m, e); + continue; + } + }; + { + let mut lock = ctx.con.lock().unwrap(); + let tx = lock.transaction().unwrap(); + { + let mut add_str = tx + .prepare_cached(include_str!("q/add-str.sql")) + .context("q/add-str.sql") + .unwrap(); + let mut add_hash = tx + .prepare_cached(include_str!("q/upsert-blob.sql")) + .context("q/upsert-blob.sql") + .unwrap(); + let mut add_manifest = tx + .prepare_cached(include_str!("q/add-cuda-manifest.sql")) + .context("q/add-cuda-manifest.sql") + .unwrap(); + let mut add_comp = tx + .prepare_cached(include_str!("q/add-cuda-artifact.sql")) + .context("q/add-cuda-artifact.sql") + .unwrap(); + + add_hash.execute(params![b64, None::]).unwrap(); + for s in vec![ + &parsed.release_date, + &parsed.release_label, + &parsed.release_product, + ] { + add_str.execute((s,)).unwrap(); + } + add_manifest + .execute(named_params! { + ":hash": b64, + ":release_date": parsed.release_date, + ":release_label": parsed.release_label, + ":release_product": parsed.release_product, + }) + .context("Executing q/add-cuda-manifest.sql") + .unwrap(); + + for (pname, pkg) in parsed.by_pname { + for (platform, maybe_tag, comp) in pkg.artifacts.into_iter() { + let ps = named_params! { + ":manifest": b64, + ":name": pkg.name, + ":pname": pname, + ":license_name": pkg.license, + ":license_path": pkg.license_path, + ":version": pkg.version, + ":sha256": comp.sha256, + ":md5": comp.md5, + ":platform": platform, + ":relative_path": comp.relative_path, + ":n_bytes": comp.size, + ":compat_tag": maybe_tag + }; + for h in &vec![Some(&comp.sha256), comp.md5.as_ref()] { + add_hash.execute(params![h, None::]).unwrap(); + } + for s in &vec![ + Some(&pname), + pkg.name.as_ref(), + Some(&pkg.license), + pkg.license_path.as_ref(), + Some(&pkg.version), + Some(&platform.to_string()), + Some(&comp.relative_path), + maybe_tag.as_ref(), + ] { + add_str.execute(params![s]).unwrap(); + } + add_comp + .execute(ps) + .context("Executing q/add-cuda-artifact.sql") + .unwrap(); + } + } + } + tx.commit() + .expect("Couldn't commit transaction adding manifest or its component"); + } + } + } + None => {} + } +} diff --git a/src/q/add-cuda-artifact.sql b/src/q/add-cuda-artifact.sql new file mode 100644 index 0000000..2b860f6 --- /dev/null +++ b/src/q/add-cuda-artifact.sql @@ -0,0 +1,17 @@ +INSERT INTO + CudaArtifact(manifest, sha256, md5, name, pname, license_name, license_path, version, platform, compat_tag, relative_path, n_bytes) +VALUES ( + (SELECT id FROM Hash WHERE hash=:manifest LIMIT 1), + (SELECT id FROM Hash WHERE hash=:sha256 LIMIT 1), + (SELECT id FROM Hash WHERE hash=:md5 LIMIT 1), + (SELECT id FROM Str WHERE str=:name LIMIT 1), + (SELECT id FROM Str WHERE str=:pname LIMIT 1), + (SELECT id FROM Str WHERE str=:license_name LIMIT 1), + (SELECT id FROM Str WHERE str=:license_path LIMIT 1), + (SELECT id FROM Str WHERE str=:version LIMIT 1), + (SELECT id FROM Str WHERE str=:platform LIMIT 1), + (SELECT id FROM Str WHERE str=:compat_tag LIMIT 1), + (SELECT id FROM Str WHERE str=:relative_path LIMIT 1), + :n_bytes +) +ON CONFLICT DO NOTHING diff --git a/src/q/add-cuda-manifest.sql b/src/q/add-cuda-manifest.sql new file mode 100644 index 0000000..02e8b1c --- /dev/null +++ b/src/q/add-cuda-manifest.sql @@ -0,0 +1,9 @@ +INSERT INTO + CudaManifest(id, release_date, release_label, release_product) +VALUES ( + (SELECT id FROM Hash WHERE hash=:hash LIMIT 1), + (SELECT id FROM Str WHERE str=:release_date LIMIT 1), + (SELECT id FROM Str WHERE str=:release_label LIMIT 1), + (SELECT id FROM Str WHERE str=:release_product LIMIT 1) +) +ON CONFLICT DO NOTHING diff --git a/src/q/add-sample.sql b/src/q/add-sample.sql index f26b3ad..71131ce 100644 --- a/src/q/add-sample.sql +++ b/src/q/add-sample.sql @@ -1,21 +1,8 @@ -INSERT INTO sidx_uri_sample(uri_id, blake3_id) -VALUES( - ( - SELECT - id - FROM - sidx_uri - WHERE - uri = ? - LIMIT 1 - ), - ( - SELECT - id - FROM - sidx_blake3 - WHERE - blake3 = ? - ) +INSERT INTO SidxUriSample(uri, hash, http_code, content_type) +VALUES ( + ( SELECT id FROM Str WHERE str = :uri LIMIT 1), + ( SELECT id FROM Hash WHERE hash = :hash LIMIT 1 ), + :http_code, + ( SELECT id FROM Str WHERE str = :content_type LIMIT 1) ) RETURNING id, epoch; diff --git a/src/q/add-str.sql b/src/q/add-str.sql new file mode 100644 index 0000000..feae104 --- /dev/null +++ b/src/q/add-str.sql @@ -0,0 +1,5 @@ +INSERT INTO Str(str) +VALUES +(?) +ON CONFLICT DO NOTHING; + diff --git a/src/q/add-uri-ref.sql b/src/q/add-uri-ref.sql new file mode 100644 index 0000000..798310a --- /dev/null +++ b/src/q/add-uri-ref.sql @@ -0,0 +1,7 @@ +INSERT INTO UriReference(content, target, why) +VALUES ( + (SELECT id FROM Hash WHERE hash=:source LIMIT 1), + (SELECT id FROM Str WHERE str=:target LIMIT 1), + (SELECT id FROM Str WHERE str=:why LIMIT 1) +) +ON CONFLICT DO UPDATE SET why=excluded.why; diff --git a/src/q/cuda-init.sql b/src/q/cuda-init.sql new file mode 100644 index 0000000..4c7d29d --- /dev/null +++ b/src/q/cuda-init.sql @@ -0,0 +1,49 @@ +CREATE TABLE IF NOT EXISTS CudaManifest( + id INTEGER, /* Blake3/ca-node of the JSON */ + release_date INTEGER, /* E.g. "2025-03-06" */ + release_label INTEGER, /* E.g. "12.8.1" */ + release_product INTEGER, /* E.g. "cuda" */ + PRIMARY KEY(id), + FOREIGN KEY(id) REFERENCES Hash(id), + FOREIGN KEY(release_date) REFERENCES Str(id), + FOREIGN KEY(release_label) REFERENCES Str(id), + FOREIGN KEY(release_product) REFERENCES Str(id) +) STRICT; + +CREATE TABLE IF NOT EXISTS CudaArtifact( + manifest INTEGER NOT NULL, + name INTEGER, /* E.g. "cuda_nvcc" */ + pname INTEGER, /* E.g. "CUDA NVCC" */ + license_name INTEGER, /* E.g. "CUDA Toolkit" */ + license_path INTEGER, /* E.g. "cuda_cccl/LICENSE.txt" */ + version INTEGER NOT NULL, /* E.g. "12.8.90" */ + /* Consider making external */ + compat_tag INTEGER, /* E.g. "cuda12" in cudnn */ + sha256 INTEGER, + md5 INTEGER, + platform INTEGER, /* E.g. "linux-x86_64" */ + /* E.g. "cuda_cccl/linux-x86_64/cuda_cccl-linux-x86_64-12.8.90-archive.tar.xz" */ + relative_path INTEGER, + n_bytes INTEGER, /* May be a string in the JSON */ + /* Tempting to have + * PRIMARY KEY(manifest, name, platform), + * however that's not unique at least because of `compat_tag`, + * which might also be `NULL`. + */ + PRIMARY KEY(sha256, manifest), + FOREIGN KEY(manifest) REFERENCES CudaManifest(id), + FOREIGN KEY(manifest) REFERENCES Hash(id), + FOREIGN KEY(name) REFERENCES Str(id), + FOREIGN KEY(pname) REFERENCES Str(id), + FOREIGN KEY(license_name) REFERENCES Str(id), + FOREIGN KEY(license_path) REFERENCES Str(id), + FOREIGN KEY(version) REFERENCES Str(id), + FOREIGN KEY(compat_tag) REFERENCES Str(id), + FOREIGN KEY(sha256) REFERENCES Hash(id), + FOREIGN KEY(md5) REFERENCES Hash(id), + FOREIGN KEY(platform) REFERENCES Str(id), + FOREIGN KEY(relative_path) REFERENCES Str(id) +) STRICT; + +CREATE UNIQUE INDEX IF NOT EXISTS CudaArtifactIdx +ON CudaArtifact(pname, platform, version, compat_tag, name, manifest); diff --git a/src/q/find-cuda-manifests.sql b/src/q/find-cuda-manifests.sql new file mode 100644 index 0000000..6074454 --- /dev/null +++ b/src/q/find-cuda-manifests.sql @@ -0,0 +1,15 @@ +SELECT + uri.str AS uri, h.hash, cm.id AS manifest +FROM + SidxUriSample AS s + INNER JOIN Str AS uri + INNER JOIN (Hash AS h LEFT JOIN CudaManifest AS cm ON h.id=cm.id) +ON + s.uri=uri.id + AND s.hash=h.id +WHERE + uri.str LIKE 'https://developer.download.nvidia.com/compute/%.json' + AND (:include_finished OR cm.id IS NULL) +GROUP BY + s.hash +ORDER BY uri.str, s.id DESC; diff --git a/src/q/init.sql b/src/q/init.sql deleted file mode 100644 index 94bd769..0000000 --- a/src/q/init.sql +++ /dev/null @@ -1,22 +0,0 @@ -CREATE TABLE IF NOT EXISTS sidx_uri( - id INTEGER, - uri TEXT UNIQUE, - PRIMARY KEY(id) - ); -CREATE TABLE IF NOT EXISTS sidx_blake3( - id INTEGER, - blake3 TEXT UNIQUE, /* snix-castore node */ - n_bytes INTEGER NOT NULL, - PRIMARY KEY(id) - ); -CREATE TABLE IF NOT EXISTS sidx_uri_sample( - id INTEGER, - uri_id INTEGER NOT NULL, - blake3_id INTEGER, - epoch INTEGER NOT NULL DEFAULT (unixepoch()), - PRIMARY KEY(id), - FOREIGN KEY(uri_id) REFERENCES sidx_uri(id), - FOREIGN KEY(blake3_id) REFERENCES sidx_blake3(id) -); -CREATE INDEX IF NOT EXISTS sidx_uri_blake3_idx -ON sidx_uri_sample(uri_id, blake3_id, epoch); diff --git a/src/q/latest-download.sql b/src/q/latest-download.sql index a0e6938..0161664 100644 --- a/src/q/latest-download.sql +++ b/src/q/latest-download.sql @@ -1,16 +1,19 @@ SELECT s.id AS sample_id, - b.blake3, + h.hash, + h.n_bytes, + s.http_code, s.epoch FROM - sidx_uri_sample AS s, - sidx_uri AS u, - sidx_blake3 AS b + SidxUriSample AS s, + Str AS u, + Hash AS h ON - s.uri_id = u.id - AND s.blake3_id = b.id + s.uri = u.id + AND s.hash = h.id WHERE - u.uri = ? + u.str = ? + AND s.hash IS NOT NULL ORDER BY s.epoch DESC LIMIT 1; diff --git a/src/q/sidx-init.sql b/src/q/sidx-init.sql new file mode 100644 index 0000000..5cf2492 --- /dev/null +++ b/src/q/sidx-init.sql @@ -0,0 +1,37 @@ +CREATE TABLE IF NOT EXISTS Hash( + id INTEGER, + hash TEXT UNIQUE, /* snix-castore node */ + n_bytes INTEGER, + PRIMARY KEY(id) +) STRICT; /* Essentially random strings */ +CREATE TABLE IF NOT EXISTS Str( + id INTEGER, + str TEXT UNIQUE, + PRIMARY KEY(id) +) STRICT; /* "Naturally occuring" strings */ +CREATE TABLE IF NOT EXISTS SidxUriSample( + id INTEGER, + uri INTEGER NOT NULL, + hash INTEGER, + epoch INTEGER NOT NULL DEFAULT (unixepoch()), + http_code INTEGER DEFAULT NULL, + content_type INTEGER DEFAULT NULL, + PRIMARY KEY(id), + FOREIGN KEY(uri) REFERENCES Str(id), + FOREIGN KEY(hash) REFERENCES Hash(id), + FOREIGN KEY(content_type) REFERENCES Str(id) +) STRICT; +CREATE INDEX IF NOT EXISTS SidxUriHashIdx +ON SidxUriSample(uri, hash, epoch); + +CREATE TABLE IF NOT EXISTS "UriReference" ( + "id" INTEGER, + "content" INTEGER NOT NULL, + "target" INTEGER NOT NULL, + "why" INTEGER, + PRIMARY KEY("id"), + CONSTRAINT "NoDupRefs" UNIQUE("content","target","why"), + FOREIGN KEY("content") REFERENCES "Hash"("id"), + FOREIGN KEY("target") REFERENCES "Str"("id"), + FOREIGN KEY("why") REFERENCES "Str"("id") +) STRICT; diff --git a/src/q/upsert-blob.sql b/src/q/upsert-blob.sql index 66fb7ec..3589e33 100644 --- a/src/q/upsert-blob.sql +++ b/src/q/upsert-blob.sql @@ -1,4 +1,4 @@ -INSERT INTO sidx_blake3(blake3, n_bytes) +INSERT INTO Hash(hash, n_bytes) VALUES (?, ?) ON CONFLICT DO NOTHING; diff --git a/src/q/upsert-uri.sql b/src/q/upsert-uri.sql index 555ede8..8702c5a 100644 --- a/src/q/upsert-uri.sql +++ b/src/q/upsert-uri.sql @@ -1,4 +1,4 @@ -INSERT INTO sidx_uri(uri) +INSERT INTO Str(str) VALUES (?) ON CONFLICT DO NOTHING; diff --git a/src/q/uris-of-hash.sql b/src/q/uris-of-hash.sql new file mode 100644 index 0000000..827fdeb --- /dev/null +++ b/src/q/uris-of-hash.sql @@ -0,0 +1,16 @@ +SELECT + uri.str AS uri +FROM + SidxUriSample AS s + INNER JOIN Str AS uri + INNER JOIN Hash AS h +ON + s.uri=uri.id + AND s.hash=h.id +WHERE + h.hash=:hash +ORDER BY + s.epoch DESC +LIMIT + :limit +; diff --git a/static/some.css b/static/some.css new file mode 100644 index 0000000..5d4ca51 --- /dev/null +++ b/static/some.css @@ -0,0 +1,48 @@ +.index { + font-family: "Source Serif Pro", "Linux Libertine", monospace; +} + +.db-table > h3 { + font-variant: small-caps; +} + +h1, nav { + font-variant: small-caps; + font-family: "Inconsolata", monospace; +} + +h2 { + font-variant: small-caps; +} + +th { + font-variant: small-caps; +} + +header { + background-color: black; +} + +.ft { + background-color: black; +} + +form input[type="submit"] { + background-color: black; +} + +a:link { + color: #404040; +} + +a:visited { + color: darkgrey; +} + +a:hover { + color: black; +} + +.rows-and-columns > tbody > tr:nth-child(even) { + background-color: #F5F5F5; +}