feat(fetch-listing): poc recursive fetching

This commit is contained in:
Else, Someone 2025-04-27 16:53:07 +00:00
parent 6df68a7e9c
commit cfff120c9b
5 changed files with 1370 additions and 133 deletions

290
Cargo.lock generated
View file

@ -342,6 +342,12 @@ version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.10.1"
@ -512,6 +518,29 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.100",
]
[[package]]
name = "darling"
version = "0.20.11"
@ -563,6 +592,17 @@ dependencies = [
"serde",
]
[[package]]
name = "derive_more"
version = "0.99.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "digest"
version = "0.10.7"
@ -591,6 +631,27 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "dtoa"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "either"
version = "1.15.0"
@ -727,6 +788,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.31"
@ -816,6 +887,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "gcp_auth"
version = "0.12.3"
@ -853,6 +933,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width 0.1.14",
]
[[package]]
name = "getrandom"
version = "0.2.15"
@ -958,6 +1047,18 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "html5ever"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
dependencies = [
"log",
"mac",
"markup5ever",
"match_token",
]
[[package]]
name = "http"
version = "1.3.1"
@ -1429,6 +1530,37 @@ version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "match_token"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "matchers"
version = "0.1.0"
@ -1509,6 +1641,12 @@ dependencies = [
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
@ -1685,6 +1823,58 @@ dependencies = [
"indexmap 2.9.0",
]
[[package]]
name = "phf"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_macros",
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand 0.8.5",
]
[[package]]
name = "phf_macros"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "phf_shared"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project"
version = "1.1.10"
@ -1744,6 +1934,12 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "prettyplease"
version = "0.2.32"
@ -2300,6 +2496,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "527e65d9d888567588db4c12da1087598d0f6f8b346cc2c5abc91f05fc2dffe2"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "security-framework"
version = "2.11.1"
@ -2336,6 +2547,25 @@ dependencies = [
"libc",
]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags 2.9.0",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.219"
@ -2431,6 +2661,15 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "servo_arc"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
@ -2455,8 +2694,11 @@ dependencies = [
"futures",
"reqwest",
"rusqlite",
"scraper",
"serde_json",
"snix-castore",
"tokio",
"tokio-stream",
"tokio-util",
"url",
]
@ -2470,6 +2712,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "slab"
version = "0.4.9"
@ -2588,6 +2836,31 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]]
name = "strsim"
version = "0.11.1"
@ -2676,6 +2949,17 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "thiserror"
version = "1.0.69"
@ -3129,6 +3413,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16_iter"
version = "1.0.5"

743
Cargo.nix
View file

@ -1104,6 +1104,19 @@ rec {
};
resolvedDefaultFeatures = [ "default" ];
};
"byteorder" = rec {
crateName = "byteorder";
version = "1.5.0";
edition = "2021";
sha256 = "0jzncxyf404mwqdbspihyzpkndfgda450l0893pz5xj685cg5l0z";
authors = [
"Andrew Gallant <jamslam@gmail.com>"
];
features = {
"default" = [ "std" ];
};
resolvedDefaultFeatures = [ "default" "std" ];
};
"bytes" = rec {
crateName = "bytes";
version = "1.10.1";
@ -1588,6 +1601,65 @@ rec {
"rand_core" = [ "dep:rand_core" ];
};
};
"cssparser" = rec {
crateName = "cssparser";
version = "0.34.0";
edition = "2018";
sha256 = "1qx3hha392szcl812l6hp0d4029gg8x62cl4nf0byqgdv0f6vimp";
authors = [
"Simon Sapin <simon.sapin@exyr.org>"
];
dependencies = [
{
name = "cssparser-macros";
packageId = "cssparser-macros";
}
{
name = "dtoa-short";
packageId = "dtoa-short";
}
{
name = "itoa";
packageId = "itoa";
}
{
name = "phf";
packageId = "phf";
features = [ "macros" ];
}
{
name = "smallvec";
packageId = "smallvec";
}
];
features = {
"serde" = [ "dep:serde" ];
};
};
"cssparser-macros" = rec {
crateName = "cssparser-macros";
version = "0.6.1";
edition = "2018";
sha256 = "0cfkzj60avrnskdmaf7f8zw6pp3di4ylplk455zrzaf19ax8id8k";
procMacro = true;
libName = "cssparser_macros";
libPath = "lib.rs";
authors = [
"Simon Sapin <simon.sapin@exyr.org>"
];
dependencies = [
{
name = "quote";
packageId = "quote";
}
{
name = "syn";
packageId = "syn 2.0.100";
features = [ "full" "extra-traits" ];
}
];
};
"darling" = rec {
crateName = "darling";
version = "0.20.11";
@ -1731,6 +1803,49 @@ rec {
};
resolvedDefaultFeatures = [ "alloc" "powerfmt" "serde" "std" ];
};
"derive_more" = rec {
crateName = "derive_more";
version = "0.99.19";
edition = "2018";
sha256 = "17y6g78dg31fsv7z4p455bzxs670spg476ww2ibg3mj3vww9m8ix";
procMacro = true;
authors = [
"Jelte Fennema <github-tech@jeltef.nl>"
];
dependencies = [
{
name = "proc-macro2";
packageId = "proc-macro2";
}
{
name = "quote";
packageId = "quote";
}
{
name = "syn";
packageId = "syn 2.0.100";
}
];
features = {
"convert_case" = [ "dep:convert_case" ];
"default" = [ "add_assign" "add" "as_mut" "as_ref" "constructor" "deref" "deref_mut" "display" "error" "from" "from_str" "index" "index_mut" "into" "into_iterator" "iterator" "mul_assign" "mul" "not" "sum" "try_into" "is_variant" "unwrap" ];
"display" = [ "syn/extra-traits" ];
"error" = [ "syn/extra-traits" ];
"from" = [ "syn/extra-traits" ];
"generate-parsing-rs" = [ "peg" ];
"into" = [ "syn/extra-traits" ];
"is_variant" = [ "convert_case" ];
"mul" = [ "syn/extra-traits" ];
"mul_assign" = [ "syn/extra-traits" ];
"not" = [ "syn/extra-traits" ];
"peg" = [ "dep:peg" ];
"rustc_version" = [ "dep:rustc_version" ];
"testing-helpers" = [ "rustc_version" ];
"try_into" = [ "syn/extra-traits" ];
"unwrap" = [ "convert_case" "rustc_version" ];
};
resolvedDefaultFeatures = [ "add" "add_assign" ];
};
"digest" = rec {
crateName = "digest";
version = "0.10.7";
@ -1810,6 +1925,49 @@ rec {
features = {
};
};
"dtoa" = rec {
crateName = "dtoa";
version = "1.0.10";
edition = "2018";
sha256 = "016gid01rarcdv57h049d7nr9daxc2hc2gqzx0mji57krywd7bfn";
authors = [
"David Tolnay <dtolnay@gmail.com>"
];
features = {
"no-panic" = [ "dep:no-panic" ];
};
};
"dtoa-short" = rec {
crateName = "dtoa-short";
version = "0.3.5";
edition = "2015";
sha256 = "11rwnkgql5jilsmwxpx6hjzkgyrbdmx1d71s0jyrjqm5nski25fd";
libName = "dtoa_short";
authors = [
"Xidorn Quan <me@upsuper.org>"
];
dependencies = [
{
name = "dtoa";
packageId = "dtoa";
}
];
};
"ego-tree" = rec {
crateName = "ego-tree";
version = "0.10.0";
edition = "2021";
sha256 = "1n2csy99chk5v5vzjl0ff79vxpxhl76xmcb3aj6brrzzipmjz5xj";
libName = "ego_tree";
authors = [
"June McEnroe <june@causal.agency>"
"Carlo Federico Vescovo <vescovocarlofederico@gmail.com>"
];
features = {
"serde" = [ "dep:serde" ];
};
};
"either" = rec {
crateName = "either";
version = "1.15.0";
@ -2155,6 +2313,26 @@ rec {
};
resolvedDefaultFeatures = [ "alloc" "default" "std" ];
};
"futf" = rec {
crateName = "futf";
version = "0.1.5";
edition = "2015";
sha256 = "0hvqk2r7v4fnc34hvc3vkri89gn52d5m9ihygmwn75l1hhp0whnz";
authors = [
"Keegan McAllister <kmcallister@mozilla.com>"
];
dependencies = [
{
name = "mac";
packageId = "mac";
}
{
name = "new_debug_unreachable";
packageId = "new_debug_unreachable";
}
];
};
"futures" = rec {
crateName = "futures";
version = "0.3.31";
@ -2433,6 +2611,23 @@ rec {
};
resolvedDefaultFeatures = [ "alloc" "async-await" "async-await-macro" "channel" "default" "futures-channel" "futures-io" "futures-macro" "futures-sink" "io" "memchr" "sink" "slab" "std" ];
};
"fxhash" = rec {
crateName = "fxhash";
version = "0.2.1";
edition = "2015";
sha256 = "037mb9ichariqi45xm6mz0b11pa92gj38ba0409z3iz239sns6y3";
libPath = "lib.rs";
authors = [
"cbreeden <github@u.breeden.cc>"
];
dependencies = [
{
name = "byteorder";
packageId = "byteorder";
}
];
};
"gcp_auth" = rec {
crateName = "gcp_auth";
version = "0.12.3";
@ -2565,6 +2760,26 @@ rec {
};
resolvedDefaultFeatures = [ "more_lengths" ];
};
"getopts" = rec {
crateName = "getopts";
version = "0.2.21";
edition = "2015";
sha256 = "1mgb3qvivi26gs6ihqqhh8iyhp3vgxri6vwyrwg28w0xqzavznql";
authors = [
"The Rust Project Developers"
];
dependencies = [
{
name = "unicode-width";
packageId = "unicode-width 0.1.14";
}
];
features = {
"core" = [ "dep:core" ];
"rustc-dep-of-std" = [ "unicode-width/rustc-dep-of-std" "std" "core" ];
"std" = [ "dep:std" ];
};
};
"getrandom 0.2.15" = rec {
crateName = "getrandom";
version = "0.2.15";
@ -2938,6 +3153,35 @@ rec {
];
};
"html5ever" = rec {
crateName = "html5ever";
version = "0.29.1";
edition = "2021";
sha256 = "07518h5gbw0c6x7w5br76bgxvgphs6zlrb4q7ii7bg1ww7510x1v";
authors = [
"The html5ever Project Developers"
];
dependencies = [
{
name = "log";
packageId = "log";
}
{
name = "mac";
packageId = "mac";
}
{
name = "markup5ever";
packageId = "markup5ever";
}
{
name = "match_token";
packageId = "match_token";
}
];
features = {
};
};
"http" = rec {
crateName = "http";
version = "1.3.1";
@ -4471,6 +4715,78 @@ rec {
};
resolvedDefaultFeatures = [ "std" ];
};
"mac" = rec {
crateName = "mac";
version = "0.1.1";
edition = "2015";
sha256 = "194vc7vrshqff72rl56f9xgb0cazyl4jda7qsv31m5l6xx7hq7n4";
authors = [
"Jonathan Reem <jonathan.reem@gmail.com>"
];
};
"markup5ever" = rec {
crateName = "markup5ever";
version = "0.14.1";
edition = "2021";
sha256 = "063sdq7hwxn2al9ygify8dd96mj57n9c4lig007lr1p128yj39y7";
libPath = "lib.rs";
authors = [
"The html5ever Project Developers"
];
dependencies = [
{
name = "log";
packageId = "log";
}
{
name = "phf";
packageId = "phf";
}
{
name = "string_cache";
packageId = "string_cache";
}
{
name = "tendril";
packageId = "tendril";
}
];
buildDependencies = [
{
name = "phf_codegen";
packageId = "phf_codegen";
}
{
name = "string_cache_codegen";
packageId = "string_cache_codegen";
}
];
};
"match_token" = rec {
crateName = "match_token";
version = "0.1.0";
edition = "2021";
sha256 = "0sx3212vkjqfblfhr556ayabbjflbigjf5j591j9kgs4infniac8";
procMacro = true;
dependencies = [
{
name = "proc-macro2";
packageId = "proc-macro2";
}
{
name = "quote";
packageId = "quote";
}
{
name = "syn";
packageId = "syn 2.0.100";
features = [ "full" ];
}
];
};
"matchers" = rec {
crateName = "matchers";
version = "0.1.0";
@ -4717,6 +5033,18 @@ rec {
"vendored" = [ "openssl/vendored" ];
};
};
"new_debug_unreachable" = rec {
crateName = "new_debug_unreachable";
version = "1.0.6";
edition = "2021";
sha256 = "11phpf1mjxq6khk91yzcbd3ympm78m3ivl7xg6lg2c0lf66fy3k5";
libName = "debug_unreachable";
authors = [
"Matt Brubeck <mbrubeck@limpet.net>"
"Jonathan Reem <jonathan.reem@gmail.com>"
];
};
"nu-ansi-term" = rec {
crateName = "nu-ansi-term";
version = "0.46.0";
@ -5280,6 +5608,142 @@ rec {
"unstable" = [ "generate" ];
};
};
"phf" = rec {
crateName = "phf";
version = "0.11.3";
edition = "2021";
sha256 = "0y6hxp1d48rx2434wgi5g8j1pr8s5jja29ha2b65435fh057imhz";
authors = [
"Steven Fackler <sfackler@gmail.com>"
];
dependencies = [
{
name = "phf_macros";
packageId = "phf_macros";
optional = true;
}
{
name = "phf_shared";
packageId = "phf_shared";
usesDefaultFeatures = false;
}
];
features = {
"default" = [ "std" ];
"macros" = [ "phf_macros" ];
"phf_macros" = [ "dep:phf_macros" ];
"serde" = [ "dep:serde" ];
"std" = [ "phf_shared/std" ];
"uncased" = [ "phf_shared/uncased" ];
"unicase" = [ "phf_macros?/unicase" "phf_shared/unicase" ];
};
resolvedDefaultFeatures = [ "default" "macros" "phf_macros" "std" ];
};
"phf_codegen" = rec {
crateName = "phf_codegen";
version = "0.11.3";
edition = "2021";
sha256 = "0si1n6zr93kzjs3wah04ikw8z6npsr39jw4dam8yi9czg2609y5f";
authors = [
"Steven Fackler <sfackler@gmail.com>"
];
dependencies = [
{
name = "phf_generator";
packageId = "phf_generator";
}
{
name = "phf_shared";
packageId = "phf_shared";
}
];
};
"phf_generator" = rec {
crateName = "phf_generator";
version = "0.11.3";
edition = "2021";
crateBin = [];
sha256 = "0gc4np7s91ynrgw73s2i7iakhb4lzdv1gcyx7yhlc0n214a2701w";
authors = [
"Steven Fackler <sfackler@gmail.com>"
];
dependencies = [
{
name = "phf_shared";
packageId = "phf_shared";
usesDefaultFeatures = false;
}
{
name = "rand";
packageId = "rand 0.8.5";
usesDefaultFeatures = false;
features = [ "small_rng" ];
}
];
features = {
"criterion" = [ "dep:criterion" ];
};
};
"phf_macros" = rec {
crateName = "phf_macros";
version = "0.11.3";
edition = "2021";
sha256 = "05kjfbyb439344rhmlzzw0f9bwk9fp95mmw56zs7yfn1552c0jpq";
procMacro = true;
authors = [
"Steven Fackler <sfackler@gmail.com>"
];
dependencies = [
{
name = "phf_generator";
packageId = "phf_generator";
}
{
name = "phf_shared";
packageId = "phf_shared";
usesDefaultFeatures = false;
}
{
name = "proc-macro2";
packageId = "proc-macro2";
}
{
name = "quote";
packageId = "quote";
}
{
name = "syn";
packageId = "syn 2.0.100";
features = [ "full" ];
}
];
features = {
"unicase" = [ "unicase_" "phf_shared/unicase" ];
"unicase_" = [ "dep:unicase_" ];
};
};
"phf_shared" = rec {
crateName = "phf_shared";
version = "0.11.3";
edition = "2021";
sha256 = "1rallyvh28jqd9i916gk5gk2igdmzlgvv5q0l3xbf3m6y8pbrsk7";
authors = [
"Steven Fackler <sfackler@gmail.com>"
];
dependencies = [
{
name = "siphasher";
packageId = "siphasher";
}
];
features = {
"default" = [ "std" ];
"uncased" = [ "dep:uncased" ];
"unicase" = [ "dep:unicase" ];
};
resolvedDefaultFeatures = [ "default" "std" ];
};
"pin-project" = rec {
crateName = "pin-project";
version = "1.1.10";
@ -5397,6 +5861,17 @@ rec {
};
resolvedDefaultFeatures = [ "simd" "std" ];
};
"precomputed-hash" = rec {
crateName = "precomputed-hash";
version = "0.1.1";
edition = "2015";
sha256 = "075k9bfy39jhs53cb2fpb9klfakx2glxnf28zdw08ws6lgpq6lwj";
libName = "precomputed_hash";
authors = [
"Emilio Cobos Álvarez <emilio@crisal.io>"
];
};
"prettyplease" = rec {
crateName = "prettyplease";
version = "0.2.32";
@ -7485,6 +7960,56 @@ rec {
"default" = [ "use_std" ];
};
};
"scraper" = rec {
crateName = "scraper";
version = "0.23.1";
edition = "2021";
crateBin = [];
sha256 = "1qpz5py0a7y9mg2w4v1lidphz3arhw8dl4jcvf47aml8v3cnazjj";
authors = [
"June McEnroe <june@causal.agency>"
];
dependencies = [
{
name = "cssparser";
packageId = "cssparser";
}
{
name = "ego-tree";
packageId = "ego-tree";
}
{
name = "getopts";
packageId = "getopts";
optional = true;
}
{
name = "html5ever";
packageId = "html5ever";
}
{
name = "precomputed-hash";
packageId = "precomputed-hash";
}
{
name = "selectors";
packageId = "selectors";
}
{
name = "tendril";
packageId = "tendril";
}
];
features = {
"default" = [ "main" "errors" ];
"deterministic" = [ "indexmap" ];
"getopts" = [ "dep:getopts" ];
"indexmap" = [ "dep:indexmap" ];
"main" = [ "getopts" ];
"serde" = [ "dep:serde" ];
};
resolvedDefaultFeatures = [ "default" "errors" "getopts" "main" ];
};
"security-framework 2.11.1" = rec {
crateName = "security-framework";
version = "2.11.1";
@ -7607,6 +8132,69 @@ rec {
};
resolvedDefaultFeatures = [ "OSX_10_10" "OSX_10_11" "OSX_10_12" "OSX_10_9" "default" ];
};
"selectors" = rec {
crateName = "selectors";
version = "0.26.0";
edition = "2021";
sha256 = "1s3zv30rqgdvil7mnfr4xq5nb9m8yp0sai42l28y565mkd68lmpx";
libPath = "lib.rs";
authors = [
"The Servo Project Developers"
];
dependencies = [
{
name = "bitflags";
packageId = "bitflags 2.9.0";
}
{
name = "cssparser";
packageId = "cssparser";
}
{
name = "derive_more";
packageId = "derive_more";
usesDefaultFeatures = false;
features = [ "add" "add_assign" ];
}
{
name = "fxhash";
packageId = "fxhash";
}
{
name = "log";
packageId = "log";
}
{
name = "new_debug_unreachable";
packageId = "new_debug_unreachable";
}
{
name = "phf";
packageId = "phf";
}
{
name = "precomputed-hash";
packageId = "precomputed-hash";
}
{
name = "servo_arc";
packageId = "servo_arc";
}
{
name = "smallvec";
packageId = "smallvec";
}
];
buildDependencies = [
{
name = "phf_codegen";
packageId = "phf_codegen";
}
];
features = {
"to_shmem" = [ "dep:to_shmem" "dep:to_shmem_derive" ];
};
};
"serde" = rec {
crateName = "serde";
version = "1.0.219";
@ -7951,6 +8539,26 @@ rec {
features = {
};
};
"servo_arc" = rec {
crateName = "servo_arc";
version = "0.4.0";
edition = "2021";
sha256 = "06ljch4isnnbv1xpwhjajz4a4mpc7ki47ys9n9yn98kqjhjc8rdf";
libPath = "lib.rs";
authors = [
"The Servo Project Developers"
];
dependencies = [
{
name = "stable_deref_trait";
packageId = "stable_deref_trait";
}
];
features = {
"serde" = [ "dep:serde" ];
"servo" = [ "serde" "track_alloc_size" ];
};
};
"sharded-slab" = rec {
crateName = "sharded-slab";
version = "0.1.7";
@ -8021,6 +8629,14 @@ rec {
name = "rusqlite";
packageId = "rusqlite";
}
{
name = "scraper";
packageId = "scraper";
}
{
name = "serde_json";
packageId = "serde_json";
}
{
name = "snix-castore";
packageId = "snix-castore";
@ -8058,6 +8674,23 @@ rec {
];
};
"siphasher" = rec {
crateName = "siphasher";
version = "1.0.1";
edition = "2018";
sha256 = "17f35782ma3fn6sh21c027kjmd227xyrx06ffi8gw4xzv9yry6an";
authors = [
"Frank Denis <github@pureftpd.org>"
];
features = {
"default" = [ "std" ];
"serde" = [ "dep:serde" ];
"serde_json" = [ "dep:serde_json" ];
"serde_no_std" = [ "serde/alloc" ];
"serde_std" = [ "std" "serde/std" ];
};
resolvedDefaultFeatures = [ "default" "std" ];
};
"slab" = rec {
crateName = "slab";
version = "0.4.9";
@ -8477,7 +9110,75 @@ rec {
"default" = [ "std" ];
"std" = [ "alloc" ];
};
resolvedDefaultFeatures = [ "alloc" ];
resolvedDefaultFeatures = [ "alloc" "default" "std" ];
};
"string_cache" = rec {
crateName = "string_cache";
version = "0.8.9";
edition = "2018";
sha256 = "03z7km2kzlwiv2r2qifq5riv4g8phazwng9wnvs3py3lzainnxxz";
authors = [
"The Servo Project Developers"
];
dependencies = [
{
name = "new_debug_unreachable";
packageId = "new_debug_unreachable";
}
{
name = "parking_lot";
packageId = "parking_lot";
}
{
name = "phf_shared";
packageId = "phf_shared";
}
{
name = "precomputed-hash";
packageId = "precomputed-hash";
}
{
name = "serde";
packageId = "serde";
optional = true;
}
];
features = {
"default" = [ "serde_support" ];
"malloc_size_of" = [ "dep:malloc_size_of" ];
"serde" = [ "dep:serde" ];
"serde_support" = [ "serde" ];
};
resolvedDefaultFeatures = [ "default" "serde" "serde_support" ];
};
"string_cache_codegen" = rec {
crateName = "string_cache_codegen";
version = "0.5.4";
edition = "2018";
sha256 = "181ir4d6y053s1kka2idpjx5g9d9jgll6fy517jhzzpi2n3r44f7";
libPath = "lib.rs";
authors = [
"The Servo Project Developers"
];
dependencies = [
{
name = "phf_generator";
packageId = "phf_generator";
}
{
name = "phf_shared";
packageId = "phf_shared";
}
{
name = "proc-macro2";
packageId = "proc-macro2";
}
{
name = "quote";
packageId = "quote";
}
];
};
"strsim" = rec {
crateName = "strsim";
@ -8718,6 +9419,35 @@ rec {
};
resolvedDefaultFeatures = [ "default" "getrandom" ];
};
"tendril" = rec {
crateName = "tendril";
version = "0.4.3";
edition = "2015";
sha256 = "1c3vip59sqwxn148i714nmkrvjzbk7105vj0h92s6r64bw614jnj";
authors = [
"Keegan McAllister <mcallister.keegan@gmail.com>"
"Simon Sapin <simon.sapin@exyr.org>"
"Chris Morgan <me@chrismorgan.info>"
];
dependencies = [
{
name = "futf";
packageId = "futf";
}
{
name = "mac";
packageId = "mac";
}
{
name = "utf-8";
packageId = "utf-8";
}
];
features = {
"encoding" = [ "dep:encoding" ];
"encoding_rs" = [ "dep:encoding_rs" ];
};
};
"thiserror 1.0.69" = rec {
crateName = "thiserror";
version = "1.0.69";
@ -10325,6 +11055,17 @@ rec {
};
resolvedDefaultFeatures = [ "default" "std" ];
};
"utf-8" = rec {
crateName = "utf-8";
version = "0.7.6";
edition = "2015";
sha256 = "1a9ns3fvgird0snjkd3wbdhwd3zdpc2h5gpyybrfr6ra5pkqxk09";
libName = "utf8";
authors = [
"Simon Sapin <simon.sapin@exyr.org>"
];
};
"utf16_iter" = rec {
crateName = "utf16_iter";
version = "1.0.5";

View file

@ -9,7 +9,10 @@ clap = "4.5.35"
futures = "0.3.31"
reqwest = "0.12.15"
rusqlite = "0.34.0"
scraper = "0.23.1"
serde_json = "1.0.140"
snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" }
tokio = "1.44.2"
tokio-stream = "0.1.17"
tokio-util = "0.7.14"
url = "2.5.4"

View file

@ -31,4 +31,8 @@ mkShell {
openssl
sqlite
];
shellHook = ''
export DATABASE_PATH="$HOME/.local/share/sidx/sidx.db"
unset out outputs phases
'';
}

View file

@ -1,12 +1,24 @@
use std::collections::HashSet;
use std::path::{absolute, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use anyhow::anyhow;
use anyhow::Context;
use anyhow::{anyhow, Error};
use clap::Parser;
use clap::Subcommand;
use futures::{stream, StreamExt, TryStreamExt};
use rusqlite::{params, OptionalExtension};
use scraper::{Html, Selector};
use snix_castore::blobservice::BlobService;
use snix_castore::directoryservice::DirectoryService;
use snix_castore::B3Digest;
use snix_castore::{blobservice, directoryservice, import::fs::ingest_path};
use std::sync::Mutex;
use tokio::io::{AsyncReadExt, BufReader};
use tokio::sync::mpsc::{channel, Sender};
use tokio::sync::Semaphore;
use tokio_stream::wrappers::ReceiverStream;
use url::Url;
#[derive(Clone, Debug)]
@ -15,22 +27,28 @@ enum Ingestable {
Path(PathBuf),
}
#[derive(Debug)]
#[derive(Debug, Clone)]
enum IngestedWhen {
Now,
Before,
}
#[derive(Debug)]
#[derive(Debug, Clone)]
#[allow(dead_code)]
struct Ingested {
sample_id: u32,
uri: String,
blake3: String,
blake3: B3Digest,
epoch: u32,
when: IngestedWhen,
}
#[derive(Clone)]
enum FetchListingMessage {
Ingested(Url, Ingested),
Recurse(Url, usize),
}
impl std::fmt::Display for Ingestable {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
@ -45,7 +63,7 @@ impl std::fmt::Display for Ingestable {
}
}
fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
fn parse_url_or_path(s: &str) -> Result<Ingestable, Error> {
if s.is_empty() {
Err(anyhow!("Empty path (url)"))
} else if s.starts_with("./") || s.starts_with("/") {
@ -69,7 +87,7 @@ fn parse_url_or_path(s: &str) -> Result<Ingestable, anyhow::Error> {
fn data_path() -> PathBuf {
let xdg_data_dir = std::env::var("XDG_DATA_DIR")
.and_then(|s| Ok(PathBuf::from(s)))
.or_else(|_| -> Result<PathBuf, anyhow::Error> {
.or_else(|_| -> Result<PathBuf, Error> {
match std::env::home_dir() {
Some(p) => Ok(p.join(".local/share")),
None => Err(anyhow!("...")), // FIXME
@ -93,6 +111,12 @@ enum Command {
#[clap(value_parser = parse_url_or_path, num_args = 1)]
inputs: Vec<Ingestable>,
},
FetchListing {
#[clap(value_parser, long, default_value_t = 5)]
max_depth: usize,
#[clap(value_parser, num_args = 1)]
inputs: Vec<Url>,
},
}
#[derive(Parser)]
@ -100,7 +124,7 @@ struct Cli {
#[clap(short, long, action)]
refetch: bool,
#[clap(short, long, value_parser, default_value_t = 5)]
#[clap(short, long, value_parser, default_value_t = 4)]
max_parallel: usize,
#[clap(short, long, value_parser, default_value_os_t = default_db_path())]
@ -113,130 +137,33 @@ struct Cli {
command: Option<Command>,
}
async fn ingest<BS, DS>(
inputs: &Vec<Ingestable>,
struct SidxContext<BS, DS>
where
BS: blobservice::BlobService + Clone + Send + 'static,
DS: directoryservice::DirectoryService + Clone + Send + 'static,
{
refetch: bool,
max_parallel: usize,
http_client: reqwest::Client,
http: reqwest::Client,
con: Arc<Mutex<rusqlite::Connection>>,
blob_service: BS,
dir_service: DS,
con: rusqlite::Connection,
) -> Vec<Result<Option<Ingested>, anyhow::Error>>
where
BS: blobservice::BlobService,
DS: directoryservice::DirectoryService,
{
let samples = stream::iter(inputs.iter().map(|uri| {
let client = &http_client;
let blob_service = &blob_service;
let dir_service = &dir_service;
let con = &con;
let mut find_sample = con
.prepare(include_str!("q/latest-download.sql"))
.expect("Failed to prepare latest-download.sql");
let mut add_sample = con
.prepare(include_str!("q/add-sample.sql"))
.expect("Failed to prepare add-sample.sql");
let mut add_blob = con
.prepare(include_str!("q/upsert-blob.sql"))
.expect("Failed to prepare upsert-blob.sql");
let mut add_uri = con
.prepare(include_str!("q/upsert-uri.sql"))
.expect("Failed to prepare upsert-uri.sql");
async move {
let uri_s = uri.to_string();
let latest_download = find_sample
.query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r))
.optional()?;
if let Some((sample_id, blake3, epoch)) = latest_download {
if !refetch {
return Ok::<Option<Ingested>, anyhow::Error>(Some(Ingested {
sample_id,
uri: uri_s,
blake3,
epoch,
when: IngestedWhen::Before,
}));
}
}
let (digest, n_bytes) = match uri {
Ingestable::Path(path) => {
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
.await?
{
snix_castore::Node::Directory { digest, size } => (digest, size),
snix_castore::Node::File {
digest,
size,
executable: _,
} => (digest, size),
snix_castore::Node::Symlink { target: _ } => {
return Err(anyhow!("TODO: Figure out what to do with symlink roots"))
}
}
}
Ingestable::Url(url) => {
let res = client
.get(url.clone())
.send()
.await
.context(format!("Request.send failed early for {:?}", uri))?
.error_for_status()?;
let mut r = tokio_util::io::StreamReader::new(
res.bytes_stream().map_err(std::io::Error::other),
);
let mut w = blob_service.open_write().await;
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
Ok(n) => n,
Err(e) => {
return Err(anyhow!(
"tokio::io::copy failed for uri={} with {}",
uri_s,
e
));
}
};
let digest = w.close().await?;
(digest, n_bytes)
}
};
let digest64 = format!("{}", digest);
add_blob.execute(params![digest64, n_bytes,])?;
add_uri.execute(params![uri_s])?;
let (sample_id, epoch) = add_sample
.query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?;
Ok(Some(Ingested {
sample_id,
uri: uri_s,
blake3: digest64,
epoch,
when: IngestedWhen::Now,
}))
}
}))
.buffer_unordered(max_parallel)
.collect::<Vec<Result<Option<Ingested>, _>>>()
.await;
samples
}
#[tokio::main]
async fn main() {
let args = Cli::parse();
args.db_path.parent().and_then(|p| {
async fn open_context(
refetch: bool,
max_parallel: usize,
db_path: PathBuf,
castore_path: PathBuf,
) -> SidxContext<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
if let Some(p) = db_path.parent() {
let _ = std::fs::create_dir_all(p);
Some(())
});
}
let con =
rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object");
let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object");
con.execute_batch(include_str!("q/init.sql"))
.expect("Failed to execute init.sql");
let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path");
let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path");
let blob_service = blobservice::from_addr(&std::format!(
"objectstore+file://{}",
castore_path
@ -256,20 +183,279 @@ async fn main() {
.await
.expect("Couldn't initialize .castore/directory");
let client = reqwest::Client::new();
SidxContext::<Arc<dyn BlobService>, Arc<dyn DirectoryService>> {
refetch,
max_parallel,
http: reqwest::Client::new(),
con: Arc::new(Mutex::new(con)),
blob_service,
dir_service,
}
}
impl<BS: BlobService + Clone, DS: DirectoryService + Clone> SidxContext<BS, DS> {
async fn db_latest_download(&self, uri: &str) -> Result<Option<Ingested>, Error> {
let lock = self.con.lock().unwrap();
let mut find_sample = lock
.prepare_cached(include_str!("q/latest-download.sql"))
.expect("Failed to prepare latest-download.sql");
find_sample
.query_row(params![uri], |r| <(u32, String, u32)>::try_from(r))
.optional()
.context("db_latest_download.sql")
.and_then(|maybe_triple| match maybe_triple {
Some((sample_id, blake3, epoch)) => Ok(Some(Ingested {
sample_id,
uri: uri.to_string(),
blake3: B3Digest::from_str(&blake3)?,
epoch,
when: IngestedWhen::Before,
})),
None => Ok(None),
})
}
async fn db_add_sample(&self, uri: &str, blake3: &str) -> Result<(u32, u32), rusqlite::Error> {
let lock = self.con.lock().unwrap();
let mut add_sample = lock
.prepare_cached(include_str!("q/add-sample.sql"))
.expect("Failed to prepare add-sample.sql");
add_sample.query_row(params![uri, blake3], |row| <(u32, u32)>::try_from(row))
}
async fn db_add_blob(&self, blake3: &str, n_bytes: u64) -> Result<usize, rusqlite::Error> {
let lock = self.con.lock().unwrap();
let mut add_blob = lock
.prepare_cached(include_str!("q/upsert-blob.sql"))
.expect("Failed to prepare upsert-blob.sql");
add_blob.execute(params![blake3, n_bytes,])
}
async fn db_add_uri(&self, uri: &str) -> Result<usize, rusqlite::Error> {
let lock = self.con.lock().unwrap();
let mut add_uri = lock
.prepare_cached(include_str!("q/upsert-uri.sql"))
.expect("Failed to prepare upsert-uri.sql");
add_uri.execute(params![uri])
}
async fn record_ingested_node(
&self,
uri: &str,
blake3: &snix_castore::B3Digest,
n_bytes: u64,
) -> Result<Ingested, Error> {
let digest64 = format!("{}", blake3);
self.db_add_blob(&digest64, n_bytes).await?;
self.db_add_uri(&uri).await?;
let (sample_id, epoch) = self.db_add_sample(&uri, &digest64).await?;
Ok(Ingested {
sample_id,
uri: uri.to_string(),
blake3: blake3.clone(),
epoch,
when: IngestedWhen::Now,
})
}
async fn download_no_cache(&self, uri: &Url) -> Result<Ingested, Error> {
let uri_s = uri.to_string();
let res = self
.http
.get(uri.clone())
.send()
.await
.context(format!("Request::send failed early for {:?}", uri))?
.error_for_status()?;
let mut r =
tokio_util::io::StreamReader::new(res.bytes_stream().map_err(std::io::Error::other));
let mut w = self.blob_service.open_write().await;
let n_bytes = match tokio::io::copy(&mut r, &mut w).await {
Ok(n) => n,
Err(e) => {
return Err(anyhow!(
"tokio::io::copy failed for uri={} with {}",
uri_s,
e
));
}
};
let digest = w.close().await?;
self.record_ingested_node(&uri_s, &digest, n_bytes).await
}
async fn download(&self, uri: &Url) -> Result<Ingested, Error> {
if self.refetch {
self.download_no_cache(&uri).await
} else {
match self.db_latest_download(&uri.to_string()).await? {
Some(ingested) => Ok(ingested),
None => self.download_no_cache(&uri).await,
}
}
}
async fn ingest(&self, inputs: &Vec<Ingestable>) -> Vec<Result<Option<Ingested>, Error>> {
let samples = stream::iter(inputs.iter().map(|uri| {
let blob_service = &self.blob_service;
let dir_service = &self.dir_service;
async move {
let uri_s = uri.to_string();
let latest_download = self.db_latest_download(&uri_s).await?;
if latest_download.is_some() {
return Ok(latest_download);
}
match uri {
Ingestable::Path(path) => {
match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None)
.await?
{
snix_castore::Node::Directory { digest, size } => self
.record_ingested_node(&uri_s, &digest, size)
.await
.map(Some),
snix_castore::Node::File {
digest,
size,
executable: _,
} => self
.record_ingested_node(&uri_s, &digest, size)
.await
.map(Some),
snix_castore::Node::Symlink { target: _ } => {
Err(anyhow!("TODO: Figure out what to do with symlink roots"))
}
}
}
Ingestable::Url(url) => self.download(url).await.map(Some),
}
}
}))
.buffer_unordered(self.max_parallel)
.collect::<Vec<Result<Option<Ingested>, _>>>()
.await;
samples
}
fn extract_hrefs(content: &str) -> Result<Vec<String>, Error> {
let sel = Selector::parse("a").map_err(|e| anyhow!(e.to_string()))?;
let html = Html::parse_document(&content);
Ok(html
.select(&sel)
.flat_map(|elt| elt.value().attr("href"))
.map(|s| s.to_string())
.collect::<Vec<_>>())
}
async fn fetch_from_listing_impl(
self: Arc<Self>,
url: Url,
max_depth: usize,
tx: Sender<FetchListingMessage>,
) -> Result<(), Error> {
eprintln!("Downloading {:?}", url.to_string());
let root = self.download(&url).await?;
tx.send(FetchListingMessage::Ingested(url.clone(), root.clone()))
.await
.context("Stopped accepting tasks before processing an Ingested notification")?;
if max_depth <= 0 {
return Ok(());
}
/* TODO: no need to load blobs to memory unless you know they're text/html */
match self.blob_service.open_read(&root.blake3).await? {
Some(mut reader) => {
let content = {
let mut br = BufReader::new(&mut *reader);
let mut content = String::new();
br.read_to_string(&mut content).await?;
content
};
let hrefs = Self::extract_hrefs(&content).unwrap_or(vec![]);
/* max_depth > 0 here */
for href in hrefs {
let next_url = url.join(&href).context("Constructing next_url")?;
tx.send(FetchListingMessage::Recurse(
next_url.clone(),
max_depth - 1,
))
.await
.context("Stopped accepting tasks before finishing all hrefs")?;
}
Ok(())
}
None => Err(anyhow!("Couldn't read the ingested blob")),
}
}
async fn fetch_from_listing(
self: Arc<Self>,
url: Url,
max_depth: usize,
) -> ReceiverStream<Ingested> {
let mq_size = 10;
/* TODO: move task queue to e.g. sqlite */
let (tx, mut rx) = channel(mq_size);
let (out_tx, out_rx) = channel(mq_size);
let semaphore = Arc::new(Semaphore::new(self.max_parallel));
tokio::spawn({
async move {
let mut seen: HashSet<String> = HashSet::new();
tx.send(FetchListingMessage::Recurse(url, max_depth))
.await
.expect("fetch_from_listing failed populating the queue");
while let Some(m) = rx.recv().await {
match m {
FetchListingMessage::Ingested(_url, ingested) => {
out_tx
.send(ingested)
.await
.expect("ReceiverStream failed to accept an Ingestable");
}
FetchListingMessage::Recurse(url, max_depth) => {
if max_depth > 0 && !seen.contains(&url.to_string()) {
seen.insert(url.to_string());
tokio::spawn({
let s = self.clone();
let url = url.clone();
let tx = tx.clone();
let semaphore = semaphore.clone();
async move {
let _permit = semaphore.acquire();
s.fetch_from_listing_impl(url, max_depth, tx).await
}
});
}
}
}
}
}
});
ReceiverStream::new(out_rx)
}
}
#[tokio::main]
async fn main() {
let args = Cli::parse();
let _cwd = std::env::current_dir().expect("Couldn't get CWD");
let _host_name = std::env::var("HOSTNAME").map_or(None, Some);
let ctx = Arc::new(
open_context(
args.refetch,
args.max_parallel,
args.db_path,
args.castore_path,
)
.await,
);
match args.command {
Some(Command::Ingest { inputs }) => {
let samples = ingest(
&inputs,
args.refetch,
args.max_parallel,
client,
blob_service,
dir_service,
con,
)
.await;
let samples = ctx.ingest(&inputs).await;
for s in samples {
match s {
Err(e) => {
@ -282,6 +468,19 @@ async fn main() {
}
}
}
Some(Command::FetchListing { max_depth, inputs }) => {
let ingested: Vec<Ingested> = stream::iter(inputs)
.then(async |i| {
let i = i.clone();
ctx.clone().fetch_from_listing(i, max_depth).await
})
.flatten_unordered(args.max_parallel)
.collect()
.await;
for i in ingested {
eprintln!("{:?}", i);
}
}
None => {}
}
}