diff --git a/Cargo.lock b/Cargo.lock index ee023c5..dd4233c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -342,6 +342,12 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.10.1" @@ -512,6 +518,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.100", +] + [[package]] name = "darling" version = "0.20.11" @@ -563,6 +592,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_more" +version = "0.99.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "digest" version = "0.10.7" @@ -591,6 +631,27 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "dtoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "either" version = "1.15.0" @@ -727,6 +788,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -816,6 +887,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "gcp_auth" version = "0.12.3" @@ -853,6 +933,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width 0.1.14", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -958,6 +1047,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "html5ever" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" +dependencies = [ + "log", + "mac", + "markup5ever", + "match_token", +] + [[package]] name = "http" version = "1.3.1" @@ -1429,6 +1530,37 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1509,6 +1641,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1685,6 +1823,58 @@ dependencies = [ "indexmap 2.9.0", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.100", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -1744,6 +1934,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.32" @@ -2300,6 +2496,21 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527e65d9d888567588db4c12da1087598d0f6f8b346cc2c5abc91f05fc2dffe2" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors", + "tendril", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -2336,6 +2547,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +dependencies = [ + "bitflags 2.9.0", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "serde" version = "1.0.219" @@ -2431,6 +2661,15 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "servo_arc" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -2455,8 +2694,11 @@ dependencies = [ "futures", "reqwest", "rusqlite", + "scraper", + "serde_json", "snix-castore", "tokio", + "tokio-stream", "tokio-util", "url", ] @@ -2470,6 +2712,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.9" @@ -2588,6 +2836,31 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" @@ -2676,6 +2949,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -3129,6 +3413,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf16_iter" version = "1.0.5" diff --git a/Cargo.nix b/Cargo.nix index 22eb2d2..4cf77bc 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -1104,6 +1104,19 @@ rec { }; resolvedDefaultFeatures = [ "default" ]; }; + "byteorder" = rec { + crateName = "byteorder"; + version = "1.5.0"; + edition = "2021"; + sha256 = "0jzncxyf404mwqdbspihyzpkndfgda450l0893pz5xj685cg5l0z"; + authors = [ + "Andrew Gallant " + ]; + features = { + "default" = [ "std" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "bytes" = rec { crateName = "bytes"; version = "1.10.1"; @@ -1588,6 +1601,65 @@ rec { "rand_core" = [ "dep:rand_core" ]; }; }; + "cssparser" = rec { + crateName = "cssparser"; + version = "0.34.0"; + edition = "2018"; + sha256 = "1qx3hha392szcl812l6hp0d4029gg8x62cl4nf0byqgdv0f6vimp"; + authors = [ + "Simon Sapin " + ]; + dependencies = [ + { + name = "cssparser-macros"; + packageId = "cssparser-macros"; + } + { + name = "dtoa-short"; + packageId = "dtoa-short"; + } + { + name = "itoa"; + packageId = "itoa"; + } + { + name = "phf"; + packageId = "phf"; + features = [ "macros" ]; + } + { + name = "smallvec"; + packageId = "smallvec"; + } + ]; + features = { + "serde" = [ "dep:serde" ]; + }; + }; + "cssparser-macros" = rec { + crateName = "cssparser-macros"; + version = "0.6.1"; + edition = "2018"; + sha256 = "0cfkzj60avrnskdmaf7f8zw6pp3di4ylplk455zrzaf19ax8id8k"; + procMacro = true; + libName = "cssparser_macros"; + libPath = "lib.rs"; + authors = [ + "Simon Sapin " + ]; + dependencies = [ + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" "extra-traits" ]; + } + ]; + + }; "darling" = rec { crateName = "darling"; version = "0.20.11"; @@ -1731,6 +1803,49 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "powerfmt" "serde" "std" ]; }; + "derive_more" = rec { + crateName = "derive_more"; + version = "0.99.19"; + edition = "2018"; + sha256 = "17y6g78dg31fsv7z4p455bzxs670spg476ww2ibg3mj3vww9m8ix"; + procMacro = true; + authors = [ + "Jelte Fennema " + ]; + dependencies = [ + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + } + ]; + features = { + "convert_case" = [ "dep:convert_case" ]; + "default" = [ "add_assign" "add" "as_mut" "as_ref" "constructor" "deref" "deref_mut" "display" "error" "from" "from_str" "index" "index_mut" "into" "into_iterator" "iterator" "mul_assign" "mul" "not" "sum" "try_into" "is_variant" "unwrap" ]; + "display" = [ "syn/extra-traits" ]; + "error" = [ "syn/extra-traits" ]; + "from" = [ "syn/extra-traits" ]; + "generate-parsing-rs" = [ "peg" ]; + "into" = [ "syn/extra-traits" ]; + "is_variant" = [ "convert_case" ]; + "mul" = [ "syn/extra-traits" ]; + "mul_assign" = [ "syn/extra-traits" ]; + "not" = [ "syn/extra-traits" ]; + "peg" = [ "dep:peg" ]; + "rustc_version" = [ "dep:rustc_version" ]; + "testing-helpers" = [ "rustc_version" ]; + "try_into" = [ "syn/extra-traits" ]; + "unwrap" = [ "convert_case" "rustc_version" ]; + }; + resolvedDefaultFeatures = [ "add" "add_assign" ]; + }; "digest" = rec { crateName = "digest"; version = "0.10.7"; @@ -1810,6 +1925,49 @@ rec { features = { }; }; + "dtoa" = rec { + crateName = "dtoa"; + version = "1.0.10"; + edition = "2018"; + sha256 = "016gid01rarcdv57h049d7nr9daxc2hc2gqzx0mji57krywd7bfn"; + authors = [ + "David Tolnay " + ]; + features = { + "no-panic" = [ "dep:no-panic" ]; + }; + }; + "dtoa-short" = rec { + crateName = "dtoa-short"; + version = "0.3.5"; + edition = "2015"; + sha256 = "11rwnkgql5jilsmwxpx6hjzkgyrbdmx1d71s0jyrjqm5nski25fd"; + libName = "dtoa_short"; + authors = [ + "Xidorn Quan " + ]; + dependencies = [ + { + name = "dtoa"; + packageId = "dtoa"; + } + ]; + + }; + "ego-tree" = rec { + crateName = "ego-tree"; + version = "0.10.0"; + edition = "2021"; + sha256 = "1n2csy99chk5v5vzjl0ff79vxpxhl76xmcb3aj6brrzzipmjz5xj"; + libName = "ego_tree"; + authors = [ + "June McEnroe " + "Carlo Federico Vescovo " + ]; + features = { + "serde" = [ "dep:serde" ]; + }; + }; "either" = rec { crateName = "either"; version = "1.15.0"; @@ -2155,6 +2313,26 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "default" "std" ]; }; + "futf" = rec { + crateName = "futf"; + version = "0.1.5"; + edition = "2015"; + sha256 = "0hvqk2r7v4fnc34hvc3vkri89gn52d5m9ihygmwn75l1hhp0whnz"; + authors = [ + "Keegan McAllister " + ]; + dependencies = [ + { + name = "mac"; + packageId = "mac"; + } + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + ]; + + }; "futures" = rec { crateName = "futures"; version = "0.3.31"; @@ -2433,6 +2611,23 @@ rec { }; resolvedDefaultFeatures = [ "alloc" "async-await" "async-await-macro" "channel" "default" "futures-channel" "futures-io" "futures-macro" "futures-sink" "io" "memchr" "sink" "slab" "std" ]; }; + "fxhash" = rec { + crateName = "fxhash"; + version = "0.2.1"; + edition = "2015"; + sha256 = "037mb9ichariqi45xm6mz0b11pa92gj38ba0409z3iz239sns6y3"; + libPath = "lib.rs"; + authors = [ + "cbreeden " + ]; + dependencies = [ + { + name = "byteorder"; + packageId = "byteorder"; + } + ]; + + }; "gcp_auth" = rec { crateName = "gcp_auth"; version = "0.12.3"; @@ -2565,6 +2760,26 @@ rec { }; resolvedDefaultFeatures = [ "more_lengths" ]; }; + "getopts" = rec { + crateName = "getopts"; + version = "0.2.21"; + edition = "2015"; + sha256 = "1mgb3qvivi26gs6ihqqhh8iyhp3vgxri6vwyrwg28w0xqzavznql"; + authors = [ + "The Rust Project Developers" + ]; + dependencies = [ + { + name = "unicode-width"; + packageId = "unicode-width 0.1.14"; + } + ]; + features = { + "core" = [ "dep:core" ]; + "rustc-dep-of-std" = [ "unicode-width/rustc-dep-of-std" "std" "core" ]; + "std" = [ "dep:std" ]; + }; + }; "getrandom 0.2.15" = rec { crateName = "getrandom"; version = "0.2.15"; @@ -2938,6 +3153,35 @@ rec { ]; }; + "html5ever" = rec { + crateName = "html5ever"; + version = "0.29.1"; + edition = "2021"; + sha256 = "07518h5gbw0c6x7w5br76bgxvgphs6zlrb4q7ii7bg1ww7510x1v"; + authors = [ + "The html5ever Project Developers" + ]; + dependencies = [ + { + name = "log"; + packageId = "log"; + } + { + name = "mac"; + packageId = "mac"; + } + { + name = "markup5ever"; + packageId = "markup5ever"; + } + { + name = "match_token"; + packageId = "match_token"; + } + ]; + features = { + }; + }; "http" = rec { crateName = "http"; version = "1.3.1"; @@ -4471,6 +4715,78 @@ rec { }; resolvedDefaultFeatures = [ "std" ]; }; + "mac" = rec { + crateName = "mac"; + version = "0.1.1"; + edition = "2015"; + sha256 = "194vc7vrshqff72rl56f9xgb0cazyl4jda7qsv31m5l6xx7hq7n4"; + authors = [ + "Jonathan Reem " + ]; + + }; + "markup5ever" = rec { + crateName = "markup5ever"; + version = "0.14.1"; + edition = "2021"; + sha256 = "063sdq7hwxn2al9ygify8dd96mj57n9c4lig007lr1p128yj39y7"; + libPath = "lib.rs"; + authors = [ + "The html5ever Project Developers" + ]; + dependencies = [ + { + name = "log"; + packageId = "log"; + } + { + name = "phf"; + packageId = "phf"; + } + { + name = "string_cache"; + packageId = "string_cache"; + } + { + name = "tendril"; + packageId = "tendril"; + } + ]; + buildDependencies = [ + { + name = "phf_codegen"; + packageId = "phf_codegen"; + } + { + name = "string_cache_codegen"; + packageId = "string_cache_codegen"; + } + ]; + + }; + "match_token" = rec { + crateName = "match_token"; + version = "0.1.0"; + edition = "2021"; + sha256 = "0sx3212vkjqfblfhr556ayabbjflbigjf5j591j9kgs4infniac8"; + procMacro = true; + dependencies = [ + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" ]; + } + ]; + + }; "matchers" = rec { crateName = "matchers"; version = "0.1.0"; @@ -4717,6 +5033,18 @@ rec { "vendored" = [ "openssl/vendored" ]; }; }; + "new_debug_unreachable" = rec { + crateName = "new_debug_unreachable"; + version = "1.0.6"; + edition = "2021"; + sha256 = "11phpf1mjxq6khk91yzcbd3ympm78m3ivl7xg6lg2c0lf66fy3k5"; + libName = "debug_unreachable"; + authors = [ + "Matt Brubeck " + "Jonathan Reem " + ]; + + }; "nu-ansi-term" = rec { crateName = "nu-ansi-term"; version = "0.46.0"; @@ -5280,6 +5608,142 @@ rec { "unstable" = [ "generate" ]; }; }; + "phf" = rec { + crateName = "phf"; + version = "0.11.3"; + edition = "2021"; + sha256 = "0y6hxp1d48rx2434wgi5g8j1pr8s5jja29ha2b65435fh057imhz"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_macros"; + packageId = "phf_macros"; + optional = true; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + ]; + features = { + "default" = [ "std" ]; + "macros" = [ "phf_macros" ]; + "phf_macros" = [ "dep:phf_macros" ]; + "serde" = [ "dep:serde" ]; + "std" = [ "phf_shared/std" ]; + "uncased" = [ "phf_shared/uncased" ]; + "unicase" = [ "phf_macros?/unicase" "phf_shared/unicase" ]; + }; + resolvedDefaultFeatures = [ "default" "macros" "phf_macros" "std" ]; + }; + "phf_codegen" = rec { + crateName = "phf_codegen"; + version = "0.11.3"; + edition = "2021"; + sha256 = "0si1n6zr93kzjs3wah04ikw8z6npsr39jw4dam8yi9czg2609y5f"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + ]; + + }; + "phf_generator" = rec { + crateName = "phf_generator"; + version = "0.11.3"; + edition = "2021"; + crateBin = []; + sha256 = "0gc4np7s91ynrgw73s2i7iakhb4lzdv1gcyx7yhlc0n214a2701w"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + { + name = "rand"; + packageId = "rand 0.8.5"; + usesDefaultFeatures = false; + features = [ "small_rng" ]; + } + ]; + features = { + "criterion" = [ "dep:criterion" ]; + }; + }; + "phf_macros" = rec { + crateName = "phf_macros"; + version = "0.11.3"; + edition = "2021"; + sha256 = "05kjfbyb439344rhmlzzw0f9bwk9fp95mmw56zs7yfn1552c0jpq"; + procMacro = true; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + usesDefaultFeatures = false; + } + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + { + name = "syn"; + packageId = "syn 2.0.100"; + features = [ "full" ]; + } + ]; + features = { + "unicase" = [ "unicase_" "phf_shared/unicase" ]; + "unicase_" = [ "dep:unicase_" ]; + }; + }; + "phf_shared" = rec { + crateName = "phf_shared"; + version = "0.11.3"; + edition = "2021"; + sha256 = "1rallyvh28jqd9i916gk5gk2igdmzlgvv5q0l3xbf3m6y8pbrsk7"; + authors = [ + "Steven Fackler " + ]; + dependencies = [ + { + name = "siphasher"; + packageId = "siphasher"; + } + ]; + features = { + "default" = [ "std" ]; + "uncased" = [ "dep:uncased" ]; + "unicase" = [ "dep:unicase" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "pin-project" = rec { crateName = "pin-project"; version = "1.1.10"; @@ -5397,6 +5861,17 @@ rec { }; resolvedDefaultFeatures = [ "simd" "std" ]; }; + "precomputed-hash" = rec { + crateName = "precomputed-hash"; + version = "0.1.1"; + edition = "2015"; + sha256 = "075k9bfy39jhs53cb2fpb9klfakx2glxnf28zdw08ws6lgpq6lwj"; + libName = "precomputed_hash"; + authors = [ + "Emilio Cobos Álvarez " + ]; + + }; "prettyplease" = rec { crateName = "prettyplease"; version = "0.2.32"; @@ -7485,6 +7960,56 @@ rec { "default" = [ "use_std" ]; }; }; + "scraper" = rec { + crateName = "scraper"; + version = "0.23.1"; + edition = "2021"; + crateBin = []; + sha256 = "1qpz5py0a7y9mg2w4v1lidphz3arhw8dl4jcvf47aml8v3cnazjj"; + authors = [ + "June McEnroe " + ]; + dependencies = [ + { + name = "cssparser"; + packageId = "cssparser"; + } + { + name = "ego-tree"; + packageId = "ego-tree"; + } + { + name = "getopts"; + packageId = "getopts"; + optional = true; + } + { + name = "html5ever"; + packageId = "html5ever"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "selectors"; + packageId = "selectors"; + } + { + name = "tendril"; + packageId = "tendril"; + } + ]; + features = { + "default" = [ "main" "errors" ]; + "deterministic" = [ "indexmap" ]; + "getopts" = [ "dep:getopts" ]; + "indexmap" = [ "dep:indexmap" ]; + "main" = [ "getopts" ]; + "serde" = [ "dep:serde" ]; + }; + resolvedDefaultFeatures = [ "default" "errors" "getopts" "main" ]; + }; "security-framework 2.11.1" = rec { crateName = "security-framework"; version = "2.11.1"; @@ -7607,6 +8132,69 @@ rec { }; resolvedDefaultFeatures = [ "OSX_10_10" "OSX_10_11" "OSX_10_12" "OSX_10_9" "default" ]; }; + "selectors" = rec { + crateName = "selectors"; + version = "0.26.0"; + edition = "2021"; + sha256 = "1s3zv30rqgdvil7mnfr4xq5nb9m8yp0sai42l28y565mkd68lmpx"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "bitflags"; + packageId = "bitflags 2.9.0"; + } + { + name = "cssparser"; + packageId = "cssparser"; + } + { + name = "derive_more"; + packageId = "derive_more"; + usesDefaultFeatures = false; + features = [ "add" "add_assign" ]; + } + { + name = "fxhash"; + packageId = "fxhash"; + } + { + name = "log"; + packageId = "log"; + } + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + { + name = "phf"; + packageId = "phf"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "servo_arc"; + packageId = "servo_arc"; + } + { + name = "smallvec"; + packageId = "smallvec"; + } + ]; + buildDependencies = [ + { + name = "phf_codegen"; + packageId = "phf_codegen"; + } + ]; + features = { + "to_shmem" = [ "dep:to_shmem" "dep:to_shmem_derive" ]; + }; + }; "serde" = rec { crateName = "serde"; version = "1.0.219"; @@ -7951,6 +8539,26 @@ rec { features = { }; }; + "servo_arc" = rec { + crateName = "servo_arc"; + version = "0.4.0"; + edition = "2021"; + sha256 = "06ljch4isnnbv1xpwhjajz4a4mpc7ki47ys9n9yn98kqjhjc8rdf"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "stable_deref_trait"; + packageId = "stable_deref_trait"; + } + ]; + features = { + "serde" = [ "dep:serde" ]; + "servo" = [ "serde" "track_alloc_size" ]; + }; + }; "sharded-slab" = rec { crateName = "sharded-slab"; version = "0.1.7"; @@ -8021,6 +8629,14 @@ rec { name = "rusqlite"; packageId = "rusqlite"; } + { + name = "scraper"; + packageId = "scraper"; + } + { + name = "serde_json"; + packageId = "serde_json"; + } { name = "snix-castore"; packageId = "snix-castore"; @@ -8058,6 +8674,23 @@ rec { ]; }; + "siphasher" = rec { + crateName = "siphasher"; + version = "1.0.1"; + edition = "2018"; + sha256 = "17f35782ma3fn6sh21c027kjmd227xyrx06ffi8gw4xzv9yry6an"; + authors = [ + "Frank Denis " + ]; + features = { + "default" = [ "std" ]; + "serde" = [ "dep:serde" ]; + "serde_json" = [ "dep:serde_json" ]; + "serde_no_std" = [ "serde/alloc" ]; + "serde_std" = [ "std" "serde/std" ]; + }; + resolvedDefaultFeatures = [ "default" "std" ]; + }; "slab" = rec { crateName = "slab"; version = "0.4.9"; @@ -8477,7 +9110,75 @@ rec { "default" = [ "std" ]; "std" = [ "alloc" ]; }; - resolvedDefaultFeatures = [ "alloc" ]; + resolvedDefaultFeatures = [ "alloc" "default" "std" ]; + }; + "string_cache" = rec { + crateName = "string_cache"; + version = "0.8.9"; + edition = "2018"; + sha256 = "03z7km2kzlwiv2r2qifq5riv4g8phazwng9wnvs3py3lzainnxxz"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "new_debug_unreachable"; + packageId = "new_debug_unreachable"; + } + { + name = "parking_lot"; + packageId = "parking_lot"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + { + name = "precomputed-hash"; + packageId = "precomputed-hash"; + } + { + name = "serde"; + packageId = "serde"; + optional = true; + } + ]; + features = { + "default" = [ "serde_support" ]; + "malloc_size_of" = [ "dep:malloc_size_of" ]; + "serde" = [ "dep:serde" ]; + "serde_support" = [ "serde" ]; + }; + resolvedDefaultFeatures = [ "default" "serde" "serde_support" ]; + }; + "string_cache_codegen" = rec { + crateName = "string_cache_codegen"; + version = "0.5.4"; + edition = "2018"; + sha256 = "181ir4d6y053s1kka2idpjx5g9d9jgll6fy517jhzzpi2n3r44f7"; + libPath = "lib.rs"; + authors = [ + "The Servo Project Developers" + ]; + dependencies = [ + { + name = "phf_generator"; + packageId = "phf_generator"; + } + { + name = "phf_shared"; + packageId = "phf_shared"; + } + { + name = "proc-macro2"; + packageId = "proc-macro2"; + } + { + name = "quote"; + packageId = "quote"; + } + ]; + }; "strsim" = rec { crateName = "strsim"; @@ -8718,6 +9419,35 @@ rec { }; resolvedDefaultFeatures = [ "default" "getrandom" ]; }; + "tendril" = rec { + crateName = "tendril"; + version = "0.4.3"; + edition = "2015"; + sha256 = "1c3vip59sqwxn148i714nmkrvjzbk7105vj0h92s6r64bw614jnj"; + authors = [ + "Keegan McAllister " + "Simon Sapin " + "Chris Morgan " + ]; + dependencies = [ + { + name = "futf"; + packageId = "futf"; + } + { + name = "mac"; + packageId = "mac"; + } + { + name = "utf-8"; + packageId = "utf-8"; + } + ]; + features = { + "encoding" = [ "dep:encoding" ]; + "encoding_rs" = [ "dep:encoding_rs" ]; + }; + }; "thiserror 1.0.69" = rec { crateName = "thiserror"; version = "1.0.69"; @@ -10325,6 +11055,17 @@ rec { }; resolvedDefaultFeatures = [ "default" "std" ]; }; + "utf-8" = rec { + crateName = "utf-8"; + version = "0.7.6"; + edition = "2015"; + sha256 = "1a9ns3fvgird0snjkd3wbdhwd3zdpc2h5gpyybrfr6ra5pkqxk09"; + libName = "utf8"; + authors = [ + "Simon Sapin " + ]; + + }; "utf16_iter" = rec { crateName = "utf16_iter"; version = "1.0.5"; diff --git a/Cargo.toml b/Cargo.toml index 9b6b61b..f8e09cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,10 @@ clap = "4.5.35" futures = "0.3.31" reqwest = "0.12.15" rusqlite = "0.34.0" +scraper = "0.23.1" +serde_json = "1.0.140" snix-castore = { version = "0.1.0", git = "https://git.snix.dev/snix/snix.git" } tokio = "1.44.2" +tokio-stream = "0.1.17" tokio-util = "0.7.14" url = "2.5.4" diff --git a/shell.nix b/shell.nix index 0336a60..b628aff 100644 --- a/shell.nix +++ b/shell.nix @@ -31,4 +31,8 @@ mkShell { openssl sqlite ]; + shellHook = '' + export DATABASE_PATH="$HOME/.local/share/sidx/sidx.db" + unset out outputs phases + ''; } diff --git a/src/main.rs b/src/main.rs index 835d79d..faf4ef7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,24 @@ +use std::collections::HashSet; use std::path::{absolute, PathBuf}; +use std::str::FromStr; +use std::sync::Arc; -use anyhow::anyhow; use anyhow::Context; +use anyhow::{anyhow, Error}; use clap::Parser; use clap::Subcommand; use futures::{stream, StreamExt, TryStreamExt}; use rusqlite::{params, OptionalExtension}; +use scraper::{Html, Selector}; +use snix_castore::blobservice::BlobService; +use snix_castore::directoryservice::DirectoryService; +use snix_castore::B3Digest; use snix_castore::{blobservice, directoryservice, import::fs::ingest_path}; +use std::sync::Mutex; +use tokio::io::{AsyncReadExt, BufReader}; +use tokio::sync::mpsc::{channel, Sender}; +use tokio::sync::Semaphore; +use tokio_stream::wrappers::ReceiverStream; use url::Url; #[derive(Clone, Debug)] @@ -15,22 +27,28 @@ enum Ingestable { Path(PathBuf), } -#[derive(Debug)] +#[derive(Debug, Clone)] enum IngestedWhen { Now, Before, } -#[derive(Debug)] +#[derive(Debug, Clone)] #[allow(dead_code)] struct Ingested { sample_id: u32, uri: String, - blake3: String, + blake3: B3Digest, epoch: u32, when: IngestedWhen, } +#[derive(Clone)] +enum FetchListingMessage { + Ingested(Url, Ingested), + Recurse(Url, usize), +} + impl std::fmt::Display for Ingestable { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -45,7 +63,7 @@ impl std::fmt::Display for Ingestable { } } -fn parse_url_or_path(s: &str) -> Result { +fn parse_url_or_path(s: &str) -> Result { if s.is_empty() { Err(anyhow!("Empty path (url)")) } else if s.starts_with("./") || s.starts_with("/") { @@ -69,7 +87,7 @@ fn parse_url_or_path(s: &str) -> Result { fn data_path() -> PathBuf { let xdg_data_dir = std::env::var("XDG_DATA_DIR") .and_then(|s| Ok(PathBuf::from(s))) - .or_else(|_| -> Result { + .or_else(|_| -> Result { match std::env::home_dir() { Some(p) => Ok(p.join(".local/share")), None => Err(anyhow!("...")), // FIXME @@ -93,6 +111,12 @@ enum Command { #[clap(value_parser = parse_url_or_path, num_args = 1)] inputs: Vec, }, + FetchListing { + #[clap(value_parser, long, default_value_t = 5)] + max_depth: usize, + #[clap(value_parser, num_args = 1)] + inputs: Vec, + }, } #[derive(Parser)] @@ -100,7 +124,7 @@ struct Cli { #[clap(short, long, action)] refetch: bool, - #[clap(short, long, value_parser, default_value_t = 5)] + #[clap(short, long, value_parser, default_value_t = 4)] max_parallel: usize, #[clap(short, long, value_parser, default_value_os_t = default_db_path())] @@ -113,130 +137,33 @@ struct Cli { command: Option, } -async fn ingest( - inputs: &Vec, +struct SidxContext +where + BS: blobservice::BlobService + Clone + Send + 'static, + DS: directoryservice::DirectoryService + Clone + Send + 'static, +{ refetch: bool, max_parallel: usize, - http_client: reqwest::Client, + http: reqwest::Client, + con: Arc>, blob_service: BS, dir_service: DS, - con: rusqlite::Connection, -) -> Vec, anyhow::Error>> -where - BS: blobservice::BlobService, - DS: directoryservice::DirectoryService, -{ - let samples = stream::iter(inputs.iter().map(|uri| { - let client = &http_client; - let blob_service = &blob_service; - let dir_service = &dir_service; - let con = &con; - - let mut find_sample = con - .prepare(include_str!("q/latest-download.sql")) - .expect("Failed to prepare latest-download.sql"); - let mut add_sample = con - .prepare(include_str!("q/add-sample.sql")) - .expect("Failed to prepare add-sample.sql"); - let mut add_blob = con - .prepare(include_str!("q/upsert-blob.sql")) - .expect("Failed to prepare upsert-blob.sql"); - let mut add_uri = con - .prepare(include_str!("q/upsert-uri.sql")) - .expect("Failed to prepare upsert-uri.sql"); - - async move { - let uri_s = uri.to_string(); - let latest_download = find_sample - .query_row(params![uri_s], |r| <(u32, String, u32)>::try_from(r)) - .optional()?; - if let Some((sample_id, blake3, epoch)) = latest_download { - if !refetch { - return Ok::, anyhow::Error>(Some(Ingested { - sample_id, - uri: uri_s, - blake3, - epoch, - when: IngestedWhen::Before, - })); - } - } - let (digest, n_bytes) = match uri { - Ingestable::Path(path) => { - match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None) - .await? - { - snix_castore::Node::Directory { digest, size } => (digest, size), - snix_castore::Node::File { - digest, - size, - executable: _, - } => (digest, size), - snix_castore::Node::Symlink { target: _ } => { - return Err(anyhow!("TODO: Figure out what to do with symlink roots")) - } - } - } - Ingestable::Url(url) => { - let res = client - .get(url.clone()) - .send() - .await - .context(format!("Request.send failed early for {:?}", uri))? - .error_for_status()?; - let mut r = tokio_util::io::StreamReader::new( - res.bytes_stream().map_err(std::io::Error::other), - ); - let mut w = blob_service.open_write().await; - let n_bytes = match tokio::io::copy(&mut r, &mut w).await { - Ok(n) => n, - Err(e) => { - return Err(anyhow!( - "tokio::io::copy failed for uri={} with {}", - uri_s, - e - )); - } - }; - let digest = w.close().await?; - (digest, n_bytes) - } - }; - let digest64 = format!("{}", digest); - add_blob.execute(params![digest64, n_bytes,])?; - add_uri.execute(params![uri_s])?; - let (sample_id, epoch) = add_sample - .query_row(params![uri_s, digest64], |row| <(u32, u32)>::try_from(row))?; - Ok(Some(Ingested { - sample_id, - uri: uri_s, - blake3: digest64, - epoch, - when: IngestedWhen::Now, - })) - } - })) - .buffer_unordered(max_parallel) - .collect::, _>>>() - .await; - - samples } -#[tokio::main] -async fn main() { - let args = Cli::parse(); - - args.db_path.parent().and_then(|p| { +async fn open_context( + refetch: bool, + max_parallel: usize, + db_path: PathBuf, + castore_path: PathBuf, +) -> SidxContext, Arc> { + if let Some(p) = db_path.parent() { let _ = std::fs::create_dir_all(p); - Some(()) - }); + } - let con = - rusqlite::Connection::open(&args.db_path).expect("Failed to construct Database object"); + let con = rusqlite::Connection::open(&db_path).expect("Failed to construct Database object"); con.execute_batch(include_str!("q/init.sql")) .expect("Failed to execute init.sql"); - let castore_path = absolute(args.castore_path).expect("Failed to canonicalize castore_path"); + let castore_path = absolute(castore_path).expect("Failed to canonicalize castore_path"); let blob_service = blobservice::from_addr(&std::format!( "objectstore+file://{}", castore_path @@ -256,20 +183,279 @@ async fn main() { .await .expect("Couldn't initialize .castore/directory"); - let client = reqwest::Client::new(); + SidxContext::, Arc> { + refetch, + max_parallel, + http: reqwest::Client::new(), + con: Arc::new(Mutex::new(con)), + blob_service, + dir_service, + } +} + +impl SidxContext { + async fn db_latest_download(&self, uri: &str) -> Result, Error> { + let lock = self.con.lock().unwrap(); + let mut find_sample = lock + .prepare_cached(include_str!("q/latest-download.sql")) + .expect("Failed to prepare latest-download.sql"); + find_sample + .query_row(params![uri], |r| <(u32, String, u32)>::try_from(r)) + .optional() + .context("db_latest_download.sql") + .and_then(|maybe_triple| match maybe_triple { + Some((sample_id, blake3, epoch)) => Ok(Some(Ingested { + sample_id, + uri: uri.to_string(), + blake3: B3Digest::from_str(&blake3)?, + epoch, + when: IngestedWhen::Before, + })), + None => Ok(None), + }) + } + async fn db_add_sample(&self, uri: &str, blake3: &str) -> Result<(u32, u32), rusqlite::Error> { + let lock = self.con.lock().unwrap(); + let mut add_sample = lock + .prepare_cached(include_str!("q/add-sample.sql")) + .expect("Failed to prepare add-sample.sql"); + add_sample.query_row(params![uri, blake3], |row| <(u32, u32)>::try_from(row)) + } + async fn db_add_blob(&self, blake3: &str, n_bytes: u64) -> Result { + let lock = self.con.lock().unwrap(); + let mut add_blob = lock + .prepare_cached(include_str!("q/upsert-blob.sql")) + .expect("Failed to prepare upsert-blob.sql"); + add_blob.execute(params![blake3, n_bytes,]) + } + async fn db_add_uri(&self, uri: &str) -> Result { + let lock = self.con.lock().unwrap(); + let mut add_uri = lock + .prepare_cached(include_str!("q/upsert-uri.sql")) + .expect("Failed to prepare upsert-uri.sql"); + + add_uri.execute(params![uri]) + } + async fn record_ingested_node( + &self, + uri: &str, + blake3: &snix_castore::B3Digest, + n_bytes: u64, + ) -> Result { + let digest64 = format!("{}", blake3); + self.db_add_blob(&digest64, n_bytes).await?; + self.db_add_uri(&uri).await?; + let (sample_id, epoch) = self.db_add_sample(&uri, &digest64).await?; + Ok(Ingested { + sample_id, + uri: uri.to_string(), + blake3: blake3.clone(), + epoch, + when: IngestedWhen::Now, + }) + } + async fn download_no_cache(&self, uri: &Url) -> Result { + let uri_s = uri.to_string(); + let res = self + .http + .get(uri.clone()) + .send() + .await + .context(format!("Request::send failed early for {:?}", uri))? + .error_for_status()?; + let mut r = + tokio_util::io::StreamReader::new(res.bytes_stream().map_err(std::io::Error::other)); + let mut w = self.blob_service.open_write().await; + let n_bytes = match tokio::io::copy(&mut r, &mut w).await { + Ok(n) => n, + Err(e) => { + return Err(anyhow!( + "tokio::io::copy failed for uri={} with {}", + uri_s, + e + )); + } + }; + let digest = w.close().await?; + self.record_ingested_node(&uri_s, &digest, n_bytes).await + } + async fn download(&self, uri: &Url) -> Result { + if self.refetch { + self.download_no_cache(&uri).await + } else { + match self.db_latest_download(&uri.to_string()).await? { + Some(ingested) => Ok(ingested), + None => self.download_no_cache(&uri).await, + } + } + } + async fn ingest(&self, inputs: &Vec) -> Vec, Error>> { + let samples = stream::iter(inputs.iter().map(|uri| { + let blob_service = &self.blob_service; + let dir_service = &self.dir_service; + + async move { + let uri_s = uri.to_string(); + let latest_download = self.db_latest_download(&uri_s).await?; + if latest_download.is_some() { + return Ok(latest_download); + } + match uri { + Ingestable::Path(path) => { + match ingest_path::<_, _, _, &[u8]>(&blob_service, &dir_service, path, None) + .await? + { + snix_castore::Node::Directory { digest, size } => self + .record_ingested_node(&uri_s, &digest, size) + .await + .map(Some), + + snix_castore::Node::File { + digest, + size, + executable: _, + } => self + .record_ingested_node(&uri_s, &digest, size) + .await + .map(Some), + snix_castore::Node::Symlink { target: _ } => { + Err(anyhow!("TODO: Figure out what to do with symlink roots")) + } + } + } + Ingestable::Url(url) => self.download(url).await.map(Some), + } + } + })) + .buffer_unordered(self.max_parallel) + .collect::, _>>>() + .await; + + samples + } + + fn extract_hrefs(content: &str) -> Result, Error> { + let sel = Selector::parse("a").map_err(|e| anyhow!(e.to_string()))?; + let html = Html::parse_document(&content); + + Ok(html + .select(&sel) + .flat_map(|elt| elt.value().attr("href")) + .map(|s| s.to_string()) + .collect::>()) + } + + async fn fetch_from_listing_impl( + self: Arc, + url: Url, + max_depth: usize, + tx: Sender, + ) -> Result<(), Error> { + eprintln!("Downloading {:?}", url.to_string()); + let root = self.download(&url).await?; + tx.send(FetchListingMessage::Ingested(url.clone(), root.clone())) + .await + .context("Stopped accepting tasks before processing an Ingested notification")?; + if max_depth <= 0 { + return Ok(()); + } + /* TODO: no need to load blobs to memory unless you know they're text/html */ + match self.blob_service.open_read(&root.blake3).await? { + Some(mut reader) => { + let content = { + let mut br = BufReader::new(&mut *reader); + let mut content = String::new(); + br.read_to_string(&mut content).await?; + content + }; + let hrefs = Self::extract_hrefs(&content).unwrap_or(vec![]); + /* max_depth > 0 here */ + for href in hrefs { + let next_url = url.join(&href).context("Constructing next_url")?; + tx.send(FetchListingMessage::Recurse( + next_url.clone(), + max_depth - 1, + )) + .await + .context("Stopped accepting tasks before finishing all hrefs")?; + } + Ok(()) + } + None => Err(anyhow!("Couldn't read the ingested blob")), + } + } + + async fn fetch_from_listing( + self: Arc, + url: Url, + max_depth: usize, + ) -> ReceiverStream { + let mq_size = 10; + + /* TODO: move task queue to e.g. sqlite */ + let (tx, mut rx) = channel(mq_size); + + let (out_tx, out_rx) = channel(mq_size); + + let semaphore = Arc::new(Semaphore::new(self.max_parallel)); + + tokio::spawn({ + async move { + let mut seen: HashSet = HashSet::new(); + tx.send(FetchListingMessage::Recurse(url, max_depth)) + .await + .expect("fetch_from_listing failed populating the queue"); + while let Some(m) = rx.recv().await { + match m { + FetchListingMessage::Ingested(_url, ingested) => { + out_tx + .send(ingested) + .await + .expect("ReceiverStream failed to accept an Ingestable"); + } + FetchListingMessage::Recurse(url, max_depth) => { + if max_depth > 0 && !seen.contains(&url.to_string()) { + seen.insert(url.to_string()); + tokio::spawn({ + let s = self.clone(); + let url = url.clone(); + let tx = tx.clone(); + let semaphore = semaphore.clone(); + async move { + let _permit = semaphore.acquire(); + s.fetch_from_listing_impl(url, max_depth, tx).await + } + }); + } + } + } + } + } + }); + ReceiverStream::new(out_rx) + } +} + +#[tokio::main] +async fn main() { + let args = Cli::parse(); + + let _cwd = std::env::current_dir().expect("Couldn't get CWD"); + let _host_name = std::env::var("HOSTNAME").map_or(None, Some); + + let ctx = Arc::new( + open_context( + args.refetch, + args.max_parallel, + args.db_path, + args.castore_path, + ) + .await, + ); match args.command { Some(Command::Ingest { inputs }) => { - let samples = ingest( - &inputs, - args.refetch, - args.max_parallel, - client, - blob_service, - dir_service, - con, - ) - .await; + let samples = ctx.ingest(&inputs).await; for s in samples { match s { Err(e) => { @@ -282,6 +468,19 @@ async fn main() { } } } + Some(Command::FetchListing { max_depth, inputs }) => { + let ingested: Vec = stream::iter(inputs) + .then(async |i| { + let i = i.clone(); + ctx.clone().fetch_from_listing(i, max_depth).await + }) + .flatten_unordered(args.max_parallel) + .collect() + .await; + for i in ingested { + eprintln!("{:?}", i); + } + } None => {} } }