Get the basic implementation working
- ID
d6a28f6- date
2025-01-09 22:49:23+00:00- author
Alex Chan <alex@alexwlchan.net>- parent
05e93a8- message
Get the basic implementation working- changed files
4 files, 168 additions, 234 deletions
Changed files
Cargo.lock (6532) → Cargo.lock (3555)
diff --git a/Cargo.lock b/Cargo.lock
index bd5be36..51a772e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3,144 +3,99 @@
version = 3
[[package]]
-name = "anstream"
-version = "0.6.18"
+name = "byteorder"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "is_terminal_polyfill",
- "utf8parse",
-]
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
-name = "anstyle"
-version = "1.0.10"
+name = "cfg-if"
+version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
-name = "anstyle-parse"
-version = "0.2.6"
+name = "getrandom"
+version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
- "utf8parse",
+ "cfg-if",
+ "libc",
+ "wasi",
]
[[package]]
-name = "anstyle-query"
-version = "1.1.2"
+name = "libc"
+version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
-dependencies = [
- "windows-sys",
-]
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
-name = "anstyle-wincon"
-version = "3.0.6"
+name = "ppv-lite86"
+version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
- "anstyle",
- "windows-sys",
+ "zerocopy",
]
[[package]]
-name = "clap"
-version = "4.5.26"
+name = "proc-macro2"
+version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [
- "clap_builder",
- "clap_derive",
+ "unicode-ident",
]
[[package]]
-name = "clap_builder"
-version = "4.5.26"
+name = "quote"
+version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
- "anstream",
- "anstyle",
- "clap_lex",
- "strsim",
+ "proc-macro2",
]
[[package]]
-name = "clap_derive"
-version = "4.5.24"
+name = "rand"
+version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn",
+ "libc",
+ "rand_chacha",
+ "rand_core",
]
[[package]]
-name = "clap_lex"
-version = "0.7.4"
+name = "rand_chacha"
+version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
-
-[[package]]
-name = "colorchoice"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
-
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-
-[[package]]
-name = "is_terminal_polyfill"
-version = "1.70.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
- "unicode-ident",
+ "ppv-lite86",
+ "rand_core",
]
[[package]]
-name = "quote"
-version = "1.0.38"
+name = "rand_core"
+version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
- "proc-macro2",
+ "getrandom",
]
[[package]]
name = "randline"
version = "0.1.0"
dependencies = [
- "clap",
+ "rand",
]
[[package]]
-name = "strsim"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
-
-[[package]]
name = "syn"
version = "2.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -158,80 +113,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
-name = "utf8parse"
-version = "0.2.2"
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
-name = "windows-sys"
-version = "0.59.0"
+name = "zerocopy"
+version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
- "windows-targets",
+ "byteorder",
+ "zerocopy-derive",
]
[[package]]
-name = "windows-targets"
-version = "0.52.6"
+name = "zerocopy-derive"
+version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_gnullvm",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "proc-macro2",
+ "quote",
+ "syn",
]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
-
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
Cargo.toml (132) → Cargo.toml (92)
diff --git a/Cargo.toml b/Cargo.toml
index bfa0d23..4d61b21 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,4 +4,4 @@ version = "0.1.0"
edition = "2021"
[dependencies]
-clap = { version = "4.5.26", features = ["derive"] }
+rand = "0.8"
src/main.rs (918) → src/main.rs (1101)
diff --git a/src/main.rs b/src/main.rs
index 1051b3f..b61ef4d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,4 @@
-#![deny(warnings)]
+// #![deny(warnings)]
use std::iter::Iterator;
@@ -11,30 +11,30 @@ fn main() {
// 1 argument N = get that number of lines
// >1 arguments = error
//
- let args: Vec<_> = std::env::args().collect();
-
- let n = match args.len() {
- 1 => 0,
- 2 => match args[1].parse::<usize>() {
- Ok(parsed_n) => parsed_n,
- Err(_) => {
- eprintln!("Usage: randline [N]");
- std::process::exit(1)
- }
- },
- _ => {
- eprintln!("Usage: randline [N]");
- std::process::exit(1)
- }
- };
-
- // Read the first N lines from stdout
- // let stdin = io::stdin();
-
- let a = [1, 2, 3];
- let iter = a.iter();
-
- println!("{:?}", sampling::reservoir_sample(iter, n));
-
- println!("n = {:?}", n);
+ // let args: Vec<_> = std::env::args().collect();
+ //
+ // let n = match args.len() {
+ // 1 => 0,
+ // 2 => match args[1].parse::<i32>() {
+ // Ok(parsed_n) => parsed_n,
+ // Err(_) => {
+ // eprintln!("Usage: randline [N]");
+ // std::process::exit(1)
+ // }
+ // },
+ // _ => {
+ // eprintln!("Usage: randline [N]");
+ // std::process::exit(1)
+ // }
+ // };
+ //
+ // // Read the first N lines from stdout
+ // // let stdin = io::stdin();
+ //
+ // let a = [1, 2, 3, 4, 5, 6];
+ // let iter = a.iter();
+ //
+ // println!("{:?}", sampling::reservoir_sample(iter, n));
+ //
+ // println!("n = {:?}", n);
}
src/sampling.rs (1634) → src/sampling.rs (2787)
diff --git a/src/sampling.rs b/src/sampling.rs
index 4433c05..2b9e0b6 100644
--- a/src/sampling.rs
+++ b/src/sampling.rs
@@ -1,61 +1,92 @@
+use rand::Rng;
+use std::collections::HashMap;
+
+fn random_weight() -> i32 {
+ rand::thread_rng().gen_range(i32::MIN..i32::MAX)
+}
+
pub fn reservoir_sample<T: std::fmt::Debug>(
- mut items: impl Iterator<Item = T>,
- n: usize
+ mut items: impl Iterator<Item = T>,
+ n: usize,
) -> Vec<T> {
+ // Create an empty reservoir.
+ //
+ // This is a map (weight) -> (item).
+ let mut reservoir: HashMap<i32, T> = HashMap::with_capacity(n);
+
+ // Fill the reservoir with the first n items. If there are less
+ // than n items, we can exit immediately.
+ for _ in 1..=n {
+ match items.next() {
+ Some(this_item) => reservoir.insert(random_weight(), this_item),
+ None => return reservoir.into_values().collect(),
+ };
+ }
- // Create an empty reservoir.
- let mut reservoir: Vec<T> = Vec::with_capacity(n);
+ // What's the largest weight seen so far?
+ let mut max_weight: i32 = *reservoir.keys().max().unwrap();
- // Fill the reservoir with the first n items. If there are less
- // than n items, we can exit immediately.
- for _ in 1..=n {
- match items.next() {
- Some(this_item) => reservoir.push(this_item),
- None => return reservoir,
- };
- }
+ // Now go through the remaining items.
+ for this_item in items {
+ // Choose a weight for this item.
+ let this_weight = random_weight();
- vec![]
+ // If this is greater than the weights seen so far, we can ignore
+ // this item and move on to the next one.
+ if this_weight > max_weight {
+ continue;
+ }
+
+ // Replace the item that had the max weight with the new item,
+ // then recalculate the max weight.
+ assert!(reservoir.remove(&max_weight).is_some());
+ reservoir.insert(this_weight, this_item);
+ max_weight = *reservoir.keys().max().unwrap();
+ }
+
+ reservoir.into_values().collect()
}
#[cfg(test)]
mod reservoir_sample_tests {
- use crate::sampling::reservoir_sample;
-
- // If there are no items, then the sample is empty.
- #[test]
- fn it_returns_an_empty_sample_for_an_empty_input() {
- let items: Vec<usize> = vec![];
- let sample = reservoir_sample(items.iter(), 5);
-
- assert_eq!(sample.len(), 0);
- }
-
- // If there are less items than the sample size, then the sample is
- // the complete set.
- #[test]
- fn it_returns_complete_sample_if_less_items_than_sample_size() {
- let items = vec!["a", "b", "c"];
- let sample = reservoir_sample(items.iter(), 5);
-
- assert_eq!(sample.len(), 3);
- assert_eq!(*sample[0], "a");
- assert_eq!(*sample[1], "b");
- assert_eq!(*sample[2], "c");
- }
-
- // If there's an equal number of items to the sample size, then the
- // sample is the complete set.
- #[test]
- fn it_returns_complete_sample_if_item_count_equal_to_sample_size() {
- let items = vec!["a", "b", "c"];
- let sample = reservoir_sample(items.iter(), 3);
-
- println!("AWLC = {:?}", sample);
-
- assert_eq!(sample.len(), 3);
- assert_eq!(*sample[0], "a");
- assert_eq!(*sample[1], "b");
- assert_eq!(*sample[2], "c");
- }
-}
\ No newline at end of file
+ use crate::sampling::reservoir_sample;
+
+ // If there are no items, then the sample is empty.
+ #[test]
+ fn it_returns_an_empty_sample_for_an_empty_input() {
+ let items: Vec<usize> = vec![];
+ let sample = reservoir_sample(items.into_iter(), 5);
+
+ assert_eq!(sample.len(), 0);
+ }
+
+ // If there are less items than the sample size, then the sample is
+ // the complete set.
+ #[test]
+ fn it_returns_complete_sample_if_less_items_than_sample_size() {
+ let items = vec!["a", "b", "c"];
+ let sample = reservoir_sample(items.into_iter(), 5);
+
+ assert!(equivalent_items(sample, vec!["a", "b", "c"]));
+ }
+
+ // If there's an equal number of items to the sample size, then the
+ // sample is the complete set.
+ #[test]
+ fn it_returns_complete_sample_if_item_count_equal_to_sample_size() {
+ let items = vec!["a", "b", "c"];
+ let sample = reservoir_sample(items.into_iter(), 3);
+
+ assert!(equivalent_items(sample, vec!["a", "b", "c"]));
+ }
+
+ fn equivalent_items<T: std::cmp::PartialEq + std::cmp::Ord>(
+ mut vec1: Vec<T>,
+ mut vec2: Vec<T>,
+ ) -> bool {
+ vec1.sort();
+ vec2.sort();
+
+ vec1 == vec2
+ }
+}