Skip to main content

Get the basic implementation working

ID
d6a28f6
date
2025-01-09 22:49:23+00:00
author
Alex Chan <alex@alexwlchan.net>
parent
05e93a8
message
Get the basic implementation working
changed files
4 files, 168 additions, 234 deletions

Changed files

Cargo.lock (6532) → Cargo.lock (3555)

diff --git a/Cargo.lock b/Cargo.lock
index bd5be36..51a772e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3,144 +3,99 @@
 version = 3
 
 [[package]]
-name = "anstream"
-version = "0.6.18"
+name = "byteorder"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "is_terminal_polyfill",
- "utf8parse",
-]
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
-name = "anstyle"
-version = "1.0.10"
+name = "cfg-if"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
-name = "anstyle-parse"
-version = "0.2.6"
+name = "getrandom"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
- "utf8parse",
+ "cfg-if",
+ "libc",
+ "wasi",
 ]
 
 [[package]]
-name = "anstyle-query"
-version = "1.1.2"
+name = "libc"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
-dependencies = [
- "windows-sys",
-]
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
-name = "anstyle-wincon"
-version = "3.0.6"
+name = "ppv-lite86"
+version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
 dependencies = [
- "anstyle",
- "windows-sys",
+ "zerocopy",
 ]
 
 [[package]]
-name = "clap"
-version = "4.5.26"
+name = "proc-macro2"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
- "clap_builder",
- "clap_derive",
+ "unicode-ident",
 ]
 
 [[package]]
-name = "clap_builder"
-version = "4.5.26"
+name = "quote"
+version = "1.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
 dependencies = [
- "anstream",
- "anstyle",
- "clap_lex",
- "strsim",
+ "proc-macro2",
 ]
 
 [[package]]
-name = "clap_derive"
-version = "4.5.24"
+name = "rand"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn",
+ "libc",
+ "rand_chacha",
+ "rand_core",
 ]
 
 [[package]]
-name = "clap_lex"
-version = "0.7.4"
+name = "rand_chacha"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
-
-[[package]]
-name = "colorchoice"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
-
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-
-[[package]]
-name = "is_terminal_polyfill"
-version = "1.70.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
- "unicode-ident",
+ "ppv-lite86",
+ "rand_core",
 ]
 
 [[package]]
-name = "quote"
-version = "1.0.38"
+name = "rand_core"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
- "proc-macro2",
+ "getrandom",
 ]
 
 [[package]]
 name = "randline"
 version = "0.1.0"
 dependencies = [
- "clap",
+ "rand",
 ]
 
 [[package]]
-name = "strsim"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
-
-[[package]]
 name = "syn"
 version = "2.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -158,80 +113,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
-name = "utf8parse"
-version = "0.2.2"
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "windows-sys"
-version = "0.59.0"
+name = "zerocopy"
+version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
- "windows-targets",
+ "byteorder",
+ "zerocopy-derive",
 ]
 
 [[package]]
-name = "windows-targets"
-version = "0.52.6"
+name = "zerocopy-derive"
+version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_gnullvm",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
-
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

Cargo.toml (132) → Cargo.toml (92)

diff --git a/Cargo.toml b/Cargo.toml
index bfa0d23..4d61b21 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,4 +4,4 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-clap = { version = "4.5.26", features = ["derive"] }
+rand = "0.8"

src/main.rs (918) → src/main.rs (1101)

diff --git a/src/main.rs b/src/main.rs
index 1051b3f..b61ef4d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,4 @@
-#![deny(warnings)]
+// #![deny(warnings)]
 
 use std::iter::Iterator;
 
@@ -11,30 +11,30 @@ fn main() {
     //   1 argument N = get that number of lines
     //  >1 arguments  = error
     //
-    let args: Vec<_> = std::env::args().collect();
-
-    let n = match args.len() {
-        1 => 0,
-        2 => match args[1].parse::<usize>() {
-            Ok(parsed_n) => parsed_n,
-            Err(_) => {
-                eprintln!("Usage: randline [N]");
-                std::process::exit(1)
-            }
-        },
-        _ => {
-            eprintln!("Usage: randline [N]");
-            std::process::exit(1)
-        }
-    };
-
-    // Read the first N lines from stdout
-    // let stdin = io::stdin();
-
-    let a = [1, 2, 3];
-    let iter = a.iter();
-
-    println!("{:?}", sampling::reservoir_sample(iter, n));
-
-    println!("n = {:?}", n);
+    // let args: Vec<_> = std::env::args().collect();
+    //
+    //     let n = match args.len() {
+    //         1 => 0,
+    //         2 => match args[1].parse::<i32>() {
+    //             Ok(parsed_n) => parsed_n,
+    //             Err(_) => {
+    //                 eprintln!("Usage: randline [N]");
+    //                 std::process::exit(1)
+    //             }
+    //         },
+    //         _ => {
+    //             eprintln!("Usage: randline [N]");
+    //             std::process::exit(1)
+    //         }
+    //     };
+    //
+    //     // Read the first N lines from stdout
+    //     // let stdin = io::stdin();
+    //
+    //     let a = [1, 2, 3, 4, 5, 6];
+    //     let iter = a.iter();
+    //
+    //     println!("{:?}", sampling::reservoir_sample(iter, n));
+    //
+    //     println!("n = {:?}", n);
 }

src/sampling.rs (1634) → src/sampling.rs (2787)

diff --git a/src/sampling.rs b/src/sampling.rs
index 4433c05..2b9e0b6 100644
--- a/src/sampling.rs
+++ b/src/sampling.rs
@@ -1,61 +1,92 @@
+use rand::Rng;
+use std::collections::HashMap;
+
+fn random_weight() -> i32 {
+    rand::thread_rng().gen_range(i32::MIN..i32::MAX)
+}
+
 pub fn reservoir_sample<T: std::fmt::Debug>(
-  mut items: impl Iterator<Item = T>,
-  n: usize
+    mut items: impl Iterator<Item = T>,
+    n: usize,
 ) -> Vec<T> {
+    // Create an empty reservoir.
+    //
+    // This is a map (weight) -> (item).
+    let mut reservoir: HashMap<i32, T> = HashMap::with_capacity(n);
+
+    // Fill the reservoir with the first n items.  If there are less
+    // than n items, we can exit immediately.
+    for _ in 1..=n {
+        match items.next() {
+            Some(this_item) => reservoir.insert(random_weight(), this_item),
+            None => return reservoir.into_values().collect(),
+        };
+    }
 
-  // Create an empty reservoir.
-  let mut reservoir: Vec<T> = Vec::with_capacity(n);
+    // What's the largest weight seen so far?
+    let mut max_weight: i32 = *reservoir.keys().max().unwrap();
 
-  // Fill the reservoir with the first n items.  If there are less
-  // than n items, we can exit immediately.
-  for _ in 1..=n {
-    match items.next() {
-      Some(this_item) => reservoir.push(this_item),
-      None => return reservoir,
-    };
-  }
+    // Now go through the remaining items.
+    for this_item in items {
+        // Choose a weight for this item.
+        let this_weight = random_weight();
 
-  vec![]
+        // If this is greater than the weights seen so far, we can ignore
+        // this item and move on to the next one.
+        if this_weight > max_weight {
+            continue;
+        }
+
+        // Replace the item that had the max weight with the new item,
+        // then recalculate the max weight.
+        assert!(reservoir.remove(&max_weight).is_some());
+        reservoir.insert(this_weight, this_item);
+        max_weight = *reservoir.keys().max().unwrap();
+    }
+
+    reservoir.into_values().collect()
 }
 
 #[cfg(test)]
 mod reservoir_sample_tests {
-  use crate::sampling::reservoir_sample;
-
-  // If there are no items, then the sample is empty.
-  #[test]
-  fn it_returns_an_empty_sample_for_an_empty_input() {
-    let items: Vec<usize> = vec![];
-    let sample = reservoir_sample(items.iter(), 5);
-
-    assert_eq!(sample.len(), 0);
-  }
-
-  // If there are less items than the sample size, then the sample is
-  // the complete set.
-  #[test]
-  fn it_returns_complete_sample_if_less_items_than_sample_size() {
-    let items = vec!["a", "b", "c"];
-    let sample = reservoir_sample(items.iter(), 5);
-
-    assert_eq!(sample.len(), 3);
-    assert_eq!(*sample[0], "a");
-    assert_eq!(*sample[1], "b");
-    assert_eq!(*sample[2], "c");
-  }
-
-  // If there's an equal number of items to the sample size, then the
-  // sample is the complete set.
-  #[test]
-  fn it_returns_complete_sample_if_item_count_equal_to_sample_size() {
-    let items = vec!["a", "b", "c"];
-    let sample = reservoir_sample(items.iter(), 3);
-
-    println!("AWLC = {:?}", sample);
-
-    assert_eq!(sample.len(), 3);
-    assert_eq!(*sample[0], "a");
-    assert_eq!(*sample[1], "b");
-    assert_eq!(*sample[2], "c");
-  }
-}
\ No newline at end of file
+    use crate::sampling::reservoir_sample;
+
+    // If there are no items, then the sample is empty.
+    #[test]
+    fn it_returns_an_empty_sample_for_an_empty_input() {
+        let items: Vec<usize> = vec![];
+        let sample = reservoir_sample(items.into_iter(), 5);
+
+        assert_eq!(sample.len(), 0);
+    }
+
+    // If there are less items than the sample size, then the sample is
+    // the complete set.
+    #[test]
+    fn it_returns_complete_sample_if_less_items_than_sample_size() {
+        let items = vec!["a", "b", "c"];
+        let sample = reservoir_sample(items.into_iter(), 5);
+
+        assert!(equivalent_items(sample, vec!["a", "b", "c"]));
+    }
+
+    // If there's an equal number of items to the sample size, then the
+    // sample is the complete set.
+    #[test]
+    fn it_returns_complete_sample_if_item_count_equal_to_sample_size() {
+        let items = vec!["a", "b", "c"];
+        let sample = reservoir_sample(items.into_iter(), 3);
+
+        assert!(equivalent_items(sample, vec!["a", "b", "c"]));
+    }
+
+    fn equivalent_items<T: std::cmp::PartialEq + std::cmp::Ord>(
+        mut vec1: Vec<T>,
+        mut vec2: Vec<T>,
+    ) -> bool {
+        vec1.sort();
+        vec2.sort();
+
+        vec1 == vec2
+    }
+}