use std::{ cmp::Ordering, ops::{Add, Div, Mul, Sub}, path::Path, }; use eyre::{eyre, Result}; use rustc_hash::FxHashMap; type GramMap = FxHashMap<[u8; N], T>; #[derive(Default, Debug)] struct Grams { grams1: GramMap<1, T>, grams2: GramMap<2, T>, grams3: GramMap<3, T>, grams4: GramMap<4, T>, } impl Grams where T: Div + Copy, { fn divide_by(&mut self, n: T) { divide_by(&mut self.grams1, n); divide_by(&mut self.grams2, n); divide_by(&mut self.grams3, n); divide_by(&mut self.grams4, n); } } impl Grams where T: Copy + PartialOrd + Default + Sub + Add + Div + Mul, { fn normalize(&mut self, omin: T, omax: T) { normalize(&mut self.grams1, omin, omax); normalize(&mut self.grams2, omin, omax); normalize(&mut self.grams3, omin, omax); normalize(&mut self.grams4, omin, omax); } } fn divide_by + Copy>( grams: &mut GramMap, n: T, ) { for v in grams.values_mut() { *v = *v / n; } } fn normalize(grams: &mut GramMap, omin: T, omax: T) where T: Copy + PartialOrd + Default + Sub + Add + Div + Mul, { let max = grams .values() .copied() .max_by(|&a, &b| { if a > b { Ordering::Greater } else { Ordering::Less } }) .unwrap_or(Default::default()); let min = grams .values() .copied() .min_by(|&a, &b| { if a > b { Ordering::Greater } else { Ordering::Less } }) .unwrap_or(Default::default()); for v in grams.values_mut() { *v = map_to_range(*v, min, max, omin, omax); } } #[test] fn test_normalize() { let mut input = GramMap::<1, f64>::default(); input.insert([b'a'], 500.); input.insert([b'b'], 300.); input.insert([b'c'], 100.); input.insert([b'd'], 125.); normalize(&mut input, 0., 100.); assert_eq!(input[b"a"], 100.); assert_eq!(input[b"b"], 50.); assert_eq!(input[b"c"], 0.); assert_eq!(input[b"d"], 6.25); } // maps a number from range [amin, amax] to range [bmin, bmax] fn map_to_range(v: V, amin: V, amax: V, bmin: V, bmax: V) -> V where V: Sub + Add + Mul + Div + Copy, { bmin + (((v - amin) * (bmax - bmin)) / (amax - amin)) } #[test] fn test_map_to_range() { assert_eq!(map_to_range(40, 0, 100, 0, 10), 4); assert_eq!(map_to_range(60, 50, 100, 5, 10), 6); assert_eq!(map_to_range(55.5, 55., 56., 0., 1.), 0.5); } type GramsCounts = Grams; type GramsFreqs = Grams; fn gen_data_file(path: &Path) -> Result { let data = std::fs::read_to_string(path)?; let mut grams = Grams::default(); for win in data.as_bytes().windows(4) { *grams.grams1.entry([win[0]]).or_insert(0) += 1; *grams.grams2.entry([win[0], win[1]]).or_insert(0) += 1; *grams.grams3.entry([win[0], win[1], win[2]]).or_insert(0) += 1; *grams .grams4 .entry([win[0], win[1], win[2], win[3]]) .or_insert(0) += 1; } // TODO: We lose a few N<4 grams here, but it's probably not that big of a deal Ok(grams) } fn gen_data(inputs: Vec) -> Result { let mut grams = Grams::default(); for dir in inputs { for de in walkdir::WalkDir::new(dir).into_iter() { let de = de?; if de.file_type().is_file() { grams = grams.combine(gen_data_file(de.path())?); } } } Ok(grams) } fn main() -> Result<()> { let mut dirs: Vec = std::env::args().skip(1).collect(); let grams = gen_data(dirs)?; Ok(()) }