From 8d5f9a771c0e8610b8985a57a5dfd4476ec82af5 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 25 Nov 2024 17:13:54 +0100 Subject: [PATCH] Avoiding self-loops in vocab cooc --distrib Related to #383 --- src/cmd/vocab.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/cmd/vocab.rs b/src/cmd/vocab.rs index fd624f2..52d5e06 100644 --- a/src/cmd/vocab.rs +++ b/src/cmd/vocab.rs @@ -1217,6 +1217,7 @@ impl Cooccurrences { Ok(()) } + // NOTE: currently we avoid self loops because they are fiddly fn for_each_distrib_cooc_record(self, min_count: usize, mut callback: F) -> Result<(), E> where F: FnMut(&csv::ByteRecord) -> Result<(), E>, @@ -1232,12 +1233,16 @@ impl Cooccurrences { let mut sums: Vec = Vec::with_capacity(self.token_entries.len()); - for source_entry in self.token_entries.iter() { + for (source_id, source_entry) in self.token_entries.iter().enumerate() { let x = source_entry.gcf; let mut sum = Metrics::default(); for (target_id, count) in source_entry.cooc.iter() { + if source_id == *target_id { + continue; + } + let target_entry = &self.token_entries[*target_id]; let y = target_entry.gcf; @@ -1283,6 +1288,10 @@ impl Cooccurrences { let mut min_g2_sum = 0.0; for (other_id, source_other_count) in source_entry.cooc.iter() { + if source_id == *other_id { + continue; + } + let target_other_count = match target_entry.cooc.get(other_id) { Some(c) => c, None => continue,