From 4e60cd97276a2202a0195374071e996fe7a45507 Mon Sep 17 00:00:00 2001 From: Doug Kelkhoff <18220321+dgkf@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:14:39 -0700 Subject: [PATCH] Rewriting `List` internals and adding named subsets (#56) --- README.md | 95 +++++++++++++++++++++-------- src/CHANGELOG.md | 14 +++++ src/lang.rs | 16 +++-- src/object/list.rs | 115 ++++++++++++++++++++++++++++------- src/object/vector/subset.rs | 32 +++++----- src/object/vector/subsets.rs | 72 ++++++++++++++++------ src/repl/release.rs | 2 +- 7 files changed, 259 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index 699af0f4..4e005148 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # R -_An experimental implementation of R_ +_An experimental implementation of R, with embellishments_ + +Check out the [live demo](https://dgkf.github.io/R/) ## What can it do? @@ -8,7 +10,7 @@ _An experimental implementation of R_ cargo run ``` ```r -# R version 0.0.1 -- "Why Not?" +# R version 0.3.0 -- "Days of Abandon" x <- function(a = 1, ...) { a + c(...) } # function(a = 1, ...) { @@ -27,43 +29,82 @@ y(4, 3, 2, 1) This amounts to (most) of R's grammar parsing, basic primitives, scope management and ellipsis argument passing. +## What's different? + +This project is not just a rewrite of R, but a playground for features and +reinterpretations. It is not meant to reimplement a compatible R layer, but +to rethink some of R's assumptions. + +To start, there are a few superficial changes: + +```r +# 'fn' keyword +f <- fn(a, b, c) { + a + b + c +} + +# vector syntax +v <- [1, 2, 3, 4] + +# list syntax +l <- (a = 1, b = 2, c = 3) + +# lowercase keywords +kws <- (na, null, inf, true, false) +``` + ## Why -First and foremost, to learn. +This project is primarily a personal exploration into language design. + +At the outset, many of the choices are researched one-by-one and are almost +certainly naive implementations. My goal is to learn and explore, and in +that way the project is already a success in my eyes. Beyond advancing my own +understanding of language internals, I'd love to see the project garner enough +interest to become self-sustaining. + +If you see value in the project for anything beyond prototyping ideas, then +pushing the project toward something practical is contingent on your support. +Contributions, suggestions, feedback and testing are all appreciated. + +### Values -I've always been interested in language design. I know `R` well and think it's a -super expressive language, so it felt like a good target to shoot for. Like any -long-time user of a language, I also have dreamt of how the language could be -improved. This project also offered a small testing ground for a few of those. +Being primarily a one-person project, the values currently map closely to my +own. Somethings I want to aim for: -## Long-term Goals +- A reasonably approachable language for R users (possibly with the ability to + interpret R code). +- Improved R constructs for complex calls, including argument packing and + unpacking, partial function calls, destructuring assignment +- Guardrails on non-standard-evaluation, allowing for user-facing + domain-specific-languages, while allowing a more rigid evaluation scheme + internally. +- Lean into the things that `rust` does well, such as threading, arguably + async evaluation, first-class data structures and algebraic error types. +- Learn from more general languages like `TypeScript` to better understand + how static typing can be comfortably embedded in a high-level language. -As to not mislead anyone, I want to be upfront in saying that this project is -well beyond what I can achieve alone. +## Contribution Guide -For this project to mature, it is going to need a community of contributors -with diverse expertise. I welcome anyone interested to help out, and I'm happy -to find an intersection of interests as we hash out what the language aims to -deliver. +If you also want to learn some `rust` or want to explore language design with +me, I'm happy to have you along for the ride. There are plenty of ways to +contribute. In order of increasing complexity, this might include: -That said, my personal ambitions for any spiritual successor to R would be: +- Documenting internals +- Improving documentation throughout +- Helping to improve the demo page hosted on GitHub pages +- Implementing new language concepts +- Providing feedback on internals -- Built with `R` code as a first-class input. Even if the language evolves past -`R`, I'd like for it to be able to leverage `R`'s package ecosystem. -- Reimagine many recent `R` language features without the confines of backwards -compatibility. -- Take Jan Vitek's analysis of R's performance to heart and bake in constructs -for isolating non-standard evaluation (though admittedly performance is a -distant goal at the moment). -- Leverage things that `rust` excels at, like its strong iterator support, -async/multithread execution and its error model. +Any and all contributions are appreciated, and you'll earn yourself a mention +in release notes! ## License I welcome other contributors, but also have not thoughtfully selected a long- -term license yet. For now there's a CLA in place so that the license can -be altered later on. I don't intend to keep it around forever. If you have -suggestions or considerations for selecting an appropriate license, your +term license yet. For now there's a CLA in place so that the license can +be altered later on. I don't intend to keep it around forever. If you have +suggestions or considerations for selecting an appropriate license, your feedback would be much appreciated. My current preference is toward a copyleft license like GPL as opposed to a diff --git a/src/CHANGELOG.md b/src/CHANGELOG.md index 0f9d147a..8aa9bc24 100644 --- a/src/CHANGELOG.md +++ b/src/CHANGELOG.md @@ -13,11 +13,25 @@ # [1] 1 2 3 1000 5 6 7 8 9 10 ``` +* Mutating assignment implemented for `List`s, including by named index. + + ```r + x <- list(a = 1, b = 2, c = 3, d = 4, e = 5) + x[2:3][[1]] <- 200 + x[1:4][c("d", "c")] <- 1000 + x + # list(a = 1, b = 200, c = 1000, d = 1000, e = 5) + ``` + ## Internals * "altreps" are now supported internally, though currently only a "Subset" (used for indexed assignment) is implemented. +* `List`s were reworked to use a `HashMap` of named values, allowing for + more immediate access to named values without repeated traversals of a + vector of pairs. + # 0.2.0 "In Bloom" diff --git a/src/lang.rs b/src/lang.rs index 95a7f008..594dce29 100644 --- a/src/lang.rs +++ b/src/lang.rs @@ -1,7 +1,7 @@ -use crate::object::types::*; -use crate::object::*; use crate::callable::core::{builtin, Callable}; use crate::error::*; +use crate::object::types::*; +use crate::object::*; use core::fmt; use std::fmt::Display; @@ -287,8 +287,12 @@ fn display_list(x: &List, f: &mut fmt::Formatter<'_>, bc: Option) -> fmt let v = x.values.borrow(); let s = x.subsets.clone(); - let names: Vec<_> = x.values.borrow().clone().into_iter().map(|(n, _)| n).collect(); - for (i, (_, si)) in s.bind_names(names).into_iter().take(v.len()).enumerate() { + for (i, (_, si)) in s + .bind_names(x.names.clone()) + .into_iter() + .take(v.len()) + .enumerate() + { let name; let value; @@ -787,7 +791,9 @@ impl Context for Rc { match expr { Expr::Null => Ok(Obj::Null), Expr::NA => Ok(Obj::Vector(Vector::from(vec![OptionNA::NA as Logical]))), - Expr::Inf => Ok(Obj::Vector(Vector::from(vec![OptionNA::Some(f64::INFINITY)]))), + Expr::Inf => Ok(Obj::Vector(Vector::from(vec![OptionNA::Some( + f64::INFINITY, + )]))), Expr::Number(x) => Ok(Obj::Vector(Vector::from(vec![x]))), Expr::Integer(x) => Ok(Obj::Vector(Vector::from(vec![x]))), Expr::Bool(x) => Ok(Obj::Vector(Vector::from(vec![OptionNA::Some(x)]))), diff --git a/src/object/list.rs b/src/object/list.rs index 123877a2..4c4a5b49 100644 --- a/src/object/list.rs +++ b/src/object/list.rs @@ -1,5 +1,6 @@ -use std::rc::Rc; use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; use crate::error::RError; use crate::lang::EvalResult; @@ -8,56 +9,99 @@ use super::*; #[derive(Debug, Clone, PartialEq, Default)] pub struct List { + pub names: Rc>>>, pub values: Rc, Obj)>>>, pub subsets: Subsets, } impl From, Obj)>> for List { fn from(value: Vec<(Option, Obj)>) -> Self { - List { + let mut result = List { values: Rc::new(RefCell::new(value)), ..Default::default() - } + }; + + result.reindex(); + result } } impl List { + pub fn reindex(&mut self) { + let mut names = self.names.borrow_mut(); + names.drain(); + + for (i, (k, _)) in self.values.borrow().iter().enumerate() { + if let Some(name) = k { + let indices = names.entry(name.clone()).or_insert(vec![]); + if !indices.contains(&i) { + indices.push(i) + } + } + } + } + pub fn subset(&self, by: Subset) -> List { let Subsets(mut inner) = self.subsets.clone(); inner.push(by); List { + names: self.names.clone(), values: self.values.clone(), subsets: Subsets(inner), } } pub fn assign(&mut self, value: Obj) -> EvalResult { - // TODO(performance): Avoid having to split vector and collect into - // separate names vec for binding during subsetting. Ideally just - // need a reference. - let names: Vec<_> = self.values.borrow().clone().into_iter().map(|(n, _)| n).collect(); match value { // remove elements from list Obj::Null => { - let mut v = self.values.borrow_mut(); - let n = v.len(); - let indices = self.subsets.clone().bind_names(names).into_iter().take(n); - for (i, _) in indices { - v.remove(i); + let n = self.values.borrow().len(); + let indices = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .take(n); + + { + let mut values = self.values.borrow_mut(); + for (i, _) in indices { + values.remove(i); + } } + + self.reindex(); + + // TODO(feat): need to return list with NULL elements when + // index is NA + Ok(Obj::List(List { + names: self.names.clone(), values: self.values.clone(), subsets: self.subsets.clone(), })) } + // any single length R value any if any.len() == Some(1) => { let mut v = self.values.borrow_mut(); let n = v.len(); - let indices = self.subsets.clone().bind_names(names.clone()).into_iter().take(n); + let indices = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .take(n); // first check to see if we need to extend - if let Some(max) = self.subsets.clone().bind_names(names).into_iter().map(|(i, _)| i).max() { + if let Some(max) = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .map(|(i, _)| i) + .max() + { v.reserve(max.saturating_sub(n)) } @@ -69,6 +113,7 @@ impl List { } Ok(Obj::List(List { + names: self.names.clone(), values: self.values.clone(), subsets: self.subsets.clone(), })) @@ -78,10 +123,22 @@ impl List { any if any.len() == Some(self.len()) => { let mut v = self.values.borrow_mut(); let n = v.len(); - let indices = self.subsets.clone().bind_names(names.clone()).into_iter().take(n); + let indices = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .take(n); // first check to see if we need to extend - if let Some(max) = self.subsets.clone().bind_names(names).into_iter().map(|(i, _)| i).max() { + if let Some(max) = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .map(|(i, _)| i) + .max() + { v.reserve(max.saturating_sub(n)) } @@ -93,6 +150,7 @@ impl List { } Ok(Obj::List(List { + names: self.names.clone(), values: self.values.clone(), subsets: self.subsets.clone(), })) @@ -100,10 +158,22 @@ impl List { other => { let mut v = self.values.borrow_mut(); let n = v.len(); - let indices = self.subsets.clone().bind_names(names.clone()).into_iter().take(n); + let indices = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .take(n); // first check to see if we need to extend - if let Some(max) = self.subsets.clone().bind_names(names).into_iter().map(|(i, _)| i).max() { + if let Some(max) = self + .subsets + .clone() + .bind_names(self.names.clone()) + .into_iter() + .map(|(i, _)| i) + .max() + { v.reserve(max.saturating_sub(n)) } @@ -115,6 +185,7 @@ impl List { } Ok(Obj::List(List { + names: self.names.clone(), values: self.values.clone(), subsets: self.subsets.clone(), })) @@ -132,13 +203,16 @@ impl List { pub fn try_get_inner(&self, index: Obj) -> EvalResult { let err = RError::Other("Cannot use object for indexing.".to_string()); - let names: Vec<_> = self.values.borrow().clone().into_iter().map(|(n, _)| n).collect(); match index.as_vector()? { Obj::Vector(v) if v.len() == 1 => { let Subsets(mut subsets) = self.subsets.clone(); subsets.push(v.try_into()?); - if let Some((i, _)) = Subsets(subsets).bind_names(names).into_iter().next() { + if let Some((i, _)) = Subsets(subsets) + .bind_names(self.names.clone()) + .into_iter() + .next() + { self.values .borrow() .get(i) @@ -159,4 +233,3 @@ impl List { } } } - diff --git a/src/object/vector/subset.rs b/src/object/vector/subset.rs index 791b895d..5b5a995c 100644 --- a/src/object/vector/subset.rs +++ b/src/object/vector/subset.rs @@ -1,10 +1,11 @@ -use std::{cell::RefCell, ops::Range, rc::Rc}; +use std::cell::RefCell; +use std::ops::Range; +use std::rc::Rc; use crate::lang::RSignal; use super::{types::*, OptionNA, Vector}; - /// Subsets /// /// Representations of how data views might be specified. Indices are 0-indexed, @@ -57,9 +58,12 @@ impl Subset { } } - pub fn filter<'a, I>(&self, mut iter: I) -> Box)> + 'a> + pub fn filter<'a, I>( + &self, + mut iter: I, + ) -> Box)> + 'a> where - I: Iterator)> + 'a + I: Iterator)> + 'a, { match self.clone() { Subset::Indices(i) => { @@ -151,7 +155,7 @@ impl Subset { // and finally we convert our new indices into an iterator Box::new(indices.into_iter()) } - }, + } Subset::Mask(mask) => { Box::new( mask.borrow() @@ -166,14 +170,12 @@ impl Subset { }), ) } - Subset::Range(range) => { - Box::new( - iter.skip(range.start) - .enumerate() - .take_while(move |(i, _)| i < &(range.end - range.start)) - .map(|(_, v)| v), - ) - } + Subset::Range(range) => Box::new( + iter.skip(range.start) + .enumerate() + .take_while(move |(i, _)| i < &(range.end - range.start)) + .map(|(_, v)| v), + ), Subset::Names(_) => unimplemented!(), } } @@ -235,9 +237,7 @@ impl TryFrom for Subset { Ok(Subset::Mask(v.inner())) } } - Vector::Character(v) => { - Ok(Subset::Names(v.inner())) - } + Vector::Character(v) => Ok(Subset::Names(v.inner())), } } } diff --git a/src/object/vector/subsets.rs b/src/object/vector/subsets.rs index 835801dc..bed09894 100644 --- a/src/object/vector/subsets.rs +++ b/src/object/vector/subsets.rs @@ -1,11 +1,13 @@ -use super::{Subset, OptionNA}; +use std::{cell::RefCell, collections::HashMap, rc::Rc}; + +use super::Subset; #[derive(Debug, Clone, PartialEq, Default)] pub struct Subsets(pub Vec); pub struct NamedSubsets { subsets: Subsets, - names: Vec>, + names: Rc>>>, } impl Subsets { @@ -37,8 +39,11 @@ impl Subsets { v.push(subset.into()); } - pub fn bind_names(self, names: Vec>) -> NamedSubsets { - NamedSubsets { subsets: self, names } + pub fn bind_names(self, names: Rc>>>) -> NamedSubsets { + NamedSubsets { + subsets: self, + names, + } } } @@ -62,20 +67,53 @@ impl IntoIterator for NamedSubsets { for subset in subsets { match subset { Subset::Names(names) => { - // TODO(performance): extract named elements without - // repeatedly iterating through named values - let mut indices = vec![(0, None); names.borrow().len()]; - for (i, _) in iter.take(self.names.len()) { - if let Some(Some(ni)) = self.names.get(i) { - for (si, sn) in names.borrow().iter().enumerate() { - if &OptionNA::Some(ni.to_string()) == sn { - indices[si] = (i, Some(i)) - } + use super::OptionNA; + const NOTFOUND: (usize, Option) = (0, None); + + let snames = self.names.borrow(); + + // grab indices within subset to find first named index + let (_, hint_n_max) = iter.size_hint(); + let subset_indices: Vec<_> = match hint_n_max { + Some(n) => iter.map(|(i, _)| i).take(n).collect(), + None => { + // figure out the absolute maximum value we may require + let mut n = 0 as usize; + for name in names.borrow().iter() { + let OptionNA::Some(name) = name else { continue }; + let name_max = snames + .get(name) + .and_then(|name| name.iter().reduce(|l, r| std::cmp::max(l, r))) + .unwrap_or(&0); + + n = std::cmp::max(n, *name_max) } - } - } - iter = Box::new(indices.into_iter()) - }, + iter.map(|(i, _)| i).take(n + 1).collect() + }, + }; + + // for each name, find the first index in the subset + let named_indices = names + .borrow() + .iter() + .map(|name| match name { + OptionNA::NA => NOTFOUND, + OptionNA::Some(name) => snames + .get(name) + .and_then(|name_indices| { + for i in name_indices { + if subset_indices.contains(i) { + return Some((*i, Some(*i))); + } + } + Some(NOTFOUND) + }) + .unwrap_or(NOTFOUND), + }) + .collect::>(); + + iter = Box::new(named_indices.into_iter()) as Self::IntoIter + } _ => iter = subset.filter(iter), } } diff --git a/src/repl/release.rs b/src/repl/release.rs index d0da6f17..d17bffb5 100644 --- a/src/repl/release.rs +++ b/src/repl/release.rs @@ -7,5 +7,5 @@ pub fn session_header() -> String { String::from("") }; - format!("R version 0.2.0 -- \"In Bloom\"{dev}") + format!("R version 0.3.0 -- \"Days of Abandon\"{dev}") }