From 103fd6b0dfc81539ff69c0b7f89026d57cee9323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 10:52:57 +0200 Subject: [PATCH 1/8] Add tests to show how #135 fails --- test/commands.jl | 2 ++ test/parse.jl | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/commands.jl b/test/commands.jl index 71dd87d..75ff264 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -379,6 +379,8 @@ end @testset "Complex conditions" begin df2 = @with dfxz @egen y = sum(x) @if z == 4 && x > 2 @test all(df2.y .=== [missing, missing, missing, 4]) + df2 = @with dfxz @egen y = sum(x) @if z == 4 && x > 2 && !ismissing(x) + @test all(df2.y .=== [missing, missing, missing, 4]) end end diff --git a/test/parse.jl b/test/parse.jl index 844a341..42626e7 100644 --- a/test/parse.jl +++ b/test/parse.jl @@ -12,8 +12,8 @@ TEST_CASES = [ (ex="@mockmacro x @if x < 0", command=:summarize, arguments=[:x], condition=:(x < 0), options=[]), (ex="@mockmacro x @if ln(x) < 0", command=:summarize, arguments=[:x], condition=:(ln(x) < 0), options=[]), (ex="@mockmacro x @if x < 0, detail", command=:summarize, arguments=[:x], condition=:(x < 0), options=[:detail]), - (ex="@mockmacro x @if x < 0 && y > 0", command=:summarize, arguments=[:x], condition=:(x < 0 .&& y > 0), options=[]), - (ex="@mockmacro x @if x < 0 && y > 0, detail", command=:summarize, arguments=[:x], condition=:(x < 0 .&& y > 0), options=[:detail]), + (ex="@mockmacro x @if x < 0 && y > 0", command=:summarize, arguments=[:x], condition=:(x < 0 && y > 0), options=[]), + (ex="@mockmacro x @if x < 0 && y > 0, detail", command=:summarize, arguments=[:x], condition=:(x < 0 && y > 0), options=[:detail]), ] From 40769f4ba461966009f763dc499480ec833080cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 10:56:43 +0200 Subject: [PATCH 2/8] Do not vectorize && and || in parser, this is a codegen issue #135 --- src/parse.jl | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/parse.jl b/src/parse.jl index 2225e86..dbacfec 100644 --- a/src/parse.jl +++ b/src/parse.jl @@ -46,12 +46,12 @@ function construct_call(node::Node) end if node.type in [:&&, :||] && typeof(node.content) != Expr - return Expr(Symbol("." * String(node.type)), replace_logical_operators.(node.content)...) + return Expr(node.type, node.content...) end if typeof(node.content) == Expr if node.type in [:&&, :||] - return Expr(Symbol("." * String(node.type)), node.content.args...) + return Expr(node.type, node.content.args...) end return node.content end @@ -59,20 +59,6 @@ function construct_call(node::Node) return Expr(node.type, node.content...) end -function replace_logical_operators(args) - if args in [:&&, :||] - return Symbol("." * String(args)) - end - return args -end - -function replace_logical_operators(args::Expr)::Expr - if args.head in [:&&, :||] - return Expr(Symbol("." * String(args.head)), replace_logical_operators.(args.args)...) - end - return args -end - function transition(state::Int64, arg::Node)::Int64 ## from command to condition if arg.content == Symbol("@if") && state == 1 From 38c1735dd08fd63d63756d4c524466fbba1921aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 11:28:03 +0200 Subject: [PATCH 3/8] Add syntactic operators and vectorize them (fix #135) But not =, we don't want to mess with assignment 297 tests pass --- src/codegen.jl | 7 +++++++ src/consts.jl | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/codegen.jl b/src/codegen.jl index cdadac0..094e861 100644 --- a/src/codegen.jl +++ b/src/codegen.jl @@ -220,6 +220,13 @@ function vectorize_function_calls(expr::Any) vectorize_function_calls.(expr.args[2:end])...) ) end + elseif is_operator(expr.head) && !is_dotted_operator(expr.head) && expr.head in SYNTACTIC_OPERATORS + # special handling of syntactic operators like &&, ||, etc. + # these are not called as a function + op = expr.head + dot_op = Symbol("." * String(op)) + return Expr(dot_op, + vectorize_function_calls.(expr.args)...) elseif is_operator(expr.args[1]) && !is_dotted_operator(expr.args[1]) op = expr.args[1] dot_op = Symbol("." * String(op)) diff --git a/src/consts.jl b/src/consts.jl index 841ac6e..ed62404 100644 --- a/src/consts.jl +++ b/src/consts.jl @@ -116,7 +116,11 @@ const OPTIONS = ( :variables ) -const OPERATORS = tuple([Symbol(x) for x in split(raw"= += -= −= *= /= //= |\\=| ^= ÷= %= <<= >>= >>>= |\|=| &= ⊻= ≔ ⩴ ≕ ← → ↔ ↚ ↛ ↞ ↠ ↢ ↣ ↦ ↤ ↮ ⇎ ⇍ ⇏ ⇐ ⇒ ⇔ ⇴ ⇶ ⇷ ⇸ ⇹ ⇺ ⇻ ⇼ ⇽ ⇾ ⇿ ⟵ ⟶ ⟷ ⟹ ⟺ ⟻ ⟼ ⟽ ⟾ ⟿ ⤀ ⤁ ⤂ ⤃ ⤄ ⤅ ⤆ ⤇ ⤌ ⤍ ⤎ ⤏ ⤐ ⤑ ⤔ ⤕ ⤖ ⤗ ⤘ ⤝ ⤞ ⤟ ⤠ ⥄ ⥅ ⥆ ⥇ ⥈ ⥊ ⥋ ⥎ ⥐ ⥒ ⥓ ⥖ ⥗ ⥚ ⥛ ⥞ ⥟ ⥢ ⥤ ⥦ ⥧ ⥨ ⥩ ⥪ ⥫ ⥬ ⥭ ⥰ ⧴ ⬱ ⬰ ⬲ ⬳ ⬴ ⬵ ⬶ ⬷ ⬸ ⬹ ⬺ ⬻ ⬼ ⬽ ⬾ ⬿ ⭀ ⭁ ⭂ ⭃ ⥷ ⭄ ⥺ ⭇ ⭈ ⭉ ⭊ ⭋ ⭌ ← → ⇜ ⇝ ↜ ↝ ↩ ↪ ↫ ↬ ↼ ↽ ⇀ ⇁ ⇄ ⇆ ⇇ ⇉ ⇋ ⇌ ⇚ ⇛ ⇠ ⇢ ↷ ↶ ↺ ↻ ~ --> <-- <--> > < >= ≥ <= ≤ == === ≡ != ≠ !== ≢ ∈ ∉ ∋ ∌ ⊆ ⊈ ⊂ ⊄ ⊊ ∝ ∊ ∍ ∥ ∦ ∷ ∺ ∻ ∽ ∾ ≁ ≃ ≂ ≄ ≅ ≆ ≇ ≈ ≉ ≊ ≋ ≌ ≍ ≎ ≐ ≑ ≒ ≓ ≖ ≗ ≘ ≙ ≚ ≛ ≜ ≝ ≞ ≟ ≣ ≦ ≧ ≨ ≩ ≪ ≫ ≬ ≭ ≮ ≯ ≰ ≱ ≲ ≳ ≴ ≵ ≶ ≷ ≸ ≹ ≺ ≻ ≼ ≽ ≾ ≿ ⊀ ⊁ ⊃ ⊅ ⊇ ⊉ ⊋ ⊏ ⊐ ⊑ ⊒ ⊜ ⊩ ⊬ ⊮ ⊰ ⊱ ⊲ ⊳ ⊴ ⊵ ⊶ ⊷ ⋍ ⋐ ⋑ ⋕ ⋖ ⋗ ⋘ ⋙ ⋚ ⋛ ⋜ ⋝ ⋞ ⋟ ⋠ ⋡ ⋢ ⋣ ⋤ ⋥ ⋦ ⋧ ⋨ ⋩ ⋪ ⋫ ⋬ ⋭ ⋲ ⋳ ⋴ ⋵ ⋶ ⋷ ⋸ ⋹ ⋺ ⋻ ⋼ ⋽ ⋾ ⋿ ⟈ ⟉ ⟒ ⦷ ⧀ ⧁ ⧡ ⧣ ⧤ ⧥ ⩦ ⩧ ⩪ ⩫ ⩬ ⩭ ⩮ ⩯ ⩰ ⩱ ⩲ ⩳ ⩵ ⩶ ⩷ ⩸ ⩹ ⩺ ⩻ ⩼ ⩽ ⩾ ⩿ ⪀ ⪁ ⪂ ⪃ ⪄ ⪅ ⪆ ⪇ ⪈ ⪉ ⪊ ⪋ ⪌ ⪍ ⪎ ⪏ ⪐ ⪑ ⪒ ⪓ ⪔ ⪕ ⪖ ⪗ ⪘ ⪙ ⪚ ⪛ ⪜ ⪝ ⪞ ⪟ ⪠ ⪡ ⪢ ⪣ ⪤ ⪥ ⪦ ⪧ ⪨ ⪩ ⪪ ⪫ ⪬ ⪭ ⪮ ⪯ ⪰ ⪱ ⪲ ⪳ ⪴ ⪵ ⪶ ⪷ ⪸ ⪹ ⪺ ⪻ ⪼ ⪽ ⪾ ⪿ ⫀ ⫁ ⫂ ⫃ ⫄ ⫅ ⫆ ⫇ ⫈ ⫉ ⫊ ⫋ ⫌ ⫍ ⫎ ⫏ ⫐ ⫑ ⫒ ⫓ ⫔ ⫕ ⫖ ⫗ ⫘ ⫙ ⫷ ⫸ ⫹ ⫺ ⊢ ⊣ ⟂ ⫪ ⫫ <: >: + - − ¦ |\|| ⊕ ⊖ ⊞ ⊟ |++| ∪ ∨ ⊔ ± ∓ ∔ ∸ ≏ ⊎ ⊻ ⊽ ⋎ ⋓ ⟇ ⧺ ⧻ ⨈ ⨢ ⨣ ⨤ ⨥ ⨦ ⨧ ⨨ ⨩ ⨪ ⨫ ⨬ ⨭ ⨮ ⨹ ⨺ ⩁ ⩂ ⩅ ⩊ ⩌ ⩏ ⩐ ⩒ ⩔ ⩖ ⩗ ⩛ ⩝ ⩡ ⩢ ⩣ * / ⌿ ÷ % & · · ⋅ ∘ × |\\| ∩ ∧ ⊗ ⊘ ⊙ ⊚ ⊛ ⊠ ⊡ ⊓ ∗ ∙ ∤ ⅋ ≀ ⊼ ⋄ ⋆ ⋇ ⋉ ⋊ ⋋ ⋌ ⋏ ⋒ ⟑ ⦸ ⦼ ⦾ ⦿ ⧶ ⧷ ⨇ ⨰ ⨱ ⨲ ⨳ ⨴ ⨵ ⨶ ⨷ ⨸ ⨻ ⨼ ⨽ ⩀ ⩃ ⩄ ⩋ ⩍ ⩎ ⩑ ⩓ ⩕ ⩘ ⩚ ⩜ ⩞ ⩟ ⩠ ⫛ ⊍ ▷ ⨝ ⟕ ⟖ ⟗ ⨟ // ^ ↑ ↓ ⇵ ⟰ ⟱ ⤈ ⤉ ⤊ ⤋ ⤒ ⤓ ⥉ ⥌ ⥍ ⥏ ⥑ ⥔ ⥕ ⥘ ⥙ ⥜ ⥝ ⥠ ⥡ ⥣ ⥥ ⥮ ⥯ ↑ ↓ << >> >>>")]...) +const SYNTACTIC_OPERATORS = tuple([Symbol(x) for x in split(raw"&& || += -= *= /= //= \= ^= ÷= %= <<= >>= >>>= |= &= ⊻=")]...) +const OPERATORS = tuple( + vcat( + [Symbol(x) for x in split(raw"= += -= −= *= /= //= \= ^= ÷= %= <<= >>= >>>= |= &= ⊻= ≔ ⩴ ≕ ← → ↔ ↚ ↛ ↞ ↠ ↢ ↣ ↦ ↤ ↮ ⇎ ⇍ ⇏ ⇐ ⇒ ⇔ ⇴ ⇶ ⇷ ⇸ ⇹ ⇺ ⇻ ⇼ ⇽ ⇾ ⇿ ⟵ ⟶ ⟷ ⟹ ⟺ ⟻ ⟼ ⟽ ⟾ ⟿ ⤀ ⤁ ⤂ ⤃ ⤄ ⤅ ⤆ ⤇ ⤌ ⤍ ⤎ ⤏ ⤐ ⤑ ⤔ ⤕ ⤖ ⤗ ⤘ ⤝ ⤞ ⤟ ⤠ ⥄ ⥅ ⥆ ⥇ ⥈ ⥊ ⥋ ⥎ ⥐ ⥒ ⥓ ⥖ ⥗ ⥚ ⥛ ⥞ ⥟ ⥢ ⥤ ⥦ ⥧ ⥨ ⥩ ⥪ ⥫ ⥬ ⥭ ⥰ ⧴ ⬱ ⬰ ⬲ ⬳ ⬴ ⬵ ⬶ ⬷ ⬸ ⬹ ⬺ ⬻ ⬼ ⬽ ⬾ ⬿ ⭀ ⭁ ⭂ ⭃ ⥷ ⭄ ⥺ ⭇ ⭈ ⭉ ⭊ ⭋ ⭌ ← → ⇜ ⇝ ↜ ↝ ↩ ↪ ↫ ↬ ↼ ↽ ⇀ ⇁ ⇄ ⇆ ⇇ ⇉ ⇋ ⇌ ⇚ ⇛ ⇠ ⇢ ↷ ↶ ↺ ↻ ~ --> <-- <--> > < >= ≥ <= ≤ == === ≡ != ≠ !== ≢ ∈ ∉ ∋ ∌ ⊆ ⊈ ⊂ ⊄ ⊊ ∝ ∊ ∍ ∥ ∦ ∷ ∺ ∻ ∽ ∾ ≁ ≃ ≂ ≄ ≅ ≆ ≇ ≈ ≉ ≊ ≋ ≌ ≍ ≎ ≐ ≑ ≒ ≓ ≖ ≗ ≘ ≙ ≚ ≛ ≜ ≝ ≞ ≟ ≣ ≦ ≧ ≨ ≩ ≪ ≫ ≬ ≭ ≮ ≯ ≰ ≱ ≲ ≳ ≴ ≵ ≶ ≷ ≸ ≹ ≺ ≻ ≼ ≽ ≾ ≿ ⊀ ⊁ ⊃ ⊅ ⊇ ⊉ ⊋ ⊏ ⊐ ⊑ ⊒ ⊜ ⊩ ⊬ ⊮ ⊰ ⊱ ⊲ ⊳ ⊴ ⊵ ⊶ ⊷ ⋍ ⋐ ⋑ ⋕ ⋖ ⋗ ⋘ ⋙ ⋚ ⋛ ⋜ ⋝ ⋞ ⋟ ⋠ ⋡ ⋢ ⋣ ⋤ ⋥ ⋦ ⋧ ⋨ ⋩ ⋪ ⋫ ⋬ ⋭ ⋲ ⋳ ⋴ ⋵ ⋶ ⋷ ⋸ ⋹ ⋺ ⋻ ⋼ ⋽ ⋾ ⋿ ⟈ ⟉ ⟒ ⦷ ⧀ ⧁ ⧡ ⧣ ⧤ ⧥ ⩦ ⩧ ⩪ ⩫ ⩬ ⩭ ⩮ ⩯ ⩰ ⩱ ⩲ ⩳ ⩵ ⩶ ⩷ ⩸ ⩹ ⩺ ⩻ ⩼ ⩽ ⩾ ⩿ ⪀ ⪁ ⪂ ⪃ ⪄ ⪅ ⪆ ⪇ ⪈ ⪉ ⪊ ⪋ ⪌ ⪍ ⪎ ⪏ ⪐ ⪑ ⪒ ⪓ ⪔ ⪕ ⪖ ⪗ ⪘ ⪙ ⪚ ⪛ ⪜ ⪝ ⪞ ⪟ ⪠ ⪡ ⪢ ⪣ ⪤ ⪥ ⪦ ⪧ ⪨ ⪩ ⪪ ⪫ ⪬ ⪭ ⪮ ⪯ ⪰ ⪱ ⪲ ⪳ ⪴ ⪵ ⪶ ⪷ ⪸ ⪹ ⪺ ⪻ ⪼ ⪽ ⪾ ⪿ ⫀ ⫁ ⫂ ⫃ ⫄ ⫅ ⫆ ⫇ ⫈ ⫉ ⫊ ⫋ ⫌ ⫍ ⫎ ⫏ ⫐ ⫑ ⫒ ⫓ ⫔ ⫕ ⫖ ⫗ ⫘ ⫙ ⫷ ⫸ ⫹ ⫺ ⊢ ⊣ ⟂ ⫪ ⫫ <: >: + - − ¦ | ⊕ ⊖ ⊞ ⊟ ++ ∪ ∨ ⊔ ± ∓ ∔ ∸ ≏ ⊎ ⊻ ⊽ ⋎ ⋓ ⟇ ⧺ ⧻ ⨈ ⨢ ⨣ ⨤ ⨥ ⨦ ⨧ ⨨ ⨩ ⨪ ⨫ ⨬ ⨭ ⨮ ⨹ ⨺ ⩁ ⩂ ⩅ ⩊ ⩌ ⩏ ⩐ ⩒ ⩔ ⩖ ⩗ ⩛ ⩝ ⩡ ⩢ ⩣ * / ⌿ ÷ % & · · ⋅ ∘ × \ ∩ ∧ ⊗ ⊘ ⊙ ⊚ ⊛ ⊠ ⊡ ⊓ ∗ ∙ ∤ ⅋ ≀ ⊼ ⋄ ⋆ ⋇ ⋉ ⋊ ⋋ ⋌ ⋏ ⋒ ⟑ ⦸ ⦼ ⦾ ⦿ ⧶ ⧷ ⨇ ⨰ ⨱ ⨲ ⨳ ⨴ ⨵ ⨶ ⨷ ⨸ ⨻ ⨼ ⨽ ⩀ ⩃ ⩄ ⩋ ⩍ ⩎ ⩑ ⩓ ⩕ ⩘ ⩚ ⩜ ⩞ ⩟ ⩠ ⫛ ⊍ ▷ ⨝ ⟕ ⟖ ⟗ ⨟ // ^ ↑ ↓ ⇵ ⟰ ⟱ ⤈ ⤉ ⤊ ⤋ ⤒ ⤓ ⥉ ⥌ ⥍ ⥏ ⥑ ⥔ ⥕ ⥘ ⥙ ⥜ ⥝ ⥠ ⥡ ⥣ ⥥ ⥮ ⥯ ↑ ↓ << >> >>>")], + SYNTACTIC_OPERATORS...)...) # not really a const, but anyway global _global_dataframe::Union{AbstractDataFrame, Nothing} = nothing From 3280d87d376445ccf8e4b1edcd56d8ff4f77f8e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 11:50:22 +0200 Subject: [PATCH 4/8] Drop NaNs, Infs, Nothings from analysis, not only Missings (#130) --- docs/src/index.md | 4 ++++ src/Kezdi.jl | 4 +++- src/codegen.jl | 7 ++----- src/functions.jl | 12 ++++++++++++ test/codegen.jl | 10 +++++----- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 911c48f..617aa37 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -331,6 +331,10 @@ rowcount DNV ``` +```@docs +keep_only_values +``` + ## Acknowledgements [^stata]: Stata is a registered trademark of StataCorp LLC. Kezdi.jl is not affiliated with StataCorp LLC. diff --git a/src/Kezdi.jl b/src/Kezdi.jl index b350841..4c1caab 100644 --- a/src/Kezdi.jl +++ b/src/Kezdi.jl @@ -2,7 +2,9 @@ Kezdi.jl is a Julia package for data manipulation and analysis. It is inspired by Stata, but it is written in Julia, which makes it faster and more flexible. It is designed to be used in the Julia REPL, but it can also be used in Jupyter notebooks or in scripts. """ module Kezdi -export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, use, @use, @tabulate, rowcount, distinct, @count, @sort, @order, getdf, setdf, @list, @head, @tail, @names, display_and_return, @rename +export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, use, @use, @tabulate, @count, @sort, @order, getdf, setdf, @list, @head, @tail, @names, @rename + +export display_and_return, keep_only_values, rowcount, distinct using Reexport using Logging diff --git a/src/codegen.jl b/src/codegen.jl index 094e861..164b95a 100644 --- a/src/codegen.jl +++ b/src/codegen.jl @@ -209,11 +209,8 @@ function vectorize_function_calls(expr::Any) elseif fname == :DNV return expr.args[2] elseif fname in DO_NOT_VECTORIZE || (!(fname in ALWAYS_VECTORIZE) && (length(methodswith(Vector, eval(fname); supertypes=true)) > 0)) - return Expr(expr.head, fname, - Expr(:call, :collect, - Expr(:call, :skipmissing, - vectorize_function_calls.(expr.args[2:end])...) - )) + skipmissing_each_arg = [Expr(:call, :keep_only_values, vectorize_function_calls(arg)) for arg in expr.args[2:end]] + return Expr(expr.head, fname, skipmissing_each_arg...) else return Expr(Symbol("."), fname, Expr(:tuple, diff --git a/src/functions.jl b/src/functions.jl index da0c950..ce8f401 100644 --- a/src/functions.jl +++ b/src/functions.jl @@ -82,3 +82,15 @@ counter(gdf::GroupedDataFrame) = [nrow(df) for df in gdf] Indicate that the function `f` should not be vectorized. The name DNV is only used for parsing, do not call it directly. """ DNV(args...; kwargs...) = error("This function should not be directly called. It is used to indicate that a function should not be vectorized. For example, @generate y = DNV(log(x))") + +isvalue(x) = true +isvalue(::Missing) = false +isvalue(::Nothing) = false +isvalue(x::Number) = isinf(x) || isnan(x) ? false : true + +""" + keep_only_values(x::AbstractVector) -> AbstractVector + +Return a vector with only the values of `x`, excluding any `missing`` values, `nothing`s, `Inf`a and `NaN`s. +""" +keep_only_values(x) = filter(isvalue, x) \ No newline at end of file diff --git a/test/codegen.jl b/test/codegen.jl index a39ad61..517be8a 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -10,12 +10,12 @@ end @test_expr vectorize_function_calls(:(div(x, y))) == :(div.(x, y)) @test_expr vectorize_function_calls(:(1 + div(x, y, z))) == :(1 .+ div.(x, y, z)) @testset "Do not vectorize" begin - @test_expr vectorize_function_calls(:(mean(x))) == :(mean(collect(skipmissing(x)))) - @test_expr vectorize_function_calls(:(mean(x) + log(y))) == :(mean(collect(skipmissing(x))) .+ log.(y)) + @test_expr vectorize_function_calls(:(mean(x))) == :(mean(keep_only_values(x))) + @test_expr vectorize_function_calls(:(mean(x) + log(y))) == :(mean(keep_only_values(x)) .+ log.(y)) @test_expr vectorize_function_calls(:(log.(x))) == :(log.(x)) - @test_expr vectorize_function_calls(:(log(x) + sum(y))) == :(log.(x) .+ sum(collect(skipmissing(y)))) - @test_expr vectorize_function_calls(:(wsum(x))) == :(wsum(collect(skipmissing(x)))) - @test_expr vectorize_function_calls(:(std(x))) == :(std(collect(skipmissing(x)))) + @test_expr vectorize_function_calls(:(log(x) + sum(y))) == :(log.(x) .+ sum(keep_only_values(y))) + @test_expr vectorize_function_calls(:(wsum(x))) == :(wsum(keep_only_values(x))) + @test_expr vectorize_function_calls(:(std(x))) == :(std(keep_only_values(x))) end @testset "Explicit DNV request" begin From 64adbd1bbb4cc89ab0ce8eebeca15c22477c93e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 11:54:25 +0200 Subject: [PATCH 5/8] Add tests for NaN and Inf exclusion (#130) --- test/commands.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/commands.jl b/test/commands.jl index 75ff264..7282a0c 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -121,6 +121,8 @@ end @test df2.y == [4] df2 = @with df @collapse y = mean(x) @test df2.y == [2.0] + df2 = @with DataFrame(x=[1, Inf, 3]) @collapse y = sum(x) + @test df2.y == [4.0] end @testset "Vectorized does not collapse" begin df = DataFrame(x=1:4, z= 5:8) @@ -446,6 +448,17 @@ end @test s.N == 2 @test s.mean == 2.0 end + + @testset "Other NaN values" begin + df = DataFrame(x=[1, NaN, 3]) + s = @with df @summarize x + @test s.N == 2 + @test s.mean == 2.0 + df = DataFrame(x=[1, Inf, 3]) + s = @with df @summarize x + @test s.N == 2 + @test s.mean == 2.0 + end end @testset "Boolean-valued column" begin From 1b3a069fda1125c2dfdf44f96d70086309153361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 12:01:03 +0200 Subject: [PATCH 6/8] Filter on values (fix #130) 302 tests pass --- src/functions.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/functions.jl b/src/functions.jl index ce8f401..047f7c4 100644 --- a/src/functions.jl +++ b/src/functions.jl @@ -24,17 +24,17 @@ distinct(x::AbstractVector) = unique(x) distinct(x::Base.SkipMissing) = distinct(collect(x)) """ - rowcount(x::AbstractVector) = length(collect(skipmissing(x))) + rowcount(x::AbstractVector) = length(keep_only_values(x)) -Count the number of non-missing values in a vector. +Count the number of valid values in a vector. """ -rowcount(x::AbstractVector) = length(collect(skipmissing(x))) +rowcount(x::AbstractVector) = length(keep_only_values(x)) rowcount(x::Base.SkipMissing) = length(collect(x)) tabulate(df::AbstractDataFrame, columns::Vector{Symbol}) = freqtable(df, columns...) function summarize(df::AbstractDataFrame, column::Symbol)::Summarize - data = df[!, column] |> skipmissing |> collect + data = df[!, column] |> keep_only_values n = length(data) sum_val = sum(data) mean_val = mean(data) @@ -93,4 +93,4 @@ isvalue(x::Number) = isinf(x) || isnan(x) ? false : true Return a vector with only the values of `x`, excluding any `missing`` values, `nothing`s, `Inf`a and `NaN`s. """ -keep_only_values(x) = filter(isvalue, x) \ No newline at end of file +keep_only_values(x) = filter(isvalue, collect(skipmissing(x))) \ No newline at end of file From 2cc1c1fc940477248cc86577e543b9e491273b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 12:04:56 +0200 Subject: [PATCH 7/8] Collapse is noticably slower with `keep_only_values` --- docs/src/index.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 617aa37..81eac77 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -69,11 +69,11 @@ end | Command | Stata | Julia 1st run | Julia 2nd run | Speedup | | ------------ | ----- | ------------- | ------------- | ------- | -| `@egen` | 4.90s | 1.60s | 0.41s | 10x | -| `@collapse` | 0.92s | 0.18s | 0.13s | 8x | -| `@tabulate` | 2.14s | 0.46s | 0.10s | 20x | -| `@summarize` | 10.40s | 0.58s | 0.37s | 28x | -| `@regress` | 0.89s | 1.93s | 0.16s | 6x | +| `@egen` | 4.90s | 1.36s | 0.59s | 8x | +| `@collapse` | 0.92s | 0.39s | 0.39s | 2x | +| `@tabulate` | 2.14s | 0.68s | 0.10s | 21x | +| `@summarize` | 10.40s | 0.58s | 0.46s | 22x | +| `@regress` | 0.89s | 1.95s | 0.14s | 6x | See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/blob/main/examples/benchmark.do) and [Kezdi.jl](https://github.com/codedthinking/Kezdi.jl/blob/main/examples/benchmark.jl). From d2f14fc07f44868d2b88d62b79a689d5864c8707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 5 Jul 2024 12:08:18 +0200 Subject: [PATCH 8/8] Bump version number --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index a77b499..161825b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Kezdi" uuid = "48308a23-c29e-446c-b4c0-d9446a767439" authors = ["Miklos Koren ", "Gergely Attila Kiss "] -version = "0.4.3" +version = "0.4.4" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"