From 6119a75dd7a06ef5c1991614484d681ea471d355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Sun, 21 Jul 2024 11:56:30 +0200 Subject: [PATCH 01/15] implement reshape wide (#172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit no tests yet ```julia julia> df 4×3 DataFrame Row │ i j x │ Int64 Int64 Int64 ─────┼───────────────────── 1 │ 1 1 1 2 │ 2 1 2 3 │ 1 2 3 4 │ 2 2 4 julia> @with df @reshape wide x, i(i) j(j) Kezdi.jl> @reshape_wide x, i(i) j(j) 2×3 DataFrame Row │ i x1 x2 │ Int64 Int64? Int64? ─────┼─────────────────────── 1 │ 1 1 3 2 │ 2 2 4 ``` --- src/Kezdi.jl | 2 +- src/commands.jl | 12 ++++++++++++ src/macros.jl | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/Kezdi.jl b/src/Kezdi.jl index 9348046..c5657bb 100644 --- a/src/Kezdi.jl +++ b/src/Kezdi.jl @@ -2,7 +2,7 @@ Kezdi.jl is a Julia package for data manipulation and analysis. It is inspired by Stata, but it is written in Julia, which makes it faster and more flexible. It is designed to be used in the Julia REPL, but it can also be used in Jupyter notebooks or in scripts. """ module Kezdi -export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe +export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe, @reshape export getdf, setdf, display_and_return, keep_only_values, rowcount, distinct, cond diff --git a/src/commands.jl b/src/commands.jl index faac0f8..0599a83 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -1,6 +1,18 @@ # use multiple dispatch to generate code rewrite(command::Command) = rewrite(Val(command.command), command) +function rewrite(::Val{:reshape_wide}, command::Command) + gc = generate_command(command; options=[:variables], allowed=[:i, :j]) + (; local_copy, target_df, setup, teardown, arguments, options) = gc + i = get_option(command, :i)[1] |> replace_column_references + j = get_option(command, :j)[1] |> replace_column_references + var = collect(arguments)[1] |> replace_column_references + quote + $setup + unstack($local_copy, $i, $j, $var, renamecols = x -> Symbol($var, x)) |> $teardown |> setdf + end |> esc +end + function rewrite(::Val{:rename}, command::Command) gc = generate_command(command; options=[:variables], allowed=[]) (; local_copy, target_df, setup, teardown, arguments, options) = gc diff --git a/src/macros.jl b/src/macros.jl index f95f1d0..bbeedf5 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -198,3 +198,18 @@ macro describe(exprs...) :describe |> parse(exprs) |> rewrite end +""" + @reshape long y1 y2 ... i(var) j(var) + @reshape wide y1 y2 ... i(var) j(var) + +Reshape the data frame from wide to long or from long to wide format. The variables `y1`, `y2`, etc. are the variables to be reshaped. The `i(var)` and `j(var)` are the variables that define the row and column indices in the reshaped data frame. +""" +macro reshape(exprs...) + if exprs[1] == :long + :reshape_long |> parse(exprs[2:end]) |> rewrite + elseif exprs[1] == :wide + :reshape_wide |> parse(exprs[2:end]) |> rewrite + else + ArgumentError("Invalid option $(exprs[1]). Correct syntax:\n@reshape long y1 y2 ... i(var) j(var)\n@reshape wide y1 y2 ... i(var) j(var)") |> throw + end +end \ No newline at end of file From a79bc3aa0e845ec23c25e82842eb667e1ee45626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 26 Jul 2024 17:55:40 +0200 Subject: [PATCH 02/15] bugfix: double vectorization in replace - fixes #182 - new test added - 405/405 tests pass - version bumped to 0.5.1 --- src/commands.jl | 2 +- test/commands.jl | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/commands.jl b/src/commands.jl index 0599a83..467621e 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -43,7 +43,7 @@ function rewrite(::Val{:replace}, command::Command) target_column = get_LHS(command.arguments[1]) LHS, RHS = split_assignment(arguments[1]) third_vector = gensym() - bitmask = build_bitmask(local_copy, vectorize_function_calls(replace_column_references(local_copy, command.condition))) + bitmask = build_bitmask(local_copy, command.condition) quote !($target_column in names(getdf())) && ArgumentError("Column \"$($target_column)\" does not exist in $(names(getdf()))") |> throw $setup diff --git a/test/commands.jl b/test/commands.jl index de7517a..b100762 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -112,6 +112,11 @@ end @testset "Error handling" begin @test_throws Exception @with df @replace y = 1 end + + @testset "Double vectorization bug (#182)" begin + positive(x) = x > 0 + @test (@with DataFrame(x=1:4, y=5:8) @replace y = 0 @if positive(x - 2)).y == [5, 6, 0, 0] + end end @testset "Missing values" begin From c07d23b7ec0053a0cff79ea1dec496344faaec5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Fri, 26 Jul 2024 17:55:40 +0200 Subject: [PATCH 03/15] bugfix: double vectorization in replace - fixes #182 - new test added - 405/405 tests pass - version bumped to 0.5.1 --- Project.toml | 2 +- src/commands.jl | 2 +- test/commands.jl | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 15406e8..b29eec1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Kezdi" uuid = "48308a23-c29e-446c-b4c0-d9446a767439" authors = ["Miklos Koren ", "Gergely Attila Kiss "] -version = "0.5.0" +version = "0.5.1" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/src/commands.jl b/src/commands.jl index 0599a83..467621e 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -43,7 +43,7 @@ function rewrite(::Val{:replace}, command::Command) target_column = get_LHS(command.arguments[1]) LHS, RHS = split_assignment(arguments[1]) third_vector = gensym() - bitmask = build_bitmask(local_copy, vectorize_function_calls(replace_column_references(local_copy, command.condition))) + bitmask = build_bitmask(local_copy, command.condition) quote !($target_column in names(getdf())) && ArgumentError("Column \"$($target_column)\" does not exist in $(names(getdf()))") |> throw $setup diff --git a/test/commands.jl b/test/commands.jl index de7517a..b100762 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -112,6 +112,11 @@ end @testset "Error handling" begin @test_throws Exception @with df @replace y = 1 end + + @testset "Double vectorization bug (#182)" begin + positive(x) = x > 0 + @test (@with DataFrame(x=1:4, y=5:8) @replace y = 0 @if positive(x - 2)).y == [5, 6, 0, 0] + end end @testset "Missing values" begin From ec1b71ba87df2ae9a46a96b44f21dfc15e19b39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Mon, 29 Jul 2024 22:49:33 +0200 Subject: [PATCH 04/15] Reshape wide for single and multiple vars (WIP) --- src/commands.jl | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index 0845c4f..8472d44 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -6,10 +6,28 @@ function rewrite(::Val{:reshape_wide}, command::Command) (; local_copy, target_df, setup, teardown, arguments, options) = gc i = get_option(command, :i)[1] |> replace_column_references j = get_option(command, :j)[1] |> replace_column_references - var = collect(arguments)[1] |> replace_column_references + vars = collect(arguments) |> replace_column_references + df_list = gensym() + combined_df = gensym() + #= + TODO: + - multiple vqribales + - unstack can only do 1 variable at a time + - varlist in i + =# + length(vars) > 1 ? quote $setup - unstack($local_copy, $i, $j, $var, renamecols = x -> Symbol($var, x)) |> $teardown |> setdf + $df_list = [unstack($target_df, $i, $j, var, renamecols = x -> Symbol(var, x)) for var in $vars] + $combined_df = $df_list[1] + for df in $df_list[2:end] + $combined_df = innerjoin($combined_df, df, on = $i) + end + $combined_df |> $teardown |> setdf + end |> esc : + quote + $setup + unstack($target_df, $i, $j, $vars[1], renamecols = x -> Symbol($vars[1], x)) |> $teardown |> setdf end |> esc end From 4d0fc3bc99545e65abfbe825194d0fc4851381b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Tue, 30 Jul 2024 08:25:18 +0200 Subject: [PATCH 05/15] Add tests for reshape wide (#172) - 451/451 pass --- src/commands.jl | 8 ++++++-- test/commands.jl | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index 8472d44..7a5e125 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -1,9 +1,15 @@ # use multiple dispatch to generate code rewrite(command::Command) = rewrite(Val(command.command), command) +function rewrite(::Val{:reshape_long}, command::Command) + error("@reshape long not implemented yet") +end + function rewrite(::Val{:reshape_wide}, command::Command) gc = generate_command(command; options=[:variables], allowed=[:i, :j]) (; local_copy, target_df, setup, teardown, arguments, options) = gc + get_option(command, :i) isa Nothing && ArgumentError("i() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw + get_option(command, :j) isa Nothing && ArgumentError("j() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw i = get_option(command, :i)[1] |> replace_column_references j = get_option(command, :j)[1] |> replace_column_references vars = collect(arguments) |> replace_column_references @@ -11,8 +17,6 @@ function rewrite(::Val{:reshape_wide}, command::Command) combined_df = gensym() #= TODO: - - multiple vqribales - - unstack can only do 1 variable at a time - varlist in i =# length(vars) > 1 ? diff --git a/test/commands.jl b/test/commands.jl index ebdbc8e..b77fdb0 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -768,4 +768,43 @@ end @use "test.dta", clear @test df == getdf() try @use "test.dta" @if x<5, clear; catch e; @test e isa LoadError; end +end + +@testset "Reshape wide" begin + df = DataFrame(i=[1, 1, 2, 2], j=[1, 2, 1, 2], x=1:4, y=5:8) + @testset "Known values" begin + df2 = @with df @reshape wide x y, i(i) j(j) + @test names(df2) == ["i", "x1", "x2", "y1", "y2"] + @test all(df2.x1 .== [1, 3]) + @test all(df2.x2 .== [2, 4]) + @test all(df2.y1 .== [5, 7]) + @test all(df2.y2 .== [6, 8]) + df2 = @with df @reshape wide x, i(i) j(j) + @test names(df2) == ["i", "x1", "x2"] + @test all(df2.x1 .== [1, 3]) + @test all(df2.x2 .== [2, 4]) + df2 = @with df @reshape wide x, i(j) j(i) + @test names(df2) == ["j", "x1", "x2"] + @test all(df2.x1 .== [1, 2]) + @test all(df2.x2 .== [3, 4]) + end + + @testset "Unbalanced panel" begin + df = DataFrame(i=[1, 1, 2, 2, 2], j=[1, 2, 1, 2, 3], x=1:5, y=5:9) + df2 = @with df @reshape wide x y, i(i) j(j) + @test names(df2) == ["i", "x1", "x2", "x3", "y1", "y2", "y3"] + @test all(df2.x1 .== [1, 3]) + @test all(df2.x2 .== [2, 4]) + @test all(df2.x3 .=== [missing, 5]) + @test all(df2.y1 .== [5, 7]) + @test all(df2.y2 .== [6, 8]) + @test all(df2.y3 .=== [missing, 9]) + df2 = @with df @reshape wide x y, i(j) j(i) + @test names(df2) == ["j", "x1", "x2", "y1", "y2"] + @test all(df2.j .== [1, 2, 3]) + @test all(df2.x1 .=== [1, 2, missing]) + @test all(df2.x2 .== [3, 4, 5]) + @test all(df2.y1 .=== [5, 6, missing]) + @test all(df2.y2 .== [7, 8, 9]) + end end \ No newline at end of file From eb68c2339a45cf77a2cca0f4c7e526fb87fa3f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Tue, 30 Jul 2024 08:34:58 +0200 Subject: [PATCH 06/15] Implement multiple i() variables in `@reshape wide` - re #172 - update documentation - tests added - 458/458 pass --- src/commands.jl | 7 ++----- src/macros.jl | 6 ++++-- test/commands.jl | 12 ++++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index 7a5e125..bdfd524 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -10,15 +10,12 @@ function rewrite(::Val{:reshape_wide}, command::Command) (; local_copy, target_df, setup, teardown, arguments, options) = gc get_option(command, :i) isa Nothing && ArgumentError("i() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw get_option(command, :j) isa Nothing && ArgumentError("j() is mandatory. Syntax is @reshape wide y1 y2 ... i(var) j(var)") |> throw - i = get_option(command, :i)[1] |> replace_column_references + length(get_option(command, :j)) > 1 && ArgumentError("Only one variable can be specified for j() in @reshape wide") |> throw + i = get_option(command, :i) |> replace_column_references j = get_option(command, :j)[1] |> replace_column_references vars = collect(arguments) |> replace_column_references df_list = gensym() combined_df = gensym() - #= - TODO: - - varlist in i - =# length(vars) > 1 ? quote $setup diff --git a/src/macros.jl b/src/macros.jl index bbeedf5..2d61817 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -199,10 +199,12 @@ macro describe(exprs...) end """ - @reshape long y1 y2 ... i(var) j(var) - @reshape wide y1 y2 ... i(var) j(var) + @reshape long y1 y2 ... i(varlist) j(var) + @reshape wide y1 y2 ... i(varlist) j(var) Reshape the data frame from wide to long or from long to wide format. The variables `y1`, `y2`, etc. are the variables to be reshaped. The `i(var)` and `j(var)` are the variables that define the row and column indices in the reshaped data frame. + +The option `i()` may include multiple variables, like `i(var1, var2, var3)`. The option `j()` must include only one variable. """ macro reshape(exprs...) if exprs[1] == :long diff --git a/test/commands.jl b/test/commands.jl index b77fdb0..2df535b 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -807,4 +807,16 @@ end @test all(df2.y1 .=== [5, 6, missing]) @test all(df2.y2 .== [7, 8, 9]) end + + @testset "Multiple i variables" begin + df = DataFrame(i1=[1, 1, 2, 2], i2=[0, 0, 0, 1], j=[1, 2, 1, 2], x=1:4, y=5:8) + df2 = @with df @reshape wide x y, i(i1, i2) j(j) + @test names(df2) == ["i1", "i2", "x1", "x2", "y1", "y2"] + @test all(df2.i1 .== [1, 2, 2]) + @test all(df2.i2 .== [0, 0, 1]) + @test all(df2.x1 .=== [1, 3, missing]) + @test all(df2.x2 .=== [2, missing, 4]) + @test all(df2.y1 .=== [5, 7, missing]) + @test all(df2.y2 .=== [6, missing, 8]) + end end \ No newline at end of file From 72878936686a69d2d1e4736610790a50c8adee7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:26:14 +0200 Subject: [PATCH 07/15] Add and to speed benchmarks --- docs/examples/benchmark.do | 24 ++++++++++++++++++++---- docs/examples/benchmark.jl | 6 ++++++ docs/src/index.md | 16 +++++++++------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/docs/examples/benchmark.do b/docs/examples/benchmark.do index 2aa8fc1..3ebd9bc 100644 --- a/docs/examples/benchmark.do +++ b/docs/examples/benchmark.do @@ -10,11 +10,27 @@ gen i = _n set seed 12345 gen g = floor(runiform() * 100) +timer clear 1 +preserve +timer on 1 + generate ln_i = log(i) +timer off 1 +restore +timer list 1 + +timer clear 1 +preserve +timer on 1 + replace g = 2*i +timer off 1 +restore +timer list 1 + * Measure time for mean calculation by group timer clear 1 preserve timer on 1 -egen mean_i = mean(i), by(g) + egen mean_i = mean(i), by(g) timer off 1 restore timer list 1 @@ -23,7 +39,7 @@ timer list 1 preserve timer clear 3 timer on 3 -collapse (mean) mean_i=i, by(g) + collapse (mean) mean_i=i, by(g) timer off 3 restore timer list 3 @@ -38,7 +54,7 @@ timer list 5 * Measure time for summarize timer clear 7 timer on 7 -summarize g, detail + summarize g, detail timer off 7 timer list 7 @@ -46,7 +62,7 @@ timer list 7 preserve timer clear 9 timer on 9 -regress i g if g > 50 + regress i g if g > 50 timer off 9 restore timer list 9 diff --git a/docs/examples/benchmark.jl b/docs/examples/benchmark.jl index 7630ec8..bd26e83 100644 --- a/docs/examples/benchmark.jl +++ b/docs/examples/benchmark.jl @@ -6,6 +6,12 @@ using Pkg; Pkg.precompile() df = DataFrame(i = 1:10_000_000) df.g = rand(0:99, nrow(df)) +println("Generate") +@btime @with df @generate ln_i = log(i) + +println("Replace") +@btime @with df @replace g = 2*i + println("Egen") @btime @with df @egen mean_i = mean(i), by(g) diff --git a/docs/src/index.md b/docs/src/index.md index a6b48e7..24f1afe 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -65,13 +65,15 @@ end ### Free and open-source ### Speed -| Command | Stata | Julia 1st run | Julia 2nd run | Speedup | -| ------------ | ----- | ------------- | ------------- | ------- | -| `@egen` | 4.90s | 1.36s | 0.36s | 14x | -| `@collapse` | 0.92s | 0.39s | 0.28s | 3x | -| `@tabulate` | 2.14s | 0.68s | 0.09s | 24x | -| `@summarize` | 10.40s | 0.58s | 0.36s | 29x | -| `@regress` | 0.89s | 1.95s | 0.11s | 8x | +| Command | Stata | Julia 2nd run | Speedup | +| ------------ | ----- | ------------- | ------- | +| `@generate` | 230ms | 46ms | 5x | +| `@replace` | 232ms | 43ms | 5x | +| `@egen` | 5.00s | 0.37s | 13x | +| `@collapse` | 0.94s | 0.28s | 3x | +| `@tabulate` | 2.19s | 0.09s | 24x | +| `@summarize` | 10.56s | 0.35s | 30x | +| `@regress` | 0.85s | 0.14s | 6x | See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.do) and [Kezdi.jl](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.jl). From ac881ffa3f3316d767d5ac5b8e62668b80fb351b Mon Sep 17 00:00:00 2001 From: "Gergely Attila Kiss (Geri)" <47605029+gergelyattilakiss@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:25:59 +0100 Subject: [PATCH 08/15] Update commands.jl --- test/commands.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/commands.jl b/test/commands.jl index de4d429..ad563e4 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -874,7 +874,6 @@ end @test all(df2.y2 .=== [6, missing, 8]) end end -end @testset "Save" begin @clear From eb638a1ff163e2978c3bd733d662a293567dad39 Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Fri, 8 Nov 2024 07:59:47 +0100 Subject: [PATCH 09/15] cover reshape long lines with tests --- src/macros.jl | 32 ++++++++++++++++++-------------- test/commands.jl | 28 +++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index dd8fb46..e745202 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -132,14 +132,14 @@ Read the data from the file `filename.dta` and set it as the global data frame. """ macro use(exprs...) command = parse(exprs, :use) - length(command.arguments) == 1 || ArgumentError("@use takes a single file name as an argument:\n@use \"filename.dta\"[, clear]") |> throw + length(command.arguments) == 1 || ArgumentError("@use takes a single file name as an argument:\n@use \"filename.dta\"[, clear]") |> throw # clear is the only permissible option isempty(filter(x -> x != :clear, command.options)) || ArgumentError("Invalid options $(string.(command.options)). Correct syntax:\n@use \"filename.dta\"[, clear]") |> throw fname = command.arguments[1] clear = :clear in command.options isnothing(getdf()) || clear || ArgumentError("There is already a global data frame set. If you want to replace it, use the \", clear\" option.") |> throw - :(println("$(Kezdi.prompt())$($command)\n");Kezdi.use($fname)) |> esc + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.use($fname)) |> esc end """ @@ -154,7 +154,7 @@ macro save(exprs...) fname = command.arguments[1] replace = :replace in command.options ispath(fname) && !replace && ArgumentError("File $fname already exists.") |> throw - :(println("$(Kezdi.prompt())$($command)\n");Kezdi.save($fname)) |> esc + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.save($fname)) |> esc end """ @@ -167,7 +167,7 @@ macro append(exprs...) length(command.arguments) == 1 || ArgumentError("@append takes a single file name as an argument:\n@append \"filename.dta\"") |> throw isnothing(getdf()) && ArgumentError("There is no data frame to append to.") |> throw fname = command.arguments[1] - :(println("$(Kezdi.prompt())$($command)\n");Kezdi.append($fname)) |> esc + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.append($fname)) |> esc end """ @head [n] @@ -175,7 +175,7 @@ end Display the first `n` rows of the data frame. By default, `n` is 5. """ macro head(n=5) - :(println("$(Kezdi.prompt())@head $($n)\n");first(getdf(), $n) |> display_and_return) |> esc + :(println("$(Kezdi.prompt())@head $($n)\n"); first(getdf(), $n) |> display_and_return) |> esc end """ @@ -184,7 +184,7 @@ end Display the last `n` rows of the data frame. By default, `n` is 5. """ macro tail(n=5) - :(println("$(Kezdi.prompt())@tail $($n)\n");last(getdf(), $n) |> display_and_return) |> esc + :(println("$(Kezdi.prompt())@tail $($n)\n"); last(getdf(), $n) |> display_and_return) |> esc end """ @@ -193,7 +193,7 @@ end Display the names of the variables in the data frame. """ macro names() - :(println("$(Kezdi.prompt())@names\n");names(getdf()) |> display_and_return) |> esc + :(println("$(Kezdi.prompt())@names\n"); names(getdf()) |> display_and_return) |> esc end """ @@ -211,7 +211,7 @@ end Clears the global dataframe. """ macro clear() - :(println("$(Kezdi.prompt())@clear\n");setdf(nothing)) + :(println("$(Kezdi.prompt())@clear\n"); setdf(nothing)) end """ @@ -220,7 +220,7 @@ end Show the names and data types of columns of the data frame. If no variable names given, all are shown. """ macro describe(exprs...) - :describe |> parse(exprs) |> rewrite + :describe |> parse(exprs) |> rewrite end """ @@ -233,19 +233,23 @@ The option `i()` may include multiple variables, like `i(var1, var2, var3)`. The """ macro reshape(exprs...) if exprs[1] == :long - :reshape_long |> parse(exprs[2:end]) |> rewrite + return quote + :reshape_long |> parse(exprs[2:end]) |> rewrite + end elseif exprs[1] == :wide :reshape_wide |> parse(exprs[2:end]) |> rewrite else - ArgumentError("Invalid option $(exprs[1]). Correct syntax:\n@reshape long y1 y2 ... i(var) j(var)\n@reshape wide y1 y2 ... i(var) j(var)") |> throw + return quote + ArgumentError("Invalid option $(exprs[1]). Correct syntax:\n@reshape long y1 y2 ... i(var) j(var)\n@reshape wide y1 y2 ... i(var) j(var)") |> throw + end end -end +end """ @mvencode y1 y2 [_all] ... [if condition], [mv(value)] -Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. By default value is `missing` making no changes on the dataframe. Using `_all` encodes all varibles of the DataFrame. +Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. By default value is `missing` making no changes on the dataframe. Using `_all` encodes all variables of the DataFrame. """ macro mvencode(exprs...) :mvencode |> parse(exprs) |> rewrite -end \ No newline at end of file +end diff --git a/test/commands.jl b/test/commands.jl index ad563e4..76d9b0f 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -821,7 +821,11 @@ end df = DataFrame(x=1:10, y=11:20) @use "test.dta", clear @test df == getdf() - try @use "test.dta" @if x<5, clear; catch e; @test e isa LoadError; end + try + @use "test.dta" @if x < 5, clear + catch e + @test e isa UndefVarError + end end @testset "Reshape wide" begin @@ -875,6 +879,28 @@ end end end +@testset "Reshape long" begin + df = DataFrame(i=[1, 1, 2, 2], j=[1, 2, 1, 2], x=1:4, y=5:8) + @testset "Known values" begin + @test_throws UndefVarError df2 = @with df @reshape long x y, i(i) j(j) + end + + @testset "Unbalanced panel" begin + df = DataFrame(i=[1, 1, 2, 2, 2], j=[1, 2, 1, 2, 3], x=1:5, y=5:9) + @test_throws UndefVarError df2 = @with df @reshape long x y, i(i) j(j) + end + + @testset "Multiple i variables" begin + df = DataFrame(i1=[1, 1, 2, 2], i2=[0, 0, 0, 1], j=[1, 2, 1, 2], x=1:4, y=5:8) + @test_throws UndefVarError df2 = @with df @reshape long x y, i(i1, i2) j(j) + end +end + +@testset "Reshape invalid" begin + df = DataFrame(i=[1, 1, 2, 2], j=[1, 2, 1, 2], x=1:4, y=5:8) + @test_throws UndefVarError df2 = @with df @reshape invalid x y, i(i) j(j) +end + @testset "Save" begin @clear df = DataFrame(x=Vector{Any}(1:11), y=11:21) From 643a6728a38f816ecce56ad7060f1de3805ecfcf Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Tue, 12 Nov 2024 18:33:56 +0100 Subject: [PATCH 10/15] implement reshape long --- src/commands.jl | 53 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index abae987..e38181d 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -2,7 +2,33 @@ rewrite(command::Command) = rewrite(Val(command.command), command) function rewrite(::Val{:reshape_long}, command::Command) - error("@reshape long not implemented yet") + gc = generate_command(command; options=[:variables], allowed=[:i, :j]) + (; local_copy, target_df, setup, teardown, arguments, options) = gc + get_option(command, :i) isa Nothing && ArgumentError("i() is mandatory. Syntax is @reshape long y1 y2 ... i(var) j(var)") |> throw + get_option(command, :j) isa Nothing && ArgumentError("j() is mandatory. Syntax is @reshape long y1 y2 ... i(var) j(var)") |> throw + length(get_option(command, :j)) > 1 && ArgumentError("Only one variable can be specified for j() in @reshape long") |> throw + i = get_option(command, :i) |> replace_column_references + j = get_option(command, :j)[1] |> replace_column_references + vars = collect(arguments) |> replace_column_references + var_lists = gensym() + combined_df = gensym() + combined_list = gensym() + quote + $setup + $var_lists = [[Symbol(name) for name in names($target_df) if startswith(name, var)] for var in $vars] + $combined_list = [stack($target_df, list, view=true) for list in $var_lists] + for (i, df) in enumerate(combined_list) + df[:, :j] = df[:, :variable] .|> x -> parse(Int, x[length(String(vars[i]))+1:end]) + rename!(df, :value => String(vars[i])) + select!(df, Not(:variable)) + end + $combined_df = $combined_list[1] + for df in $combined_list[2:end] + $combined_df = leftjoin($combined_df, df, on=[$i, $j], makeunique=true) + end + $combined_df = select($combined_df, collect(union(intersect(names.($combined_list)...), String.($vars)))) + $combined_df |> $teardown |> setdf + end |> esc end function rewrite(::Val{:reshape_wide}, command::Command) @@ -19,16 +45,17 @@ function rewrite(::Val{:reshape_wide}, command::Command) length(vars) > 1 ? quote $setup - $df_list = [unstack($target_df, $i, $j, var, renamecols = x -> Symbol(var, x)) for var in $vars] + $df_list = [unstack($target_df, $i, $j, var, renamecols=x -> Symbol(var, x)) for var in $vars] + $combined_df = innerjoin($df_list, on=$i) $combined_df = $df_list[1] for df in $df_list[2:end] - $combined_df = innerjoin($combined_df, df, on = $i) + $combined_df = innerjoin($combined_df, df, on=$i) end $combined_df |> $teardown |> setdf - end |> esc : + end |> esc : quote $setup - unstack($target_df, $i, $j, $vars[1], renamecols = x -> Symbol($vars[1], x)) |> $teardown |> setdf + unstack($target_df, $i, $j, $vars[1], renamecols=x -> Symbol($vars[1], x)) |> $teardown |> setdf end |> esc end @@ -88,7 +115,7 @@ function rewrite(::Val{:keep}, command::Command) cols = isempty(command.arguments) ? :(:) : :(collect($command.arguments)) quote $setup - $target_df[!, $cols] |> $teardown |> setdf + $target_df[!, $cols] |> $teardown |> setdf end |> esc end @@ -100,7 +127,7 @@ function rewrite(::Val{:drop}, command::Command) $setup select!($local_copy, Not(collect($(command.arguments)))) |> $teardown |> setdf end |> esc - end + end bitmask = build_bitmask(local_copy, command.condition) return quote $setup @@ -143,7 +170,7 @@ function rewrite(::Val{:sort}, command::Command) end function rewrite(::Val{:order}, command::Command) - gc = generate_command(command; options = [:variables, :nofunction], allowed=[:desc, :last, :after, :before , :alphabetical]) + gc = generate_command(command; options=[:variables, :nofunction], allowed=[:desc, :last, :after, :before, :alphabetical]) (; local_copy, target_df, setup, teardown, arguments, options) = gc desc = :desc in get_top_symbol.(options) last = :last in get_top_symbol.(options) @@ -160,7 +187,7 @@ function rewrite(::Val{:order}, command::Command) if desc && !alphabetical ArgumentError("Cannot use `desc` without `alphabetical` option in @order") |> throw end - + if before var = get_option(command, :before) elseif after @@ -181,7 +208,7 @@ function rewrite(::Val{:order}, command::Command) $setup $cols = [Symbol(col) for col in names($target_df) if Symbol(col) ∉ $target_cols] if $alphabetical - $cols = sort($cols, rev = $desc) + $cols = sort($cols, rev=$desc) end if $after @@ -204,7 +231,7 @@ function rewrite(::Val{:order}, command::Command) $cols = pushfirst!($cols, $target_cols...) end - $target_df[!, $cols]|> $teardown + $target_df[!, $cols] |> $teardown end |> esc end @@ -224,7 +251,7 @@ function rewrite(::Val{:mvencode}, command::Command) coltype = gensym() quote $setup - $valtype = typeof($value) + $valtype = typeof($value) for col in $cols $coltype = eltype($local_copy[.!($bitmask), col]) if $valtype != $coltype @@ -235,4 +262,4 @@ function rewrite(::Val{:mvencode}, command::Command) $local_copy[$bitmask, $cols] = mvreplace.($local_copy[$bitmask, $cols], $value) $local_copy |> $teardown end |> esc -end \ No newline at end of file +end From e2845d31565663bc371f575dc4aced197187ed8a Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Wed, 13 Nov 2024 12:57:43 +0100 Subject: [PATCH 11/15] implement working version of reshape long --- src/commands.jl | 19 +++++++++---------- src/functions.jl | 17 +++++++++++------ src/macros.jl | 4 +--- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index e38181d..ed3a3b9 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -12,21 +12,20 @@ function rewrite(::Val{:reshape_long}, command::Command) vars = collect(arguments) |> replace_column_references var_lists = gensym() combined_df = gensym() - combined_list = gensym() + df_list = gensym() quote $setup - $var_lists = [[Symbol(name) for name in names($target_df) if startswith(name, var)] for var in $vars] - $combined_list = [stack($target_df, list, view=true) for list in $var_lists] - for (i, df) in enumerate(combined_list) - df[:, :j] = df[:, :variable] .|> x -> parse(Int, x[length(String(vars[i]))+1:end]) - rename!(df, :value => String(vars[i])) + $var_lists = [[Symbol(name) for name in names($target_df) if startswith(name, String(var))] for var in $vars] + $df_list = [stack($target_df, list) for list in $var_lists] + for (i, df) in enumerate($df_list) + df[!, $j] = df[:, :variable] .|> x -> parse(Int, x[length(String($vars[i]))+1:end]) + rename!(df, :value => String($vars[i])) select!(df, Not(:variable)) end - $combined_df = $combined_list[1] - for df in $combined_list[2:end] - $combined_df = leftjoin($combined_df, df, on=[$i, $j], makeunique=true) + $combined_df = $df_list[1] + for df in $df_list[2:end] + $combined_df = innerjoin($combined_df, df, on=[$i..., $j], makeunique=true) end - $combined_df = select($combined_df, collect(union(intersect(names.($combined_list)...), String.($vars)))) $combined_df |> $teardown |> setdf end |> esc end diff --git a/src/functions.jl b/src/functions.jl index 3a2a502..435dc33 100644 --- a/src/functions.jl +++ b/src/functions.jl @@ -4,19 +4,19 @@ save(fname::AbstractString) = writestat(fname, getdf()) function append(fname::AbstractString) ispath(fname) || ArgumentError("File $fname does not exist.") |> throw _, ext = splitext(fname) - if ext in [".dta", ".sav", ".por", ".sas7bdat", ".xpt"] + if ext in [".dta", ".save", ".por", ".sas7bdat", ".xpt"] df = readstat(fname) |> DataFrame else df = CSV.read(fname, DataFrame) end cdf = getdf() cdf, df = create_cols(cdf, df) - df = vcat(cdf,df) + df = vcat(cdf, df) setdf(df) end function append(df::DataFrame) - cdf, df = create_cols(getdf(), df) + cdf, df = create_cols(getdf(), df) setdf(vcat(cdf, df)) end @@ -49,7 +49,7 @@ getdf() = _global_dataframe Set the global data frame. """ -setdf(df::Union{AbstractDataFrame, Nothing}) = global _global_dataframe = isnothing(df) ? nothing : copy(df) +setdf(df::Union{AbstractDataFrame,Nothing}) = global _global_dataframe = isnothing(df) ? nothing : copy(df) display_and_return(x) = (display(x); x) """ @@ -80,7 +80,7 @@ function summarize(df::AbstractDataFrame, column::Symbol)::Summarize skewness_val = skewness(data) # julia reports excess kurtosis, so we add 3 to get the kurtosis kurtosis_val = 3.0 + kurtosis(data) - + percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99] percentiles_values = quantile(data, percentiles ./ 100; alpha=0.5, beta=0.5) @@ -149,4 +149,9 @@ function _describe(df::AbstractDataFrame, cols::Vector{Symbol}=Symbol[]) table[!, [:variable, :eltype]] end -mvreplace(x, y) = ismissing(x) ? y : x \ No newline at end of file +""" + mvreplace(x, y) + +Return `y` if `x` is `missing`, otherwise return `x`. If `x` is a vector, the operation is vectorized. This function mimics `x ? y : z`, which cannot be vectorized. +""" +mvreplace(x, y) = ismissing(x) ? y : x diff --git a/src/macros.jl b/src/macros.jl index e745202..e36dd3c 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -233,9 +233,7 @@ The option `i()` may include multiple variables, like `i(var1, var2, var3)`. The """ macro reshape(exprs...) if exprs[1] == :long - return quote - :reshape_long |> parse(exprs[2:end]) |> rewrite - end + :reshape_long |> parse(exprs[2:end]) |> rewrite elseif exprs[1] == :wide :reshape_wide |> parse(exprs[2:end]) |> rewrite else From 0b9cc37731b86e64d2da2569497266e7aeec16a2 Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Wed, 13 Nov 2024 13:58:45 +0100 Subject: [PATCH 12/15] add test for reshape long --- README.md | 6 +++--- src/commands.jl | 8 ++++---- test/commands.jl | 25 +++++++++++++++++++------ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 45ba810..7f2b1e4 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ It imports and reexports [CSV](https://csv.juliadata.org/stable/), [DataFrames]( ## Getting started -> `Kezdi.jl` is currently in beta. We have more than 400 unit tests and a large code coverage. [![Coverage](https://codecov.io/gh/codedthinking/Kezdi.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/codedthinking/Kezdi.jl) The package, however, is not guaranteed to be bug-free. If you encounter any issues, please report them as a [GitHub issue](https://github.com/codedthinking/Kezdi.jl/issues/new). +> `Kezdi.jl` is currently in beta. We have more than 500 unit tests and a large code coverage. [![Coverage](https://codecov.io/gh/codedthinking/Kezdi.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/codedthinking/Kezdi.jl) The package, however, is not guaranteed to be bug-free. If you encounter any issues, please report them as a [GitHub issue](https://github.com/codedthinking/Kezdi.jl/issues/new). > > If you would like to receive updates on the package, please star the repository on GitHub and sign up for [email notifications here](https://relentless-producer-1210.ck.page/62d7ebb237). @@ -161,7 +161,7 @@ If you need to apply a function to individual elements of a column, you need to @generate n_words = length.(words) ``` -> Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `legth.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame. +> Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `length.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame. ### The `@if` condition Almost every command can be followed by an `@if` condition that filters the data frame. The command will only be executed on the subset of rows for which the condition evaluates to `true`. The condition can use any combination of column names and functions. @@ -221,4 +221,4 @@ Inspiration for the package came from [Tidier.jl](https://tidierorg.github.io/Ti The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/), [FreqTables.jl](https://github.com/nalimilan/FreqTables.jl) and [FixedEffectModels.jl](https://github.com/FixedEffects/FixedEffectModels.jl). The `@with` function relies on [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) by Julius Krumbiegel. -The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/). \ No newline at end of file +The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/). diff --git a/src/commands.jl b/src/commands.jl index ed3a3b9..53d57de 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -17,15 +17,16 @@ function rewrite(::Val{:reshape_long}, command::Command) $setup $var_lists = [[Symbol(name) for name in names($target_df) if startswith(name, String(var))] for var in $vars] $df_list = [stack($target_df, list) for list in $var_lists] - for (i, df) in enumerate($df_list) - df[!, $j] = df[:, :variable] .|> x -> parse(Int, x[length(String($vars[i]))+1:end]) - rename!(df, :value => String($vars[i])) + for (n, df) in enumerate($df_list) + df[!, $j] = df[:, :variable] .|> x -> Base.parse(Int, x[length(String($vars[n]))+1:end]) + rename!(df, :value => String($vars[n])) select!(df, Not(:variable)) end $combined_df = $df_list[1] for df in $df_list[2:end] $combined_df = innerjoin($combined_df, df, on=[$i..., $j], makeunique=true) end + $combined_df = select!($combined_df, collect(union(intersect(names.($df_list)...), String.($vars)))) $combined_df |> $teardown |> setdf end |> esc end @@ -45,7 +46,6 @@ function rewrite(::Val{:reshape_wide}, command::Command) quote $setup $df_list = [unstack($target_df, $i, $j, var, renamecols=x -> Symbol(var, x)) for var in $vars] - $combined_df = innerjoin($df_list, on=$i) $combined_df = $df_list[1] for df in $df_list[2:end] $combined_df = innerjoin($combined_df, df, on=$i) diff --git a/test/commands.jl b/test/commands.jl index 76d9b0f..2d6b1ff 100644 --- a/test/commands.jl +++ b/test/commands.jl @@ -880,19 +880,32 @@ end end @testset "Reshape long" begin - df = DataFrame(i=[1, 1, 2, 2], j=[1, 2, 1, 2], x=1:4, y=5:8) + df = DataFrame(i=[1, 1, 2, 2], x1=1:4, x2=5:8) @testset "Known values" begin - @test_throws UndefVarError df2 = @with df @reshape long x y, i(i) j(j) + df2 = @with df @reshape long x, i(i) j(j) + @test names(df2) == ["i", "x", "j"] + @test all(df2.i .== [1, 1, 2, 2, 1, 1, 2, 2]) + @test all(df2.j .== [1, 1, 1, 1, 2, 2, 2, 2]) + @test all(df2.x .== [1, 2, 3, 4, 5, 6, 7, 8]) end @testset "Unbalanced panel" begin - df = DataFrame(i=[1, 1, 2, 2, 2], j=[1, 2, 1, 2, 3], x=1:5, y=5:9) - @test_throws UndefVarError df2 = @with df @reshape long x y, i(i) j(j) + df = DataFrame(i=[1, 1, 2, 2, 2], x1=1:5, x2=[missing, 7, missing, 9, 10], y1=5:9, y2=[10, missing, 12, missing, missing]) + df2 = @with df @reshape long x y, i(i) j(j) + @test names(df2) == ["i", "j", "x", "y"] + @test all(df2.j .== [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) + @test all(df2.x .=== [1, 2, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, missing, 7, missing, 7, missing, 9, 10, missing, 9, 10, missing, 9, 10]) + @test all(df2.y .=== [5, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, missing, missing, 12, 12, 12, missing, missing, missing, missing, missing, missing]) end @testset "Multiple i variables" begin - df = DataFrame(i1=[1, 1, 2, 2], i2=[0, 0, 0, 1], j=[1, 2, 1, 2], x=1:4, y=5:8) - @test_throws UndefVarError df2 = @with df @reshape long x y, i(i1, i2) j(j) + df = DataFrame(i1=[1, 1, 2, 2], i2=[0, 0, 0, 1], x1=1:4, x2=5:8) + df2 = @with df @reshape long x, i(i1, i2) j(j) + @test names(df2) == ["i1", "i2", "x", "j"] + @test all(df2.i1 .== [1, 1, 2, 2, 1, 1, 2, 2]) + @test all(df2.i2 .== [0, 0, 0, 1, 0, 0, 0, 1]) + @test all(df2.j .== [1, 1, 1, 1, 2, 2, 2, 2]) + @test all(df2.x .== [1, 2, 3, 4, 5, 6, 7, 8]) end end From fc286ce5a0357fa39ce663815e67542e4cb2ea0e Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Wed, 13 Nov 2024 14:02:26 +0100 Subject: [PATCH 13/15] bump project version --- Project.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 26e9454..93c4555 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Kezdi" uuid = "48308a23-c29e-446c-b4c0-d9446a767439" authors = ["Miklos Koren ", "Gergely Attila Kiss "] -version = "0.5.2" +version = "0.5.3" [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" @@ -14,7 +14,7 @@ FixedEffectModels = "9d5cd8c9-2029-5cab-9928-427838db53e3" FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +Missing = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -32,7 +32,7 @@ FixedEffectModels = "1" FreqTables = "0.4" InteractiveUtils = "1" Logging = "1" -Missings = "1" +Missing = "1" RDatasets = "0.7" ReadStatTables = "0.3" Reexport = "1" From cfd447bbec5d76b281aa1806c5323aafb8da4b90 Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Wed, 13 Nov 2024 14:43:14 +0100 Subject: [PATCH 14/15] update documentation and macros.jl to mimic the doc structure --- docs/src/index.md | 26 ++++- src/macros.jl | 265 +++++++++++++++++++++++----------------------- 2 files changed, 159 insertions(+), 132 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 5110ebc..abe681a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -108,6 +108,10 @@ setdf @use ``` +```@docs +@save +``` + ```@docs getdf ``` @@ -132,6 +136,9 @@ getdf @clear ``` +```@docs +@describe +``` ### Filtering columns and rows ```@docs @keep @@ -154,6 +161,10 @@ getdf @replace ``` +```@docs +@mvencode +``` + ```@docs @egen ``` @@ -166,6 +177,17 @@ getdf @sort ``` +```@docs +@order +``` + +```@docs +@reshape +``` + +```@docs +@append +``` ### Summarizing and analyzing data ```@docs @@ -278,7 +300,7 @@ If you need to apply a function to individual elements of a column, you need to ``` !!! tip "Note: `length(words)` vs `length.(words)`" - Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `legth.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame. + Here, `words` becomes a vector of vectors, where each element is a vector of words in the corresponding `Model` string. The function `length.` will operate on each cell in `words`, counting the number of words in each `Model` string. By contrast, `length(words)` would return the number of elements in the `words` vector, which is the number of rows in the DataFrame. ### The `@if` condition Almost every command can be followed by an `@if` condition that filters the data frame. The command will only be executed on the subset of rows for which the condition evaluates to `true`. The condition can use any combination of column names and functions. @@ -470,4 +492,4 @@ Inspiration for the package came from [Tidier.jl](https://tidierorg.github.io/Ti The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/), [FreqTables.jl](https://github.com/nalimilan/FreqTables.jl) and [FixedEffectModels.jl](https://github.com/FixedEffects/FixedEffectModels.jl). The `@with` function relies on [Chain.jl](https://github.com/jkrumbiegel/Chain.jl) by Julius Krumbiegel. -The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/). \ No newline at end of file +The package is named after [Gabor Kezdi](https://kezdigabor.life/), a Hungarian economist who has made significant contributions to [teaching data analysis](https://gabors-data-analysis.com/). diff --git a/src/macros.jl b/src/macros.jl index e36dd3c..3fe7ed4 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3,251 +3,256 @@ macro mockmacro(exprs...) parse(exprs, command) end +### Setting and inspectiung the global data frame """ - @keep y1 y2 ... [@if condition] + @use "filename.dta", [clear] -Keep only the variables `y1`, `y2`, etc. in `df`. If `condition` is provided, only the rows for which the condition is true are kept. +Read the data from the file `filename.dta` and set it as the global data frame. If there is already a global data frame, `@use` will throw an error unless the `clear` option is provided """ -macro keep(exprs...) - :keep |> parse(exprs) |> rewrite +macro use(exprs...) + command = parse(exprs, :use) + length(command.arguments) == 1 || ArgumentError("@use takes a single file name as an argument:\n@use \"filename.dta\"[, clear]") |> throw + # clear is the only permissible option + isempty(filter(x -> x != :clear, command.options)) || ArgumentError("Invalid options $(string.(command.options)). Correct syntax:\n@use \"filename.dta\"[, clear]") |> throw + fname = command.arguments[1] + clear = :clear in command.options + isnothing(getdf()) || clear || ArgumentError("There is already a global data frame set. If you want to replace it, use the \", clear\" option.") |> throw + + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.use($fname)) |> esc end """ - @drop y1 y2 ... -or - @drop [@if condition] + @save "filename.dta", [replace] -Drop the variables `y1`, `y2`, etc. from `df`. If `condition` is provided, the rows for which the condition is true are dropped. +Save the global data frame to the file `filename.dta`. If the file already exists, the `replace` option must be provided. """ -macro drop(exprs...) - :drop |> parse(exprs) |> rewrite +macro save(exprs...) + command = parse(exprs, :save) + length(command.arguments) == 1 || ArgumentError("@save takes a single file name as an argument:\n@save \"filename.dta\"") |> throw + isnothing(getdf()) && ArgumentError("There is no data frame to save.") |> throw + fname = command.arguments[1] + replace = :replace in command.options + ispath(fname) && !replace && ArgumentError("File $fname already exists.") |> throw + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.save($fname)) |> esc end """ - @generate y = expr [@if condition] + @names -Create a new variable `y` in `df` by evaluating `expr`. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variable will be missing. +Display the names of the variables in the data frame. """ -macro generate(exprs...) - :generate |> parse(exprs) |> rewrite +macro names() + :(println("$(Kezdi.prompt())@names\n"); names(getdf()) |> display_and_return) |> esc end """ - @replace y = expr [@if condition] -Replace the values of `y` in `df` with the result of evaluating `expr`. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variable will be left unchanged. + @list [y1 y2...] [@if condition] + +Display the entire data frame or the rows for which the condition is true. If variable names are provided, only the variables in the list are displayed. """ -macro replace(exprs...) - :replace |> parse(exprs) |> rewrite +macro list(exprs...) + :list |> parse(exprs) |> rewrite end """ - @egen y1 = expr1 y2 = expr2 ... [@if condition], [by(group1, group2, ...)] + @head [n] -Generate new variables in `df` by evaluating expressions `expr1`, `expr2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variables will be missing. If `by` is provided, the operation is executed by group. +Display the first `n` rows of the data frame. By default, `n` is 5. """ -macro egen(exprs...) - :egen |> parse(exprs) |> rewrite +macro head(n=5) + :(println("$(Kezdi.prompt())@head $($n)\n"); first(getdf(), $n) |> display_and_return) |> esc end """ - @collapse y1 = expr1 y2 = expr2 ... [@if condition], [by(group1, group2, ...)] + @tail [n] -Collapse `df` by evaluating expressions `expr1`, `expr2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `by` is provided, the operation is executed by group. +Display the last `n` rows of the data frame. By default, `n` is 5. """ -macro collapse(exprs...) - :collapse |> parse(exprs) |> rewrite +macro tail(n=5) + :(println("$(Kezdi.prompt())@tail $($n)\n"); last(getdf(), $n) |> display_and_return) |> esc end """ - @summarize y [@if condition] + @clear -Summarize the variable `y` in `df`. If `condition` is provided, the operation is executed only on rows for which the condition is true. +Clears the global dataframe. """ -macro summarize(exprs...) - :summarize |> parse(exprs) |> rewrite +macro clear() + :(println("$(Kezdi.prompt())@clear\n"); setdf(nothing)) end """ - @regress y x1 x2 ... [@if condition], [robust] [cluster(var1, var2, ...)] - -Estimate a regression model in `df` with dependent variable `y` and independent variables `x1`, `x2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `robust` is provided, robust standard errors are calculated. If `cluster` is provided, clustered standard errors are calculated. + @describe [y1] [y2]... -The regression is limited to rows for which all variables are values. Missing values, infinity, and NaN are automatically excluded. +Show the names and data types of columns of the data frame. If no variable names given, all are shown. """ -macro regress(exprs...) - :regress |> parse(exprs) |> rewrite +macro describe(exprs...) + :describe |> parse(exprs) |> rewrite end +### Filtering columns and rows """ - @tabulate y1 y2 ... [@if condition] + @keep y1 y2 ... [@if condition] -Create a frequency table for the variables `y1`, `y2`, etc. in `df`. If `condition` is provided, the operation is executed only on rows for which the condition is true. +Keep only the variables `y1`, `y2`, etc. in `df`. If `condition` is provided, only the rows for which the condition is true are kept. """ -macro tabulate(exprs...) - :tabulate |> parse(exprs) |> rewrite +macro keep(exprs...) + :keep |> parse(exprs) |> rewrite end """ - @count [@if condition] + @drop y1 y2 ... +or + @drop [@if condition] -Count the number of rows for which the condition is true. If `condition` is not provided, the total number of rows is counted. +Drop the variables `y1`, `y2`, etc. from `df`. If `condition` is provided, the rows for which the condition is true are dropped. """ -macro count(exprs...) - :count |> parse(exprs) |> rewrite +macro drop(exprs...) + :drop |> parse(exprs) |> rewrite end +### Modifying the data """ - @sort y1 y2 ... , [desc] + @rename oldname newname -Sort the data frame by the variables `y1`, `y2`, etc. By default, the variables are sorted in ascending order. If `desc` is provided, the variables are sorted in descending order +Rename the variable `oldname` to `newname` in the data frame. """ -macro sort(exprs...) - :sort |> parse(exprs) |> rewrite +macro rename(exprs...) + :rename |> parse(exprs) |> rewrite end """ - @order y1 y2 ... , [desc] [last] [after=var] [before=var] [alphabetical] + @generate y = expr [@if condition] -Reorder the variables `y1`, `y2`, etc. in the data frame. By default, the variables are ordered in the order they are listed. If `desc` is provided, the variables are ordered in descending order. If `last` is provided, the variables are moved to the end of the data frame. If `after` is provided, the variables are moved after the variable `var`. If `before` is provided, the variables are moved before the variable `var`. If `alphabetical` is provided, the variables are ordered alphabetically. +Create a new variable `y` in `df` by evaluating `expr`. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variable will be missing. """ -macro order(exprs...) - :order |> parse(exprs) |> rewrite +macro generate(exprs...) + :generate |> parse(exprs) |> rewrite end """ - @list [y1 y2...] [@if condition] + @replace y = expr [@if condition] -Display the entire data frame or the rows for which the condition is true. If variable names are provided, only the variables in the list are displayed. +Replace the values of `y` in `df` with the result of evaluating `expr`. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variable will be left unchanged. """ -macro list(exprs...) - :list |> parse(exprs) |> rewrite +macro replace(exprs...) + :replace |> parse(exprs) |> rewrite end - """ - @use "filename.dta", [clear] + @mvencode y1 y2 [_all] ... [if condition], [mv(value)] -Read the data from the file `filename.dta` and set it as the global data frame. If there is already a global data frame, `@use` will throw an error unless the `clear` option is provided +Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. By default value is `missing` making no changes on the dataframe. Using `_all` encodes all variables of the DataFrame. """ -macro use(exprs...) - command = parse(exprs, :use) - length(command.arguments) == 1 || ArgumentError("@use takes a single file name as an argument:\n@use \"filename.dta\"[, clear]") |> throw - # clear is the only permissible option - isempty(filter(x -> x != :clear, command.options)) || ArgumentError("Invalid options $(string.(command.options)). Correct syntax:\n@use \"filename.dta\"[, clear]") |> throw - fname = command.arguments[1] - clear = :clear in command.options - isnothing(getdf()) || clear || ArgumentError("There is already a global data frame set. If you want to replace it, use the \", clear\" option.") |> throw - - :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.use($fname)) |> esc +macro mvencode(exprs...) + :mvencode |> parse(exprs) |> rewrite end """ - @save "filename.dta", [replace] + @egen y1 = expr1 y2 = expr2 ... [@if condition], [by(group1, group2, ...)] -Save the global data frame to the file `filename.dta`. If the file already exists, the `replace` option must be provided. +Generate new variables in `df` by evaluating expressions `expr1`, `expr2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. When the condition is false, the variables will be missing. If `by` is provided, the operation is executed by group. """ -macro save(exprs...) - command = parse(exprs, :save) - length(command.arguments) == 1 || ArgumentError("@save takes a single file name as an argument:\n@save \"filename.dta\"") |> throw - isnothing(getdf()) && ArgumentError("There is no data frame to save.") |> throw - fname = command.arguments[1] - replace = :replace in command.options - ispath(fname) && !replace && ArgumentError("File $fname already exists.") |> throw - :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.save($fname)) |> esc +macro egen(exprs...) + :egen |> parse(exprs) |> rewrite end """ - @append "filename.dta" + @collapse y1 = expr1 y2 = expr2 ... [@if condition], [by(group1, group2, ...)] -Append the data from the file `filename.dta` to the global data frame. Columns that are not common filled with missing values. +Collapse `df` by evaluating expressions `expr1`, `expr2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `by` is provided, the operation is executed by group. """ -macro append(exprs...) - command = parse(exprs, :append) - length(command.arguments) == 1 || ArgumentError("@append takes a single file name as an argument:\n@append \"filename.dta\"") |> throw - isnothing(getdf()) && ArgumentError("There is no data frame to append to.") |> throw - fname = command.arguments[1] - :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.append($fname)) |> esc +macro collapse(exprs...) + :collapse |> parse(exprs) |> rewrite end + """ - @head [n] + @sort y1 y2 ... , [desc] -Display the first `n` rows of the data frame. By default, `n` is 5. +Sort the data frame by the variables `y1`, `y2`, etc. By default, the variables are sorted in ascending order. If `desc` is provided, the variables are sorted in descending order """ -macro head(n=5) - :(println("$(Kezdi.prompt())@head $($n)\n"); first(getdf(), $n) |> display_and_return) |> esc +macro sort(exprs...) + :sort |> parse(exprs) |> rewrite end """ - @tail [n] + @order y1 y2 ... , [desc] [last] [after=var] [before=var] [alphabetical] -Display the last `n` rows of the data frame. By default, `n` is 5. +Reorder the variables `y1`, `y2`, etc. in the data frame. By default, the variables are ordered in the order they are listed. If `desc` is provided, the variables are ordered in descending order. If `last` is provided, the variables are moved to the end of the data frame. If `after` is provided, the variables are moved after the variable `var`. If `before` is provided, the variables are moved before the variable `var`. If `alphabetical` is provided, the variables are ordered alphabetically. """ -macro tail(n=5) - :(println("$(Kezdi.prompt())@tail $($n)\n"); last(getdf(), $n) |> display_and_return) |> esc +macro order(exprs...) + :order |> parse(exprs) |> rewrite end """ - @names + @reshape long y1 y2 ... i(varlist) j(var) + @reshape wide y1 y2 ... i(varlist) j(var) -Display the names of the variables in the data frame. +Reshape the data frame from wide to long or from long to wide format. The variables `y1`, `y2`, etc. are the variables to be reshaped. The `i(var)` and `j(var)` are the variables that define the row and column indices in the reshaped data frame. + +The option `i()` may include multiple variables, like `i(var1, var2, var3)`. The option `j()` must include only one variable. """ -macro names() - :(println("$(Kezdi.prompt())@names\n"); names(getdf()) |> display_and_return) |> esc +macro reshape(exprs...) + if exprs[1] == :long + :reshape_long |> parse(exprs[2:end]) |> rewrite + elseif exprs[1] == :wide + :reshape_wide |> parse(exprs[2:end]) |> rewrite + else + return quote + ArgumentError("Invalid option $(exprs[1]). Correct syntax:\n@reshape long y1 y2 ... i(var) j(var)\n@reshape wide y1 y2 ... i(var) j(var)") |> throw + end + end end """ - @rename oldname newname + @append "filename.dta" / @append df -Rename the variable `oldname` to `newname` in the data frame. +Append the data from the file `filename.dta` or `df` DataFrame to the global data frame. Columns that are not common filled with missing values. """ -macro rename(exprs...) - :rename |> parse(exprs) |> rewrite +macro append(exprs...) + command = parse(exprs, :append) + length(command.arguments) == 1 || ArgumentError("@append takes a single file name as an argument:\n@append \"filename.dta\"") |> throw + isnothing(getdf()) && ArgumentError("There is no data frame to append to.") |> throw + fname = command.arguments[1] + :(println("$(Kezdi.prompt())$($command)\n"); Kezdi.append($fname)) |> esc end +### Summarizing and analyzing data """ - @clear + @count [@if condition] -Clears the global dataframe. +Count the number of rows for which the condition is true. If `condition` is not provided, the total number of rows is counted. """ -macro clear() - :(println("$(Kezdi.prompt())@clear\n"); setdf(nothing)) +macro count(exprs...) + :count |> parse(exprs) |> rewrite end """ - @describe [y1] [y2]... + @tabulate y1 y2 ... [@if condition] -Show the names and data types of columns of the data frame. If no variable names given, all are shown. +Create a frequency table for the variables `y1`, `y2`, etc. in `df`. If `condition` is provided, the operation is executed only on rows for which the condition is true. """ -macro describe(exprs...) - :describe |> parse(exprs) |> rewrite +macro tabulate(exprs...) + :tabulate |> parse(exprs) |> rewrite end """ - @reshape long y1 y2 ... i(varlist) j(var) - @reshape wide y1 y2 ... i(varlist) j(var) - -Reshape the data frame from wide to long or from long to wide format. The variables `y1`, `y2`, etc. are the variables to be reshaped. The `i(var)` and `j(var)` are the variables that define the row and column indices in the reshaped data frame. + @summarize y [@if condition] -The option `i()` may include multiple variables, like `i(var1, var2, var3)`. The option `j()` must include only one variable. +Summarize the variable `y` in `df`. If `condition` is provided, the operation is executed only on rows for which the condition is true. """ -macro reshape(exprs...) - if exprs[1] == :long - :reshape_long |> parse(exprs[2:end]) |> rewrite - elseif exprs[1] == :wide - :reshape_wide |> parse(exprs[2:end]) |> rewrite - else - return quote - ArgumentError("Invalid option $(exprs[1]). Correct syntax:\n@reshape long y1 y2 ... i(var) j(var)\n@reshape wide y1 y2 ... i(var) j(var)") |> throw - end - end +macro summarize(exprs...) + :summarize |> parse(exprs) |> rewrite end """ - @mvencode y1 y2 [_all] ... [if condition], [mv(value)] + @regress y x1 x2 ... [@if condition], [robust] [cluster(var1, var2, ...)] -Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. By default value is `missing` making no changes on the dataframe. Using `_all` encodes all variables of the DataFrame. +Estimate a regression model in `df` with dependent variable `y` and independent variables `x1`, `x2`, etc. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `robust` is provided, robust standard errors are calculated. If `cluster` is provided, clustered standard errors are calculated. + +The regression is limited to rows for which all variables are values. Missing values, infinity, and NaN are automatically excluded. """ -macro mvencode(exprs...) - :mvencode |> parse(exprs) |> rewrite +macro regress(exprs...) + :regress |> parse(exprs) |> rewrite end From ec538179bdb73c09455a4dc09e31c673b4a4921d Mon Sep 17 00:00:00 2001 From: Gergely Attila Kiss Date: Wed, 13 Nov 2024 14:51:57 +0100 Subject: [PATCH 15/15] correct Project.toml --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 93c4555..4f71e0a 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ FixedEffectModels = "9d5cd8c9-2029-5cab-9928-427838db53e3" FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -Missing = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -32,7 +32,7 @@ FixedEffectModels = "1" FreqTables = "0.4" InteractiveUtils = "1" Logging = "1" -Missing = "1" +Missings = "1" RDatasets = "0.7" ReadStatTables = "0.3" Reexport = "1"