From 831b86c6db9256b8ee1e114abde44565ad17f9ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 07:35:49 +0200 Subject: [PATCH 01/10] Test speed of with 20 millon rows and 20 columns --- test/speed.jl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 test/speed.jl diff --git a/test/speed.jl b/test/speed.jl new file mode 100644 index 0000000..bef6ada --- /dev/null +++ b/test/speed.jl @@ -0,0 +1,17 @@ +begin + N = 20_000_000 + K = 20 + df = DataFrame(rand(N, K), :auto) + @time @with df begin + @generate ln_x1 = log(x1) + @generate ln_x2 = log(x2) + @generate ln_x3 = log(x3) + @generate ln_x4 = log(x4) + @generate ln_x5 = log(x5) + @generate ln_x6 = log(x6) + @generate ln_x7 = log(x7) + @generate ln_x8 = log(x8) + @generate ln_x9 = log(x9) + end + # 30.076813 seconds (9.99 k allocations: 33.528 GiB, 31.13% gc time, 1.26% compilation time) +end \ No newline at end of file From 0b1386eeabf412eb111238f9cfacc38e845b5a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 07:49:38 +0200 Subject: [PATCH 02/10] Move copy() to @with, do not do it for each command. Speed is not better --- src/codegen.jl | 2 +- src/functions.jl | 3 ++- test/speed.jl | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/codegen.jl b/src/codegen.jl index 6f71853..6379897 100644 --- a/src/codegen.jl +++ b/src/codegen.jl @@ -30,7 +30,7 @@ function generate_command(command::Command; options=[], allowed=[]) push!(setup, :(println("$(Kezdi.prompt())$($(string(command)))\n"))) push!(setup, :(getdf() isa AbstractDataFrame || error("Kezdi.jl commands can only operate on a global DataFrame set by setdf()"))) - push!(setup, :(local $df2 = copy(getdf()))) + push!(setup, :(local $df2 = Kezdi._global_dataframe)) variables_condition = (:ifable in options) ? vcat(extract_column_references(command.condition)...) : Symbol[] variables_RHS = (:variables in options) ? vcat(extract_column_references.(command.arguments)...) : Symbol[] variables = vcat(variables_condition, variables_RHS) diff --git a/src/functions.jl b/src/functions.jl index 50a0a60..3a2a502 100644 --- a/src/functions.jl +++ b/src/functions.jl @@ -1,5 +1,6 @@ use(fname::AbstractString) = readstat(fname) |> DataFrame |> setdf save(fname::AbstractString) = writestat(fname, getdf()) + function append(fname::AbstractString) ispath(fname) || ArgumentError("File $fname does not exist.") |> throw _, ext = splitext(fname) @@ -48,7 +49,7 @@ getdf() = _global_dataframe Set the global data frame. """ -setdf(df::Union{AbstractDataFrame, Nothing}) = global _global_dataframe = df +setdf(df::Union{AbstractDataFrame, Nothing}) = global _global_dataframe = isnothing(df) ? nothing : copy(df) display_and_return(x) = (display(x); x) """ diff --git a/test/speed.jl b/test/speed.jl index bef6ada..b9646f9 100644 --- a/test/speed.jl +++ b/test/speed.jl @@ -1,3 +1,5 @@ +using Kezdi + begin N = 20_000_000 K = 20 @@ -13,5 +15,5 @@ begin @generate ln_x8 = log(x8) @generate ln_x9 = log(x9) end - # 30.076813 seconds (9.99 k allocations: 33.528 GiB, 31.13% gc time, 1.26% compilation time) + # 43.221702 seconds (154.87 k allocations: 37.859 GiB, 27.68% gc time, 0.47% compilation time) end \ No newline at end of file From a4b498095e20aae27bbaf3e95bd98e907bdb033b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 07:53:58 +0200 Subject: [PATCH 03/10] Do not setdf() after command: speed is 10x --- src/commands.jl | 2 +- test/speed.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index 10470dc..b1359a8 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -21,7 +21,7 @@ function rewrite(::Val{:generate}, command::Command) $setup $local_copy[!, $target_column] .= missing $target_df[!, $target_column] .= $RHS - $local_copy |> $teardown |> setdf + $local_copy |> $teardown end |> esc end diff --git a/test/speed.jl b/test/speed.jl index b9646f9..31f5942 100644 --- a/test/speed.jl +++ b/test/speed.jl @@ -15,5 +15,5 @@ begin @generate ln_x8 = log(x8) @generate ln_x9 = log(x9) end - # 43.221702 seconds (154.87 k allocations: 37.859 GiB, 27.68% gc time, 0.47% compilation time) + # 4.397261 seconds (153.62 k allocations: 4.332 GiB, 32.32% gc time, 1.78% compilation time) end \ No newline at end of file From e297875a592a778e8da79652d02cdb61a4293fbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:05:54 +0200 Subject: [PATCH 04/10] Execute all commands in place, remove setdf from commands. 474/474 tests pass --- src/commands.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/commands.jl b/src/commands.jl index b1359a8..5283b9a 100644 --- a/src/commands.jl +++ b/src/commands.jl @@ -7,7 +7,7 @@ function rewrite(::Val{:rename}, command::Command) quote (length($arguments) != 2) && ArgumentError("Syntax is @rename oldname newname") |> throw $setup - rename!($local_copy, $arguments[1] => $arguments[2]) |> $teardown |> setdf + rename!($local_copy, $arguments[1] => $arguments[2]) |> $teardown end |> esc end @@ -47,7 +47,7 @@ function rewrite(::Val{:replace}, command::Command) else $target_df[!, $target_column] .= $RHS end - $local_copy |> $teardown |> setdf + $local_copy |> $teardown end |> esc end @@ -67,7 +67,7 @@ function rewrite(::Val{:drop}, command::Command) if isnothing(command.condition) return quote $setup - select($local_copy, Not(collect($(command.arguments)))) |> $teardown |> setdf + select!($local_copy, Not(collect($(command.arguments)))) |> $teardown |> setdf end |> esc end bitmask = build_bitmask(local_copy, command.condition) @@ -96,7 +96,7 @@ function rewrite(::Val{:egen}, command::Command) ($target_column in names(getdf())) && ArgumentError("Column \"$($target_column)\" already exists in $(names(getdf()))") |> throw $setup $transform_expression - $local_copy |> $teardown |> setdf + $local_copy |> $teardown end |> esc end @@ -107,7 +107,7 @@ function rewrite(::Val{:sort}, command::Command) desc = :desc in get_top_symbol.(options) ? true : false quote $setup - sort($target_df, $columns, rev=$desc) |> $teardown |> setdf + sort!($target_df, $columns, rev=$desc) |> $teardown end |> esc end @@ -173,7 +173,7 @@ function rewrite(::Val{:order}, command::Command) $cols = pushfirst!($cols, $target_cols...) end - $target_df[!, $cols]|> $teardown |> setdf + $target_df[!, $cols]|> $teardown end |> esc end @@ -202,6 +202,6 @@ function rewrite(::Val{:mvencode}, command::Command) end end $local_copy[$bitmask, $cols] = mvreplace.($local_copy[$bitmask, $cols], $value) - $local_copy |> $teardown |> setdf + $local_copy |> $teardown end |> esc end \ No newline at end of file From 6824593db037309af1e45bf708bd0831f82bac81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:15:56 +0200 Subject: [PATCH 05/10] Update speed benchmarks --- docs/src/index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index a6b48e7..9b19827 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -65,13 +65,13 @@ end ### Free and open-source ### Speed -| Command | Stata | Julia 1st run | Julia 2nd run | Speedup | -| ------------ | ----- | ------------- | ------------- | ------- | -| `@egen` | 4.90s | 1.36s | 0.36s | 14x | -| `@collapse` | 0.92s | 0.39s | 0.28s | 3x | -| `@tabulate` | 2.14s | 0.68s | 0.09s | 24x | -| `@summarize` | 10.40s | 0.58s | 0.36s | 29x | -| `@regress` | 0.89s | 1.95s | 0.11s | 8x | +| Command | Stata | Julia 2nd run | Speedup | +| ------------ | ----- | ------------- | ------- | +| `@egen` | 4.90s | 0.37s | 13x | +| `@collapse` | 0.92s | 0.28s | 3x | +| `@tabulate` | 2.14s | 0.09s | 24x | +| `@summarize` | 10.40s | 0.35s | 30x | +| `@regress` | 0.89s | 0.14s | 6x | See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.do) and [Kezdi.jl](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.jl). From c3ae3eafe33008c260dbab201e8f07d6fbc0ed3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:26:14 +0200 Subject: [PATCH 06/10] Add `generate` and `replace` to speed benchmarks --- docs/examples/benchmark.do | 24 ++++++++++++++++++++---- docs/examples/benchmark.jl | 6 ++++++ docs/src/index.md | 12 +++++++----- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/docs/examples/benchmark.do b/docs/examples/benchmark.do index 2aa8fc1..3ebd9bc 100644 --- a/docs/examples/benchmark.do +++ b/docs/examples/benchmark.do @@ -10,11 +10,27 @@ gen i = _n set seed 12345 gen g = floor(runiform() * 100) +timer clear 1 +preserve +timer on 1 + generate ln_i = log(i) +timer off 1 +restore +timer list 1 + +timer clear 1 +preserve +timer on 1 + replace g = 2*i +timer off 1 +restore +timer list 1 + * Measure time for mean calculation by group timer clear 1 preserve timer on 1 -egen mean_i = mean(i), by(g) + egen mean_i = mean(i), by(g) timer off 1 restore timer list 1 @@ -23,7 +39,7 @@ timer list 1 preserve timer clear 3 timer on 3 -collapse (mean) mean_i=i, by(g) + collapse (mean) mean_i=i, by(g) timer off 3 restore timer list 3 @@ -38,7 +54,7 @@ timer list 5 * Measure time for summarize timer clear 7 timer on 7 -summarize g, detail + summarize g, detail timer off 7 timer list 7 @@ -46,7 +62,7 @@ timer list 7 preserve timer clear 9 timer on 9 -regress i g if g > 50 + regress i g if g > 50 timer off 9 restore timer list 9 diff --git a/docs/examples/benchmark.jl b/docs/examples/benchmark.jl index 7630ec8..bd26e83 100644 --- a/docs/examples/benchmark.jl +++ b/docs/examples/benchmark.jl @@ -6,6 +6,12 @@ using Pkg; Pkg.precompile() df = DataFrame(i = 1:10_000_000) df.g = rand(0:99, nrow(df)) +println("Generate") +@btime @with df @generate ln_i = log(i) + +println("Replace") +@btime @with df @replace g = 2*i + println("Egen") @btime @with df @egen mean_i = mean(i), by(g) diff --git a/docs/src/index.md b/docs/src/index.md index 9b19827..24f1afe 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -67,11 +67,13 @@ end | Command | Stata | Julia 2nd run | Speedup | | ------------ | ----- | ------------- | ------- | -| `@egen` | 4.90s | 0.37s | 13x | -| `@collapse` | 0.92s | 0.28s | 3x | -| `@tabulate` | 2.14s | 0.09s | 24x | -| `@summarize` | 10.40s | 0.35s | 30x | -| `@regress` | 0.89s | 0.14s | 6x | +| `@generate` | 230ms | 46ms | 5x | +| `@replace` | 232ms | 43ms | 5x | +| `@egen` | 5.00s | 0.37s | 13x | +| `@collapse` | 0.94s | 0.28s | 3x | +| `@tabulate` | 2.19s | 0.09s | 24x | +| `@summarize` | 10.56s | 0.35s | 30x | +| `@regress` | 0.85s | 0.14s | 6x | See the benchmarking code for [Stata](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.do) and [Kezdi.jl](https://github.com/codedthinking/Kezdi.jl/blob/main/docs/examples/benchmark.jl). From 61e2e96ab665b86b7f77e24f30b4dbe43d721db8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:31:19 +0200 Subject: [PATCH 07/10] Bump patch version: no API change --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 5c5ef12..dcded7f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Kezdi" uuid = "48308a23-c29e-446c-b4c0-d9446a767439" authors = ["Miklos Koren ", "Gergely Attila Kiss "] -version = "0.5.1" +version = "0.5.2" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" From bb9862ee4d54f5ecda8403e6403d1f53b5e8dc78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 08:39:35 +0200 Subject: [PATCH 08/10] Test `generate` and `replace` without `@with` --- docs/examples/benchmark.jl | 10 ++++++++-- docs/src/index.md | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/examples/benchmark.jl b/docs/examples/benchmark.jl index bd26e83..6d75d95 100644 --- a/docs/examples/benchmark.jl +++ b/docs/examples/benchmark.jl @@ -6,11 +6,17 @@ using Pkg; Pkg.precompile() df = DataFrame(i = 1:10_000_000) df.g = rand(0:99, nrow(df)) + println("Generate") -@btime @with df @generate ln_i = log(i) +setdf(df) +@time @generate ln_i = log(i) +setdf(df) +@time @generate ln_i = log(i) + +setdf(df) println("Replace") -@btime @with df @replace g = 2*i +@btime @replace g = 2*i println("Egen") @btime @with df @egen mean_i = mean(i), by(g) diff --git a/docs/src/index.md b/docs/src/index.md index 24f1afe..5110ebc 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -68,7 +68,7 @@ end | Command | Stata | Julia 2nd run | Speedup | | ------------ | ----- | ------------- | ------- | | `@generate` | 230ms | 46ms | 5x | -| `@replace` | 232ms | 43ms | 5x | +| `@replace` | 232ms | 32ms | 7x | | `@egen` | 5.00s | 0.37s | 13x | | `@collapse` | 0.94s | 0.28s | 3x | | `@tabulate` | 2.19s | 0.09s | 24x | From a105985bdf20048addec101cd4e9b0464768579c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikl=C3=B3s=20Koren?= Date: Thu, 19 Sep 2024 12:54:18 +0200 Subject: [PATCH 09/10] Test that generate completes in less than 10 seconds - This is for 20m rows and 20 columns, 9 executions of generate log(x) - Current time is median time is 4-5 seconds, raising bar to 10 seconds to have some slack --- Project.toml | 2 ++ test/runtests.jl | 5 +++++ test/speed.jl | 35 ++++++++++++++++++----------------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index dcded7f..e660fc5 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["Miklos Koren ", "Gergely Attila Kiss Date: Thu, 19 Sep 2024 13:37:03 +0200 Subject: [PATCH 10/10] Raise benchmark to 30 seconds - Julia nightly builds were slower --- test/speed.jl | 4 ++-- test/test.dta | Bin 1915 -> 1915 bytes 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/speed.jl b/test/speed.jl index 5f4ba4f..4f8d5cd 100644 --- a/test/speed.jl +++ b/test/speed.jl @@ -1,4 +1,4 @@ -@testset "Generate completes within 10 seconds" begin +@testset "Generate completes within 30 seconds" begin df = DataFrame(rand(20_000_000, 20), :auto) t = @benchmark let df = $df @@ -16,5 +16,5 @@ end time = median(t).time / 1e9 - @test time < 10.0 + @test time < 30.0 end \ No newline at end of file diff --git a/test/test.dta b/test/test.dta index 4a22a4d66a7c6b7e954965473318b84bca248912..849b2ee4d1b88f2363292c2921f9d41e36219e7d 100644 GIT binary patch delta 27 icmey(_nU7*rGTYEaB6{qk%5tkf}x3(so}=XPiz2#BM9sO delta 27 hcmey(_nU7*rGTk|V`;jAk%5s3kgzhg+}Qbv4FG}^2;cw!