Skip to content

Commit

Permalink
Merge pull request #189 from codedthinking/0.5-dev
Browse files Browse the repository at this point in the history
fix issues with using variable references in `@mvencode` options
  • Loading branch information
korenmiklos authored Sep 19, 2024
2 parents fcf18b4 + 0986765 commit 720b846
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 40 deletions.
4 changes: 2 additions & 2 deletions src/Kezdi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
Kezdi.jl is a Julia package for data manipulation and analysis. It is inspired by Stata, but it is written in Julia, which makes it faster and more flexible. It is designed to be used in the Julia REPL, but it can also be used in Jupyter notebooks or in scripts.
"""
module Kezdi
export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe, @mvencode, @save
export @generate, @replace, @egen, @collapse, @keep, @drop, @summarize, @regress, @use, @tabulate, @count, @sort, @order, @list, @head, @tail, @names, @rename, @clear, @describe, @mvencode, @save, @append

export getdf, setdf, display_and_return, keep_only_values, rowcount, distinct, cond, mvreplace
export getdf, setdf, display_and_return, keep_only_values, rowcount, distinct, cond, mvreplace, append

using Reexport
using Logging
Expand Down
10 changes: 9 additions & 1 deletion src/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -265,4 +265,12 @@ isfunctioncall(ex::Expr) =
isfunctioncall(ex.args[end])) ||
Meta.isexpr(ex, :., 3) # Vectorized function call (broadcasting)
=#
isfunctioncall(x::Expr) = x.head == :call || (x.head == Symbol(".") && x.args[1] isa Symbol && x.args[2] isa Expr && x.args[2].head == :tuple) || x.head in SYNTACTIC_OPERATORS
isfunctioncall(x::Expr) = x.head == :call || (x.head == Symbol(".") && x.args[1] isa Symbol && x.args[2] isa Expr && x.args[2].head == :tuple) || x.head in SYNTACTIC_OPERATORS

add_skipmissing(expr::Any) = expr
function add_skipmissing(expr::Expr)
if expr.head == Symbol(".") && expr.args[2] isa QuoteNode
return Expr(:call, :skipmissing, expr)
end
Expr(expr.head, expr.args[1], [add_skipmissing(x) for x in expr.args[2:end]]...)
end
10 changes: 7 additions & 3 deletions src/commands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,22 @@ function rewrite(::Val{:order}, command::Command)
end

function rewrite(::Val{:mvencode}, command::Command)
gc = generate_command(command; options=[:variables, :ifable, :nofunction], allowed=[:mv])
gc = generate_command(command; options=[:variables, :ifable, :nofunction, :replace_options], allowed=[:mv])
(; local_copy, target_df, setup, teardown, arguments, options) = gc
cols = :(collect($command.arguments))
value = isnothing(get_option(command, :mv)) ? missing : get_option(command, :mv)[1]
if :_all in collect(command.arguments)
cols = :(names($local_copy))
end
value = isnothing(get_option(command, :mv)) ? missing : replace_column_references(local_copy, get_option(command, :mv)[1])
value isa AbstractVector && ArgumentError("The value for @mvencode cannot be a vector") |> throw
value = add_skipmissing(value)
bitmask = build_bitmask(local_copy, command.condition)
third_vector = gensym()
valtype = gensym()
coltype = gensym()
quote
$setup
$valtype = typeof($value)
$valtype = typeof($value)
for col in $cols
$coltype = eltype($local_copy[.!($bitmask), col])
if $valtype != $coltype
Expand Down
35 changes: 35 additions & 0 deletions src/functions.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@
use(fname::AbstractString) = readstat(fname) |> DataFrame |> setdf
save(fname::AbstractString) = writestat(fname, getdf())
function append(fname::AbstractString)
ispath(fname) || ArgumentError("File $fname does not exist.") |> throw
_, ext = splitext(fname)
if ext in [".dta", ".sav", ".por", ".sas7bdat", ".xpt"]
df = readstat(fname) |> DataFrame
else
df = CSV.read(fname, DataFrame)
end
cdf = getdf()
cdf, df = create_cols(cdf, df)
df = vcat(cdf,df)
setdf(df)
end

function append(df::DataFrame)
cdf, df = create_cols(getdf(), df)
setdf(vcat(cdf, df))
end

function create_cols(cdf::DataFrame, df::DataFrame)
if names(cdf) != names(df)
for col in names(df)
if col names(cdf)
cdf[!, col] .= missing
end
end
for col in names(cdf)
if col names(df)
df[!, col] .= missing
end
end
end
return cdf, df
end


"""
getdf() -> AbstractDataFrame
Expand Down
24 changes: 20 additions & 4 deletions src/macros.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
global_logger(Logging.ConsoleLogger(stderr, Logging.Info))

macro mockmacro(exprs...)
command = :mockmacro
parse(exprs, command)
Expand Down Expand Up @@ -144,6 +142,11 @@ macro use(exprs...)
:(println("$(Kezdi.prompt())$($command)\n");Kezdi.use($fname)) |> esc
end

"""
@save "filename.dta", [replace]
Save the global data frame to the file `filename.dta`. If the file already exists, the `replace` option must be provided.
"""
macro save(exprs...)
command = parse(exprs, :save)
length(command.arguments) == 1 || ArgumentError("@save takes a single file name as an argument:\n@save \"filename.dta\"") |> throw
Expand All @@ -153,6 +156,19 @@ macro save(exprs...)
ispath(fname) && !replace && ArgumentError("File $fname already exists.") |> throw
:(println("$(Kezdi.prompt())$($command)\n");Kezdi.save($fname)) |> esc
end

"""
@append "filename.dta"
Append the data from the file `filename.dta` to the global data frame. Columns that are not common filled with missing values.
"""
macro append(exprs...)
command = parse(exprs, :append)
length(command.arguments) == 1 || ArgumentError("@append takes a single file name as an argument:\n@append \"filename.dta\"") |> throw
isnothing(getdf()) && ArgumentError("There is no data frame to append to.") |> throw
fname = command.arguments[1]
:(println("$(Kezdi.prompt())$($command)\n");Kezdi.append($fname)) |> esc
end
"""
@head [n]
Expand Down Expand Up @@ -209,9 +225,9 @@ end


"""
@mvencode y1 y2 ... [if condition], [mv(value)]
@mvencode y1 y2 [_all] ... [if condition], [mv(value)]
Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. Default value is `missing` making no changes on the dataframe.
Encode missing values in the variables `y1`, `y2`, etc. in the data frame. If `condition` is provided, the operation is executed only on rows for which the condition is true. If `mv` is provided, the missing values are encoded with the value `value`. By default value is `missing` making no changes on the dataframe. Using `_all` encodes all varibles of the DataFrame.
"""
macro mvencode(exprs...)
:mvencode |> parse(exprs) |> rewrite
Expand Down
6 changes: 6 additions & 0 deletions test/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,10 @@ end
@test Kezdi.get_dot_parts(:(x.y)) == [:x, :y]
@test Kezdi.get_dot_parts(:(x.y.z)) == [:x, :y, :z]
end

@testset "Add skipmissing" begin
@test Kezdi.add_skipmissing(:(log(df.x))) == :(log(skipmissing(df.x)))
@test Kezdi.add_skipmissing(:(log(df.x)-log(df.y))) == :(log(skipmissing(df.x)) - log(skipmissing(df.y)))
@test Kezdi.add_skipmissing(:(log(df.x - df.y))) == :(log(skipmissing(df.x) - skipmissing(df.y)))
end
end
114 changes: 84 additions & 30 deletions test/commands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -763,29 +763,8 @@ end
end
end

@testset "Use" begin
df = DataFrame(x=1:10, y=11:20)
@use "test.dta", clear
@test df == getdf()
end

@testset "Save" begin
@clear
df = DataFrame(x=Vector{Any}(1:11), y=11:21)
setdf(df)
try @save "test.dta", replace catch e @test e == ErrorException("element type Any is not supported") end
df = DataFrame(x=1:11, y=11:21)
setdf(df)
@save "test.dta", replace
df2 = @use "test.dta", clear
@test df == df2
df = DataFrame(x=1:10, y=11:20)
setdf(df)
@save "test.dta", replace
end

@testset "Missing encode" begin
df = DataFrame(x=[1, 2, missing, 3, missing, 4], y=[missing, 0, 1, 2, missing, 1])
df = DataFrame(x=[1, 2, missing, 3, missing, 4], y=[missing, 0, 1, 2, missing, 2])
@testset "Known values" begin
df2 = @with df @mvencode x
@test all(df2.x .=== [1, 2, missing, 3, missing, 4])
Expand All @@ -795,14 +774,26 @@ end
df2 = @with df @mvencode x, mv(-99)
@test all(df2.x .== [1, 2, -99, 3, -99, 4])
@test typeof(df2.x) == Vector{Union{Missing, Int64}}
df2 = @with df @mvencode x, mv(mean(skipmissing(getdf().x)))
df2 = @with df @mvencode x, mv(mean(x))
@test all(df2.x .== [1, 2, 2.5, 3, 2.5, 4])
@test typeof(df2.x) == Vector{Union{Missing, Float64}}
df2 = @with df @mvencode x, mv(mean(x)/mean(y))
@test all(df2.x .== [1, 2, 2, 3, 2, 4])
@test typeof(df2.x) == Vector{Union{Missing, Float64}}
df2 = @with df @mvencode y, mv(-99)
@test all(df2.y .== [-99, 0, 1, 2, -99, 1])
@test all(df2.y .== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode x y, mv(-99)
@test all(df2.x .== [1, 2, -99, 3, -99, 4])
@test all(df2.y .== [-99, 0, 1, 2, -99, 1])
@test all(df2.y .== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode _all, mv(-99)
@test all(df2.x .== [1, 2, -99, 3, -99, 4])
@test all(df2.y .== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode x _all, mv(-99)
@test all(df2.x .== [1, 2, -99, 3, -99, 4])
@test all(df2.y .== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode _all x, mv(-99)
@test all(df2.x .== [1, 2, -99, 3, -99, 4])
@test all(df2.y .== [-99, 0, 1, 2, -99, 2])
end

@testset "If" begin
Expand All @@ -811,17 +802,80 @@ end
df2 = @with df @mvencode x @if ismissing(x), mv(-99)
@test all(df2.x .=== [1, 2, -99, 3, -99, 4])
df2 = @with df @mvencode y @if ismissing(y), mv(-99)
@test all(df2.y .=== [-99, 0, 1, 2, -99, 1])
@test all(df2.y .=== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode y @if ismissing(x), mv(-99)
@test all(df2.y .=== [missing, 0, 1, 2, -99, 1])
@test all(df2.y .=== [missing, 0, 1, 2, -99, 2])
df2 = @with df @mvencode x y @if ismissing(y), mv(-99)
@test all(df2.x .=== [1, 2, missing, 3, -99, 4])
@test all(df2.y .=== [-99, 0, 1, 2, -99, 1])
@test all(df2.y .=== [-99, 0, 1, 2, -99, 2])
df2 = @with df @mvencode x y @if ismissing(x), mv(-99)
@test all(df2.x .=== [1, 2, -99, 3, -99, 4])
@test all(df2.y .=== [missing, 0, 1, 2, -99, 1])
@test all(df2.y .=== [missing, 0, 1, 2, -99, 2])
df2 = @with df @mvencode x y @if ismissing(x) || !ismissing(y), mv(-99)
@test all(df2.x .=== [1, 2, -99, 3, -99, 4])
@test all(df2.y .=== [missing, 0, 1, 2, -99, 1])
@test all(df2.y .=== [missing, 0, 1, 2, -99, 2])
end
end

@testset "Use" begin
df = DataFrame(x=1:10, y=11:20)
@use "test.dta", clear
@test df == getdf()
end

@testset "Save" begin
@clear
df = DataFrame(x=Vector{Any}(1:11), y=11:21)
setdf(df)
try @save "test.dta", replace catch e @test e == ErrorException("element type Any is not supported") end
df = DataFrame(x=1:11, y=11:21)
setdf(df)
@save "test.dta", replace
df2 = @use "test.dta", clear
@test df == df2
df = DataFrame(x=1:10, y=11:20)
setdf(df)
@save "test.dta", replace
end

@testset "Append" begin
df = @use "test.dta", clear
@testset "Same columns" begin
@append "test.sas7bdat"
@test nrow(df) == nrow(getdf()) / 2
@test df == getdf()[11:nrow(getdf()),:]
@append "test.csv"
@test nrow(df) == nrow(getdf()) / 3
@test df == getdf()[21:nrow(getdf()),:]
end

df2 = @use "test2.sas7bdat", clear
df = @use "test.dta", clear
df = convert.(Float64, df)
@testset "Different columns" begin
@append "test2.sas7bdat"
@test nrow(df) == nrow(getdf()) / 2
@test df == getdf()[1:10,[:x,:y]]
@test all(getdf()[1:10, :z] .=== missing)
@test getdf()[11:end,:] == df2
end

@testset "With in-memory dataframe" begin
df = @use "test.dta", clear
@testset "Same columns" begin
df2 = copy(df)
@append df2
@test nrow(df) == nrow(getdf()) / 2
@test df == getdf()[11:nrow(getdf()),:]
end
df = @use "test.dta", clear
@testset "Different Columns" begin
df2 = @with df @generate z=x+y
@append df2
@test nrow(df) == nrow(getdf()) / 2
@test df[:,[:x,:y]] == getdf()[1:10,[:x,:y]]
@test all(getdf()[1:10, :z] .=== missing)
@test getdf()[11:end,:] == df2
end
end
end
11 changes: 11 additions & 0 deletions test/test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
x,y
1,11
2,12
3,13
4,14
5,15
6,16
7,17
8,18
9,19
10,20
Binary file modified test/test.dta
Binary file not shown.
Binary file added test/test.sas7bdat
Binary file not shown.
Binary file added test/test2.sas7bdat
Binary file not shown.

0 comments on commit 720b846

Please sign in to comment.