Skip to content

Commit

Permalink
fix nonunique bug (#3393)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Oct 21, 2023
1 parent 1a5da8a commit 831f010
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 5 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
instead of using the interactive thread pool when Julia was started
with `-tM,N` with N > 0
([#3385](https://github.com/JuliaData/DataFrames.jl/pull/3385))
* Correctly return `Bool[]` in the `nonunique` function applied to a data frame
with a pulled column that has zero levels in the pool
([#3393](https://github.com/JuliaData/DataFrames.jl/pull/3393))

# DataFrames.jl v1.6.1 Release Notes

Expand Down
2 changes: 1 addition & 1 deletion src/abstractdataframe/unique.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ function nonunique(df::AbstractDataFrame; keep::Symbol=:first)
if !(keep in (:first, :last, :noduplicates))
throw(ArgumentError("`keep` must be :first, :last, or :noduplicates"))
end
ncol(df) == 0 && return Bool[]
nrow(df) == 0 && return Bool[]
res = fill(true, nrow(df))
cols = ntuple(i -> df[!, i], ncol(df))
if keep == :first
Expand Down
6 changes: 5 additions & 1 deletion src/groupeddataframe/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,11 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
nt = max(1, lg ÷ 100_000)
end
# if there are few rows per group limit the number of threads used
nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt)
if ngroups == 0
nt = 1
else
nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt)
end

seen = fill(false, ngroups)
seen_vec = Vector{Vector{Bool}}(undef, nt)
Expand Down
9 changes: 6 additions & 3 deletions test/duplicates.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module TestDuplicates

using Test, DataFrames, CategoricalArrays, Random
using Test, DataFrames, CategoricalArrays, Random, PooledArrays
const = isequal

@testset "nonunique" begin
Expand Down Expand Up @@ -30,15 +30,18 @@ const ≅ = isequal
@test_throws ArgumentError unique!(df)
@test_throws ArgumentError unique(df, true)

pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]),
b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :)
pdf = view(DataFrame(a=CategoricalArray(["a", "a", missing, missing, "b", missing, "a", missing]),
b=CategoricalArray(["a", "b", missing, missing, "b", "a", "a", "a"])), :, :)
updf = DataFrame(a=CategoricalArray(["a", "a", missing, "b", missing]),
b=CategoricalArray(["a", "b", missing, "b", "a"]))
@test nonunique(pdf) == [false, false, false, true, false, false, true, true]
@test nonunique(updf) == falses(5)
@test updf unique(pdf)
@test_throws ArgumentError unique!(pdf)
@test_throws ArgumentError unique(pdf, true)

@test isempty(nonunique(DataFrame(a=PooledArray(Int[]))))
@test typeof(nonunique(DataFrame(a=PooledArray(Int[])))) === Vector{Bool}
end

@testset "nonunique, nonunique, unique! with extra argument" begin
Expand Down
5 changes: 5 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4531,4 +4531,9 @@ end
end
end

@testset "no levels in pooled grouping bug #3393" begin
@test isempty(groupby_checked(DataFrame(x=PooledArray([missing])), :x, skipmissing=true))
@test isempty(groupby_checked(DataFrame(x=categorical([missing])), :x, skipmissing=true))
end

end # module

0 comments on commit 831f010

Please sign in to comment.