diff --git a/src/datasets/misc/boston_housing.jl b/src/datasets/misc/boston_housing.jl index 53cc1f4d..a85ec31e 100644 --- a/src/datasets/misc/boston_housing.jl +++ b/src/datasets/misc/boston_housing.jl @@ -77,21 +77,24 @@ end function BostonHousing(; as_df = true, dir = nothing) @assert dir === nothing "custom `dir` is not supported at the moment." path = joinpath(@__DIR__, "..", "..", "..", "data", "boston_housing.csv") - df = read_csv(path) - features = df[!, DataFrames.Not(:MEDV)] - targets = df[!, [:MEDV]] + t = read_csv(path, CSV.File) + colnames = Tables.columnnames(t) + features = table_to_matrix(t, select = colnames[1:end-1]) + targets = table_to_matrix(t, select = colnames[end:end]) + metadata = Dict{String, Any}() metadata["path"] = path - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - metadata["n_observations"] = size(targets, 1) + metadata["feature_names"] = colnames[1:end-1] + metadata["target_names"] = colnames[end:end] + metadata["n_observations"] = size(features, 1) metadata["description"] = BOSTONHOUSING_DESCR - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + df = nothing + if as_df + df = table_to_df(t, names = colnames) + features = matrix_to_df(features, names = colnames[1:end-1]) + targets = matrix_to_df(targets, names = colnames[end:end]) end return BostonHousing(metadata, features, targets, df) diff --git a/src/datasets/misc/iris.jl b/src/datasets/misc/iris.jl index 0a5cd09a..53162451 100644 --- a/src/datasets/misc/iris.jl +++ b/src/datasets/misc/iris.jl @@ -79,22 +79,23 @@ end function Iris(; dir = nothing, as_df = true) path = datafile("Iris", "iris.data", dir) - df = read_csv(path, header=0) - DataFrames.rename!(df, ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"]) - - features = df[!, DataFrames.Not(:class)] - targets = df[!, [:class]] - + t = read_csv(path, CSV.File, header=0) + colnames = Tables.columnnames(t) + truecolnames = ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"] + features = table_to_matrix(t, select = colnames[1:end-1]) + targets = table_to_matrix(t, select = colnames[end:end]) + metadata = Dict{String, Any}() metadata["path"] = path - metadata["n_observations"] = size(df, 1) - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + metadata["n_observations"] = size(features, 1) + metadata["feature_names"] = truecolnames[1:end-1] + metadata["target_names"] = truecolnames[end:end] + + df = nothing + if as_df + df = table_to_df(t, names = truecolnames) + features = matrix_to_df(features, names = truecolnames[1:end-1]) + targets = matrix_to_df(targets, names = truecolnames[end:end]) end return Iris(metadata, features, targets, df) diff --git a/src/datasets/misc/titanic.jl b/src/datasets/misc/titanic.jl index 2bf4a04a..53fc0d5b 100644 --- a/src/datasets/misc/titanic.jl +++ b/src/datasets/misc/titanic.jl @@ -61,22 +61,25 @@ end function Titanic(; as_df = true, dir = nothing) @assert dir === nothing "custom `dir` is not supported at the moment." path = joinpath(@__DIR__, "..", "..", "..", "data", "titanic.csv") - df = read_csv(path) - - features = df[!, DataFrames.Not(:Survived)] - targets = df[!, [:Survived]] + t = read_csv(path, CSV.File) + colnames = Tables.columnnames(t) + ncols = length(colnames) + # :Sruvived is the second column + features = table_to_matrix(t, select = colnames[[1; 3:ncols]]) + targets = table_to_matrix(t, select = colnames[2:2]) metadata = Dict{String, Any}() metadata["path"] = path - metadata["feature_names"] = names(features) - metadata["target_names"] = names(targets) - metadata["n_observations"] = size(df, 1) + metadata["feature_names"] = colnames[[1; 3:ncols]] + metadata["target_names"] = colnames[2:2] + metadata["n_observations"] = size(features, 1) metadata["description"] = TITANIC_DESCR - if !as_df - features = df_to_matrix(features) - targets = df_to_matrix(targets) - df = nothing + df = nothing + if as_df + df = table_to_df(t, names = colnames) + features = matrix_to_df(features, names = colnames[[1; 3:ncols]]) + targets = matrix_to_df(targets, names = colnames[2:2]) end return Titanic(metadata, features, targets, df) diff --git a/src/io.jl b/src/io.jl index 168323b4..b4d0e467 100644 --- a/src/io.jl +++ b/src/io.jl @@ -3,13 +3,12 @@ function read_csv(path; kws...) return read_csv_asdf(path; kws...) end -# function read_csv(path, sink::Type{<:AbstractMatrix{T}}; delim=nothing, kws...) where T -# x = delim === nothing ? readdlm(path, T; kws...) : readdlm(path, delim, T; kws...) -# return x -# end - function read_csv(path, sink::Type{A}; kws...) where A <: AbstractMatrix - return A(read_csv(path; kws...)) + return table_to_matrix(read_csv(path, CSV.File; kws...)) +end + +function read_csv(path, sink::Type{CSV.File}; kws...) + return CSV.File(path; kws...) end function read_csv_asdf(path; kws...) diff --git a/src/utils.jl b/src/utils.jl index ada82445..e5e4ca30 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -21,6 +21,31 @@ function restrict_array_type(res::AbstractArray) end end +function table_to_matrix(t; select = nothing) + if select === nothing + cnames = Tables.columnnames(t) + else + cnames = select + end + return hcat((Tables.getcolumn(t, n) for n in cnames)...) +end + +function table_to_df(t; names = nothing) + df = DataFrames.DataFrame(t) + if names !== nothing + DataFrames.rename!(df, names) + end + return df +end + +function matrix_to_df(a::AbstractMatrix; names = nothing) + df = DataFrames.DataFrame(a, :auto) + if names !== nothing + DataFrames.rename!(df, names) + end + return df +end + function df_to_matrix(df) x = Matrix(df) if size(x, 2) == 1 diff --git a/test/test_utils.jl b/test/test_utils.jl index 60277c11..e26123e0 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -29,12 +29,11 @@ function test_inmemory_supervised_table_dataset(d::D; @test size(d.features) == (n_obs, n_features) @test size(d.targets) == (n_obs, n_targets) - # check that dataframe shares the same storage of features and targets for c in names(d.dataframe) if c in names(d.targets) - @test d.dataframe[!, c] === d.targets[!,c] + @test d.dataframe[!, c] == d.targets[!,c] else - @test d.dataframe[!, c] === d.features[!,c] + @test d.dataframe[!, c] == d.features[!,c] end end