diff --git a/Project.toml b/Project.toml index 651b150..c6238e6 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["xKDR Forum, Sourish Das"] version = "0.1.0" [deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/src/CRRao.jl b/src/CRRao.jl index 59c062d..501de06 100644 --- a/src/CRRao.jl +++ b/src/CRRao.jl @@ -397,6 +397,7 @@ export Prior_Ridge, Prior_Laplace, Prior_Cauchy, Prior_TDist, Prior_HorseShoe, P export CRRaoLink, Logit, Probit, Cloglog, Cauchit, fit export coef, coeftable, r2, adjr2, loglikelihood, aic, bic, sigma, predict, residuals, cooksdistance, BPTest, pvalue export FrequentistRegression, BayesianRegression +export KMeansClustering include("random_number_generator.jl") include("general_stats.jl") diff --git a/src/clustering/kmeans_clustering.jl b/src/clustering/kmeans_clustering.jl new file mode 100644 index 0000000..69c8b5c --- /dev/null +++ b/src/clustering/kmeans_clustering.jl @@ -0,0 +1,91 @@ +using Clustering, DataFrames + +struct KMeansClustering end + +""" + fit(df::DataFrame, modelClass::KMeansClustering, K::Int64; max_iters::Int=100, tol::Float64=1e-4) + + +# Arguments +- `df`: A DataFrame where each row is an observation and each column is a feature. +- `K`: The number of clusters to form. +- `max_iters`: (Optional) Maximum number of iterations for the K-means algorithm. Default is 100. +- `tol`: (Optional) Tolerance for convergence. Default is 1e-4. + +# Returns +- 'KmeansResult' object that contains the following fields providing details about the clustering outcome: + + - centers: This is a matrix where each column is the centroid of a cluster. The number of columns is equal to the number of clusters k, and the number of rows is equal to the number of features in the dataset. + - assignments: An array indicating the cluster assignment for each observation in the dataset. The length of this array is equal to the number of observations, and each element is an integer representing the cluster index to which the observation has been assigned. + - costs: An array of the costs associated with each observation, typically representing the squared distance from each observation to its assigned cluster center. + - counts: An array indicating the number of observations assigned to each cluster. + - totalcost: The total cost of the clustering solution, which is the sum of all individual observation costs. This can be interpreted as a measure of the clustering quality, with lower values indicating a better fit. + - converged: A boolean value indicating whether the algorithm has converged. The algorithm is considered to have converged if the centroids do not change significantly in the last iteration or if it reaches the maximum number of iterations. + - iterations: The number of iterations the algorithm ran before stopping. This could be due to convergence or reaching the maximum number of allowed iterations. + + + + +Perform K-means clustering on the entire DataFrame using `K` clusters. + + +""" +function fit( + df::DataFrame, + modelClass::KMeansClustering, + K::Int64; + max_iters::Int = 100, + tol::Float64 = 1e-4 +) + # Ensure the DataFrame is converted to a Float64 matrix for Clustering.jl + data = Matrix{Float64}(df) + + # Perform K-means clustering. Since Clustering.jl expects data in columns, so we transpose. + result = kmeans(data', K; maxiter = max_iters, tol = tol) + + return (result) +end + +""" + fit(VarName, df::DataFrame, modelClass:: KMeansClustering, K::Int64; max_iters::Int=100, tol::Float64=1e-4) + + # Arguments + - `df`: A DataFrame where each row is an observation and each column is a feature. + - `K`: The number of clusters to form. + - `max_iters`: (Optional) Maximum number of iterations for the K-means algorithm. Default is 100. + - `tol`: (Optional) Tolerance for convergence. Default is 1e-4. + + # Returns + - 'KmeansResult' object that contains the following fields providing details about the clustering outcome: + + - centers: This is a matrix where each column is the centroid of a cluster. The number of columns is equal to the number of clusters k, and the number of rows is equal to the number of features in the dataset. + - assignments: An array indicating the cluster assignment for each observation in the dataset. The length of this array is equal to the number of observations, and each element is an integer representing the cluster index to which the observation has been assigned. + - costs: An array of the costs associated with each observation, typically representing the squared distance from each observation to its assigned cluster center. + - counts: An array indicating the number of observations assigned to each cluster. + - totalcost: The total cost of the clustering solution, which is the sum of all individual observation costs. This can be interpreted as a measure of the clustering quality, with lower values indicating a better fit. + - converged: A boolean value indicating whether the algorithm has converged. The algorithm is considered to have converged if the centroids do not change significantly in the last iteration or if it reaches the maximum number of iterations. + - iterations: The number of iterations the algorithm ran before stopping. This could be due to convergence or reaching the maximum number of allowed iterations. + + + +Perform K-means clustering on selected variables from the DataFrame using `K` clusters. +""" +function fit( + VarName, + df::DataFrame, + modelClass::KMeansClustering, + K::Int64; + max_iters::Int = 100, + tol::Float64 = 1e-4 +) + # Select specified variables from DataFrame + selected_data = select(df, VarName) + + # Convert the selected DataFrame columns to a Float64 matrix + data = Matrix{Float64}(selected_data) + + # Perform K-means clustering + result = kmeans(data', K; maxiter = max_iters, tol = tol) + + return (result) +end diff --git a/src/fitmodel.jl b/src/fitmodel.jl index 046fa98..e24a44a 100644 --- a/src/fitmodel.jl +++ b/src/fitmodel.jl @@ -62,3 +62,6 @@ include("bayesian/poisson_regression.jl") # Bayesian getter functions include("bayesian/getter.jl") + +# K-means Clustering function +include("clustering/kmeans_clustering.jl") diff --git a/test/numerical/clustering/KMeansClustering.jl b/test/numerical/clustering/KMeansClustering.jl new file mode 100644 index 0000000..6559119 --- /dev/null +++ b/test/numerical/clustering/KMeansClustering.jl @@ -0,0 +1,26 @@ +using Test +using DataFrames + +@testset "KMeans Clustering Tests" begin + df = DataFrame(A = rand(100), B = rand(100), C = rand(100)) + K = 3 + + # Test fit function without variable selection + @testset "Fit without variable selection" begin + result = fit(df, KMeansClustering(), K) + + @test length(result.assignments) == nrow(df) + @test size(result.centers, 1) == 2 # Centroids should have dimensions matching the input features + @test size(result.centers, 2) == K # There should be K centroids + end + + # Test fit function with variable selection + @testset "Fit with variable selection" begin + selected_vars = [:A] # Select only one column for clustering + result = fit(selected_vars, df, KMeansClustering(), K) + + @test length(result.assignments) == nrow(df) + @test size(result.centers, 1) == length(selected_vars) # Centroids should have dimensions matching the input features + @test size(result.centers, 2) == K + end +end