Skip to content

Commit

Permalink
Add docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
tompollard committed Jun 13, 2024
1 parent 35095f4 commit 2426880
Showing 1 changed file with 194 additions and 37 deletions.
231 changes: 194 additions & 37 deletions tableone/modality.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,24 @@

import numpy as np
from scipy.special import beta as betafun
# import matplotlib.pyplot as plt
from scipy.optimize import brentq
import pandas as pd
np.random.seed(1337)
# import pickle


def generate_data(peaks=2, n=None, mu=None, std=None):
# Generate parameters if not provided
"""
Generates synthetic data consisting of several Gaussian distributions.
Parameters:
peaks (int): Number of distinct Gaussian distributions to generate.
n (list of int, optional): List of sizes for each Gaussian distribution.
mu (list of float, optional): List of means for each Gaussian distribution.
std (list of float, optional): List of standard deviations for each distribution.
Returns:
numpy.ndarray: Array of concatenated data from the generated Gaussian distributions.
"""
if not n:
n = [5000] * peaks
if not mu:
Expand All @@ -37,20 +46,17 @@ def generate_data(peaks=2, n=None, mu=None, std=None):

def hartigan_diptest(data):
"""
P-value according to Hartigan's dip test for unimodality.
The dip is computed using the function
dip_and_closest_unimodal_from_cdf. From this the p-value is
interpolated using a table imported from the R package diptest.
Performs Hartigan's dip test to assess the unimodality of a dataset.
References:
Hartigan and Hartigan (1985): The dip test of unimodality.
The Annals of Statistics. 13(1).
References:
Hartigan and Hartigan (1985): The dip test of unimodality.
The Annals of Statistics. 13(1).
Input:
data - one-dimensional data set.
Parameters:
data (numpy.ndarray): Array containing the dataset to be tested.
Value:
p-value for the test.
Returns:
float: P-value indicating the probability of the dataset being unimodal.
"""

try:
Expand All @@ -63,7 +69,13 @@ def hartigan_diptest(data):

def pval_hartigan(data):
"""
Add Docstring
Computes the p-value for Hartigan's dip test using a precomputed table.
Parameters:
data (numpy.ndarray): Array containing the dataset for which the dip test is performed.
Returns:
float: The interpolated p-value from precomputed Hartigan's dip test tables.
"""
xF, yF = cum_distr(data)
dip = dip_from_cdf(xF, yF)
Expand All @@ -72,7 +84,14 @@ def pval_hartigan(data):

def cum_distr(data, w=None):
"""
Add Docstring
Computes the cumulative distribution function of the given data.
Parameters:
data (numpy.ndarray): Array of data points.
w (numpy.ndarray, optional): Weights for the data points.
Returns:
tuple: Arrays representing the x and y values of the cumulative distribution.
"""
if w is None:
w = np.ones(len(data))*1./len(data)
Expand Down Expand Up @@ -103,7 +122,16 @@ def cum_distr(data, w=None):

def unique(data, return_index, eps, is_sorted=True):
"""
Add Docstring
Identifies and returns unique elements from the sorted data array.
Parameters:
data (numpy.ndarray): Array of data points.
return_index (bool): Whether to return the indices of unique elements.
eps (float): Small threshold to consider differences in data points.
is_sorted (bool): Whether the input data array is sorted.
Returns:
tuple: Array of unique data and, optionally, their indices.
"""
if not is_sorted:
ord = np.argsort(data)
Expand Down Expand Up @@ -136,16 +164,32 @@ def unique(data, return_index, eps, is_sorted=True):

def dip_from_cdf(xF, yF, plotting=False, verbose=False, eps=1e-12):
"""
Add Docstring
Computes the dip statistic from the cumulative distribution functions.
Parameters:
xF (numpy.ndarray): X-coordinates of the empirical distribution function.
yF (numpy.ndarray): Y-coordinates of the empirical distribution function.
plotting (bool, optional): If True, plots the analysis (not implemented).
verbose (bool, optional): If True, prints debug information.
eps (float): Tolerance for floating-point comparison.
Returns:
float: The computed dip statistic.
"""
dip, _ = dip_and_closest_unimodal_from_cdf(xF, yF, plotting, verbose, eps)
return dip


def dip_pval_tabinterpol(dip, N):
"""
dip - dip value computed from dip_from_cdf
N - number of observations
Interpolates a p-value from a table based on the dip statistic and sample size.
Parameters:
dip (float): Computed dip statistic.
N (int): Sample size of the data.
Returns:
float: Interpolated p-value based on the dip statistic and sample size.
"""

# if qDiptab_df is None:
Expand Down Expand Up @@ -733,21 +777,37 @@ def dip_pval_tabinterpol(dip, N):

def transform_dip_to_other_nbr_pts(dip_n, n, m):
"""
Add Docstring
Transforms a dip statistic calculated for a given sample size to another.
Parameters:
dip_n (float): The dip statistic for the original sample size.
n (int): Original sample size.
m (int): New sample size to which the dip statistic will be transformed.
Returns:
float: Transformed dip statistic appropriate for the new sample size.
"""
dip_m = np.sqrt(n/m)*dip_n
return dip_m


def dip_and_closest_unimodal_from_cdf(xF, yF, plotting=False, verbose=False, eps=1e-12):
"""
Dip computed as distance between empirical distribution function (EDF) and
cumulative distribution function for the unimodal distribution with
smallest such distance. The optimal unimodal distribution is found by
the algorithm presented in
Finds the dip of the empirical distribution function and the closest unimodal distribution.
Reference:
Hartigan (1985): Computation of the dip statistic to test for
unimodaliy. Applied Statistics, vol. 34, no. 3
Hartigan (1985): Computation of the dip statistic to test for
unimodaliy. Applied Statistics, vol. 34, no. 3
Parameters:
xF (numpy.ndarray): X-coordinates of the empirical distribution function.
yF (numpy.ndarray): Y-coordinates of the empirical distribution function.
plotting (bool, optional): If True, enables plotting of the distributions (not implemented).
verbose (bool, optional): If True, prints detailed debugging information.
eps (float): Tolerance for floating-point comparison.
Returns:
tuple: Dip statistic and the closest unimodal distribution function.
If the plotting option is enabled the optimal unimodal distribution
function is plotted along with (xF, yF-dip) and (xF, yF+dip)
Expand Down Expand Up @@ -967,15 +1027,30 @@ def dip_and_closest_unimodal_from_cdf(xF, yF, plotting=False, verbose=False, eps

def greatest_convex_minorant_sorted(x, y):
"""
Add Docstring
Determines the greatest convex minorant for sorted data.
Parameters:
x (numpy.ndarray): Sorted x-coordinates of the data points.
y (numpy.ndarray): Corresponding y-coordinates of the data points.
Returns:
numpy.ndarray: Indices of the points forming the greatest convex minorant.
"""
i = least_concave_majorant_sorted(x, -y)
return i


def least_concave_majorant_sorted(x, y, eps=1e-12):
"""
Add Docstring
Finds the least concave majorant for sorted data.
Parameters:
x (numpy.ndarray): Sorted x-coordinates of the data points.
y (numpy.ndarray): Corresponding y-coordinates of the data points.
eps (float): Small threshold to avoid division errors.
Returns:
numpy.ndarray: Indices of the points forming the least concave majorant.
"""
i = [0]
icurr = 0
Expand All @@ -1001,7 +1076,17 @@ def least_concave_majorant_sorted(x, y, eps=1e-12):

class KernelDensityDerivative(object):
"""
Add Docstring
Kernel density derivative estimator.
Parameters:
data (numpy.ndarray): Data points for density estimation.
deriv_order (int): Order of the derivative (0 for density, 2 for second derivative).
Attributes:
kernel (function): Kernel function used for density estimation.
deriv_order (int): Order of the derivative being estimated.
h (float): Bandwidth used for the kernel density estimation.
datah (numpy.ndarray): Scaled data by the bandwidth.
"""
def __init__(self, data, deriv_order):
if deriv_order == 0:
Expand Down Expand Up @@ -1038,15 +1123,32 @@ def score_samples(self, x):

def silverman_bandwidth(data, deriv_order=0):
"""
Add Docstring
Calculates the Silverman bandwidth for kernel density estimation.
Parameters:
data (numpy.ndarray): The data for which the bandwidth is to be calculated.
deriv_order (int): Order of the derivative for which the bandwidth is calculated.
Returns:
float: The estimated bandwidth.
"""
sigmahat = np.std(data, ddof=1)
return sigmahat*bandwidth_factor(data.shape[0], deriv_order)


def bandwidth_factor(nbr_data_pts, deriv_order=0):
"""
Scale factor for one-dimensional plug-in bandwidth selection.
Calculates the bandwidth scaling factor based on the number of data points and derivative order.
Parameters:
nbr_data_pts (int): Number of data points.
deriv_order (int): Order of the derivative (0 for density estimation, 2 for second derivative).
Returns:
float: Scaling factor for bandwidth calculation.
Raises:
ValueError: If the derivative order is not supported.
"""
if deriv_order == 0:
return (3.0*nbr_data_pts/4)**(-1.0/5)
Expand All @@ -1059,7 +1161,14 @@ def bandwidth_factor(nbr_data_pts, deriv_order=0):

def calibrated_dip_test(data, N_bootstrap=1000):
"""
Add Docstring
Performs a calibrated dip test by bootstrapping and comparing to reference distributions.
Parameters:
data (numpy.ndarray): The data to test for unimodality.
N_bootstrap (int): Number of bootstrap samples to generate for calibration.
Returns:
float: Proportion of bootstrap samples where the computed dip is greater than the observed dip.
"""
xF, yF = cum_distr(data)
dip = dip_from_cdf(xF, yF)
Expand All @@ -1081,7 +1190,13 @@ def calibrated_dip_test(data, N_bootstrap=1000):

def select_calibration_distribution(d_hat):
"""
Add Docstring
Selects an appropriate reference distribution for dip test calibration based on a d_hat value.
Parameters:
d_hat (float): Calculated measure from the original data used to select the reference distribution.
Returns:
object: An instance of a reference distribution class for generating samples.
"""
# data_dir = os.path.join('.', 'data')
# print(data_dir)
Expand Down Expand Up @@ -1189,30 +1304,72 @@ def gamma(beta): return 2*beta*betafun(beta-1./2, 1./2)**2 - d_hat

class RefGaussian(object):
"""
Add Docstring
Reference Gaussian distribution for generating random samples.
Methods:
sample(int): Generates a sample from a Gaussian distribution.
"""
def sample(self, n):
"""
Generates random samples from a Gaussian distribution.
Parameters:
n (int): Number of samples to generate.
Returns:
numpy.ndarray: Array of Gaussian random samples.
"""
return np.random.randn(n)


class RefBeta(object):
"""
Add Docstring
Reference Beta distribution for generating random samples, parameterized by a shape parameter.
Parameters:
beta (float): Shape parameter for both alpha and beta of the Beta distribution.
Methods:
sample(int): Generates a sample from a Beta distribution.
"""
def __init__(self, beta):
self.beta = beta

def sample(self, n):
"""
Generates random samples from a Beta distribution.
Parameters:
n (int): Number of samples to generate.
Returns:
numpy.ndarray: Array of Beta random samples.
"""
return np.random.beta(self.beta, self.beta, n)


class RefStudentt(object):
"""
Add Docstring
Reference Student's t distribution for generating random samples, parameterized by a degrees of freedom parameter.
Parameters:
beta (float): Shape parameter that influences the degrees of freedom of the Student's t distribution.
Methods:
sample(int): Generates a sample from a Student's t distribution.
"""
def __init__(self, beta):
self.beta = beta

def sample(self, n):
"""
Generates random samples from a Student's t distribution.
Parameters:
n (int): Number of samples to generate.
Returns:
numpy.ndarray: Array of Student's t random samples.
"""
dof = 2*self.beta-1
return 1./np.sqrt(dof)*np.random.standard_t(dof, n)

0 comments on commit 2426880

Please sign in to comment.