Skip to content

Commit

Permalink
Format all .py files using autopep8
Browse files Browse the repository at this point in the history
  • Loading branch information
sbaldu committed Nov 2, 2023
1 parent 218b6f3 commit d91b75e
Show file tree
Hide file tree
Showing 14 changed files with 199 additions and 143 deletions.
93 changes: 47 additions & 46 deletions CLUEstering/CLUEstering.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from sklearn.preprocessing import StandardScaler
import CLUEsteringCPP as Algo

def test_blobs(n_samples: int, n_dim: int , n_blobs: int = 4, mean: float = 0,

def test_blobs(n_samples: int, n_dim: int, n_blobs: int = 4, mean: float = 0,
sigma: float = 0.5, x_max: float = 30, y_max: float = 30) -> pd.DataFrame:
"""
Returns a dataframe containing randomly generated 2-dimensional or 3-dimensional blobs.
Expand Down Expand Up @@ -65,15 +66,14 @@ def test_blobs(n_samples: int, n_dim: int , n_blobs: int = 4, mean: float = 0,
data['x1'] = blob_data.T[1]
data['weight'] = np.full(shape=len(blob_data.T[0]), fill_value=1)


return pd.DataFrame(data)
if n_dim == 3:
data = {'x0': [], 'x1': [], 'x2': [], 'weight': []}
sqrt_samples = int(sqrt(n_samples))
z_values = np.random.normal(mean,sigma,sqrt_samples)
z_values = np.random.normal(mean, sigma, sqrt_samples)
centers = [[x_max * rnd.random(), y_max * rnd.random()] for _ in range(n_blobs)]

for value in z_values: # for every z value, a layer is generated.
for value in z_values: # for every z value, a layer is generated.
blob_data = make_blobs(n_samples=sqrt_samples, centers=np.array(centers))[0]
data['x0'] = np.concatenate([data['x0'], blob_data.T[0]])
data['x1'] = np.concatenate([data['x1'], blob_data.T[1]])
Expand Down Expand Up @@ -106,12 +106,13 @@ class clustering_data:
Number of points in the clustering data.
"""

coords : np.ndarray
original_coords : np.ndarray
weight : np.ndarray
domain_ranges : list
n_dim : int
n_points : int
coords: np.ndarray
original_coords: np.ndarray
weight: np.ndarray
domain_ranges: list
n_dim: int
n_points: int


@dataclass(eq=False)
class cluster_properties:
Expand All @@ -135,12 +136,12 @@ class cluster_properties:
Dataframe containing is_seed and cluster_ids as columns.
"""

n_clusters : int
cluster_ids : np.ndarray
is_seed : np.ndarray
cluster_points : np.ndarray
points_per_cluster : np.ndarray
output_df : pd.DataFrame
n_clusters: int
cluster_ids: np.ndarray
is_seed: np.ndarray
cluster_points: np.ndarray
points_per_cluster: np.ndarray
output_df: pd.DataFrame

def __eq__(self, other):
if self.n_clusters != other.n_clusters:
Expand Down Expand Up @@ -195,18 +196,18 @@ def __init__(self, dc_: float, rhoc_: float, outlier_: float, ppbin: int = 10):
self.ppbin = ppbin

# Initialize attributes
## Data containers
# Data containers
self.clust_data = None
self.scaler = StandardScaler()

## Kernel for calculation of local density
# Kernel for calculation of local density
self.kernel = Algo.flatKernel(0.5)

## Output attributes
# Output attributes
self.clust_prop = None
self.elapsed_time = 0.

def _read_array(self, input_data: Union[list,np.ndarray]) -> None:
def _read_array(self, input_data: Union[list, np.ndarray]) -> None:
"""
Reads data provided with lists or np.ndarrays
Expand Down Expand Up @@ -235,7 +236,7 @@ def _read_array(self, input_data: Union[list,np.ndarray]) -> None:
len(input_data[:-1]),
len(input_data[-1]))

def _read_string(self, input_data: str) -> Union[pd.DataFrame,None]:
def _read_string(self, input_data: str) -> Union[pd.DataFrame, None]:
"""
Reads data provided by passing a string containing the path to a csv file
Expand All @@ -259,7 +260,7 @@ def _read_string(self, input_data: str) -> Union[pd.DataFrame,None]:
df_ = pd.read_csv(input_data)
return df_

def _read_dict_df(self, input_data: Union[dict,pd.DataFrame]) -> pd.DataFrame:
def _read_dict_df(self, input_data: Union[dict, pd.DataFrame]) -> pd.DataFrame:
"""
Reads data provided using dictionaries or pandas dataframes
Expand Down Expand Up @@ -314,7 +315,7 @@ def _handle_dataframe(self, df_: pd.DataFrame) -> None:
n_points = len(df_.index)
coords = np.zeros(shape=(n_dim, n_points))
for dim in range(n_dim):
coords[dim] = np.array(df_.iloc[:,dim])
coords[dim] = np.array(df_.iloc[:, dim])

self.clust_data = clustering_data(coords,
np.copy(coords),
Expand All @@ -339,10 +340,10 @@ def _rescale(self) -> None:

for dim in range(self.clust_data.n_dim):
self.clust_data.coords[dim] = \
self.scaler.fit_transform(self.clust_data.coords[dim].reshape(-1, 1)).reshape(1, -1)[0]
self.scaler.fit_transform(self.clust_data.coords[dim].reshape(-1, 1)).reshape(1, -1)[0]

def read_data(self,
input_data: Union[pd.DataFrame,str,dict,list,np.ndarray],
input_data: Union[pd.DataFrame, str, dict, list, np.ndarray],
rescale: bool = True,
**kwargs: tuple) -> None:
"""
Expand Down Expand Up @@ -435,7 +436,7 @@ def change_coordinates(self, **kwargs: types.FunctionType) -> None:
self.clust_data.coords[int(coord[1])] = \
self.scaler.fit_transform(
self.clust_data.coords[int(coord[1])].reshape(-1, 1)
).reshape(1, -1)[0]
).reshape(1, -1)[0]

def change_domains(self, **kwargs: tuple) -> None:
"""
Expand Down Expand Up @@ -471,7 +472,7 @@ def change_domains(self, **kwargs: tuple) -> None:

def choose_kernel(self,
choice: str,
parameters: Union[list,None] = None,
parameters: Union[list, None] = None,
function: types.FunctionType = lambda: 0) -> None:
"""
Changes the kernel used in the calculation of local density. The default kernel
Expand Down Expand Up @@ -552,9 +553,9 @@ def run_clue(self, verbose: bool = False) -> None:
"""

start = time.time_ns()
cluster_id_is_seed = Algo.mainRun(self.dc_,self.rhoc,self.outlier,self.ppbin,
self.clust_data.domain_ranges,self.kernel,
self.clust_data.coords,self.clust_data.weight,
cluster_id_is_seed = Algo.mainRun(self.dc_, self.rhoc, self.outlier, self.ppbin,
self.clust_data.domain_ranges, self.kernel,
self.clust_data.coords, self.clust_data.weight,
self.clust_data.n_dim)
finish = time.time_ns()
cluster_ids = np.array(cluster_id_is_seed[0])
Expand All @@ -577,12 +578,12 @@ def run_clue(self, verbose: bool = False) -> None:
points_per_cluster,
output_df)

self.elapsed_time = (finish - start)/(10**6)
self.elapsed_time = (finish - start) / (10**6)
if verbose:
print(f'CLUE run in {self.elapsed_time} ms')
print(f'Number of clusters found: {self.clust_prop.n_clusters}')

def input_plotter(self, plot_title: str='', title_size: float = 16,
def input_plotter(self, plot_title: str = '', title_size: float = 16,
x_label: str = 'x', y_label: str = 'y', z_label: str = 'z',
label_size: float = 16, pt_size: float = 1, pt_colour: str = 'b',
grid: bool = True, grid_style: str = '--', grid_size: float = 0.2,
Expand Down Expand Up @@ -667,10 +668,10 @@ def input_plotter(self, plot_title: str='', title_size: float = 16,
fig = plt.figure()
ax_ = fig.add_subplot(projection='3d')
ax_.scatter(cartesian_coords[0],
cartesian_coords[1],
cartesian_coords[2],
s=pt_size,
color=pt_colour)
cartesian_coords[1],
cartesian_coords[2],
s=pt_size,
color=pt_colour)

# Customization of the plot title
ax_.set_title(plot_title, fontsize=title_size)
Expand Down Expand Up @@ -765,12 +766,12 @@ def cluster_plotter(self, plot_title: str = '', title_size: float = 16,

max_clusterid = max(df_["cluster_ids"])

df_out = df_[df_.cluster_ids == -1] # Outliers
df_out = df_[df_.cluster_ids == -1] # Outliers
plt.scatter(df_out.x0, df_out.x1, s=outl_size, marker='x', color='0.4')
for i in range(0, max_clusterid+1):
dfi = df_[df_.cluster_ids == i] # ith cluster
for i in range(0, max_clusterid + 1):
dfi = df_[df_.cluster_ids == i] # ith cluster
plt.scatter(dfi.x0, dfi.x1, s=pt_size, marker='.')
df_seed = df_[df_.isSeed == 1] # Only Seeds
df_seed = df_[df_.isSeed == 1] # Only Seeds
plt.scatter(df_seed.x0, df_seed.x1, s=seed_size, color='r', marker='*')

# Customization of the plot title
Expand Down Expand Up @@ -804,13 +805,13 @@ def cluster_plotter(self, plot_title: str = '', title_size: float = 16,
ax_ = fig.add_subplot(projection='3d')

df_out = df_[df_.cluster_ids == -1]
ax_.scatter(df_out.x0, df_out.x1, df_out.x2, s=outl_size, color = 'grey', marker = 'x')
for i in range(0, max_clusterid+1):
ax_.scatter(df_out.x0, df_out.x1, df_out.x2, s=outl_size, color='grey', marker='x')
for i in range(0, max_clusterid + 1):
dfi = df_[df_.cluster_ids == i]
ax_.scatter(dfi.x0, dfi.x1, dfi.x2, s=pt_size, marker = '.')
ax_.scatter(dfi.x0, dfi.x1, dfi.x2, s=pt_size, marker='.')

df_seed = df_[df_.isSeed == 1] # Only Seeds
ax_.scatter(df_seed.x0, df_seed.x1, df_seed.x2, s=seed_size, color = 'r', marker = '*')
df_seed = df_[df_.isSeed == 1] # Only Seeds
ax_.scatter(df_seed.x0, df_seed.x1, df_seed.x2, s=seed_size, color='r', marker='*')

# Customization of the plot title
ax_.set_title(plot_title, fontsize=title_size)
Expand Down Expand Up @@ -862,4 +863,4 @@ def to_csv(self, output_folder: str, file_name: str) -> None:
data['is_seed'] = self.clust_prop.is_seed

df_ = pd.DataFrame(data)
df_.to_csv(out_path,index=False)
df_.to_csv(out_path, index=False)
2 changes: 1 addition & 1 deletion CLUEstering/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from CLUEstering.CLUEstering import clusterer
from CLUEstering.CLUEstering import test_blobs
from CLUEstering.CLUEstering import test_blobs
40 changes: 20 additions & 20 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

__version__ = "1.4.0"
this_directory = Path(__file__).parent
long_description = (this_directory/'README.md').read_text()
long_description = (this_directory / 'README.md').read_text()

ext_modules = [
Pybind11Extension(
"CLUEsteringCPP",
['CLUEstering/binding.cc'],
include_dirs = ['CLUEstering/include/']
),
Pybind11Extension(
"CLUEsteringCPP",
['CLUEstering/binding.cc'],
include_dirs=['CLUEstering/include/']
),
]

setup(
Expand All @@ -21,18 +21,18 @@
author_email="[email protected]",
description='''A library that generalizes the original 2-dimensional CLUE
algorithm made at CERN.''',
long_description=long_description,
long_description_content_type='text/markdown',
packages=find_packages(),
install_requires=['scikit-learn','numpy','matplotlib','pandas'],
ext_modules=ext_modules,
keywords=['Python','Clustering','Binding'],
python_requires='>=3.7',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3',
'Operating System :: Unix',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows',
]
long_description=long_description,
long_description_content_type='text/markdown',
packages=find_packages(),
install_requires=['scikit-learn', 'numpy', 'matplotlib', 'pandas'],
ext_modules=ext_modules,
keywords=['Python', 'Clustering', 'Binding'],
python_requires='>=3.7',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3',
'Operating System :: Unix',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows',
]
)
13 changes: 8 additions & 5 deletions tests/test_blob_dataset.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
from filecmp import cmp
import CLUEstering as clue
import numpy as np
import os
import pandas as pd
import pytest
import sys
sys.path.insert(1, '../CLUEstering/')
import CLUEstering as clue
from filecmp import cmp


@pytest.fixture
def blobs():
return pd.read_csv("./test_datasets/blob.csv")


def test_blobs_clustering(blobs):
# Check if the output file already exists and if it does, delete it
if os.path.isfile('./blobs_output.csv'):
os.remove('./blobs_output.csv')

c = clue.clusterer(0.8,5,1.5)
c = clue.clusterer(0.8, 5, 1.5)
c.read_data(blobs)
c.run_clue()
c.to_csv('./','blobs_output.csv')
c.to_csv('./', 'blobs_output.csv')

assert cmp('./blobs_output.csv', './test_datasets/truth_files/blobs_truth.csv')
assert cmp('./blobs_output.csv',
'./test_datasets/truth_files/blobs_truth.csv')
24 changes: 16 additions & 8 deletions tests/test_change_domains.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from math import pi
import CLUEstering as clue
import numpy as np
import pytest
import sys
sys.path.insert(1, '../CLUEstering/')
import CLUEstering as clue
from math import pi


@pytest.fixture
def blob():
csv_file = './test_datasets/blob.csv'
return csv_file


def test_default_domains(blob):
clust = clue.clusterer(0.5, 5., 1.2)
clust.read_data(blob)
Expand All @@ -21,6 +23,7 @@ def test_default_domains(blob):
assert clust.clust_data.domain_ranges[1].min == -3.4028234663852886e+38
assert clust.clust_data.domain_ranges[1].max == 3.4028234663852886e+38


def test_change_domains_1():
# We generate data with zero mean and standard deviation, so that the
# domain extremes are not normalized by the standard scaler
Expand All @@ -43,10 +46,13 @@ def test_change_domains_1():
clust.change_domains(x0=(0., 2.), x1=(-pi, pi))

# Check that the new domains are (0, 2) and (-pi, pi)
assert clust.clust_data.domain_ranges[0].min == 0.
assert clust.clust_data.domain_ranges[0].min == 0.
assert clust.clust_data.domain_ranges[0].max == 2.
assert clust.clust_data.domain_ranges[1].min == pytest.approx(-pi, 0.0000001)
assert clust.clust_data.domain_ranges[1].max == pytest.approx(pi, 0.0000001)
assert clust.clust_data.domain_ranges[1].min == pytest.approx(
-pi, 0.0000001)
assert clust.clust_data.domain_ranges[1].max == pytest.approx(
pi, 0.0000001)


def test_change_domains_2():
# We generate data with non-zero mean and standard deviation, and we check
Expand All @@ -70,7 +76,9 @@ def test_change_domains_2():
clust.change_domains(x0=(0., 2.), x1=(-pi, pi))

# Check that the new domains are (0, 2) and (-pi, pi)
assert clust.clust_data.domain_ranges[0].min == pytest.approx(-1.41, 0.01)
assert clust.clust_data.domain_ranges[0].min == pytest.approx(-1.41, 0.01)
assert clust.clust_data.domain_ranges[0].max == 0.
assert clust.clust_data.domain_ranges[1].min == pytest.approx(-3.6356550, 0.0000001)
assert clust.clust_data.domain_ranges[1].max == pytest.approx(0.8072279, 0.0000001)
assert clust.clust_data.domain_ranges[1].min == pytest.approx(
-3.6356550, 0.0000001)
assert clust.clust_data.domain_ranges[1].max == pytest.approx(
0.8072279, 0.0000001)
Loading

0 comments on commit d91b75e

Please sign in to comment.