Skip to content

Commit

Permalink
Remove automatic scaling (#57)
Browse files Browse the repository at this point in the history
* Remove default use of scaler

* Update parameters in tests

* Update version

* Update test of clusterer equality

* Update run_test.sh

* Add sync on alpaka mempcy

* Update test paramenters and truth files

* Add backends to __init__.py
  • Loading branch information
sbaldu authored Aug 6, 2024
1 parent 92245d0 commit 7410770
Show file tree
Hide file tree
Showing 14 changed files with 3,064 additions and 3,098 deletions.
29 changes: 0 additions & 29 deletions CLUEstering/CLUEstering.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def __init__(self, dc_: float, rhoc_: float, outlier_: float, ppbin: int = 10):
# Initialize attributes
## Data containers
self.clust_data = None
self.scaler = StandardScaler()

## Kernel for calculation of local density
self.kernel = clue_kernels.FlatKernel(0.5)
Expand Down Expand Up @@ -393,25 +392,6 @@ def _handle_dataframe(self, df_: pd.DataFrame) -> None:
n_dim,
n_points)

def _rescale(self) -> None:
"""
Normalizes the input data using a standard scaler
Modified attributes
-------------------
clust_data.coords : np.ndarray
Array containing the coordinates and weight values of the data points
Returns
-------
None
"""

for dim in range(self.clust_data.n_dim):
self.clust_data.coords.T[dim] = \
self.scaler.fit_transform(
self.clust_data.coords.T[dim].reshape(-1, 1)).reshape(1, -1)[0]

def read_data(self,
input_data: Union[pd.DataFrame,str,dict,list,np.ndarray]) -> None:
"""
Expand Down Expand Up @@ -467,9 +447,6 @@ def read_data(self,
df = self._read_dict_df(input_data)
self._handle_dataframe(df)

# Rescale the coordinates with a standard scaler
self._rescale()

def change_coordinates(self, **kwargs: types.FunctionType) -> None:
"""
Change the coordinate system
Expand All @@ -494,12 +471,6 @@ def change_coordinates(self, **kwargs: types.FunctionType) -> None:
for coord, func in kwargs.items():
self.clust_data.coords[int(coord[1])] = func(self.clust_data.original_coords)

# Normalize the coordinate with a standard scaler
self.clust_data.coords[int(coord[1])] = \
self.scaler.fit_transform(
self.clust_data.coords[int(coord[1])].reshape(-1, 1)
).reshape(1, -1)[0]

def choose_kernel(self,
choice: str,
parameters: Union[list,None] = None,
Expand Down
1 change: 1 addition & 0 deletions CLUEstering/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from CLUEstering.CLUEstering import clusterer
from CLUEstering.CLUEstering import test_blobs
from CLUEstering.CLUEstering import backends
1 change: 1 addition & 0 deletions CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
queue_,
cms::alpakatools::make_device_view(device, (*d_tiles)->tileSize(), Ndim),
cms::alpakatools::make_host_view(tile_size, Ndim));
alpaka::wait(queue_);

const Idx tiles_grid_size = cms::alpakatools::divide_up_by(nTiles, block_size);
const auto tiles_working_div =
Expand Down
2 changes: 1 addition & 1 deletion tests/check_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def check_result(output_file, truth_file):
truth = pd.read_csv(truth_file)

# Check if the number of clusters is the same
n_clusters_o = len(truth['cluster_ids'].unique())
n_clusters_o = len(output['cluster_ids'].unique())
n_clusters_t = len(truth['cluster_ids'].unique())
if n_clusters_o != n_clusters_t:
return False
Expand Down
8 changes: 0 additions & 8 deletions tests/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,6 @@ python3 -m pytest test_test_blobs.py
echo "## Test the equality operator for the results of the clustering"
python3 -m pytest test_clusterer_equality.py

# Test the method of changind the domain extremes of the coordinates
echo "## Test the change_domains method"
python3 -m pytest test_change_domains.py

# Test the clustering of points at the opposite extremes of a finite domain
echo "## Test the clustering of points at the opposite extremes of a finite domain"
python3 -m pytest test_domain_extremes.py

if [[ $1 == "-" || $1 == "--clean" ]]
then
rm -f ./*_output.csv
Expand Down
19 changes: 10 additions & 9 deletions tests/test_blob_dataset.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
'''
Testing the algorithm on the blob dataset, a dataset where points are distributed to form
round clusters
Testing the algorithm on the blob dataset, a dataset where points are
distributed to form round clusters
'''

from check_result import check_result
import os
import sys
import pandas as pd
import pytest
sys.path.insert(1, '.')
from check_result import check_result
sys.path.insert(1, '../CLUEstering/')
import CLUEstering as clue


@pytest.fixture
def blobs():
'''
Expand All @@ -22,24 +22,25 @@ def blobs():

def test_clustering(blobs):
'''
Checks that the output of the clustering is the one given by the truth dataset
Checks that the output of the clustering is the one given by the
truth dataset
'''

# Check if the output file already exists and if it does, delete it
if os.path.isfile('./blobs_output.csv'):
os.remove('./blobs_output.csv')

c = clue.clusterer(0.8, 5, 1.5)
c = clue.clusterer(1., 5, 2.)
c.read_data(blobs)
c.run_clue()
c.to_csv('./', 'blobs_output.csv')

check_result('./blobs_output.csv',
'./test_datasets/truth_files/blobs_truth.csv')
assert check_result('./blobs_output.csv',
'./test_datasets/truth_files/blobs_truth.csv')


if __name__ == "__main__":
c = clue.clusterer(0.8, 5, 1.5)
c = clue.clusterer(1., 5, 2.)
c.read_data("./test_datasets/blob.csv")
c.run_clue()
c.cluster_plotter()
36 changes: 18 additions & 18 deletions tests/test_clusterer_equality.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,43 @@


@pytest.fixture
def moons():
def sissa():
'''
Returns the dataframe containing the moon dataset
Returns the dataframe containing the sissa ataset
'''
return pd.read_csv("./test_datasets/moons.csv")
return pd.read_csv("./test_datasets/sissa.csv")


@pytest.fixture
def circles():
def toyDet():
'''
Returns the dataframe containing the circle dataset
Returns the dataframe containing the toy detector dataset
'''
return pd.read_csv("./test_datasets/circles.csv")
return pd.read_csv("./test_datasets/toyDetector.csv")


def test_clusterer_equality(moons, circles):
def test_clusterer_equality(sissa, toyDet):
'''
Test the equality operator for clusterer objects
'''
# Moons dataset
clust1 = clue.clusterer(0.5, 5, 1.)
clust1.read_data(moons)
# Sissa dataset
clust1 = clue.clusterer(20., 10., 1.)
clust1.read_data(sissa)
clust1.run_clue()

# Create a copy of the moons clusterer to check the equality of clusterers
clust1_copy = clue.clusterer(0.5, 5, 1.)
clust1_copy.read_data(moons)
# Create a copy of the sissa lusterer to check the equality of clusterers
clust1_copy = clue.clusterer(20., 10., 1.)
clust1_copy.read_data(sissa)
clust1_copy.run_clue()

# Circles dataset
clust2 = clue.clusterer(0.9, 5, 1.5)
clust2.read_data(circles)
# toyDet dataset
clust2 = clue.clusterer(5., 2.5, 1.)
clust2.read_data(toyDet)
clust2.run_clue()

# Create a copy to check the equality of clusterers
clust2_copy = clue.clusterer(0.9, 5, 1.5)
clust2_copy.read_data(circles)
clust2_copy = clue.clusterer(5., 2.5, 1.)
clust2_copy.read_data(toyDet)
clust2_copy.run_clue()

# Check equality
Expand Down
Loading

0 comments on commit 7410770

Please sign in to comment.