Remove automatic scaling (#57)

* Remove default use of scaler * Update parameters in tests * Update version * Update test of clusterer equality * Update run_test.sh * Add sync on alpaka mempcy * Update test paramenters and truth files * Add backends to __init__.py
cms-patatrack · Aug 6, 2024 · 7410770 · 7410770
1 parent 92245d0
commit 7410770
Show file tree

Hide file tree

Showing 14 changed files with 3,064 additions and 3,098 deletions.
diff --git a/CLUEstering/CLUEstering.py b/CLUEstering/CLUEstering.py
@@ -252,7 +252,6 @@ def __init__(self, dc_: float, rhoc_: float, outlier_: float, ppbin: int = 10):
         # Initialize attributes
         ## Data containers
         self.clust_data = None
-        self.scaler = StandardScaler()
 
         ## Kernel for calculation of local density
         self.kernel = clue_kernels.FlatKernel(0.5)
@@ -393,25 +392,6 @@ def _handle_dataframe(self, df_: pd.DataFrame) -> None:
                                           n_dim,
                                           n_points)
 
-    def _rescale(self) -> None:
-        """
-        Normalizes the input data using a standard scaler
-
-        Modified attributes
-        -------------------
-        clust_data.coords : np.ndarray
-            Array containing the coordinates and weight values of the data points
-
-        Returns
-        -------
-        None
-        """
-
-        for dim in range(self.clust_data.n_dim):
-            self.clust_data.coords.T[dim] = \
-            self.scaler.fit_transform(
-                    self.clust_data.coords.T[dim].reshape(-1, 1)).reshape(1, -1)[0]
-
     def read_data(self,
                   input_data: Union[pd.DataFrame,str,dict,list,np.ndarray]) -> None:
         """
@@ -467,9 +447,6 @@ def read_data(self,
             df = self._read_dict_df(input_data)
             self._handle_dataframe(df)
 
-        # Rescale the coordinates with a standard scaler
-        self._rescale()
-
     def change_coordinates(self, **kwargs: types.FunctionType) -> None:
         """
         Change the coordinate system
@@ -494,12 +471,6 @@ def change_coordinates(self, **kwargs: types.FunctionType) -> None:
         for coord, func in kwargs.items():
             self.clust_data.coords[int(coord[1])] = func(self.clust_data.original_coords)
 
-            # Normalize the coordinate with a standard scaler
-            self.clust_data.coords[int(coord[1])] = \
-                self.scaler.fit_transform(
-                    self.clust_data.coords[int(coord[1])].reshape(-1, 1)
-                ).reshape(1, -1)[0]
-
     def choose_kernel(self,
                       choice: str,
                       parameters: Union[list,None] = None,

diff --git a/CLUEstering/__init__.py b/CLUEstering/__init__.py
@@ -1,2 +1,3 @@
 from CLUEstering.CLUEstering import clusterer
 from CLUEstering.CLUEstering import test_blobs
+from CLUEstering.CLUEstering import backends
diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h
@@ -146,6 +146,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         queue_,
         cms::alpakatools::make_device_view(device, (*d_tiles)->tileSize(), Ndim),
         cms::alpakatools::make_host_view(tile_size, Ndim));
+    alpaka::wait(queue_);
 
     const Idx tiles_grid_size = cms::alpakatools::divide_up_by(nTiles, block_size);
     const auto tiles_working_div =

diff --git a/tests/check_result.py b/tests/check_result.py
@@ -11,7 +11,7 @@ def check_result(output_file, truth_file):
     truth = pd.read_csv(truth_file)
 
     # Check if the number of clusters is the same
-    n_clusters_o = len(truth['cluster_ids'].unique())
+    n_clusters_o = len(output['cluster_ids'].unique())
     n_clusters_t = len(truth['cluster_ids'].unique())
     if n_clusters_o != n_clusters_t:
         return False

diff --git a/tests/run_test.sh b/tests/run_test.sh
@@ -27,14 +27,6 @@ python3 -m pytest test_test_blobs.py
 echo "## Test the equality operator for the results of the clustering"
 python3 -m pytest test_clusterer_equality.py
 
-# Test the method of changind the domain extremes of the coordinates
-echo "## Test the change_domains method"
-python3 -m pytest test_change_domains.py
-
-# Test the clustering of points at the opposite extremes of a finite domain
-echo "## Test the clustering of points at the opposite extremes of a finite domain"
-python3 -m pytest test_domain_extremes.py
-
 if [[ $1 == "-" || $1 == "--clean" ]]
 then
   rm -f ./*_output.csv

diff --git a/tests/test_blob_dataset.py b/tests/test_blob_dataset.py
@@ -1,17 +1,17 @@
 '''
-Testing the algorithm on the blob dataset, a dataset where points are distributed to form
-round clusters
+Testing the algorithm on the blob dataset, a dataset where points are
+distributed to form round clusters
 '''
 
+from check_result import check_result
 import os
 import sys
 import pandas as pd
 import pytest
-sys.path.insert(1, '.')
-from check_result import check_result
 sys.path.insert(1, '../CLUEstering/')
 import CLUEstering as clue
 
+
 @pytest.fixture
 def blobs():
     '''
@@ -22,24 +22,25 @@ def blobs():
 
 def test_clustering(blobs):
     '''
-    Checks that the output of the clustering is the one given by the truth dataset
+    Checks that the output of the clustering is the one given by the
+    truth dataset
     '''
 
     # Check if the output file already exists and if it does, delete it
     if os.path.isfile('./blobs_output.csv'):
         os.remove('./blobs_output.csv')
 
-    c = clue.clusterer(0.8, 5, 1.5)
+    c = clue.clusterer(1., 5, 2.)
     c.read_data(blobs)
     c.run_clue()
     c.to_csv('./', 'blobs_output.csv')
 
-    check_result('./blobs_output.csv',
-                 './test_datasets/truth_files/blobs_truth.csv')
+    assert check_result('./blobs_output.csv',
+                        './test_datasets/truth_files/blobs_truth.csv')
 
 
 if __name__ == "__main__":
-    c = clue.clusterer(0.8, 5, 1.5)
+    c = clue.clusterer(1., 5, 2.)
     c.read_data("./test_datasets/blob.csv")
     c.run_clue()
     c.cluster_plotter()
diff --git a/tests/test_clusterer_equality.py b/tests/test_clusterer_equality.py
@@ -10,43 +10,43 @@
 
 
 @pytest.fixture
-def moons():
+def sissa():
     '''
-    Returns the dataframe containing the moon dataset
+    Returns the dataframe containing the sissa ataset
     '''
-    return pd.read_csv("./test_datasets/moons.csv")
+    return pd.read_csv("./test_datasets/sissa.csv")
 
 
 @pytest.fixture
-def circles():
+def toyDet():
     '''
-    Returns the dataframe containing the circle dataset
+    Returns the dataframe containing the toy detector dataset
     '''
-    return pd.read_csv("./test_datasets/circles.csv")
+    return pd.read_csv("./test_datasets/toyDetector.csv")
 
 
-def test_clusterer_equality(moons, circles):
+def test_clusterer_equality(sissa, toyDet):
     '''
     Test the equality operator for clusterer objects
     '''
-    # Moons dataset
-    clust1 = clue.clusterer(0.5, 5, 1.)
-    clust1.read_data(moons)
+    # Sissa dataset
+    clust1 = clue.clusterer(20., 10., 1.)
+    clust1.read_data(sissa)
     clust1.run_clue()
 
-    # Create a copy of the moons clusterer to check the equality of clusterers
-    clust1_copy = clue.clusterer(0.5, 5, 1.)
-    clust1_copy.read_data(moons)
+    # Create a copy of the sissa lusterer to check the equality of clusterers
+    clust1_copy = clue.clusterer(20., 10., 1.)
+    clust1_copy.read_data(sissa)
     clust1_copy.run_clue()
 
-    # Circles dataset
-    clust2 = clue.clusterer(0.9, 5, 1.5)
-    clust2.read_data(circles)
+    # toyDet dataset
+    clust2 = clue.clusterer(5., 2.5, 1.)
+    clust2.read_data(toyDet)
     clust2.run_clue()
 
     # Create a copy to check the equality of clusterers
-    clust2_copy = clue.clusterer(0.9, 5, 1.5)
-    clust2_copy.read_data(circles)
+    clust2_copy = clue.clusterer(5., 2.5, 1.)
+    clust2_copy.read_data(toyDet)
     clust2_copy.run_clue()
 
     # Check equality