add dbscan tests, pom file changes, pip changes

apache · Sep 17, 2024 · 3005f7c · 3005f7c
1 parent 8e89ad6
commit 3005f7c
Show file tree

Hide file tree

Showing 6 changed files with 236 additions and 0 deletions.
diff --git a/pom.xml b/pom.xml
@@ -85,6 +85,7 @@
  <spark.version>3.3.0</spark.version>
  <spark.compat.version>3.3</spark.compat.version>
  <log4j.version>2.17.2</log4j.version>
+ <graphframe.version>0.8.3-spark3.4</graphframe.version>
 
  <flink.version>1.19.0</flink.version>
  <slf4j.version>1.7.36</slf4j.version>
@@ -394,6 +395,10 @@
  <enabled>true</enabled>
  </releases>
  </repository>
+ <repository>
+ <id>Spark Packages</id>
+ <url>https://repos.spark-packages.org/</url>
+ </repository>
  </repositories>
  <build>
  <pluginManagement>
@@ -578,6 +583,8 @@
  <scala.compat.version>${scala.compat.version}</scala.compat.version>
  <spark.version>${spark.version}</spark.version>
  <scala.version>${scala.version}</scala.version>
+ <log4j.version>${log4j.version}</log4j.version>
+ <graphframe.version>${graphframe.version}</graphframe.version>
  </properties>
  </configuration>
  <executions>
@@ -686,6 +693,7 @@
  <spark.version>3.0.3</spark.version>
  <spark.compat.version>3.0</spark.compat.version>
  <log4j.version>2.17.2</log4j.version>
+ <graphframe.version>0.8.1-spark3.0</graphframe.version>
  <!-- Skip deploying parent module. it will be deployed with sedona-spark-3.3 -->
  <skip.deploy.common.modules>true</skip.deploy.common.modules>
  </properties>
@@ -703,6 +711,7 @@
  <spark.version>3.1.2</spark.version>
  <spark.compat.version>3.1</spark.compat.version>
  <log4j.version>2.17.2</log4j.version>
+ <graphframe.version>0.8.2-spark3.1</graphframe.version>
  <!-- Skip deploying parent module. it will be deployed with sedona-spark-3.3 -->
  <skip.deploy.common.modules>true</skip.deploy.common.modules>
  </properties>
@@ -720,6 +729,7 @@
  <spark.version>3.2.0</spark.version>
  <spark.compat.version>3.2</spark.compat.version>
  <log4j.version>2.17.2</log4j.version>
+ <graphframe.version>0.8.2-spark3.2</graphframe.version>
  <!-- Skip deploying parent module. it will be deployed with sedona-spark-3.3 -->
  <skip.deploy.common.modules>true</skip.deploy.common.modules>
  </properties>
@@ -738,6 +748,7 @@
  <spark.version>3.3.0</spark.version>
  <spark.compat.version>3.3</spark.compat.version>
  <log4j.version>2.17.2</log4j.version>
+ <graphframe.version>0.8.3-spark3.4</graphframe.version>
  </properties>
  </profile>
  <profile>
@@ -752,6 +763,7 @@
  <spark.version>3.4.0</spark.version>
  <spark.compat.version>3.4</spark.compat.version>
  <log4j.version>2.19.0</log4j.version>
+ <graphframe.version>0.8.3-spark3.4</graphframe.version>
  <!-- Skip deploying parent module. it will be deployed with sedona-spark-3.3 -->
  <skip.deploy.common.modules>true</skip.deploy.common.modules>
  </properties>
@@ -768,6 +780,7 @@
  <spark.version>3.5.0</spark.version>
  <spark.compat.version>3.5</spark.compat.version>
  <log4j.version>2.20.0</log4j.version>
+ <graphframe.version>0.8.3-spark3.5</graphframe.version>
  <!-- Skip deploying parent module. it will be deployed with sedona-spark-3.3 -->
  <skip.deploy.common.modules>true</skip.deploy.common.modules>
  </properties>

diff --git a/python/Pipfile b/python/Pipfile
@@ -10,6 +10,8 @@ jupyter="*"
 mkdocs="*"
 pytest-cov = "*"
 
+scikit-learn = "*"
+
 [packages]
 pandas="<=1.5.3"
 numpy="<2"

diff --git a/python/tests/stats/__init__.py b/python/tests/stats/__init__.py
diff --git a/python/tests/stats/test_dbscan.py b/python/tests/stats/test_dbscan.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyspark.sql.functions as f
+
+from sedona.sql.st_constructors import ST_MakePoint
+from sedona.sql.st_functions import ST_Buffer
+from sklearn.cluster import DBSCAN as sklearnDBSCAN
+from sedona.stats.clustering.dbscan import dbscan
+
+from tests.test_base import TestBase
+
+
+class TestDBScan(TestBase):
+
+ def get_data(self):
+ return [
+ {"id": 1, "x": 1.0, "y": 2.0},
+ {"id": 2, "x": 3.0, "y": 4.0},
+ {"id": 3, "x": 2.5, "y": 4.0},
+ {"id": 4, "x": 1.5, "y": 2.5},
+ {"id": 5, "x": 3.0, "y": 5.0},
+ {"id": 6, "x": 12.8, "y": 4.5},
+ {"id": 7, "x": 2.5, "y": 4.5},
+ {"id": 8, "x": 1.2, "y": 2.5},
+ {"id": 9, "x": 1.0, "y": 3.0},
+ {"id": 10, "x": 1.0, "y": 5.0},
+ {"id": 11, "x": 1.0, "y": 2.5},
+ {"id": 12, "x": 5.0, "y": 6.0},
+ {"id": 13, "x": 4.0, "y": 3.0},
+ ]
+
+ def create_sample_dataframe(self):
+ return self.spark.createDataFrame(self.get_data()).select(
+ ST_MakePoint("x", "y").alias("arealandmark"), "id"
+ ).repartition(9)
+
+ def get_expected_result(self, input_data, epsilon, min_pts, include_outliers=True):
+ labels = (
+ sklearnDBSCAN(eps=epsilon, min_samples=min_pts)
+ .fit([[datum["x"], datum["y"]] for datum in input_data])
+ .labels_
+ )
+ expected = [(x[0] + 1, x[1]) for x in list(enumerate(labels))]
+ clusters = [x for x in set(labels) if (x != -1 or include_outliers)]
+ cluster_members = {
+ frozenset([y[0] for y in expected if y[1] == x]) for x in clusters
+ }
+ return cluster_members
+
+ def get_actual_results(
+ self,
+ input_data,
+ epsilon,
+ min_pts,
+ geometry=None,
+ id=None,
+ include_outliers=True,
+ ):
+ result = dbscan(
+ input_data, epsilon, min_pts, geometry, include_outliers=include_outliers
+ )
+ id = id or "id"
+ clusters_members = [
+ (x[id], x.cluster)
+ for x in result.collect()
+ if x.cluster != -1 or include_outliers
+ ]
+
+ clusters = {
+ frozenset([y[0] for y in clusters_members if y[1] == x])
+ for x in set([y[1] for y in clusters_members])
+ }
+
+ return clusters
+
+ def test_dbscan_valid_parameters(self):
+ # repeated broadcast joins with this small data size use a lot of RAM on broadcast references
+ prior_join_threshold = self.spark.conf.get("sedona.join.autoBroadcastJoinThreshold", None)
+ self.spark.conf.set(
+ "sedona.join.autoBroadcastJoinThreshold", -1
+ )
+ df = self.create_sample_dataframe()
+ for epsilon in [0.6, 0.7, 0.8]:
+ for min_pts in [3, 4, 5]:
+ assert self.get_expected_result(
+ self.get_data(), epsilon, min_pts
+ ) == self.get_actual_results(df, epsilon, min_pts)
+
+ if prior_join_threshold is None:
+ self.spark.conf.unset("sedona.join.autoBroadcastJoinThreshold")
+ else:
+ self.spark.conf.set("sedona.join.autoBroadcastJoinThreshold", prior_join_threshold)
+
+ def test_dbscan_valid_parameters_default_column_name(self):
+ df = self.create_sample_dataframe().select(
+ "id", f.col("arealandmark").alias("geometryFieldName")
+ )
+ epsilon = 0.6
+ min_pts = 4
+
+ assert self.get_expected_result(
+ self.get_data(), epsilon, min_pts
+ ) == self.get_actual_results(df, epsilon, min_pts)
+
+ def test_dbscan_valid_parameters_polygons(self):
+ df = self.create_sample_dataframe().select(
+ "id", ST_Buffer(f.col("arealandmark"), 0.000001).alias("geometryFieldName")
+ )
+ epsilon = 0.6
+ min_pts = 4
+
+ assert self.get_expected_result(
+ self.get_data(), epsilon, min_pts
+ ) == self.get_actual_results(df, epsilon, min_pts)
+
+ def test_dbscan_supports_other_distance_function(self):
+ df = self.create_sample_dataframe().select(
+ "id", ST_Buffer(f.col("arealandmark"), 0.000001).alias("geometryFieldName")
+ )
+ epsilon = 0.6
+ min_pts = 4
+
+ dbscan(
+ df,
+ epsilon,
+ min_pts,
+ "geometryFieldName",
+ use_spheroid=True,
+ )
+
+ def test_dbscan_invalid_epsilon(self):
+ df = self.create_sample_dataframe()
+
+ try:
+ dbscan(df, -0.1, 5, "arealandmark")
+ assert False
+ except Exception:
+ assert True
+
+ def test_dbscan_invalid_min_pts(self):
+ df = self.create_sample_dataframe()
+
+ try:
+ dbscan(df, 0.1, -5, "arealandmark")
+ assert False
+ except Exception:
+ assert True
+
+ def test_dbscan_invalid_geometry_column(self):
+ df = self.create_sample_dataframe()
+
+ try:
+ dbscan(df, 0.1, 5, "invalid_column")
+ assert False
+ except Exception:
+ assert True
+
+ def test_return_empty_df_when_no_clusters(self):
+ df = self.create_sample_dataframe()
+ epsilon = 0.1
+ min_pts = 10000
+
+ assert dbscan(df, epsilon, min_pts, "arealandmark", include_outliers = False).count() == 0
+ # picked some coefficient we know yields clusters and thus hit the happy case
+ assert (
+ dbscan(df, epsilon, min_pts, "arealandmark", include_outliers = False).schema
+ == dbscan(df, 0.6, 3, "arealandmark").schema
+ )
+
+ def test_dbscan_doesnt_duplicate_border_points_in_two_clusters(self):
+ input_df = self.spark.createDataFrame(
+ [
+ {"id": 10, "x": 1.0, "y": 1.8},
+ {"id": 11, "x": 1.0, "y": 1.9},
+ {"id": 12, "x": 1.0, "y": 2.0},
+ {"id": 13, "x": 1.0, "y": 2.1},
+ {"id": 14, "x": 2.0, "y": 2.0},
+ {"id": 15, "x": 3.0, "y": 1.9},
+ {"id": 16, "x": 3.0, "y": 2.0},
+ {"id": 17, "x": 3.0, "y": 2.1},
+ {"id": 18, "x": 3.0, "y": 2.2},
+ ]
+ ).select(ST_MakePoint("x", "y").alias("geometry"), "id")
+
+ # make sure no id occurs more than once
+ output_df = dbscan(input_df, 1.0, 4)
+
+ assert output_df.count() == 9
+ assert output_df.select("cluster").distinct().count() == 2
+
+ def test_return_outliers_false_doesnt_return_outliers(self):
+ df = self.create_sample_dataframe()
+ for epsilon in [0.6, 0.7, 0.8]:
+ for min_pts in [3, 4, 5]:
+ assert self.get_expected_result(
+ self.get_data(), epsilon, min_pts, include_outliers=False
+ ) == self.get_actual_results(
+ df, epsilon, min_pts, include_outliers=False
+ )
diff --git a/python/tests/test_base.py b/python/tests/test_base.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from tempfile import mkdtemp
 from sedona.spark import *
 from sedona.utils.decorators import classproperty
 
@@ -25,6 +26,7 @@ class TestBase:
  def spark(self):
  if not hasattr(self, "__spark"):
  spark = SedonaContext.create(SedonaContext.builder().master("local[*]").getOrCreate())
+ spark.sparkContext.setCheckpointDir(mkdtemp())
  setattr(self, "__spark", spark)
  return getattr(self, "__spark")
 

diff --git a/spark/common/pom.xml b/spark/common/pom.xml
@@ -157,6 +157,11 @@
  </exclusion>
  </exclusions>
  </dependency>
+ <dependency>
+ <groupId>graphframes</groupId>
+ <artifactId>graphframes</artifactId>
+ <version>${graphframe.version}-s_${scala.compat.version}</version>
+ </dependency>
  <dependency>
  <groupId>org.scala-lang</groupId>
  <artifactId>scala-library</artifactId>