add dbscan python

apache · Sep 17, 2024 · 8e89ad6 · 8e89ad6
1 parent 97769e3
commit 8e89ad6
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 0 deletions.
diff --git a/python/sedona/stats/__init__.py b/python/sedona/stats/__init__.py
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
diff --git a/python/sedona/stats/clustering/__init__.py b/python/sedona/stats/clustering/__init__.py
@@ -0,0 +1,21 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+"""The clustering module contains spark based implementations of popular geospatial clustering algorithms.
+
+These implementations are designed to scale to larger datasets and support various geometric feature types.
+"""
diff --git a/python/sedona/stats/clustering/dbscan.py b/python/sedona/stats/clustering/dbscan.py
@@ -0,0 +1,68 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+"""DBSCAN is a popular clustering algorithm for spatial data.
+
+It identifies groups of data where enough records are close enough to each other. This implementation leverages spark,
+sedona and graphframes to support large scale datasets and various, heterogeneous geometric feature types.
+"""
+from typing import Optional
+
+from pyspark.sql import DataFrame, SparkSession
+
+ID_COLUMN_NAME = "__id"
+DEFAULT_MAX_SAMPLE_SIZE = 1000000  # 1 million
+
+
+def dbscan(
+    dataframe: DataFrame,
+    epsilon: float,
+    min_pts: int,
+    geometry: Optional[str] = None,
+    include_outliers: bool = True,
+    use_spheroid=False,
+):
+    """Annotates a dataframe with a cluster label for each data record using the DBSCAN algorithm.
+
+    The dataframe should contain at least one GeometryType column. Rows must be unique. If one geometry column is
+    present it will be used automatically. If two are present, the one named 'geometry' will be used. If more than one
+    are present and neither is named 'geometry', the column name must be provided.
+
+    Args:
+        dataframe: spark dataframe containing the geometries
+        epsilon: minimum distance parameter of DBSCAN algorithm
+        min_pts: minimum number of points parameter of DBSCAN algorithm
+        geometry: name of the geometry column
+        include_outliers: whether to return outlier points. If True, outliers are returned with a cluster value of -1.
+            Default is False
+        use_spheroid: whether to use a cartesian or spheroidal distance calculation. Default is false
+
+    Returns:
+        A PySpark DataFrame containing the cluster label for each row
+    """
+    sedona = SparkSession.getActiveSession()
+
+    result_df = sedona._jvm.org.apache.sedona.stats.clustering.DBSCAN.dbscan(
+        dataframe._jdf,
+        float(epsilon),
+        min_pts,
+        geometry,
+        include_outliers,
+        use_spheroid,
+    )
+
+    return DataFrame(result_df, sedona)
diff --git a/python/sedona/stats/utils/__init__.py b/python/sedona/stats/utils/__init__.py
@@ -0,0 +1,33 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+from pyspark.sql import DataFrame, Column, SparkSession
+from sedona.sql.types import GeometryType
+
+
+def get_geometry_column_name(df: DataFrame) -> Column:
+    geom_fields = [
+        field.name for field in df.schema.fields if field.dataType == GeometryType()
+    ]
+    if len(geom_fields) > 1:
+        if "geometry" in geom_fields:
+            return "geometry"
+        else:
+            raise ValueError("Multiple geometry columns found in DataFrame")
+    if len(geom_fields) == 0:
+        raise ValueError("No geometry column found in DataFrame")
+    return geom_fields[0]