From 3c4be6efd14ded0548bd0bcf8303f15fbc5f6c05 Mon Sep 17 00:00:00 2001
From: jana-starkova <45129167+jana-starkova@users.noreply.github.com>
Date: Wed, 2 Aug 2023 15:44:01 +0200
Subject: [PATCH] Include docstrings in schemas - improved readibility (#136)

* incl docstrings in schema

* lint

* incl docstring in schema

* lint

* fix lint

* lint

* add include_documentation flag

* lint

* lint

* lint

* add test

* lint

* lint

* fix test

* fix

* fix indentation

* fix indentation

* remove commented lines

* rename variables

* rename variables

* rename variables

* rename variables

* rename variables

* add docstring

* improve readability

* improve readability

* fix lint

* fix lint

* rename

* rename

* improve readability

* improve readability
---
 typedspark/_schema/get_schema_definition.py | 89 ++++++++++++---------
 1 file changed, 53 insertions(+), 36 deletions(-)

diff --git a/typedspark/_schema/get_schema_definition.py b/typedspark/_schema/get_schema_definition.py
index 9bf546c..237b722 100644
--- a/typedspark/_schema/get_schema_definition.py
+++ b/typedspark/_schema/get_schema_definition.py
@@ -35,18 +35,6 @@ def get_schema_definition_as_string(
     return imports + schema_string
 
 
-def _get_comment(schema: Type[Schema], col_name: str) -> str:
-    """Return the comment of a given column."""
-    if (
-        hasattr(schema.__annotations__[col_name], "__metadata__")
-        and schema.__annotations__[col_name].__metadata__ is not None
-    ):
-        comment = schema.__annotations__[col_name].__metadata__[0]
-    else:
-        comment = ""
-    return comment
-
-
 def _build_schema_definition_string(
     schema: Type[Schema],
     include_documentation: bool,
@@ -55,31 +43,11 @@ def _build_schema_definition_string(
 ) -> str:
     """Return the code for a given ``Schema`` as a string."""
     lines = f"class {class_name}(Schema):\n"
+
     if include_documentation:
-        if schema.get_docstring() is not None:
-            lines += f'    """{schema.get_docstring()}"""\n\n'
-        else:
-            lines += '    """Add documentation here."""\n\n'
-
-    for col_name, col_object in get_type_hints(schema).items():
-        typehint = (
-            str(col_object)
-            .replace("typedspark._core.column.", "")
-            .replace("typedspark._core.datatypes.", "")
-            .replace("typedspark._schema.schema.", "")
-            .replace("pyspark.sql.types.", "")
-            .replace("typing.", "")
-        )
-        typehint = _replace_literals(
-            typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType
-        )
-        if include_documentation:
-            col_annotated_start = f"    {col_name}: Annotated[{typehint}, "
-            if col_name in schema.__annotations__:
-                comment = _get_comment(schema, col_name)
-                lines += f'{col_annotated_start}ColumnMeta(comment="{comment}")]\n'
-        else:
-            lines += f"    {col_name}: {typehint}\n"
+        lines += _create_docstring(schema)
+
+    lines += _add_lines_with_typehint(include_documentation, schema)
 
     if add_subschemas:
         lines += _add_subschemas(schema, add_subschemas, include_documentation)
@@ -87,6 +55,55 @@ def _build_schema_definition_string(
     return lines
 
 
+def _create_docstring(schema: Type[Schema]) -> str:
+    """Create the docstring for a given ``Schema``."""
+    if schema.get_docstring() is not None:
+        docstring = f'    """{schema.get_docstring()}"""\n\n'
+    else:
+        docstring = '    """Add documentation here."""\n\n'
+    return docstring
+
+
+def _add_lines_with_typehint(include_documentation, schema):
+    """Add a line with the typehint for each column in the ``Schema``."""
+    lines = ""
+    for col_name, col_type in get_type_hints(schema, include_extras=True).items():
+        typehint, comment = _create_typehint_and_comment(col_type)
+
+        if include_documentation:
+            lines += f'    {col_name}: Annotated[{typehint}, ColumnMeta(comment="{comment}")]\n'
+        else:
+            lines += f"    {col_name}: {typehint}\n"
+    return lines
+
+
+def _create_typehint_and_comment(col_type) -> list[str]:
+    """Create a typehint and comment for a given column."""
+    typehint = (
+        str(col_type)
+        .replace("typedspark._core.column.", "")
+        .replace("typedspark._core.datatypes.", "")
+        .replace("typedspark._schema.schema.", "")
+        .replace("pyspark.sql.types.", "")
+        .replace("typing.", "")
+    )
+    typehint, comment = _extract_comment(typehint)
+    typehint = _replace_literals(
+        typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType
+    )
+    return [typehint, comment]
+
+
+def _extract_comment(typehint: str) -> tuple[str, str]:
+    """Extract the comment from a typehint."""
+    comment = ""
+    if "Annotated" in typehint:
+        match = re.search(r"Annotated\[(.*), '(.*)'\]", typehint)
+        if match is not None:
+            typehint, comment = match.groups()
+    return typehint, comment
+
+
 def _replace_literals(
     typehint: str,
     replace_literals_in: Type[TypedSparkDataType],