Skip to content

Commit

Permalink
Include docstrings in schemas - improved readibility (#136)
Browse files Browse the repository at this point in the history
* incl docstrings in schema

* lint

* incl docstring in schema

* lint

* fix lint

* lint

* add include_documentation flag

* lint

* lint

* lint

* add test

* lint

* lint

* fix test

* fix

* fix indentation

* fix indentation

* remove commented lines

* rename variables

* rename variables

* rename variables

* rename variables

* rename variables

* add docstring

* improve readability

* improve readability

* fix lint

* fix lint

* rename

* rename

* improve readability

* improve readability
  • Loading branch information
jana-starkova authored Aug 2, 2023
1 parent d3c4781 commit 3c4be6e
Showing 1 changed file with 53 additions and 36 deletions.
89 changes: 53 additions & 36 deletions typedspark/_schema/get_schema_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,6 @@ def get_schema_definition_as_string(
return imports + schema_string


def _get_comment(schema: Type[Schema], col_name: str) -> str:
"""Return the comment of a given column."""
if (
hasattr(schema.__annotations__[col_name], "__metadata__")
and schema.__annotations__[col_name].__metadata__ is not None
):
comment = schema.__annotations__[col_name].__metadata__[0]
else:
comment = ""
return comment


def _build_schema_definition_string(
schema: Type[Schema],
include_documentation: bool,
Expand All @@ -55,38 +43,67 @@ def _build_schema_definition_string(
) -> str:
"""Return the code for a given ``Schema`` as a string."""
lines = f"class {class_name}(Schema):\n"

if include_documentation:
if schema.get_docstring() is not None:
lines += f' """{schema.get_docstring()}"""\n\n'
else:
lines += ' """Add documentation here."""\n\n'

for col_name, col_object in get_type_hints(schema).items():
typehint = (
str(col_object)
.replace("typedspark._core.column.", "")
.replace("typedspark._core.datatypes.", "")
.replace("typedspark._schema.schema.", "")
.replace("pyspark.sql.types.", "")
.replace("typing.", "")
)
typehint = _replace_literals(
typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType
)
if include_documentation:
col_annotated_start = f" {col_name}: Annotated[{typehint}, "
if col_name in schema.__annotations__:
comment = _get_comment(schema, col_name)
lines += f'{col_annotated_start}ColumnMeta(comment="{comment}")]\n'
else:
lines += f" {col_name}: {typehint}\n"
lines += _create_docstring(schema)

lines += _add_lines_with_typehint(include_documentation, schema)

if add_subschemas:
lines += _add_subschemas(schema, add_subschemas, include_documentation)

return lines


def _create_docstring(schema: Type[Schema]) -> str:
"""Create the docstring for a given ``Schema``."""
if schema.get_docstring() is not None:
docstring = f' """{schema.get_docstring()}"""\n\n'
else:
docstring = ' """Add documentation here."""\n\n'
return docstring


def _add_lines_with_typehint(include_documentation, schema):
"""Add a line with the typehint for each column in the ``Schema``."""
lines = ""
for col_name, col_type in get_type_hints(schema, include_extras=True).items():
typehint, comment = _create_typehint_and_comment(col_type)

if include_documentation:
lines += f' {col_name}: Annotated[{typehint}, ColumnMeta(comment="{comment}")]\n'
else:
lines += f" {col_name}: {typehint}\n"
return lines


def _create_typehint_and_comment(col_type) -> list[str]:
"""Create a typehint and comment for a given column."""
typehint = (
str(col_type)
.replace("typedspark._core.column.", "")
.replace("typedspark._core.datatypes.", "")
.replace("typedspark._schema.schema.", "")
.replace("pyspark.sql.types.", "")
.replace("typing.", "")
)
typehint, comment = _extract_comment(typehint)
typehint = _replace_literals(
typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType
)
return [typehint, comment]


def _extract_comment(typehint: str) -> tuple[str, str]:
"""Extract the comment from a typehint."""
comment = ""
if "Annotated" in typehint:
match = re.search(r"Annotated\[(.*), '(.*)'\]", typehint)
if match is not None:
typehint, comment = match.groups()
return typehint, comment


def _replace_literals(
typehint: str,
replace_literals_in: Type[TypedSparkDataType],
Expand Down

0 comments on commit 3c4be6e

Please sign in to comment.