From 3c4be6efd14ded0548bd0bcf8303f15fbc5f6c05 Mon Sep 17 00:00:00 2001 From: jana-starkova <45129167+jana-starkova@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:44:01 +0200 Subject: [PATCH] Include docstrings in schemas - improved readibility (#136) * incl docstrings in schema * lint * incl docstring in schema * lint * fix lint * lint * add include_documentation flag * lint * lint * lint * add test * lint * lint * fix test * fix * fix indentation * fix indentation * remove commented lines * rename variables * rename variables * rename variables * rename variables * rename variables * add docstring * improve readability * improve readability * fix lint * fix lint * rename * rename * improve readability * improve readability --- typedspark/_schema/get_schema_definition.py | 89 ++++++++++++--------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/typedspark/_schema/get_schema_definition.py b/typedspark/_schema/get_schema_definition.py index 9bf546c..237b722 100644 --- a/typedspark/_schema/get_schema_definition.py +++ b/typedspark/_schema/get_schema_definition.py @@ -35,18 +35,6 @@ def get_schema_definition_as_string( return imports + schema_string -def _get_comment(schema: Type[Schema], col_name: str) -> str: - """Return the comment of a given column.""" - if ( - hasattr(schema.__annotations__[col_name], "__metadata__") - and schema.__annotations__[col_name].__metadata__ is not None - ): - comment = schema.__annotations__[col_name].__metadata__[0] - else: - comment = "" - return comment - - def _build_schema_definition_string( schema: Type[Schema], include_documentation: bool, @@ -55,31 +43,11 @@ def _build_schema_definition_string( ) -> str: """Return the code for a given ``Schema`` as a string.""" lines = f"class {class_name}(Schema):\n" + if include_documentation: - if schema.get_docstring() is not None: - lines += f' """{schema.get_docstring()}"""\n\n' - else: - lines += ' """Add documentation here."""\n\n' - - for col_name, col_object in get_type_hints(schema).items(): - typehint = ( - str(col_object) - .replace("typedspark._core.column.", "") - .replace("typedspark._core.datatypes.", "") - .replace("typedspark._schema.schema.", "") - .replace("pyspark.sql.types.", "") - .replace("typing.", "") - ) - typehint = _replace_literals( - typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType - ) - if include_documentation: - col_annotated_start = f" {col_name}: Annotated[{typehint}, " - if col_name in schema.__annotations__: - comment = _get_comment(schema, col_name) - lines += f'{col_annotated_start}ColumnMeta(comment="{comment}")]\n' - else: - lines += f" {col_name}: {typehint}\n" + lines += _create_docstring(schema) + + lines += _add_lines_with_typehint(include_documentation, schema) if add_subschemas: lines += _add_subschemas(schema, add_subschemas, include_documentation) @@ -87,6 +55,55 @@ def _build_schema_definition_string( return lines +def _create_docstring(schema: Type[Schema]) -> str: + """Create the docstring for a given ``Schema``.""" + if schema.get_docstring() is not None: + docstring = f' """{schema.get_docstring()}"""\n\n' + else: + docstring = ' """Add documentation here."""\n\n' + return docstring + + +def _add_lines_with_typehint(include_documentation, schema): + """Add a line with the typehint for each column in the ``Schema``.""" + lines = "" + for col_name, col_type in get_type_hints(schema, include_extras=True).items(): + typehint, comment = _create_typehint_and_comment(col_type) + + if include_documentation: + lines += f' {col_name}: Annotated[{typehint}, ColumnMeta(comment="{comment}")]\n' + else: + lines += f" {col_name}: {typehint}\n" + return lines + + +def _create_typehint_and_comment(col_type) -> list[str]: + """Create a typehint and comment for a given column.""" + typehint = ( + str(col_type) + .replace("typedspark._core.column.", "") + .replace("typedspark._core.datatypes.", "") + .replace("typedspark._schema.schema.", "") + .replace("pyspark.sql.types.", "") + .replace("typing.", "") + ) + typehint, comment = _extract_comment(typehint) + typehint = _replace_literals( + typehint, replace_literals_in=DayTimeIntervalType, replace_literals_by=IntervalType + ) + return [typehint, comment] + + +def _extract_comment(typehint: str) -> tuple[str, str]: + """Extract the comment from a typehint.""" + comment = "" + if "Annotated" in typehint: + match = re.search(r"Annotated\[(.*), '(.*)'\]", typehint) + if match is not None: + typehint, comment = match.groups() + return typehint, comment + + def _replace_literals( typehint: str, replace_literals_in: Type[TypedSparkDataType],