Skip to content

Commit

Permalink
update repr_llm and add DataFrameSummarizer with customizable sum…
Browse files Browse the repository at this point in the history
…marizing function (#323)

* update repr_llm

* wip

* getter

* compat

* cleanup
  • Loading branch information
shouples authored Nov 2, 2023
1 parent 0bf33e6 commit 957e805
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 10 deletions.
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mkdocstrings = { version = ">=0.19,<0.22", optional = true }
mkdocstrings-python = { version = ">=0.7.1,<0.10.0", optional = true }
duckdb-engine = "^0.9.2"
exceptiongroup = "^1.0.4"
repr-llm = "^0.2.1"
repr-llm = "^0.3.0"
structlog = "^23.2.0"

[tool.poetry.group.dev.dependencies]
Expand Down
4 changes: 2 additions & 2 deletions src/dx/formatters/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display as ipydisplay
from pandas.io.json import build_table_schema
from repr_llm.pandas import summarize_dataframe

from dx.formatters.summarizing import make_df_summary
from dx.sampling import get_column_string_lengths, get_df_dimensions, sample_if_too_big
from dx.settings import get_settings
from dx.types.main import DXDisplayMode
Expand Down Expand Up @@ -216,7 +216,7 @@ def format_output(
# add additional payload for LLM consumption; if any parsing/summarizing errors occur, we
# shouldn't block displaying the bundle
try:
payload["text/llm+plain"] = summarize_dataframe(df)
payload["text/llm+plain"] = make_df_summary(df)
except Exception as e:
logger.debug(f"Error in summarize_dataframe: {e}")

Expand Down
54 changes: 54 additions & 0 deletions src/dx/formatters/summarizing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Callable, Optional

import pandas as pd


class DataFrameSummarizer:
_instance: "DataFrameSummarizer" = None
summarizing_func: Optional[Callable] = None

def __init__(self, summarizing_func: Optional[Callable] = None):
if summarizing_func is None:
self._try_to_load_repr_llm()
else:
self.summarizing_func = summarizing_func

def _try_to_load_repr_llm(self) -> None:
"""Load repr_llm's summarize_dataframe into the summarizing_func if it's available."""
try:
from repr_llm.pandas import summarize_dataframe

self.summarizing_func = summarize_dataframe
except ImportError:
return

@classmethod
def instance(cls) -> "DataFrameSummarizer":
if cls._instance is None:
cls._instance = cls()
return cls._instance

def summarize(self, df: pd.DataFrame) -> str:
"""Generate a summary of a dataframe using the configured summarizing_func."""
if not isinstance(df, pd.DataFrame):
raise ValueError("`df` must be a pandas DataFrame")

if self.summarizing_func is None:
return df.describe().to_string()

return self.summarizing_func(df)


def get_summarizing_function() -> Optional[Callable]:
"""Get the function to use for summarizing dataframes."""
return DataFrameSummarizer.instance().summarizing_func


def set_summarizing_function(func: Callable) -> None:
"""Set the function to use for summarizing dataframes."""
DataFrameSummarizer.instance().summarizing_func = func


def make_df_summary(df: pd.DataFrame) -> str:
"""Generate a summary of a dataframe using the configured summarizing_func."""
return DataFrameSummarizer.instance().summarize(df)

0 comments on commit 957e805

Please sign in to comment.