Skip to content

Commit

Permalink
fixed merge issues, upgraded replicate to work with master's new vers…
Browse files Browse the repository at this point in the history
…ion of urllib
  • Loading branch information
SanderGi committed Oct 18, 2023
1 parent 588b4a5 commit ea8d035
Show file tree
Hide file tree
Showing 3 changed files with 1,882 additions and 1,646 deletions.
25 changes: 14 additions & 11 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,13 @@ def _download_doc_content(f_url: str, doc_meta: DocMetadata):
return ext, f_name, f_bytes


def download_content_bytes(f_url: str, mime_type: str):
ext, _, f_bytes = _download_doc_content(
f_url, DocMetadata(name="", etag="", mime_type=mime_type)
)
return f_bytes, ext


@redis_cache_decorator
def doc_url_to_text_pages(
*,
Expand Down Expand Up @@ -486,7 +493,7 @@ def doc_url_to_text_pages(
)
pages = [run_asr(f_url, selected_model=selected_asr_model, language="en")]
case _:
df = bytes_to_df(f_name=f_name, f_bytes=f_bytes, ext=ext)
df = bytes_to_df(f_name=f_name, f_bytes=f_bytes, ext=ext).fillna("")
assert (
"snippet" in df.columns or "sections" in df.columns
), f'uploaded spreadsheet must contain a "snippet" or "sections" column - {f_name !r}'
Expand Down Expand Up @@ -514,9 +521,13 @@ def bytes_to_df(
df = pd.read_json(f, dtype=str)
case ".xml":
df = pd.read_xml(f, dtype=str)
case ".ods":
df = pd.read_excel(f, engine="odf", dtype=str)
case ".gsheet":
df = pd.read_csv(f, dtype=str)
case _:
raise ValueError(f"Unsupported document format {ext!r} ({f_name})")
return df.fillna("")
return df


def pdf_to_text_pages(f: typing.BinaryIO) -> list[str]:
Expand Down Expand Up @@ -558,15 +569,7 @@ def pandoc_to_text(f_name: str, f_bytes: bytes, to="plain") -> str:

def download_table_doc(f_url: str, doc_meta: DocMetadata) -> "pd.DataFrame":
ext, f_name, f_bytes = _download_doc_content(f_url, doc_meta)
match ext:
case ".csv" | ".xlsx" | ".tsv" | ".ods" | ".gsheet":
import pandas as pd

df = pd.read_csv(io.BytesIO(f_bytes), dtype=str).dropna()
case _:
raise ValueError(f"Unsupported document format {ext!r} ({f_name})")

return df
return bytes_to_df(f_name=f_name, f_bytes=f_bytes, ext=ext).dropna()


def render_sources_widget(refs: list[SearchReference]):
Expand Down
Loading

0 comments on commit ea8d035

Please sign in to comment.