diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index 59e399a8c..3b6282cfd 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -424,7 +424,7 @@ def doc_url_to_text_pages( doc_meta: DocMetadata, google_translate_target: str | None, selected_asr_model: str | None, -) -> list[str]: +) -> typing.Union[list[str], "pd.DataFrame"]: """ Download document from url and convert to text pages. @@ -437,33 +437,61 @@ def doc_url_to_text_pages( Returns: list of text pages """ + f_bytes, ext = download_content_bytes(f_url=f_url, mime_type=doc_meta.mime_type) + if not f_bytes: + return [] + pages = bytes_to_text_pages_or_df( + f_url=f_url, + f_name=doc_meta.name, + f_bytes=f_bytes, + ext=ext, + mime_type=doc_meta.mime_type, + selected_asr_model=selected_asr_model, + ) + # optionally, translate text + if google_translate_target and isinstance(pages, list): + pages = run_google_translate(pages, google_translate_target) + return pages + + +def download_content_bytes(*, f_url: str, mime_type: str) -> tuple[bytes, str]: f = furl(f_url) - f_name = doc_meta.name if is_gdrive_url(f): # download from google drive - f_bytes, ext = gdrive_download(f, doc_meta.mime_type) - else: + return gdrive_download(f, mime_type) + try: # download from url + r = requests.get( + f_url, + headers={"User-Agent": random.choice(FAKE_USER_AGENTS)}, + timeout=settings.EXTERNAL_REQUEST_TIMEOUT_SEC, + ) + r.raise_for_status() + except requests.RequestException as e: + print(f"ignore error while downloading {f_url}: {e}") + return b"", "" + f_bytes = r.content + # if it's a known encoding, standardize to utf-8 + if r.encoding: try: - r = requests.get( - f_url, - headers={"User-Agent": random.choice(FAKE_USER_AGENTS)}, - timeout=settings.EXTERNAL_REQUEST_TIMEOUT_SEC, - ) - r.raise_for_status() - except requests.RequestException as e: - print(f"ignore error while downloading {f_url}: {e}") - return [] - f_bytes = r.content - # if it's a known encoding, standardize to utf-8 - if r.encoding: - try: - codec = codecs.lookup(r.encoding) - except LookupError: - pass - else: - f_bytes = codec.decode(f_bytes)[0].encode() - ext = guess_ext_from_response(r) + codec = codecs.lookup(r.encoding) + except LookupError: + pass + else: + f_bytes = codec.decode(f_bytes)[0].encode() + ext = guess_ext_from_response(r) + return f_bytes, ext + + +def bytes_to_text_pages_or_df( + *, + f_url: str, + f_name: str, + f_bytes: bytes, + ext: str, + mime_type: str, + selected_asr_model: str | None, +) -> typing.Union[list[str], "pd.DataFrame"]: # convert document to text pages match ext: case ".pdf": @@ -477,25 +505,42 @@ def doc_url_to_text_pages( raise ValueError( "For transcribing audio/video, please choose an ASR model from the settings!" ) - if is_gdrive_url(f): - f_url = upload_file_from_bytes( - f_name, f_bytes, content_type=doc_meta.mime_type - ) + if is_gdrive_url(furl(f_url)): + f_url = upload_file_from_bytes(f_name, f_bytes, content_type=mime_type) pages = [run_asr(f_url, selected_model=selected_asr_model, language="en")] - case ".csv" | ".xlsx" | ".tsv" | ".ods": - import pandas as pd - - df = pd.read_csv(io.BytesIO(f_bytes), dtype=str).fillna("") + case _: + df = bytes_to_df(f_name=f_name, f_bytes=f_bytes, ext=ext) assert ( "snippet" in df.columns or "sections" in df.columns ), f'uploaded spreadsheet must contain a "snippet" or "sections" column - {f_name !r}' - pages = df + return df + + return pages + + +def bytes_to_df( + *, + f_name: str, + f_bytes: bytes, + ext: str, +) -> "pd.DataFrame": + import pandas as pd + + f = io.BytesIO(f_bytes) + match ext: + case ".csv": + df = pd.read_csv(f, dtype=str) + case ".tsv": + df = pd.read_csv(f, sep="\t", dtype=str) + case ".xls" | ".xlsx": + df = pd.read_excel(f, dtype=str) + case ".json": + df = pd.read_json(f, dtype=str) + case ".xml": + df = pd.read_xml(f, dtype=str) case _: raise ValueError(f"Unsupported document format {ext!r} ({f_name})") - # optionally, translate text - if google_translate_target: - pages = run_google_translate(pages, google_translate_target) - return pages + return df.fillna("") def pdf_to_text_pages(f: typing.BinaryIO) -> list[str]: @@ -534,10 +579,6 @@ def pandoc_to_text(f_name: str, f_bytes: bytes, to="plain") -> str: subprocess.check_call(args) return outfile.read() - refs = st.session_state.get("references", []) - if not refs: - return - def render_sources_widget(refs: list[SearchReference]): if not refs: diff --git a/recipes/BulkRunner.py b/recipes/BulkRunner.py index 5ba9f44c1..e7ef5f373 100644 --- a/recipes/BulkRunner.py +++ b/recipes/BulkRunner.py @@ -1,8 +1,6 @@ import io import typing -import pandas as pd -import requests from fastapi import HTTPException from furl import furl from pydantic import BaseModel, Field @@ -14,6 +12,10 @@ from daras_ai_v2.doc_search_settings_widgets import document_uploader from daras_ai_v2.functional import map_parallel from daras_ai_v2.query_params_util import extract_query_params +from daras_ai_v2.vector_search import ( + doc_url_to_metadata, + download_content_bytes, +) from recipes.DocSearch import render_documents CACHED_COLUMNS = "__cached_columns" @@ -71,15 +73,17 @@ def render_form_v2(self): ) if files: + dfs = map_parallel(_read_df, files) st.session_state[CACHED_COLUMNS] = list( { col: None - for df in map_parallel(_read_df, files) + for df in dfs for col in df.columns if not col.startswith("Unnamed:") } ) else: + dfs = [] st.session_state.pop(CACHED_COLUMNS, None) required_input_fields = {} @@ -145,12 +149,26 @@ def render_form_v2(self): st.write( """ ##### Input Data Preview -Here's how we've parsed your data. +Here's how we've parsed your data. """ ) - for file in files: - st.data_table(file) + for df in dfs: + st.text_area( + "", + value=df.to_string( + max_cols=10, max_rows=10, max_colwidth=40, show_dimensions=True + ), + label_visibility="collapsed", + disabled=True, + style={ + "white-space": "pre", + "overflow": "scroll", + "font-family": "monospace", + "font-size": "0.9rem", + }, + height=250, + ) if not (required_input_fields or optional_input_fields): return @@ -218,6 +236,8 @@ def run_v2( request: "BulkRunnerPage.RequestModel", response: "BulkRunnerPage.ResponseModel", ) -> typing.Iterator[str | None]: + import pandas as pd + response.output_documents = [] for doc_ix, doc in enumerate(request.documents): @@ -408,21 +428,25 @@ def is_arr(field_props: dict) -> bool: return False -def _read_df(f: str) -> "pd.DataFrame": +def _read_df(f_url: str) -> "pd.DataFrame": import pandas as pd - r = requests.get(f) - r.raise_for_status() - if f.endswith(".csv"): - df = pd.read_csv(io.StringIO(r.text)) - elif f.endswith(".xlsx") or f.endswith(".xls"): - df = pd.read_excel(io.BytesIO(r.content)) - elif f.endswith(".json"): - df = pd.read_json(io.StringIO(r.text)) - elif f.endswith(".tsv"): - df = pd.read_csv(io.StringIO(r.text), sep="\t") - elif f.endswith(".xml"): - df = pd.read_xml(io.StringIO(r.text)) - else: - raise ValueError(f"Unsupported file type: {f}") - return df.dropna(how="all", axis=1).dropna(how="all", axis=0) + doc_meta = doc_url_to_metadata(f_url) + f_bytes, ext = download_content_bytes(f_url=f_url, mime_type=doc_meta.mime_type) + + f = io.BytesIO(f_bytes) + match ext: + case ".csv": + df = pd.read_csv(f) + case ".tsv": + df = pd.read_csv(f, sep="\t") + case ".xls" | ".xlsx": + df = pd.read_excel(f) + case ".json": + df = pd.read_json(f) + case ".xml": + df = pd.read_xml(f) + case _: + raise ValueError(f"Unsupported file type: {f_url}") + + return df.dropna(how="all", axis=1).dropna(how="all", axis=0).fillna("")