Skip to content

Commit

Permalink
expand gdrive folders in parallel
Browse files Browse the repository at this point in the history
avoid mutating the list that is being iterated
  • Loading branch information
devxpy committed Feb 8, 2024
1 parent bd3a206 commit 93d786d
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 29 deletions.
26 changes: 17 additions & 9 deletions daras_ai_v2/doc_search_settings_widgets.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import os
import typing

from furl import furl
from sentry_sdk import capture_exception

import gooey_ui as st
from daras_ai_v2 import settings
from daras_ai_v2.asr import AsrModels, google_translate_language_selector
from daras_ai_v2.enum_selector_widget import enum_selector
from daras_ai_v2.gdrive_downloader import gdrive_list_urls_of_files_in_folder
from daras_ai_v2.search_ref import CitationStyles

_user_media_url_prefix = os.path.join(
Expand Down Expand Up @@ -76,20 +80,24 @@ def document_uploader(
accept_multiple_files=accept_multiple_files,
)
documents = st.session_state.get(key, [])
for document in documents:
if not document.startswith("https://drive.google.com/drive/folders"):
continue
from daras_ai_v2.gdrive_downloader import gdrive_list_urls_of_files_in_folder
from furl import furl

folder_content_urls = gdrive_list_urls_of_files_in_folder(furl(document))
documents.remove(document)
documents.extend(folder_content_urls)
try:
documents = list(_expand_gdrive_folders(documents))
except Exception as e:
capture_exception(e)
st.error(f"Error expanding gdrive folders: {e}")
st.session_state[key] = documents
st.session_state[custom_key] = "\n".join(documents)
return documents


def _expand_gdrive_folders(documents: list[str]) -> list[str]:
for url in documents:
if url.startswith("https://drive.google.com/drive/folders"):
yield from gdrive_list_urls_of_files_in_folder(furl(url))
else:
yield url


def doc_search_settings(
asr_allowed: bool = False,
keyword_instructions_allowed: bool = False,
Expand Down
41 changes: 21 additions & 20 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

from daras_ai_v2.functional import flatmap_parallel


def is_gdrive_url(f: furl) -> bool:
return f.host in ["drive.google.com", "docs.google.com"]
Expand All @@ -25,34 +27,33 @@ def url_to_gdrive_file_id(f: furl) -> str:
return file_id


def gdrive_list_urls_of_files_in_folder(f: furl, max_depth=10) -> list[str]:
def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str]:
if max_depth <= 0:
return []
assert f.host == "drive.google.com", f"Bad google drive folder url: {f}"
# get drive folder id from url (e.g. https://drive.google.com/drive/folders/1Xijcsj7oBvDn1OWx4UmNAT8POVKG4W73?usp=drive_link)
folder_id = f.path.segments[-1]
service = discovery.build("drive", "v3")
if f.host == "drive.google.com":
request = service.files().list(
supportsAllDrives=True,
includeItemsFromAllDrives=True,
q=f"'{folder_id}' in parents",
fields="files(mimeType,webViewLink)",
)
else:
raise ValueError(f"Can't list files from non google folder url: {str(f)!r}")
request = service.files().list(
supportsAllDrives=True,
includeItemsFromAllDrives=True,
q=f"'{folder_id}' in parents",
fields="files(mimeType,webViewLink)",
)
response = request.execute()
files = response.get("files", [])
urls = []
for file in files:
mime_type = file.get("mimeType")
url = file.get("webViewLink")
if mime_type == "application/vnd.google-apps.folder":
urls += gdrive_list_urls_of_files_in_folder(
furl(url), max_depth=max_depth - 1
urls = flatmap_parallel(
lambda file: (
gdrive_list_urls_of_files_in_folder(furl(url), max_depth=max_depth - 1)
if (
(url := file.get("webViewLink"))
and file.get("mimeType") == "application/vnd.google-apps.folder"
)
elif url:
urls.append(url)
return urls
else [url]
),
files,
)
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:
Expand Down

0 comments on commit 93d786d

Please sign in to comment.