Skip to content

Commit

Permalink
Merge pull request #268 from GooeyAI/google_folder_expansion
Browse files Browse the repository at this point in the history
Expand Google Drive Folders Into Individual URLs
  • Loading branch information
devxpy authored Feb 9, 2024
2 parents f9723fe + 93d786d commit a8382ae
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 2 deletions.
22 changes: 21 additions & 1 deletion daras_ai_v2/doc_search_settings_widgets.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import os
import typing

from furl import furl
from sentry_sdk import capture_exception

import gooey_ui as st
from daras_ai_v2 import settings
from daras_ai_v2.asr import AsrModels, google_translate_language_selector
from daras_ai_v2.enum_selector_widget import enum_selector
from daras_ai_v2.gdrive_downloader import gdrive_list_urls_of_files_in_folder
from daras_ai_v2.search_ref import CitationStyles

_user_media_url_prefix = os.path.join(
Expand Down Expand Up @@ -75,7 +79,23 @@ def document_uploader(
accept=accept,
accept_multiple_files=accept_multiple_files,
)
return st.session_state.get(key, [])
documents = st.session_state.get(key, [])
try:
documents = list(_expand_gdrive_folders(documents))
except Exception as e:
capture_exception(e)
st.error(f"Error expanding gdrive folders: {e}")
st.session_state[key] = documents
st.session_state[custom_key] = "\n".join(documents)
return documents


def _expand_gdrive_folders(documents: list[str]) -> list[str]:
for url in documents:
if url.startswith("https://drive.google.com/drive/folders"):
yield from gdrive_list_urls_of_files_in_folder(furl(url))
else:
yield url


def doc_search_settings(
Expand Down
31 changes: 31 additions & 0 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

from daras_ai_v2.functional import flatmap_parallel


def is_gdrive_url(f: furl) -> bool:
return f.host in ["drive.google.com", "docs.google.com"]
Expand All @@ -25,6 +27,35 @@ def url_to_gdrive_file_id(f: furl) -> str:
return file_id


def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str]:
if max_depth <= 0:
return []
assert f.host == "drive.google.com", f"Bad google drive folder url: {f}"
# get drive folder id from url (e.g. https://drive.google.com/drive/folders/1Xijcsj7oBvDn1OWx4UmNAT8POVKG4W73?usp=drive_link)
folder_id = f.path.segments[-1]
service = discovery.build("drive", "v3")
request = service.files().list(
supportsAllDrives=True,
includeItemsFromAllDrives=True,
q=f"'{folder_id}' in parents",
fields="files(mimeType,webViewLink)",
)
response = request.execute()
files = response.get("files", [])
urls = flatmap_parallel(
lambda file: (
gdrive_list_urls_of_files_in_folder(furl(url), max_depth=max_depth - 1)
if (
(url := file.get("webViewLink"))
and file.get("mimeType") == "application/vnd.google-apps.folder"
)
else [url]
),
files,
)
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:
# get drive file id
file_id = url_to_gdrive_file_id(f)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,4 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.black]
force-exclude = "migrations"
--force-exclude = "migrations|node_modules|\\.git|\\.venv|\\.env|\\.pytest_cache|\\.vscode|\\.github|\\.to"

0 comments on commit a8382ae

Please sign in to comment.