Skip to content

Commit

Permalink
refactor: docs_export_mimetype as top level constant
Browse files Browse the repository at this point in the history
  • Loading branch information
milovate committed Dec 23, 2024
1 parent 7172465 commit b91b325
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 12 deletions.
21 changes: 12 additions & 9 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import io

import typing
from furl import furl
import requests

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status

docs_export_mimetype = {
"application/vnd.google-apps.document": "text/plain",
"application/vnd.google-apps.spreadsheet": "text/csv",
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.google-apps.drawing": "application/pdf",
}


def is_gdrive_url(f: furl) -> bool:
return f.host in ["drive.google.com", "docs.google.com"]
Expand Down Expand Up @@ -63,23 +70,19 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str


def gdrive_download(
f: furl, mime_type: str, export_links: dict = {}
f: furl, mime_type: str, export_links: typing.Optional[dict] = None
) -> tuple[bytes, str]:
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

if export_links is None:
export_links = {}

# get drive file id
file_id = url_to_gdrive_file_id(f)
# get metadata
service = discovery.build("drive", "v3")

docs_export_mimetype = {
"application/vnd.google-apps.document": "text/plain",
"application/vnd.google-apps.spreadsheet": "text/csv",
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.google-apps.drawing": "application/pdf",
}

if f.host != "drive.google.com":
# export google docs to appropriate type
export_mime_type = docs_export_mimetype.get(mime_type, mime_type)
Expand Down
6 changes: 3 additions & 3 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", {})
export_links = meta.get("exportLinks", None)
else:
try:
if is_user_uploaded_url(f_url):
Expand All @@ -328,7 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = None
etag = None
total_bytes = 0
export_links = {}
export_links = None
else:
name = (
r.headers.get("content-disposition", "")
Expand All @@ -340,7 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = etag.strip('"')
mime_type = get_mimetype_from_response(r)
total_bytes = int(r.headers.get("content-length") or 0)
export_links = {}
export_links = None
# extract filename from url as a fallback
if not name:
if is_user_uploaded_url(f_url):
Expand Down

0 comments on commit b91b325

Please sign in to comment.