Skip to content

Commit

Permalink
add export links to FileMetadata
Browse files Browse the repository at this point in the history
  • Loading branch information
milovate committed Dec 17, 2024
1 parent 855adce commit 677531b
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 34 deletions.
75 changes: 45 additions & 30 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import io

from furl import furl
import requests
from loguru import logger

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status


def is_gdrive_url(f: furl) -> bool:
Expand Down Expand Up @@ -60,7 +63,7 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:
def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]:
from googleapiclient import discovery

# get drive file id
Expand All @@ -70,7 +73,7 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:

request, mime_type = service_request(service, file_id, f, mime_type)
file_bytes, mime_type = download_blob_file_content(
service, request, file_id, f, mime_type
service, request, file_id, f, mime_type, export_links
)

return file_bytes, mime_type
Expand All @@ -96,7 +99,7 @@ def service_request(


def download_blob_file_content(
service, request, file_id: str, f: furl, mime_type: str
service, request, file_id: str, f: furl, mime_type: str, export_links: dict
) -> tuple[bytes, str]:
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.errors import HttpError
Expand All @@ -105,38 +108,48 @@ def download_blob_file_content(
file = io.BytesIO()
downloader = MediaIoBaseDownload(file, request)

done = False
try:
while done is False:
_, done = downloader.next_chunk()
# print(f"Download {int(status.progress() * 100)}%")
except HttpError as error:
# retry if error exporting google docs format files e.g .pptx/.docx files uploaded to docs.google.com
if "presentation" in f.path.segments:
# update mime_type to download the file directly
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
request, _ = service_request(
service, file_id, f, mime_type, retried_request=True
)
downloader = MediaIoBaseDownload(file, request)
done = False
while done is False:
_, done = downloader.next_chunk()
if (
mime_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
logger.debug(f"Downloading {str(f)!r} using export links")
f_url_export = export_links.get(mime_type, None)
if f_url_export:
logger.debug(f"Downloading {str(f_url_export)!r} using export links")
f_bytes, mime_type = download_from_exportlinks(f_url_export, mime_type)
else:

elif "document" in f.path.segments:
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
request, _ = service_request(
service, file_id, f, mime_type, retried_request=True
request = service.files().get_media(
fileId=file_id,
supportsAllDrives=True,
)
downloader = MediaIoBaseDownload(file, request)
done = False
while done is False:
_, done = downloader.next_chunk()
# print(f"Download {int(status.progress() * 100)}%")
f_bytes = file.getvalue()

else:
done = False
while done is False:
_, done = downloader.next_chunk()
# print(f"Download {int(status.progress() * 100)}%")
f_bytes = file.getvalue()

return f_bytes, mime_type

else:
raise error

f_bytes = file.getvalue()
def download_from_exportlinks(f: furl, mime_type: str) -> tuple[bytes, str]:

logger.debug(f"Downloading {str(f)!r} using export links")
try:
r = requests.get(f)
f_bytes = r.content
logger.debug(f"Downloaded {len(f_bytes)} bytes from {str(f)!r}")

except requests.RequestException as e:
raise_for_status(e)
return f_bytes, mime_type


Expand All @@ -157,8 +170,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]:
mime_type = "text/csv"
ext = ".csv"
elif "presentation" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
mime_type = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
ext = ".pptx"
elif "drawings" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
Expand All @@ -176,7 +191,7 @@ def gdrive_metadata(file_id: str) -> dict:
.get(
supportsAllDrives=True,
fileId=file_id,
fields="name,md5Checksum,modifiedTime,mimeType,size",
fields="name,md5Checksum,modifiedTime,mimeType,size,exportLinks",
)
.execute()
)
Expand Down
12 changes: 8 additions & 4 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks")
else:
try:
if is_user_uploaded_url(f_url):
Expand Down Expand Up @@ -347,9 +348,12 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
# guess mimetype from name as a fallback
if not mime_type:
mime_type = mimetypes.guess_type(name)[0]
return FileMetadata(

file_metadata = FileMetadata(
name=name, etag=etag, mime_type=mime_type or "", total_bytes=total_bytes
)
file_metadata.export_links = export_links or {}
return file_metadata


def yt_dlp_get_video_entries(url: str) -> list[dict]:
Expand Down Expand Up @@ -650,7 +654,7 @@ def doc_url_to_text_pages(
Download document from url and convert to text pages.
"""
f_bytes, mime_type = download_content_bytes(
f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url
f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url,export_links=file_meta.export_links
)
if not f_bytes:
return []
Expand All @@ -664,14 +668,14 @@ def doc_url_to_text_pages(


def download_content_bytes(
*, f_url: str, mime_type: str, is_user_url: bool = True
*, f_url: str, mime_type: str, is_user_url: bool = True, export_links:dict = None
) -> tuple[bytes, str]:
if is_yt_dlp_able_url(f_url):
return download_youtube_to_wav(f_url), "audio/wav"
f = furl(f_url)
if is_gdrive_url(f):
# download from google drive
return gdrive_download(f, mime_type)
return gdrive_download(f, mime_type,export_links)
try:
# download from url
if is_user_uploaded_url(f_url):
Expand Down
5 changes: 5 additions & 0 deletions files/models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from django.db import models
from django.template.defaultfilters import filesizeformat
from collections import defaultdict


class FileMetadata(models.Model):
name = models.TextField(default="", blank=True)
etag = models.CharField(max_length=255, null=True)
mime_type = models.CharField(max_length=255, default="", blank=True)
total_bytes = models.PositiveIntegerField(default=0, blank=True)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.export_links = kwargs.get('export_links', defaultdict(dict))

def __str__(self):
ret = f"{self.name or 'Unnamed'} - {self.mime_type}"
Expand Down

0 comments on commit 677531b

Please sign in to comment.