Skip to content

Commit

Permalink
handle error instead of long atomic block
Browse files Browse the repository at this point in the history
  • Loading branch information
SanderGi committed Sep 27, 2023
1 parent bd3581c commit 44500d9
Showing 1 changed file with 32 additions and 34 deletions.
66 changes: 32 additions & 34 deletions daras_ai_v2/glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from contextlib import contextmanager
from glossary_resources.models import GlossaryResources
from django.db import transaction
import requests
from time import sleep

DEFAULT_GLOSSARY_URL = "https://docs.google.com/spreadsheets/d/1IRHKcOC86oZXwMB0hR7eej7YVg5kUHpriZymwYQcQX4/edit?usp=sharing" # only viewing access
PROJECT_ID = "dara-c1b52" # GCP project id
Expand Down Expand Up @@ -33,47 +35,42 @@ def glossary_input(

# ================================ Glossary Logic ================================
@contextmanager
def glossary_resource(f_url: str = DEFAULT_GLOSSARY_URL):
def glossary_resource(f_url: str = DEFAULT_GLOSSARY_URL, max_tries=3):
"""
Obtains a glossary resource for use in translation requests.
"""
from daras_ai_v2.vector_search import doc_url_to_metadata

if not f_url:
yield None, None
yield None
return

# I could not get this to work with concurrent translate requests without locking everything :(
with transaction.atomic():
resource, created = GlossaryResources.objects.select_for_update().get_or_create(
f_url=f_url
)
resource, created = GlossaryResources.objects.get_or_create(f_url=f_url)

# make sure we don't exceed the max number of glossary resources allowed by GCP (we add a safety buffer of 100 for local development)
if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES - 100:
for gloss in GlossaryResources.objects.order_by("uses", "last_used")[:10]:
_delete_glossary(glossary_name=gloss.get_clean_name())
gloss.delete()

# make sure we don't exceed the max number of glossary resources allowed by GCP (we add a safety buffer of 100 for local development)
if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES - 100:
first_nonlocked = (
GlossaryResources.objects.order_by("uses", "last_used")
.select_for_update(
skip_locked=True
) # important: prevents deadlock and locks this row from being selected for read
.first()
)
assert first_nonlocked
first_nonlocked.delete()
try:
_delete_glossary(glossary_name=first_nonlocked.get_clean_name())
except:
pass # great error handling

doc_meta = doc_url_to_metadata(f_url)
_update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
path = _get_glossary(glossary_name=resource.get_clean_name())

try:
yield path
finally:
resource.uses += 1
resource.save()
doc_meta = doc_url_to_metadata(f_url)
# create glossary if it doesn't exist, update if it has changed
_update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
path = _get_glossary(glossary_name=resource.get_clean_name())

try:
yield path
except requests.exceptions.HTTPError as e:
if e.response.status_code == 400 and e.response.json().get("error", {}).get(
"message", ""
).startswith("Invalid resource name"):
sleep(1)
yield glossary_resource(f_url, max_tries - 1)
else:
raise e
finally:
resource.uses += 1
resource.save()


@redis_cache_decorator
Expand All @@ -82,15 +79,16 @@ def _update_glossary(
) -> "pd.DataFrame":
"""Goes through the full process of uploading the glossary from the url"""
from daras_ai_v2.vector_search import download_table_doc
from google.api_core.exceptions import NotFound

df = download_table_doc(f_url, doc_meta)

_upload_glossary_to_bucket(df, glossary_name=glossary_name)
# delete existing glossary
try:
_delete_glossary(glossary_name=glossary_name)
except:
pass # great error handling
except NotFound:
pass # glossary already deleted, moving on
# create new glossary
languages = [
lan_code
Expand Down

0 comments on commit 44500d9

Please sign in to comment.