From f594d027e0b2f366f71eb78eb451ca63c3fc6a70 Mon Sep 17 00:00:00 2001 From: Alexander Metzger Date: Tue, 19 Sep 2023 20:38:26 -0700 Subject: [PATCH] inefficient lock, but it always works --- daras_ai_v2/asr.py | 6 ++- daras_ai_v2/glossary.py | 42 ++++++------------- ...glossaryresources_times_locked_for_read.py | 16 +++++++ glossary_resources/models.py | 1 - 4 files changed, 32 insertions(+), 33 deletions(-) create mode 100644 glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py index 2dd4cffb7..ebed57117 100644 --- a/daras_ai_v2/asr.py +++ b/daras_ai_v2/asr.py @@ -189,7 +189,9 @@ def run_google_translate( language_codes = [detection["language"] for detection in detections] return map_parallel( - lambda text, source: _translate_text(text, source, target_language), + lambda text, source: _translate_text( + text, source, target_language, glossary_url or DEFAULT_GLOSSARY_URL + ), texts, language_codes, ) @@ -225,7 +227,7 @@ def _translate_text( "transliteration_config": {"enable_transliteration": enable_transliteration}, } - with glossary_resource(glossary_url) as (uri, _): + with glossary_resource(glossary_url) as uri: config.update( { "glossaryConfig": { diff --git a/daras_ai_v2/glossary.py b/daras_ai_v2/glossary.py index 2ee9adf74..0c1610250 100644 --- a/daras_ai_v2/glossary.py +++ b/daras_ai_v2/glossary.py @@ -42,54 +42,36 @@ def glossary_resource(f_url: str = DEFAULT_GLOSSARY_URL): yield None, None return - # obtain read lock (to allow multiple translation requests to use the same glossary resource) + # I could not get this to work with concurrent translate requests without locking everything :( with AsyncAtomic(): resource, created = GlossaryResources.objects.select_for_update().get_or_create( f_url=f_url ) - resource.times_locked_for_read += 1 - resource.uses += 1 - resource.save() - # make sure we don't exceed the max number of glossary resources allowed by GCP - first_nonlocked = None - with AsyncAtomic(): - if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES: + # make sure we don't exceed the max number of glossary resources allowed by GCP (we add a safety buffer of 100 for local development) + if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES - 100: first_nonlocked = ( GlossaryResources.objects.order_by("uses", "last_used") .select_for_update( skip_locked=True ) # important: prevents deadlock and locks this row from being selected for read - .filter(times_locked_for_read=0) .first() ) assert first_nonlocked first_nonlocked.delete() - if first_nonlocked: - try: - _delete_glossary(glossary_name=first_nonlocked.get_clean_name()) - except: - pass # great error handling + try: + _delete_glossary(glossary_name=first_nonlocked.get_clean_name()) + except: + pass # great error handling - # write lock to prevent bad interleavings of _update_glossary's internal create and delete operations - with AsyncAtomic(): - resource = GlossaryResources.objects.select_for_update().get(f_url=f_url) doc_meta = doc_url_to_metadata(f_url) - df = _update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name()) + _update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name()) path = _get_glossary(glossary_name=resource.get_clean_name()) - try: - # The read lock should prevent most race conditions. - # The only possible data race, I believe, is if the glossary resource is deleted during translation - # which would only happen if the glossary is a google sheet that has just been updated and two - # translate requests are running concurrently. I couldn't find a better way to allow concurrent - # translation requests. - yield path, df - finally: - # release read lock - with AsyncAtomic(): - resource = GlossaryResources.objects.select_for_update().get(f_url=f_url) - resource.times_locked_for_read -= 1 + try: + yield path + finally: + resource.uses += 1 resource.save() diff --git a/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py b/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py new file mode 100644 index 000000000..0bfd5269c --- /dev/null +++ b/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py @@ -0,0 +1,16 @@ +# Generated by Django 4.2.5 on 2023-09-20 03:25 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("glossary_resources", "0002_glossaryresources_glossary_name"), + ] + + operations = [ + migrations.RemoveField( + model_name="glossaryresources", + name="times_locked_for_read", + ), + ] diff --git a/glossary_resources/models.py b/glossary_resources/models.py index 19a4b8d65..f714dcced 100644 --- a/glossary_resources/models.py +++ b/glossary_resources/models.py @@ -8,7 +8,6 @@ class GlossaryResources(models.Model): f_url = CustomURLField(unique=True) uses = models.IntegerField(default=0) last_used = models.DateTimeField(auto_now=True) - times_locked_for_read = models.IntegerField(default=0) glossary_name = models.UUIDField(unique=True, default=uuid.uuid4, editable=False) class Meta: