From f594d027e0b2f366f71eb78eb451ca63c3fc6a70 Mon Sep 17 00:00:00 2001
From: Alexander Metzger <alexander@metzger.dk>
Date: Tue, 19 Sep 2023 20:38:26 -0700
Subject: [PATCH] inefficient lock, but it always works

---
 daras_ai_v2/asr.py                            |  6 ++-
 daras_ai_v2/glossary.py                       | 42 ++++++-------------
 ...glossaryresources_times_locked_for_read.py | 16 +++++++
 glossary_resources/models.py                  |  1 -
 4 files changed, 32 insertions(+), 33 deletions(-)
 create mode 100644 glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py

diff --git a/daras_ai_v2/asr.py b/daras_ai_v2/asr.py
index 2dd4cffb7..ebed57117 100644
--- a/daras_ai_v2/asr.py
+++ b/daras_ai_v2/asr.py
@@ -189,7 +189,9 @@ def run_google_translate(
         language_codes = [detection["language"] for detection in detections]
 
     return map_parallel(
-        lambda text, source: _translate_text(text, source, target_language),
+        lambda text, source: _translate_text(
+            text, source, target_language, glossary_url or DEFAULT_GLOSSARY_URL
+        ),
         texts,
         language_codes,
     )
@@ -225,7 +227,7 @@ def _translate_text(
         "transliteration_config": {"enable_transliteration": enable_transliteration},
     }
 
-    with glossary_resource(glossary_url) as (uri, _):
+    with glossary_resource(glossary_url) as uri:
         config.update(
             {
                 "glossaryConfig": {
diff --git a/daras_ai_v2/glossary.py b/daras_ai_v2/glossary.py
index 2ee9adf74..0c1610250 100644
--- a/daras_ai_v2/glossary.py
+++ b/daras_ai_v2/glossary.py
@@ -42,54 +42,36 @@ def glossary_resource(f_url: str = DEFAULT_GLOSSARY_URL):
         yield None, None
         return
 
-    # obtain read lock (to allow multiple translation requests to use the same glossary resource)
+    # I could not get this to work with concurrent translate requests without locking everything :(
     with AsyncAtomic():
         resource, created = GlossaryResources.objects.select_for_update().get_or_create(
             f_url=f_url
         )
-        resource.times_locked_for_read += 1
-        resource.uses += 1
-        resource.save()
 
-    # make sure we don't exceed the max number of glossary resources allowed by GCP
-    first_nonlocked = None
-    with AsyncAtomic():
-        if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES:
+        # make sure we don't exceed the max number of glossary resources allowed by GCP (we add a safety buffer of 100 for local development)
+        if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES - 100:
             first_nonlocked = (
                 GlossaryResources.objects.order_by("uses", "last_used")
                 .select_for_update(
                     skip_locked=True
                 )  # important: prevents deadlock and locks this row from being selected for read
-                .filter(times_locked_for_read=0)
                 .first()
             )
             assert first_nonlocked
             first_nonlocked.delete()
-    if first_nonlocked:
-        try:
-            _delete_glossary(glossary_name=first_nonlocked.get_clean_name())
-        except:
-            pass  # great error handling
+            try:
+                _delete_glossary(glossary_name=first_nonlocked.get_clean_name())
+            except:
+                pass  # great error handling
 
-    # write lock to prevent bad interleavings of _update_glossary's internal create and delete operations
-    with AsyncAtomic():
-        resource = GlossaryResources.objects.select_for_update().get(f_url=f_url)
         doc_meta = doc_url_to_metadata(f_url)
-        df = _update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
+        _update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
         path = _get_glossary(glossary_name=resource.get_clean_name())
 
-    try:
-        # The read lock should prevent most race conditions.
-        # The only possible data race, I believe, is if the glossary resource is deleted during translation
-        # which would only happen if the glossary is a google sheet that has just been updated and two
-        # translate requests are running concurrently. I couldn't find a better way to allow concurrent
-        # translation requests.
-        yield path, df
-    finally:
-        # release read lock
-        with AsyncAtomic():
-            resource = GlossaryResources.objects.select_for_update().get(f_url=f_url)
-            resource.times_locked_for_read -= 1
+        try:
+            yield path
+        finally:
+            resource.uses += 1
             resource.save()
 
 
diff --git a/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py b/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py
new file mode 100644
index 000000000..0bfd5269c
--- /dev/null
+++ b/glossary_resources/migrations/0003_remove_glossaryresources_times_locked_for_read.py
@@ -0,0 +1,16 @@
+# Generated by Django 4.2.5 on 2023-09-20 03:25
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("glossary_resources", "0002_glossaryresources_glossary_name"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="glossaryresources",
+            name="times_locked_for_read",
+        ),
+    ]
diff --git a/glossary_resources/models.py b/glossary_resources/models.py
index 19a4b8d65..f714dcced 100644
--- a/glossary_resources/models.py
+++ b/glossary_resources/models.py
@@ -8,7 +8,6 @@ class GlossaryResources(models.Model):
     f_url = CustomURLField(unique=True)
     uses = models.IntegerField(default=0)
     last_used = models.DateTimeField(auto_now=True)
-    times_locked_for_read = models.IntegerField(default=0)
     glossary_name = models.UUIDField(unique=True, default=uuid.uuid4, editable=False)
 
     class Meta: