Skip to content

Commit

Permalink
inefficient lock, but it always works
Browse files Browse the repository at this point in the history
  • Loading branch information
SanderGi committed Sep 20, 2023
1 parent becb7c9 commit f594d02
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 33 deletions.
6 changes: 4 additions & 2 deletions daras_ai_v2/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@ def run_google_translate(
language_codes = [detection["language"] for detection in detections]

return map_parallel(
lambda text, source: _translate_text(text, source, target_language),
lambda text, source: _translate_text(
text, source, target_language, glossary_url or DEFAULT_GLOSSARY_URL
),
texts,
language_codes,
)
Expand Down Expand Up @@ -225,7 +227,7 @@ def _translate_text(
"transliteration_config": {"enable_transliteration": enable_transliteration},
}

with glossary_resource(glossary_url) as (uri, _):
with glossary_resource(glossary_url) as uri:
config.update(
{
"glossaryConfig": {
Expand Down
42 changes: 12 additions & 30 deletions daras_ai_v2/glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,54 +42,36 @@ def glossary_resource(f_url: str = DEFAULT_GLOSSARY_URL):
yield None, None
return

# obtain read lock (to allow multiple translation requests to use the same glossary resource)
# I could not get this to work with concurrent translate requests without locking everything :(
with AsyncAtomic():
resource, created = GlossaryResources.objects.select_for_update().get_or_create(
f_url=f_url
)
resource.times_locked_for_read += 1
resource.uses += 1
resource.save()

# make sure we don't exceed the max number of glossary resources allowed by GCP
first_nonlocked = None
with AsyncAtomic():
if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES:
# make sure we don't exceed the max number of glossary resources allowed by GCP (we add a safety buffer of 100 for local development)
if created and GlossaryResources.objects.count() > MAX_GLOSSARY_RESOURCES - 100:
first_nonlocked = (
GlossaryResources.objects.order_by("uses", "last_used")
.select_for_update(
skip_locked=True
) # important: prevents deadlock and locks this row from being selected for read
.filter(times_locked_for_read=0)
.first()
)
assert first_nonlocked
first_nonlocked.delete()
if first_nonlocked:
try:
_delete_glossary(glossary_name=first_nonlocked.get_clean_name())
except:
pass # great error handling
try:
_delete_glossary(glossary_name=first_nonlocked.get_clean_name())
except:
pass # great error handling

# write lock to prevent bad interleavings of _update_glossary's internal create and delete operations
with AsyncAtomic():
resource = GlossaryResources.objects.select_for_update().get(f_url=f_url)
doc_meta = doc_url_to_metadata(f_url)
df = _update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
_update_glossary(f_url, doc_meta, glossary_name=resource.get_clean_name())
path = _get_glossary(glossary_name=resource.get_clean_name())

try:
# The read lock should prevent most race conditions.
# The only possible data race, I believe, is if the glossary resource is deleted during translation
# which would only happen if the glossary is a google sheet that has just been updated and two
# translate requests are running concurrently. I couldn't find a better way to allow concurrent
# translation requests.
yield path, df
finally:
# release read lock
with AsyncAtomic():
resource = GlossaryResources.objects.select_for_update().get(f_url=f_url)
resource.times_locked_for_read -= 1
try:
yield path
finally:
resource.uses += 1
resource.save()


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Django 4.2.5 on 2023-09-20 03:25

from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
("glossary_resources", "0002_glossaryresources_glossary_name"),
]

operations = [
migrations.RemoveField(
model_name="glossaryresources",
name="times_locked_for_read",
),
]
1 change: 0 additions & 1 deletion glossary_resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class GlossaryResources(models.Model):
f_url = CustomURLField(unique=True)
uses = models.IntegerField(default=0)
last_used = models.DateTimeField(auto_now=True)
times_locked_for_read = models.IntegerField(default=0)
glossary_name = models.UUIDField(unique=True, default=uuid.uuid4, editable=False)

class Meta:
Expand Down

0 comments on commit f594d02

Please sign in to comment.