From 50c848af71ae20956fc06a2e339ea20196dcdea9 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 23 Aug 2024 19:34:08 +0100 Subject: [PATCH 01/56] Add `PROCESS_REPEATERS` toggle --- corehq/toggles/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/corehq/toggles/__init__.py b/corehq/toggles/__init__.py index 1de3ab96ad02..4d67b6719528 100644 --- a/corehq/toggles/__init__.py +++ b/corehq/toggles/__init__.py @@ -1956,6 +1956,18 @@ def _commtrackify(domain_name, toggle_is_enabled): [NAMESPACE_DOMAIN] ) +PROCESS_REPEATERS = FeatureRelease( + 'process_repeaters', + 'Process repeaters instead of processing repeat records independently', + TAG_INTERNAL, + [NAMESPACE_DOMAIN], + owner='Norman Hooper', + description=""" + Manages repeat records through their repeater in order to make + smarter decisions about remote endpoints. + """ +) + DO_NOT_RATE_LIMIT_SUBMISSIONS = StaticToggle( 'do_not_rate_limit_submissions', 'Do not rate limit submissions for this project, on a temporary basis.', From 0db6a6c4f4ca90d3650f7d7beaf753042b97df8f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 23 Aug 2024 20:51:51 +0100 Subject: [PATCH 02/56] `process_repeaters()` task --- corehq/motech/repeaters/const.py | 2 + corehq/motech/repeaters/models.py | 10 +++ corehq/motech/repeaters/tasks.py | 67 +++++++++++++++++++-- corehq/motech/repeaters/tests/test_tasks.py | 59 +++++++++++++++++- 4 files changed, 131 insertions(+), 7 deletions(-) diff --git a/corehq/motech/repeaters/const.py b/corehq/motech/repeaters/const.py index cec83aa1d125..669dd1306965 100644 --- a/corehq/motech/repeaters/const.py +++ b/corehq/motech/repeaters/const.py @@ -13,6 +13,8 @@ CHECK_REPEATERS_INTERVAL = timedelta(minutes=5) CHECK_REPEATERS_PARTITION_COUNT = settings.CHECK_REPEATERS_PARTITION_COUNT CHECK_REPEATERS_KEY = 'check-repeaters-key' +PROCESS_REPEATERS_INTERVAL = timedelta(minutes=1) +PROCESS_REPEATERS_KEY = 'process-repeaters-key' ENDPOINT_TIMER = 'endpoint_timer' # Number of attempts to an online endpoint before cancelling payload MAX_ATTEMPTS = 3 diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 8e693a81b529..c5d5c2386198 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1171,6 +1171,9 @@ def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): retry_process_datasource_repeat_record, ) + if toggles.PROCESS_REPEATERS.enabled(self.domain, toggles.NAMESPACE_DOMAIN): + return + if self.next_check is None or self.next_check > datetime.utcnow(): return @@ -1327,3 +1330,10 @@ def domain_can_forward(domain): domain_has_privilege(domain, ZAPIER_INTEGRATION) or domain_has_privilege(domain, DATA_FORWARDING) ) + + +def domain_can_forward_now(domain): + return ( + domain_can_forward(domain) + and not toggles.PAUSE_DATA_FORWARDING.enabled(domain) + ) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index eb6e5403594a..0ae80661456d 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -11,6 +11,11 @@ from corehq import toggles from corehq.apps.celery import periodic_task, task from corehq.motech.models import RequestLog +from corehq.motech.rate_limiter import ( + rate_limit_repeater, + report_repeater_attempt, + report_repeater_usage, +) from corehq.util.metrics import ( make_buckets_from_timedeltas, metrics_counter, @@ -27,15 +32,16 @@ CHECK_REPEATERS_PARTITION_COUNT, ENDPOINT_TIMER, MAX_RETRY_WAIT, + PROCESS_REPEATERS_INTERVAL, + PROCESS_REPEATERS_KEY, RATE_LIMITER_DELAY_RANGE, State, ) -from .models import RepeatRecord, domain_can_forward - -from ..rate_limiter import ( - rate_limit_repeater, - report_repeater_attempt, - report_repeater_usage, +from .models import ( + Repeater, + RepeatRecord, + domain_can_forward, + domain_can_forward_now, ) _check_repeaters_buckets = make_buckets_from_timedeltas( @@ -215,6 +221,55 @@ def _process_repeat_record(repeat_record): ) +@periodic_task( + run_every=PROCESS_REPEATERS_INTERVAL, + queue=settings.CELERY_PERIODIC_QUEUE, +) +def process_repeaters(): + """ + Processes repeaters, instead of processing repeat records + independently the way that ``check_repeaters()`` does. + """ + process_repeater_lock = get_redis_lock( + PROCESS_REPEATERS_KEY, + timeout=24 * 60 * 60, + name=PROCESS_REPEATERS_KEY, + ) + if not process_repeater_lock.acquire(blocking=False): + return + try: + for domain, repeater_id in iter_ready_repeater_ids_forever(): + process_repeater.delay(domain, repeater_id) + finally: + process_repeater_lock.release() + + +def iter_ready_repeater_ids_forever(): + """ + Cycles through repeaters (repeatedly ;) ) until there are no more + repeat records ready to be sent. + """ + while True: + yielded = False + for repeater in Repeater.objects.all_ready(): + if not toggles.PROCESS_REPEATERS.enabled(repeater.domain): + continue + if not domain_can_forward_now(repeater.domain): + continue + yielded = True + yield repeater.domain, repeater.repeater_id + + if not yielded: + # No repeaters are ready, or their domains can't forward or + # are paused. + return + + +@task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) +def process_repeater(domain, repeater_id): + ... + + metrics_gauge_task( 'commcare.repeaters.overdue', RepeatRecord.objects.count_overdue, diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 455d28142945..4552429016ed 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -2,7 +2,7 @@ from datetime import datetime, timedelta from unittest.mock import patch -from django.test import TestCase +from django.test import SimpleTestCase, TestCase from corehq.apps.receiverwrapper.util import submit_form_locally from corehq.form_processor.models import XFormInstance @@ -15,6 +15,7 @@ from corehq.motech.repeaters.tasks import ( _process_repeat_record, delete_old_request_logs, + iter_ready_repeater_ids_forever, ) from ..const import State @@ -241,3 +242,59 @@ def patch(self): self.mock_domain_can_forward = patch_domain_can_forward.start() self.mock_domain_can_forward.return_value = True self.addCleanup(patch_domain_can_forward.stop) + + +class TestIterReadyRepeaterIDsForever(SimpleTestCase): + + def test_no_ready_repeaters(self): + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', + return_value=[]), # <-- + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=True) + ): + self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) + + def test_domain_cant_forward_now(self): + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', + return_value=[Repeater()]), + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=False), # <-- + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=True) + ): + self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) + + def test_process_repeaters_not_enabled(self): + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', + return_value=[Repeater()]), + patch('corehq.motech.repeaters.models.domain_can_forward', + return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=False) # <-- + ): + self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) + + def test_successive_loops(self): + repeater_1 = Repeater(domain=DOMAIN, name='foo') + repeater_2 = Repeater(domain=DOMAIN, name='bar') + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', + side_effect=[[repeater_1, repeater_2], [repeater_1], []]), + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=True) + ): + repeaters = list(iter_ready_repeater_ids_forever()) + self.assertEqual(len(repeaters), 3) + repeater_ids = [r[1] for r in repeaters] + self.assertEqual(repeater_ids, [ + repeater_1.repeater_id, + repeater_2.repeater_id, + repeater_1.repeater_id, + ]) From e36296c833ccd97263a7e2fad226256e4df65196 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 23 Aug 2024 21:10:04 +0100 Subject: [PATCH 03/56] `get_repeater_lock()` --- corehq/motech/repeaters/tasks.py | 26 +++++++++++++++++---- corehq/motech/repeaters/tests/test_tasks.py | 3 ++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 0ae80661456d..f16b8b7c907e 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -1,10 +1,13 @@ import random +import uuid from datetime import datetime, timedelta from django.conf import settings from celery.schedules import crontab from celery.utils.log import get_task_logger +from django_redis import get_redis_connection +from redis.lock import Lock from dimagi.utils.couch import get_redis_lock @@ -24,6 +27,7 @@ metrics_histogram_timer, ) from corehq.util.metrics.const import MPM_MAX +from corehq.util.metrics.lockmeter import MeteredLock from corehq.util.timer import TimingContext from .const import ( @@ -238,8 +242,8 @@ def process_repeaters(): if not process_repeater_lock.acquire(blocking=False): return try: - for domain, repeater_id in iter_ready_repeater_ids_forever(): - process_repeater.delay(domain, repeater_id) + for domain, repeater_id, lock_token in iter_ready_repeater_ids_forever(): + process_repeater.delay(domain, repeater_id, lock_token) finally: process_repeater_lock.release() @@ -256,8 +260,12 @@ def iter_ready_repeater_ids_forever(): continue if not domain_can_forward_now(repeater.domain): continue - yielded = True - yield repeater.domain, repeater.repeater_id + + lock = get_repeater_lock(repeater.repeater_id) + lock_token = uuid.uuid1().hex # The same way Lock does it + if lock.acquire(blocking=False, token=lock_token): + yielded = True + yield repeater.domain, repeater.repeater_id, lock_token if not yielded: # No repeaters are ready, or their domains can't forward or @@ -265,8 +273,16 @@ def iter_ready_repeater_ids_forever(): return +def get_repeater_lock(repeater_id): + redis = get_redis_connection() + name = f'process_repeater_{repeater_id}' + three_hours = 3 * 60 * 60 + lock = Lock(redis, name, timeout=three_hours) + return MeteredLock(lock, name) + + @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) -def process_repeater(domain, repeater_id): +def process_repeater(domain, repeater_id, lock_token): ... diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 4552429016ed..7b3454ea32d9 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -288,7 +288,8 @@ def test_successive_loops(self): patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=True) + return_value=True), + patch('corehq.motech.repeaters.tasks.get_repeater_lock'), ): repeaters = list(iter_ready_repeater_ids_forever()) self.assertEqual(len(repeaters), 3) From aeb10baf021dcdf2e88a35f243233bd2a3078b2a Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 23 Aug 2024 23:22:37 +0100 Subject: [PATCH 04/56] `iter_ready_repeater_ids_once()` --- corehq/motech/repeaters/models.py | 7 ++ corehq/motech/repeaters/tasks.py | 47 ++++++++++-- corehq/motech/repeaters/tests/test_models.py | 8 ++ corehq/motech/repeaters/tests/test_tasks.py | 77 ++++++++++++++++---- 4 files changed, 120 insertions(+), 19 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index c5d5c2386198..8ccc929abcb6 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -235,6 +235,13 @@ def all_ready(self): .filter(next_attempt_not_in_the_future) .filter(repeat_records_ready_to_send)) + def get_all_ready_ids_by_domain(self): + results = defaultdict(list) + query = self.all_ready().values_list('domain', 'id') + for (domain, id_uuid) in query.all(): + results[domain].append(id_uuid.hex) + return results + def get_queryset(self): repeater_obj = self.model() if type(repeater_obj).__name__ == "Repeater": diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index f16b8b7c907e..9a04a7a5eba5 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -255,17 +255,17 @@ def iter_ready_repeater_ids_forever(): """ while True: yielded = False - for repeater in Repeater.objects.all_ready(): - if not toggles.PROCESS_REPEATERS.enabled(repeater.domain): + for domain, repeater_id in iter_ready_repeater_ids_once(): + if not toggles.PROCESS_REPEATERS.enabled(domain, toggles.NAMESPACE_DOMAIN): continue - if not domain_can_forward_now(repeater.domain): + if not domain_can_forward_now(domain): continue - lock = get_repeater_lock(repeater.repeater_id) + lock = get_repeater_lock(repeater_id) lock_token = uuid.uuid1().hex # The same way Lock does it if lock.acquire(blocking=False, token=lock_token): yielded = True - yield repeater.domain, repeater.repeater_id, lock_token + yield domain, repeater_id, lock_token if not yielded: # No repeaters are ready, or their domains can't forward or @@ -273,6 +273,43 @@ def iter_ready_repeater_ids_forever(): return +def iter_ready_repeater_ids_once(): + """ + Yields domain-repeater_id tuples in a round-robin fashion. + + e.g. :: + ('domain1', 'repeater_id1'), + ('domain2', 'repeater_id2'), + ('domain3', 'repeater_id3'), + ('domain1', 'repeater_id4'), + ('domain2', 'repeater_id5'), + ... + + """ + + def iter_domain_repeaters(dom): + try: + rep_id = repeater_ids_by_domain[dom].pop(0) + except IndexError: + return + else: + yield rep_id + + repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() + while True: + if not repeater_ids_by_domain: + return + for domain in list(repeater_ids_by_domain.keys()): + try: + repeater_id = next(iter_domain_repeaters(domain)) + except StopIteration: + # We've exhausted the repeaters for this domain + del repeater_ids_by_domain[domain] + continue + else: + yield domain, repeater_id + + def get_repeater_lock(repeater_id): redis = get_redis_connection() name = f'process_repeater_{repeater_id}' diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 75905c7231d5..80ff8046bfd1 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -193,6 +193,14 @@ def test_all_ready_next_past(self): self.assertEqual(len(repeaters), 1) self.assertEqual(repeaters[0].id, self.repeater.id) + def test_all_ready_ids(self): + with make_repeat_record(self.repeater, RECORD_PENDING_STATE): + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual( + dict(repeater_ids), + {self.repeater.domain: [self.repeater.repeater_id]} + ) + @contextmanager def make_repeat_record(repeater, state): diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 7b3454ea32d9..901e515ae39b 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -4,6 +4,8 @@ from django.test import SimpleTestCase, TestCase +from nose.tools import assert_equal + from corehq.apps.receiverwrapper.util import submit_form_locally from corehq.form_processor.models import XFormInstance from corehq.form_processor.utils.xform import ( @@ -16,6 +18,7 @@ _process_repeat_record, delete_old_request_logs, iter_ready_repeater_ids_forever, + iter_ready_repeater_ids_once, ) from ..const import State @@ -248,7 +251,7 @@ class TestIterReadyRepeaterIDsForever(SimpleTestCase): def test_no_ready_repeaters(self): with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', + patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', return_value=[]), # <-- patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), @@ -259,8 +262,8 @@ def test_no_ready_repeaters(self): def test_domain_cant_forward_now(self): with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', - return_value=[Repeater()]), + patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', + return_value=[(DOMAIN, 'abc123')]), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=False), # <-- patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', @@ -270,8 +273,8 @@ def test_domain_cant_forward_now(self): def test_process_repeaters_not_enabled(self): with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', - return_value=[Repeater()]), + patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', + return_value=[(DOMAIN, 'abc123')]), patch('corehq.motech.repeaters.models.domain_can_forward', return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', @@ -280,11 +283,23 @@ def test_process_repeaters_not_enabled(self): self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) def test_successive_loops(self): - repeater_1 = Repeater(domain=DOMAIN, name='foo') - repeater_2 = Repeater(domain=DOMAIN, name='bar') with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.all_ready', - side_effect=[[repeater_1, repeater_2], [repeater_1], []]), + patch( + 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=[ + { + # See test_iter_ready_repeater_ids_once() + 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + }, + { + 'domain1': ['repeater_id1', 'repeater_id2'], + 'domain2': ['repeater_id4'] + }, + {}, + ] + ), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', @@ -292,10 +307,44 @@ def test_successive_loops(self): patch('corehq.motech.repeaters.tasks.get_repeater_lock'), ): repeaters = list(iter_ready_repeater_ids_forever()) - self.assertEqual(len(repeaters), 3) - repeater_ids = [r[1] for r in repeaters] + self.assertEqual(len(repeaters), 9) + repeater_ids = [(r[0], r[1]) for r in repeaters] self.assertEqual(repeater_ids, [ - repeater_1.repeater_id, - repeater_2.repeater_id, - repeater_1.repeater_id, + # First loop + ('domain1', 'repeater_id1'), + ('domain2', 'repeater_id4'), + ('domain3', 'repeater_id6'), + ('domain1', 'repeater_id2'), + ('domain2', 'repeater_id5'), + ('domain1', 'repeater_id3'), + + # Second loop + ('domain1', 'repeater_id1'), + ('domain2', 'repeater_id4'), + ('domain1', 'repeater_id2'), ]) + + +def test_iter_ready_repeater_ids_once(): + with patch( + 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + return_value={ + 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + } + ): + pairs = list(iter_ready_repeater_ids_once()) + assert_equal(pairs, [ + # First round of domains + ('domain1', 'repeater_id1'), + ('domain2', 'repeater_id4'), + ('domain3', 'repeater_id6'), + + # Second round + ('domain1', 'repeater_id2'), + ('domain2', 'repeater_id5'), + + # Third round + ('domain1', 'repeater_id3'), + ]) From 01e4bc788c606a9e72707c4c761b943fa04f3d5f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 23 Aug 2024 23:51:02 +0100 Subject: [PATCH 05/56] Skip rate-limited repeaters --- corehq/motech/repeaters/tasks.py | 2 + corehq/motech/repeaters/tests/test_tasks.py | 61 +++++++++++++++------ 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 9a04a7a5eba5..95cd4da6f098 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -260,6 +260,8 @@ def iter_ready_repeater_ids_forever(): continue if not domain_can_forward_now(domain): continue + if rate_limit_repeater(domain, repeater_id): + continue lock = get_repeater_lock(repeater_id) lock_token = uuid.uuid1().hex # The same way Lock does it diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 901e515ae39b..a1889033b5fd 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -249,6 +249,22 @@ def patch(self): class TestIterReadyRepeaterIDsForever(SimpleTestCase): + @staticmethod + def all_ready_ids_by_domain(): + return [ + { + # See test_iter_ready_repeater_ids_once() + 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + }, + { + 'domain1': ['repeater_id1', 'repeater_id2'], + 'domain2': ['repeater_id4'] + }, + {}, + ] + def test_no_ready_repeaters(self): with ( patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', @@ -284,26 +300,14 @@ def test_process_repeaters_not_enabled(self): def test_successive_loops(self): with ( - patch( - 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=[ - { - # See test_iter_ready_repeater_ids_once() - 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], - 'domain2': ['repeater_id4', 'repeater_id5'], - 'domain3': ['repeater_id6'], - }, - { - 'domain1': ['repeater_id1', 'repeater_id2'], - 'domain2': ['repeater_id4'] - }, - {}, - ] - ), + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=self.all_ready_ids_by_domain()), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', return_value=True), + patch('corehq.motech.repeaters.tasks.rate_limit_repeater', + return_value=False), patch('corehq.motech.repeaters.tasks.get_repeater_lock'), ): repeaters = list(iter_ready_repeater_ids_forever()) @@ -324,6 +328,31 @@ def test_successive_loops(self): ('domain1', 'repeater_id2'), ]) + def test_rate_limit(self): + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=self.all_ready_ids_by_domain()), + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=True), + patch('corehq.motech.repeaters.tasks.rate_limit_repeater', + side_effect=lambda dom, rep: dom == 'domain2' and rep == 'repeater_id4'), + patch('corehq.motech.repeaters.tasks.get_repeater_lock'), + ): + repeaters = list(iter_ready_repeater_ids_forever()) + self.assertEqual(len(repeaters), 7) + repeater_ids = [(r[0], r[1]) for r in repeaters] + self.assertEqual(repeater_ids, [ + ('domain1', 'repeater_id1'), + ('domain3', 'repeater_id6'), + ('domain1', 'repeater_id2'), + ('domain2', 'repeater_id5'), + ('domain1', 'repeater_id3'), + ('domain1', 'repeater_id1'), + ('domain1', 'repeater_id2'), + ]) + def test_iter_ready_repeater_ids_once(): with patch( From db2fec2159cf88bee2d92ae312abdf197d8a5356 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 24 Aug 2024 00:51:59 +0100 Subject: [PATCH 06/56] `process_repeater()` task --- corehq/motech/repeaters/models.py | 23 ++++- corehq/motech/repeaters/tasks.py | 95 ++++++++++++++++++- .../motech/repeaters/tests/test_repeater.py | 8 +- corehq/util/metrics/lockmeter.py | 6 ++ corehq/util/metrics/tests/test_lockmeter.py | 20 +++- settings.py | 4 + 6 files changed, 146 insertions(+), 10 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 8ccc929abcb6..0e03cc29fdd0 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -73,6 +73,7 @@ from http import HTTPStatus from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from django.conf import settings from django.db import models, router from django.db.models.base import Deferred from django.dispatch import receiver @@ -357,9 +358,21 @@ def _repeater_type(cls): @property def repeat_records_ready(self): - return self.repeat_records.filter(state__in=(State.Pending, State.Fail)) + """ + A QuerySet of repeat records in the Pending or Fail state in the + order in which they were registered + """ + return ( + self.repeat_records + .filter(state__in=(State.Pending, State.Fail)) + .order_by('registered_at') + ) - def set_next_attempt(self): + @property + def num_workers(self): + return settings.DEFAULT_REPEATER_WORKERS + + def set_backoff(self): now = datetime.utcnow() interval = _get_retry_interval(self.last_attempt_at, now) self.last_attempt_at = now @@ -372,7 +385,7 @@ def set_next_attempt(self): next_attempt_at=now + interval, ) - def reset_next_attempt(self): + def reset_backoff(self): if self.last_attempt_at or self.next_attempt_at: self.last_attempt_at = None self.next_attempt_at = None @@ -1168,7 +1181,9 @@ def fire(self, force_send=False, timing_context=None): self.repeater.fire_for_record(self, timing_context=timing_context) except Exception as e: self.handle_payload_error(str(e), traceback_str=traceback.format_exc()) - raise + finally: + return self.state + return None def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): from corehq.motech.repeaters.tasks import ( diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 95cd4da6f098..63103ffdfe1b 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -4,6 +4,7 @@ from django.conf import settings +from celery import chord from celery.schedules import crontab from celery.utils.log import get_task_logger from django_redis import get_redis_connection @@ -322,7 +323,99 @@ def get_repeater_lock(repeater_id): @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) def process_repeater(domain, repeater_id, lock_token): - ... + + def get_task_signature(repeat_record): + task_ = { + State.Pending: process_pending_repeat_record, + State.Fail: process_failed_repeat_record, + }[repeat_record.state] + return task_.s(repeat_record.id, repeat_record.domain) + + repeater = Repeater.objects.get(domain=domain, id=repeater_id) + repeat_records = repeater.repeat_records_ready[:repeater.num_workers] + header_tasks = [get_task_signature(rr) for rr in repeat_records] + chord(header_tasks)(update_repeater.s(repeater_id, lock_token)) + + +@task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) +def process_pending_repeat_record(repeat_record_id, domain): + # NOTE: Keep separate from `process_failed_repeat_record()` for + # monitoring purposes. `domain` is for tagging in Datadog + return process_ready_repeat_record(repeat_record_id) + + +@task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) +def process_failed_repeat_record(repeat_record_id, domain): + # NOTE: Keep separate from `process_pending_repeat_record()` for + # monitoring purposes. `domain` is for tagging in Datadog + return process_ready_repeat_record(repeat_record_id) + + +def process_ready_repeat_record(repeat_record_id): + state_or_none = None + with TimingContext('process_repeat_record') as timer: + try: + repeat_record = ( + RepeatRecord.objects + .prefetch_related('repeater', 'attempt_set') + .get(id=repeat_record_id) + ) + report_repeater_attempt(repeat_record.repeater.repeater_id) + if not is_repeat_record_ready(repeat_record): + return None + with timer('fire_timing') as fire_timer: + state_or_none = repeat_record.fire(timing_context=fire_timer) + report_repeater_usage( + repeat_record.domain, + # round up to the nearest millisecond, meaning always at least 1ms + milliseconds=int(fire_timer.duration * 1000) + 1 + ) + except Exception: + logging.exception(f'Failed to process repeat record {repeat_record_id}') + return state_or_none + + +def is_repeat_record_ready(repeat_record): + # Fail loudly if repeat_record is not ready. + # process_ready_repeat_record() will log an exception. + assert repeat_record.state in (State.Pending, State.Fail) + + # The repeater could have been paused or rate-limited while it was + # being processed + return ( + not repeat_record.repeater.is_paused + and not toggles.PAUSE_DATA_FORWARDING.enabled(repeat_record.domain) + and not rate_limit_repeater( + repeat_record.domain, + repeat_record.repeater.repeater_id + ) + ) + + +@task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) +def update_repeater(repeat_record_states, repeater_id, lock_token): + """ + Determines whether the repeater should back off, based on the + results of ``_process_repeat_record()`` tasks. + """ + try: + repeater = Repeater.objects.get(id=repeater_id) + if any(s == State.Success for s in repeat_record_states): + # At least one repeat record was sent successfully. The + # remote endpoint is healthy. + repeater.reset_backoff() + elif all(s in (State.Empty, State.InvalidPayload, None) + for s in repeat_record_states): + # We can't tell anything about the remote endpoint. + # (_process_repeat_record() can return None on an exception.) + pass + else: + # All sent payloads failed. Try again later. + repeater.set_backoff() + finally: + lock = get_repeater_lock(repeater_id) + lock.local.token = lock_token + lock.release() metrics_gauge_task( diff --git a/corehq/motech/repeaters/tests/test_repeater.py b/corehq/motech/repeaters/tests/test_repeater.py index 7115b6133ccc..84a31f2a23c3 100644 --- a/corehq/motech/repeaters/tests/test_repeater.py +++ b/corehq/motech/repeaters/tests/test_repeater.py @@ -690,10 +690,10 @@ def test_payload_exception_on_fire(self): with patch('corehq.motech.repeaters.models.simple_request') as mock_simple_post: mock_simple_post.return_value.status_code = 503 # Fail and retry rr = self.repeater.register(case) - with self.assertRaises(Exception): - with patch.object(CaseRepeater, 'get_payload', side_effect=Exception('Boom!')): - rr.fire() + with patch.object(CaseRepeater, 'get_payload', side_effect=Exception('Boom!')): + state_or_none = rr.fire() + self.assertEqual(state_or_none, State.InvalidPayload) repeat_record = RepeatRecord.objects.get(id=rr.id) self.assertEqual(repeat_record.state, State.InvalidPayload) self.assertEqual(repeat_record.failure_reason, 'Boom!') @@ -1348,7 +1348,7 @@ def test_pause_and_set_next_attempt(self): self.assertIsNone(repeater_a.next_attempt_at) self.assertFalse(repeater_b.is_paused) - repeater_a.set_next_attempt() + repeater_a.set_backoff() repeater_b.pause() repeater_c = Repeater.objects.get(id=self.repeater.repeater_id) diff --git a/corehq/util/metrics/lockmeter.py b/corehq/util/metrics/lockmeter.py index 692793b24863..27b77dbc113f 100644 --- a/corehq/util/metrics/lockmeter.py +++ b/corehq/util/metrics/lockmeter.py @@ -56,6 +56,12 @@ def release(self): self.lock_trace.finish() self.lock_trace = None + @property + def local(self): + # Used for setting lock token + # Raises AttributeError if lock does not have a `local` attribute + return self.lock.local + def __enter__(self): self.acquire(blocking=True) return self diff --git a/corehq/util/metrics/tests/test_lockmeter.py b/corehq/util/metrics/tests/test_lockmeter.py index cd5a96b53aee..e658b2423463 100644 --- a/corehq/util/metrics/tests/test_lockmeter.py +++ b/corehq/util/metrics/tests/test_lockmeter.py @@ -1,9 +1,14 @@ +import threading from unittest import TestCase +from unittest.mock import call, patch +from uuid import uuid1 import attr -from unittest.mock import call, patch +from django_redis import get_redis_connection +from redis.lock import Lock from corehq.util.metrics.tests.utils import capture_metrics + from ..lockmeter import MeteredLock @@ -155,6 +160,19 @@ def test_del_untracked(self): lock.__del__() self.assertListEqual(tracer.mock_calls, []) + def test_local(self): + redis = get_redis_connection() + name = uuid1().hex + with Lock(redis, name, timeout=5) as redis_lock: + lock = MeteredLock(redis_lock, name) + self.assertEqual(type(lock.local), threading.local) + + def test_no_local(self): + fake = FakeLock() + lock = MeteredLock(fake, "test") + with self.assertRaises(AttributeError): + lock.local + @attr.s class FakeLock(object): diff --git a/settings.py b/settings.py index 5131ef9eb623..e9a3331eedcd 100755 --- a/settings.py +++ b/settings.py @@ -628,6 +628,10 @@ "ucr_queue": None, } +# The default number of repeat_record_queue workers that one repeater +# can use to send repeat records at the same time. +DEFAULT_REPEATER_WORKERS = 7 + # websockets config WEBSOCKET_URL = '/ws/' WS4REDIS_PREFIX = 'ws' From 85b952e7a0d1fefbf1412ae99a9446f13ee1641f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sun, 4 Aug 2024 04:38:01 +0100 Subject: [PATCH 07/56] Add tests --- corehq/motech/repeaters/tests/test_tasks.py | 120 +++++++++++++++++++- 1 file changed, 118 insertions(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index a1889033b5fd..47ab52aa2bf9 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -1,6 +1,8 @@ +from collections import namedtuple from contextlib import contextmanager from datetime import datetime, timedelta -from unittest.mock import patch +from unittest.mock import MagicMock, patch +from uuid import uuid4 from django.test import SimpleTestCase, TestCase @@ -13,13 +15,16 @@ TestFormMetadata, ) from corehq.motech.models import ConnectionSettings, RequestLog -from corehq.motech.repeaters.models import Repeater, RepeatRecord +from corehq.motech.repeaters.models import FormRepeater, Repeater, RepeatRecord from corehq.motech.repeaters.tasks import ( _process_repeat_record, delete_old_request_logs, iter_ready_repeater_ids_forever, iter_ready_repeater_ids_once, + process_repeater, + update_repeater, ) +from corehq.util.test_utils import _create_case, flag_enabled from ..const import State @@ -28,6 +33,9 @@ 'naoi', 'deich'] +ResponseMock = namedtuple('ResponseMock', 'status_code reason') + + class TestDeleteOldRequestLogs(TestCase): def test_raw_delete_logs_old(self): @@ -377,3 +385,111 @@ def test_iter_ready_repeater_ids_once(): # Third round ('domain1', 'repeater_id3'), ]) + + +@flag_enabled('PROCESS_REPEATERS') +class TestProcessRepeater(TestCase): + + @classmethod + def setUpClass(cls): + super().setUpClass() + + cls.set_backoff_patch = patch.object(FormRepeater, 'set_backoff') + cls.set_backoff_patch.start() + cls.addClassCleanup(cls.set_backoff_patch.stop) + + connection_settings = ConnectionSettings.objects.create( + domain=DOMAIN, + url='http://www.example.com/api/' + ) + cls.repeater = FormRepeater.objects.create( + domain=DOMAIN, + connection_settings=connection_settings, + ) + + def test_process_repeater_sends_repeat_record(self): + payload, __ = _create_case( + domain=DOMAIN, + case_id=str(uuid4()), + case_type='case', + owner_id='abc123' + ) + self.repeater.register(payload) + + with ( + patch('corehq.motech.repeaters.models.simple_request') as request_mock, + patch('corehq.motech.repeaters.tasks.get_repeater_lock') + ): + request_mock.return_value = ResponseMock(status_code=200, reason='OK') + process_repeater(DOMAIN, self.repeater.repeater_id, 'token') + + request_mock.assert_called_once() + + def test_process_repeater_updates_repeater(self): + payload, __ = _create_case( + domain=DOMAIN, + case_id=str(uuid4()), + case_type='case', + owner_id='abc123' + ) + self.repeater.register(payload) + + with ( + patch('corehq.motech.repeaters.models.simple_request') as request_mock, + patch('corehq.motech.repeaters.tasks.get_repeater_lock') + ): + request_mock.return_value = ResponseMock( + status_code=429, + reason='Too Many Requests', + ) + process_repeater(DOMAIN, self.repeater.repeater_id, 'token') + + self.repeater.set_backoff.assert_called_once() + + +@flag_enabled('PROCESS_REPEATERS') +class TestUpdateRepeater(SimpleTestCase): + + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + + update_repeater([State.Success, State.Fail, State.Empty, None], 1, 'token') + + mock_repeater.set_backoff.assert_not_called() + mock_repeater.reset_backoff.assert_called_once() + + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + + update_repeater([State.Fail, State.Empty, None], 1, 'token') + + mock_repeater.set_backoff.assert_called_once() + mock_repeater.reset_backoff.assert_not_called() + + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + + update_repeater([State.Empty], 1, 'token') + + mock_repeater.set_backoff.assert_not_called() + mock_repeater.reset_backoff.assert_not_called() + + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + + update_repeater([None], 1, 'token') + + mock_repeater.set_backoff.assert_not_called() + mock_repeater.reset_backoff.assert_not_called() From c28c11b84578ef5613931fa7012278cf438a07a9 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 24 Aug 2024 01:49:48 +0100 Subject: [PATCH 08/56] `Repeater.max_workers` field --- .../migrations/0014_repeater_max_workers.py | 16 ++++++++++++++++ corehq/motech/repeaters/models.py | 6 +++++- migrations.lock | 1 + settings.py | 5 +++++ 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 corehq/motech/repeaters/migrations/0014_repeater_max_workers.py diff --git a/corehq/motech/repeaters/migrations/0014_repeater_max_workers.py b/corehq/motech/repeaters/migrations/0014_repeater_max_workers.py new file mode 100644 index 000000000000..66240342f454 --- /dev/null +++ b/corehq/motech/repeaters/migrations/0014_repeater_max_workers.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("repeaters", "0013_alter_repeatrecord_state_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="repeater", + name="max_workers", + field=models.IntegerField(default=0), + ), + ] diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 0e03cc29fdd0..65eee77157d2 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -268,6 +268,7 @@ class Repeater(RepeaterSuperProxy): is_paused = models.BooleanField(default=False) next_attempt_at = models.DateTimeField(null=True, blank=True) last_attempt_at = models.DateTimeField(null=True, blank=True) + max_workers = models.IntegerField(default=0) options = JSONField(default=dict) connection_settings_id = models.IntegerField(db_index=True) is_deleted = models.BooleanField(default=False, db_index=True) @@ -370,7 +371,10 @@ def repeat_records_ready(self): @property def num_workers(self): - return settings.DEFAULT_REPEATER_WORKERS + # If num_workers is 1, repeat records are sent in the order in + # which they were registered. + num_workers = self.max_workers or settings.DEFAULT_REPEATER_WORKERS + return min(num_workers, settings.MAX_REPEATER_WORKERS) def set_backoff(self): now = datetime.utcnow() diff --git a/migrations.lock b/migrations.lock index 0e1a62d79cda..74031295af7f 100644 --- a/migrations.lock +++ b/migrations.lock @@ -833,6 +833,7 @@ repeaters 0011_remove_obsolete_entities 0012_formexpressionrepeater_arcgisformexpressionrepeater 0013_alter_repeatrecord_state_and_more + 0014_repeater_max_workers reports 0001_initial 0002_auto_20171121_1803 diff --git a/settings.py b/settings.py index e9a3331eedcd..42e8ce0953f4 100755 --- a/settings.py +++ b/settings.py @@ -631,6 +631,11 @@ # The default number of repeat_record_queue workers that one repeater # can use to send repeat records at the same time. DEFAULT_REPEATER_WORKERS = 7 +# The hard limit for the number of repeat_record_queue workers that one +# repeater can use to send repeat records at the same time. This is a +# guardrail to prevent one repeater from hogging repeat_record_queue +# workers and to ensure that repeaters are iterated fairly. +MAX_REPEATER_WORKERS = 144 # websockets config WEBSOCKET_URL = '/ws/' From d8d9642341d6b91815827008d9c1c69f78485f5e Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 28 Aug 2024 15:41:11 +0100 Subject: [PATCH 09/56] Index fields used by `RepeaterManager.all_ready()` --- .../0015_alter_repeatrecord_state_and_more.py | 35 +++++++++++++++++++ corehq/motech/repeaters/models.py | 33 ++++++++++++----- migrations.lock | 1 + 3 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py diff --git a/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py b/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py new file mode 100644 index 000000000000..83036dc7302c --- /dev/null +++ b/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py @@ -0,0 +1,35 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("repeaters", "0014_repeater_max_workers"), + ] + + operations = [ + migrations.AlterField( + model_name="repeatrecord", + name="state", + field=models.PositiveSmallIntegerField( + choices=[ + (1, "Pending"), + (2, "Failed"), + (4, "Succeeded"), + (8, "Cancelled"), + (16, "Empty"), + (32, "Invalid Payload"), + ], + db_index=True, + default=1, + ), + ), + migrations.AddIndex( + model_name="repeater", + index=models.Index( + condition=models.Q(("is_paused", False)), + fields=["next_attempt_at"], + name="next_attempt_at_not_paused_idx", + ), + ), + ] diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 65eee77157d2..f36a7318b0d7 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -231,10 +231,12 @@ def all_ready(self): repeat_records_ready_to_send = models.Q( repeat_records__state__in=(State.Pending, State.Fail) ) - return (self.get_queryset() - .filter(not_paused) - .filter(next_attempt_not_in_the_future) - .filter(repeat_records_ready_to_send)) + return ( + self.get_queryset() + .filter(not_paused) + .filter(next_attempt_not_in_the_future) + .filter(repeat_records_ready_to_send) + ) def get_all_ready_ids_by_domain(self): results = defaultdict(list) @@ -280,6 +282,13 @@ class Repeater(RepeaterSuperProxy): class Meta: db_table = 'repeaters_repeater' + indexes = [ + models.Index( + fields=['next_attempt_at'], + condition=models.Q(is_paused=False), + name='next_attempt_at_not_paused_idx', + ), + ] payload_generator_classes = () @@ -1001,11 +1010,17 @@ def get_domains_with_records(self): class RepeatRecord(models.Model): domain = models.CharField(max_length=126) payload_id = models.CharField(max_length=255) - repeater = models.ForeignKey(Repeater, - on_delete=DB_CASCADE, - db_column="repeater_id_", - related_name='repeat_records') - state = models.PositiveSmallIntegerField(choices=State.choices, default=State.Pending) + repeater = models.ForeignKey( + Repeater, + on_delete=DB_CASCADE, + db_column="repeater_id_", + related_name='repeat_records', + ) + state = models.PositiveSmallIntegerField( + choices=State.choices, + default=State.Pending, + db_index=True, + ) registered_at = models.DateTimeField() next_check = models.DateTimeField(null=True, default=None) max_possible_tries = models.IntegerField(default=MAX_BACKOFF_ATTEMPTS) diff --git a/migrations.lock b/migrations.lock index 74031295af7f..6c2ca3472698 100644 --- a/migrations.lock +++ b/migrations.lock @@ -834,6 +834,7 @@ repeaters 0012_formexpressionrepeater_arcgisformexpressionrepeater 0013_alter_repeatrecord_state_and_more 0014_repeater_max_workers + 0015_alter_repeatrecord_state_and_more reports 0001_initial 0002_auto_20171121_1803 From 48c3d7cc9f5080cf84ae366f4633926fa1dc7f09 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 28 Aug 2024 17:57:46 +0100 Subject: [PATCH 10/56] Use quickcache. Prefilter enabled domains. --- corehq/motech/repeaters/models.py | 1 + corehq/motech/repeaters/tasks.py | 16 ++- corehq/motech/repeaters/tests/test_tasks.py | 106 +++++++++++++++----- 3 files changed, 95 insertions(+), 28 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index f36a7318b0d7..3484dec0b4b7 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1373,6 +1373,7 @@ def domain_can_forward(domain): ) +@quickcache(['domain'], timeout=60) def domain_can_forward_now(domain): return ( domain_can_forward(domain) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 63103ffdfe1b..57fc27835cdc 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -257,8 +257,6 @@ def iter_ready_repeater_ids_forever(): while True: yielded = False for domain, repeater_id in iter_ready_repeater_ids_once(): - if not toggles.PROCESS_REPEATERS.enabled(domain, toggles.NAMESPACE_DOMAIN): - continue if not domain_can_forward_now(domain): continue if rate_limit_repeater(domain, repeater_id): @@ -298,7 +296,7 @@ def iter_domain_repeaters(dom): else: yield rep_id - repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() + repeater_ids_by_domain = get_repeater_ids_by_domain() while True: if not repeater_ids_by_domain: return @@ -321,6 +319,16 @@ def get_repeater_lock(repeater_id): return MeteredLock(lock, name) +def get_repeater_ids_by_domain(): + repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() + enabled_domains = set(toggles.PROCESS_REPEATERS.get_enabled_domains()) + return { + domain: repeater_ids + for domain, repeater_ids in repeater_ids_by_domain.items() + if domain in enabled_domains + } + + @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) def process_repeater(domain, repeater_id, lock_token): @@ -384,7 +392,7 @@ def is_repeat_record_ready(repeat_record): # being processed return ( not repeat_record.repeater.is_paused - and not toggles.PAUSE_DATA_FORWARDING.enabled(repeat_record.domain) + and domain_can_forward_now(repeat_record.domain) and not rate_limit_repeater( repeat_record.domain, repeat_record.repeater.repeater_id diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 47ab52aa2bf9..ae3c2629fb09 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -19,6 +19,7 @@ from corehq.motech.repeaters.tasks import ( _process_repeat_record, delete_old_request_logs, + get_repeater_ids_by_domain, iter_ready_repeater_ids_forever, iter_ready_repeater_ids_once, process_repeater, @@ -275,34 +276,34 @@ def all_ready_ids_by_domain(): def test_no_ready_repeaters(self): with ( - patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', - return_value=[]), # <-- + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + return_value={}), # <-- patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=True) + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain1', 'domain2', 'domain3']), ): self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) def test_domain_cant_forward_now(self): with ( - patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', - return_value=[(DOMAIN, 'abc123')]), + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=self.all_ready_ids_by_domain()), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=False), # <-- - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=True) + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain1', 'domain2', 'domain3']), ): self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) def test_process_repeaters_not_enabled(self): with ( - patch('corehq.motech.repeaters.tasks.iter_ready_repeater_ids_once', - return_value=[(DOMAIN, 'abc123')]), - patch('corehq.motech.repeaters.models.domain_can_forward', + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=self.all_ready_ids_by_domain()), + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=False) # <-- + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=[]), # <-- ): self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) @@ -312,8 +313,8 @@ def test_successive_loops(self): side_effect=self.all_ready_ids_by_domain()), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain1', 'domain2', 'domain3']), patch('corehq.motech.repeaters.tasks.rate_limit_repeater', return_value=False), patch('corehq.motech.repeaters.tasks.get_repeater_lock'), @@ -342,8 +343,8 @@ def test_rate_limit(self): side_effect=self.all_ready_ids_by_domain()), patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain1', 'domain2', 'domain3']), patch('corehq.motech.repeaters.tasks.rate_limit_repeater', side_effect=lambda dom, rep: dom == 'domain2' and rep == 'repeater_id4'), patch('corehq.motech.repeaters.tasks.get_repeater_lock'), @@ -361,15 +362,43 @@ def test_rate_limit(self): ('domain1', 'repeater_id2'), ]) + def test_disabled_domains_excluded(self): + with ( + patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + side_effect=self.all_ready_ids_by_domain()), + patch('corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=True), + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain2', 'domain3']), # <-- + patch('corehq.motech.repeaters.tasks.rate_limit_repeater', + return_value=False), + patch('corehq.motech.repeaters.tasks.get_repeater_lock'), + ): + repeaters = list(iter_ready_repeater_ids_forever()) + self.assertEqual(len(repeaters), 4) + repeater_ids = [(r[0], r[1]) for r in repeaters] + self.assertEqual(repeater_ids, [ + ('domain2', 'repeater_id4'), + ('domain3', 'repeater_id6'), + ('domain2', 'repeater_id5'), + ('domain2', 'repeater_id4'), + ]) + def test_iter_ready_repeater_ids_once(): - with patch( - 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - return_value={ - 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], - 'domain2': ['repeater_id4', 'repeater_id5'], - 'domain3': ['repeater_id6'], - } + with ( + patch( + 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + return_value={ + 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + } + ), + patch( + 'corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain1', 'domain2', 'domain3'], + ), ): pairs = list(iter_ready_repeater_ids_once()) assert_equal(pairs, [ @@ -387,6 +416,28 @@ def test_iter_ready_repeater_ids_once(): ]) +def test_get_repeater_ids_by_domain(): + with ( + patch( + 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', + return_value={ + 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + } + ), + patch( + 'corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', + return_value=['domain2', 'domain3', 'domain4'], + ), + ): + repeater_ids_by_domain = get_repeater_ids_by_domain() + assert_equal(repeater_ids_by_domain, { + 'domain2': ['repeater_id4', 'repeater_id5'], + 'domain3': ['repeater_id6'], + }) + + @flag_enabled('PROCESS_REPEATERS') class TestProcessRepeater(TestCase): @@ -394,6 +445,13 @@ class TestProcessRepeater(TestCase): def setUpClass(cls): super().setUpClass() + can_forward_now_patch = patch( + 'corehq.motech.repeaters.tasks.domain_can_forward_now', + return_value=True, + ) + can_forward_now_patch = can_forward_now_patch.start() + cls.addClassCleanup(can_forward_now_patch.stop) + cls.set_backoff_patch = patch.object(FormRepeater, 'set_backoff') cls.set_backoff_patch.start() cls.addClassCleanup(cls.set_backoff_patch.stop) From 418ed3afe7f853edeeb1160bd025608ce85a0c99 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Aug 2024 13:23:59 +0100 Subject: [PATCH 11/56] Check randomly-enabled domains --- corehq/motech/repeaters/tasks.py | 8 ++++++-- corehq/motech/repeaters/tests/test_tasks.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 57fc27835cdc..dad7e09cfd14 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -321,11 +321,15 @@ def get_repeater_lock(repeater_id): def get_repeater_ids_by_domain(): repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() - enabled_domains = set(toggles.PROCESS_REPEATERS.get_enabled_domains()) + always_enabled_domains = set(toggles.PROCESS_REPEATERS.get_enabled_domains()) return { domain: repeater_ids for domain, repeater_ids in repeater_ids_by_domain.items() - if domain in enabled_domains + if ( + domain in always_enabled_domains + # FeatureRelease toggle: Check whether domain is randomly enabled + or toggles.PROCESS_REPEATERS.enabled(domain, toggles.NAMESPACE_DOMAIN) + ) } diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index ae3c2629fb09..ed633d7404da 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -304,6 +304,8 @@ def test_process_repeaters_not_enabled(self): return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', return_value=[]), # <-- + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + return_value=False), # <-- ): self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) @@ -369,7 +371,9 @@ def test_disabled_domains_excluded(self): patch('corehq.motech.repeaters.tasks.domain_can_forward_now', return_value=True), patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain2', 'domain3']), # <-- + return_value=['domain2']), # <-- + patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + side_effect=lambda dom, __: dom == 'domain3'), # <-- patch('corehq.motech.repeaters.tasks.rate_limit_repeater', return_value=False), patch('corehq.motech.repeaters.tasks.get_repeater_lock'), @@ -428,8 +432,11 @@ def test_get_repeater_ids_by_domain(): ), patch( 'corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain2', 'domain3', 'domain4'], + return_value=['domain2', 'domain4'], ), + patch( + 'corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', + side_effect=lambda dom, __: dom == 'domain3'), ): repeater_ids_by_domain = get_repeater_ids_by_domain() assert_equal(repeater_ids_by_domain, { From 85bbfa337942042f5539d143b6b6238d44eb9a1f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Aug 2024 13:43:34 +0100 Subject: [PATCH 12/56] Forward new records for synchronous case repeaters --- corehq/motech/repeaters/models.py | 12 ++++++++++- corehq/motech/repeaters/tests/test_models.py | 21 +++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 3484dec0b4b7..a628382c2cc4 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1212,7 +1212,17 @@ def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): retry_process_datasource_repeat_record, ) - if toggles.PROCESS_REPEATERS.enabled(self.domain, toggles.NAMESPACE_DOMAIN): + def is_new_synchronous_case_repeater_record(): + """ + Repeat record is a new record for a synchronous case repeater + See corehq.motech.repeaters.signals.fire_synchronous_case_repeaters + """ + return fire_synchronously and self.state == State.Pending + + if ( + toggles.PROCESS_REPEATERS.enabled(self.domain, toggles.NAMESPACE_DOMAIN) + and not is_new_synchronous_case_repeater_record() + ): return if self.next_check is None or self.next_check > datetime.utcnow(): diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 80ff8046bfd1..946495bc0e08 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -14,7 +14,7 @@ from nose.tools import assert_in, assert_raises from corehq.motech.models import ConnectionSettings -from corehq.util.test_utils import _create_case +from corehq.util.test_utils import _create_case, flag_enabled from ..const import ( MAX_ATTEMPTS, @@ -484,6 +484,25 @@ def test_fire_synchronously(self, process, retry_process): process.assert_called_once() self.assert_not_called(retry_process) + @flag_enabled('PROCESS_REPEATERS') + def test_process_repeaters_enabled(self, process, retry_process): + rec = self.new_record() + rec.attempt_forward_now() + + self.assert_not_called(process, retry_process) + + @flag_enabled('PROCESS_REPEATERS') + def test_fire_synchronously_process_repeaters_enabled( + self, + process, + retry_process, + ): + rec = self.new_record() + rec.attempt_forward_now(fire_synchronously=True) + + process.assert_called_once() + self.assert_not_called(retry_process) + def test_retry(self, process, retry_process): rec = self.new_record() rec.attempt_forward_now(is_retry=True) From d1119bbec475cf3cdbffb78b6ef8b07c36e7efd3 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 9 Sep 2024 15:38:14 +0100 Subject: [PATCH 13/56] Add explanatory docstrings and comments --- corehq/motech/repeaters/models.py | 12 ++++++++++++ corehq/motech/repeaters/tasks.py | 7 ++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index a628382c2cc4..1dd9de434f27 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1377,6 +1377,12 @@ def is_response(duck): def domain_can_forward(domain): + """ + Returns whether ``domain`` has data forwarding or Zapier integration + privileges. + + Used for determining whether to register a repeat record. + """ return domain and ( domain_has_privilege(domain, ZAPIER_INTEGRATION) or domain_has_privilege(domain, DATA_FORWARDING) @@ -1385,6 +1391,12 @@ def domain_can_forward(domain): @quickcache(['domain'], timeout=60) def domain_can_forward_now(domain): + """ + Returns ``True`` if ``domain`` has the requisite privileges and data + forwarding is not paused. + + Used for determining whether to send a repeat record now. + """ return ( domain_can_forward(domain) and not toggles.PAUSE_DATA_FORWARDING.enabled(domain) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index dad7e09cfd14..8741d8bab1d9 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -263,7 +263,12 @@ def iter_ready_repeater_ids_forever(): continue lock = get_repeater_lock(repeater_id) - lock_token = uuid.uuid1().hex # The same way Lock does it + # Generate a lock token using `uuid1()` the same way that + # `redis.lock.Lock` does. The `Lock` class uses the token to + # determine ownership, so that one process can acquire a + # lock and a different process can release it. This lock + # will be released by the `update_repeater()` task. + lock_token = uuid.uuid1().hex if lock.acquire(blocking=False, token=lock_token): yielded = True yield domain, repeater_id, lock_token From 03b26cfb41df9e2d57b91fa723754687b8bfac10 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 9 Sep 2024 18:50:40 +0100 Subject: [PATCH 14/56] get_redis_lock() ... acquire(): No TypeError ?! --- .../dimagi/utils/couch/tests/__init__.py | 0 .../utils/couch/tests/test_redis_lock.py | 31 +++++++++++++++++++ corehq/motech/repeaters/tasks.py | 7 +---- 3 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 corehq/ex-submodules/dimagi/utils/couch/tests/__init__.py create mode 100644 corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py diff --git a/corehq/ex-submodules/dimagi/utils/couch/tests/__init__.py b/corehq/ex-submodules/dimagi/utils/couch/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py b/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py new file mode 100644 index 000000000000..c5a8bd2d38f5 --- /dev/null +++ b/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py @@ -0,0 +1,31 @@ +import uuid + +from redis.lock import Lock as RedisLock + +from dimagi.utils.couch import get_redis_lock + +from corehq.tests.noseplugins.redislocks import TestLock +from corehq.util.metrics.lockmeter import MeteredLock + + +def test_get_redis_lock_with_token(): + lock_name = 'test-1' + metered_lock = get_redis_lock(key=lock_name, name=lock_name, timeout=1) + assert isinstance(metered_lock, MeteredLock) + # metered_lock.lock is a TestLock instance because of + # corehq.tests.noseplugins.redislocks.RedisLockTimeoutPlugin + test_lock = metered_lock.lock + assert isinstance(test_lock, TestLock) + redis_lock = test_lock.lock + assert isinstance(redis_lock, RedisLock) + + token = uuid.uuid1().hex + acquired = redis_lock.acquire(blocking=False, token=token) + assert acquired + + # What we want to be able to do in a separate process: + metered_lock_2 = get_redis_lock(key=lock_name, name=lock_name, timeout=1) + redis_lock_2 = metered_lock_2.lock.lock + redis_lock_2.local.token = token + # Does not raise LockNotOwnedError: + redis_lock_2.release() diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 8741d8bab1d9..388bde5a6f90 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -7,8 +7,6 @@ from celery import chord from celery.schedules import crontab from celery.utils.log import get_task_logger -from django_redis import get_redis_connection -from redis.lock import Lock from dimagi.utils.couch import get_redis_lock @@ -28,7 +26,6 @@ metrics_histogram_timer, ) from corehq.util.metrics.const import MPM_MAX -from corehq.util.metrics.lockmeter import MeteredLock from corehq.util.timer import TimingContext from .const import ( @@ -317,11 +314,9 @@ def iter_domain_repeaters(dom): def get_repeater_lock(repeater_id): - redis = get_redis_connection() name = f'process_repeater_{repeater_id}' three_hours = 3 * 60 * 60 - lock = Lock(redis, name, timeout=three_hours) - return MeteredLock(lock, name) + return get_redis_lock(key=name, name=name, timeout=three_hours) def get_repeater_ids_by_domain(): From de27ba0da31c6326068ea897bed20be8dd77da4f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 10 Sep 2024 15:22:02 +0100 Subject: [PATCH 15/56] Drop unnecessary `iter_domain_repeaters()` --- corehq/motech/repeaters/tasks.py | 16 +++-------- corehq/motech/repeaters/tests/test_tasks.py | 30 ++++++++++----------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 388bde5a6f90..2d83319ec9b6 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -289,28 +289,18 @@ def iter_ready_repeater_ids_once(): ... """ - - def iter_domain_repeaters(dom): - try: - rep_id = repeater_ids_by_domain[dom].pop(0) - except IndexError: - return - else: - yield rep_id - repeater_ids_by_domain = get_repeater_ids_by_domain() while True: if not repeater_ids_by_domain: return for domain in list(repeater_ids_by_domain.keys()): try: - repeater_id = next(iter_domain_repeaters(domain)) - except StopIteration: + repeater_id = repeater_ids_by_domain[domain].pop() + except IndexError: # We've exhausted the repeaters for this domain del repeater_ids_by_domain[domain] continue - else: - yield domain, repeater_id + yield domain, repeater_id def get_repeater_lock(repeater_id): diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index ed633d7404da..2a25d223f1cc 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -326,17 +326,17 @@ def test_successive_loops(self): repeater_ids = [(r[0], r[1]) for r in repeaters] self.assertEqual(repeater_ids, [ # First loop - ('domain1', 'repeater_id1'), - ('domain2', 'repeater_id4'), + ('domain1', 'repeater_id3'), + ('domain2', 'repeater_id5'), ('domain3', 'repeater_id6'), ('domain1', 'repeater_id2'), - ('domain2', 'repeater_id5'), - ('domain1', 'repeater_id3'), + ('domain2', 'repeater_id4'), + ('domain1', 'repeater_id1'), # Second loop - ('domain1', 'repeater_id1'), - ('domain2', 'repeater_id4'), ('domain1', 'repeater_id2'), + ('domain2', 'repeater_id4'), + ('domain1', 'repeater_id1'), ]) def test_rate_limit(self): @@ -355,13 +355,13 @@ def test_rate_limit(self): self.assertEqual(len(repeaters), 7) repeater_ids = [(r[0], r[1]) for r in repeaters] self.assertEqual(repeater_ids, [ - ('domain1', 'repeater_id1'), + ('domain1', 'repeater_id3'), + ('domain2', 'repeater_id5'), ('domain3', 'repeater_id6'), ('domain1', 'repeater_id2'), - ('domain2', 'repeater_id5'), - ('domain1', 'repeater_id3'), ('domain1', 'repeater_id1'), ('domain1', 'repeater_id2'), + ('domain1', 'repeater_id1'), ]) def test_disabled_domains_excluded(self): @@ -382,9 +382,9 @@ def test_disabled_domains_excluded(self): self.assertEqual(len(repeaters), 4) repeater_ids = [(r[0], r[1]) for r in repeaters] self.assertEqual(repeater_ids, [ - ('domain2', 'repeater_id4'), - ('domain3', 'repeater_id6'), ('domain2', 'repeater_id5'), + ('domain3', 'repeater_id6'), + ('domain2', 'repeater_id4'), ('domain2', 'repeater_id4'), ]) @@ -407,16 +407,16 @@ def test_iter_ready_repeater_ids_once(): pairs = list(iter_ready_repeater_ids_once()) assert_equal(pairs, [ # First round of domains - ('domain1', 'repeater_id1'), - ('domain2', 'repeater_id4'), + ('domain1', 'repeater_id3'), + ('domain2', 'repeater_id5'), ('domain3', 'repeater_id6'), # Second round ('domain1', 'repeater_id2'), - ('domain2', 'repeater_id5'), + ('domain2', 'repeater_id4'), # Third round - ('domain1', 'repeater_id3'), + ('domain1', 'repeater_id1'), ]) From 4955ef4a9628c5238956ea07dc6e592cc73b65a6 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 24 Sep 2024 15:47:54 +0200 Subject: [PATCH 16/56] Don't quickcache `domain_can_forward_now()` --- corehq/motech/repeaters/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 1dd9de434f27..c8c17ada3593 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1389,7 +1389,6 @@ def domain_can_forward(domain): ) -@quickcache(['domain'], timeout=60) def domain_can_forward_now(domain): """ Returns ``True`` if ``domain`` has the requisite privileges and data From 59aae7169773ecbef86a659e2709c34ebf67df87 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 24 Sep 2024 15:52:22 +0200 Subject: [PATCH 17/56] Migration to create indexes concurrently --- .../0015_alter_repeatrecord_state_and_more.py | 72 ++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py b/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py index 83036dc7302c..3d7f8ccd6ef5 100644 --- a/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py +++ b/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py @@ -1,35 +1,59 @@ -from django.db import migrations, models +""" +Adds an index for RepeatRecord.state and a partial index for +next_attempt_at + not_paused +""" +from django.db import migrations class Migration(migrations.Migration): + atomic = False dependencies = [ ("repeaters", "0014_repeater_max_workers"), ] operations = [ - migrations.AlterField( - model_name="repeatrecord", - name="state", - field=models.PositiveSmallIntegerField( - choices=[ - (1, "Pending"), - (2, "Failed"), - (4, "Succeeded"), - (8, "Cancelled"), - (16, "Empty"), - (32, "Invalid Payload"), - ], - db_index=True, - default=1, - ), - ), - migrations.AddIndex( - model_name="repeater", - index=models.Index( - condition=models.Q(("is_paused", False)), - fields=["next_attempt_at"], - name="next_attempt_at_not_paused_idx", - ), + # migrations.AlterField( + # model_name="repeatrecord", + # name="state", + # field=models.PositiveSmallIntegerField( + # choices=[ + # (1, "Pending"), + # (2, "Failed"), + # (4, "Succeeded"), + # (8, "Cancelled"), + # (16, "Empty"), + # (32, "Invalid Payload"), + # ], + # db_index=True, + # default=1, + # ), + # ), + # Equivalent to the migration above, but builds the index concurrently + migrations.RunSQL( + sql=""" + CREATE INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b" + ON "repeaters_repeatrecord" ("state"); + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b"; + """ ), + + # migrations.AddIndex( + # model_name="repeater", + # index=models.Index( + # condition=models.Q(("is_paused", False)), + # fields=["next_attempt_at"], + # name="next_attempt_at_not_paused_idx", + # ), + # ), + migrations.RunSQL( + sql=""" + CREATE INDEX "next_attempt_at_not_paused_idx" ON "repeaters_repeater" ("next_attempt_at") WHERE NOT "is_paused"; + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "next_attempt_at_not_paused_idx"; + """ + ) ] From b70fc523d63012d06e337bffac15887a5b9f3f4d Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 19 Oct 2024 13:34:23 +0100 Subject: [PATCH 18/56] Add comment --- corehq/motech/repeaters/models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 2b83a1388eff..77627a43e826 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -415,6 +415,10 @@ def set_backoff(self): def reset_backoff(self): if self.last_attempt_at or self.next_attempt_at: + # `_get_retry_interval()` implements exponential backoff by + # multiplying the previous interval by 3. Set last_attempt_at + # to None so that the next time we need to back off, we + # know it is the first interval. self.last_attempt_at = None self.next_attempt_at = None # Avoid a possible race condition with self.pause(), etc. From 7e65b3b06614e6d75f044540d2b3bbf4d17199f6 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 19 Oct 2024 15:40:42 +0100 Subject: [PATCH 19/56] Don't squash BaseExceptions --- corehq/motech/repeaters/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 77627a43e826..da8af4f523fa 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1218,8 +1218,7 @@ def fire(self, force_send=False, timing_context=None): self.repeater.fire_for_record(self, timing_context=timing_context) except Exception as e: self.handle_payload_error(str(e), traceback_str=traceback.format_exc()) - finally: - return self.state + return self.state return None def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): From 4c4189661f8d7af95d6d45797eebb56ce59ed05f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 19 Oct 2024 16:45:14 +0100 Subject: [PATCH 20/56] Drop timeout for `process_repeater_lock`. --- corehq/motech/repeaters/tasks.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 2d83319ec9b6..cac724254467 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -234,11 +234,20 @@ def process_repeaters(): """ process_repeater_lock = get_redis_lock( PROCESS_REPEATERS_KEY, - timeout=24 * 60 * 60, + timeout=None, # Iterating repeaters forever is fine name=PROCESS_REPEATERS_KEY, ) + # How to recover from a crash: If `process_repeaters()` needs to be + # restarted and `process_repeater_lock` was not released, expire the + # lock to allow `process_repeaters()` to start: + # + # >>> from dimagi.utils.couch.cache.cache_core import get_redis_client + # >>> from corehq.motech.repeaters.const import PROCESS_REPEATERS_KEY + # >>> client = get_redis_client() + # >>> client.expire(PROCESS_REPEATERS_KEY, timeout=0) if not process_repeater_lock.acquire(blocking=False): return + try: for domain, repeater_id, lock_token in iter_ready_repeater_ids_forever(): process_repeater.delay(domain, repeater_id, lock_token) From 30d4a6fac0d0c1c5b4a06db941c6e8e57bcf1b28 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 19 Oct 2024 16:46:25 +0100 Subject: [PATCH 21/56] Add metric for monitoring health --- corehq/motech/repeaters/tasks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index cac724254467..409398df05c8 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -435,3 +435,12 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): run_every=crontab(), # every minute multiprocess_mode=MPM_MAX ) + +# This metric monitors the number of Repeaters waiting to be sent. A +# steep increase indicates a problem with `process_repeaters()`. +metrics_gauge_task( + 'commcare.repeaters.all_ready', + lambda: Repeater.objects.all_ready().count(), + run_every=crontab(minute='*/5'), # every five minutes + multiprocess_mode=MPM_MAX +) From e32b46522c77544562f632691023f1038e6a94ec Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 19 Oct 2024 17:03:51 +0100 Subject: [PATCH 22/56] Resolve migration conflict, fix index --- .../0015_alter_repeatrecord_state_and_more.py | 59 ------------------ ...orkers.py => 0015_repeater_max_workers.py} | 2 +- .../repeaters/migrations/0016_add_indexes.py | 62 +++++++++++++++++++ corehq/motech/repeaters/models.py | 4 +- migrations.lock | 2 + 5 files changed, 67 insertions(+), 62 deletions(-) delete mode 100644 corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py rename corehq/motech/repeaters/migrations/{0014_repeater_max_workers.py => 0015_repeater_max_workers.py} (81%) create mode 100644 corehq/motech/repeaters/migrations/0016_add_indexes.py diff --git a/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py b/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py deleted file mode 100644 index 3d7f8ccd6ef5..000000000000 --- a/corehq/motech/repeaters/migrations/0015_alter_repeatrecord_state_and_more.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Adds an index for RepeatRecord.state and a partial index for -next_attempt_at + not_paused -""" -from django.db import migrations - - -class Migration(migrations.Migration): - atomic = False - - dependencies = [ - ("repeaters", "0014_repeater_max_workers"), - ] - - operations = [ - # migrations.AlterField( - # model_name="repeatrecord", - # name="state", - # field=models.PositiveSmallIntegerField( - # choices=[ - # (1, "Pending"), - # (2, "Failed"), - # (4, "Succeeded"), - # (8, "Cancelled"), - # (16, "Empty"), - # (32, "Invalid Payload"), - # ], - # db_index=True, - # default=1, - # ), - # ), - # Equivalent to the migration above, but builds the index concurrently - migrations.RunSQL( - sql=""" - CREATE INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b" - ON "repeaters_repeatrecord" ("state"); - """, - reverse_sql=""" - DROP INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b"; - """ - ), - - # migrations.AddIndex( - # model_name="repeater", - # index=models.Index( - # condition=models.Q(("is_paused", False)), - # fields=["next_attempt_at"], - # name="next_attempt_at_not_paused_idx", - # ), - # ), - migrations.RunSQL( - sql=""" - CREATE INDEX "next_attempt_at_not_paused_idx" ON "repeaters_repeater" ("next_attempt_at") WHERE NOT "is_paused"; - """, - reverse_sql=""" - DROP INDEX CONCURRENTLY "next_attempt_at_not_paused_idx"; - """ - ) - ] diff --git a/corehq/motech/repeaters/migrations/0014_repeater_max_workers.py b/corehq/motech/repeaters/migrations/0015_repeater_max_workers.py similarity index 81% rename from corehq/motech/repeaters/migrations/0014_repeater_max_workers.py rename to corehq/motech/repeaters/migrations/0015_repeater_max_workers.py index 66240342f454..d55d9a2e4047 100644 --- a/corehq/motech/repeaters/migrations/0014_repeater_max_workers.py +++ b/corehq/motech/repeaters/migrations/0015_repeater_max_workers.py @@ -4,7 +4,7 @@ class Migration(migrations.Migration): dependencies = [ - ("repeaters", "0013_alter_repeatrecord_state_and_more"), + ("repeaters", "0014_alter_repeater_request_method"), ] operations = [ diff --git a/corehq/motech/repeaters/migrations/0016_add_indexes.py b/corehq/motech/repeaters/migrations/0016_add_indexes.py new file mode 100644 index 000000000000..2409b1df064d --- /dev/null +++ b/corehq/motech/repeaters/migrations/0016_add_indexes.py @@ -0,0 +1,62 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("repeaters", "0015_repeater_max_workers"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name="repeatrecord", + name="state", + field=models.PositiveSmallIntegerField( + choices=[ + (1, "Pending"), + (2, "Failed"), + (4, "Succeeded"), + (8, "Cancelled"), + (16, "Empty"), + (32, "Invalid Payload"), + ], + db_index=True, + default=1, + ), + ), + migrations.AddIndex( + model_name="repeater", + index=models.Index( + condition=models.Q(("is_deleted", False), ("is_paused", False)), + fields=["next_attempt_at"], + name="next_attempt_at_partial_idx", + ), + ), + ], + + database_operations=[ + migrations.RunSQL( + sql=""" + CREATE INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b" + ON "repeaters_repeatrecord" ("state"); + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b"; + """ + ), + migrations.RunSQL( + sql=""" + CREATE INDEX CONCURRENTLY "next_attempt_at_partial_idx" + ON "repeaters_repeater" ("next_attempt_at") + WHERE (NOT "is_deleted" AND NOT "is_paused"); + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "next_attempt_at_partial_idx"; + """ + ), + ] + ) + ] diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 9699c70154e3..2bcdbfe3ae5d 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -300,8 +300,8 @@ class Meta: indexes = [ models.Index( fields=['next_attempt_at'], - condition=models.Q(is_paused=False), - name='next_attempt_at_not_paused_idx', + condition=models.Q(("is_deleted", False), ("is_paused", False)), + name='next_attempt_at_partial_idx', ), ] diff --git a/migrations.lock b/migrations.lock index 7dc0874ac264..170568b378bf 100644 --- a/migrations.lock +++ b/migrations.lock @@ -837,6 +837,8 @@ repeaters 0012_formexpressionrepeater_arcgisformexpressionrepeater 0013_alter_repeatrecord_state_and_more 0014_alter_repeater_request_method + 0015_repeater_max_workers + 0016_add_indexes reports 0001_initial 0002_auto_20171121_1803 From e3bcd748894852990155ee69258c111fc05eaaa0 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sun, 20 Oct 2024 00:04:42 +0100 Subject: [PATCH 23/56] Fix metric --- corehq/motech/repeaters/tasks.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 409398df05c8..6b67c27158d8 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -436,11 +436,16 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): multiprocess_mode=MPM_MAX ) -# This metric monitors the number of Repeaters waiting to be sent. A -# steep increase indicates a problem with `process_repeaters()`. + +def get_all_ready_count(): + return Repeater.objects.all_ready().count() + + +# This metric monitors the number of Repeaters with RepeatRecords ready to +# be sent. A steep increase indicates a problem with `process_repeaters()`. metrics_gauge_task( 'commcare.repeaters.all_ready', - lambda: Repeater.objects.all_ready().count(), + get_all_ready_count, run_every=crontab(minute='*/5'), # every five minutes multiprocess_mode=MPM_MAX ) From efc4dde3b96803301a2a5ec888b15c38c8941623 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 22 Oct 2024 20:22:26 +0100 Subject: [PATCH 24/56] Change indexes After some analyzing, it turns out that the query is between two and three times as fast using a partial index on RepeatRecord.repeater_id where state is pending or failed. Performance is also improved using a partial index with Repeater.is_deleted instead of a normal B-tree index. When these two indexes are used, the next_attempt_at_partial_idx is not used. --- .../repeaters/migrations/0016_add_indexes.py | 60 +++++++++++-------- corehq/motech/repeaters/models.py | 16 +++-- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/corehq/motech/repeaters/migrations/0016_add_indexes.py b/corehq/motech/repeaters/migrations/0016_add_indexes.py index 2409b1df064d..18866dcc5673 100644 --- a/corehq/motech/repeaters/migrations/0016_add_indexes.py +++ b/corehq/motech/repeaters/migrations/0016_add_indexes.py @@ -12,49 +12,61 @@ class Migration(migrations.Migration): migrations.SeparateDatabaseAndState( state_operations=[ migrations.AlterField( - model_name="repeatrecord", - name="state", - field=models.PositiveSmallIntegerField( - choices=[ - (1, "Pending"), - (2, "Failed"), - (4, "Succeeded"), - (8, "Cancelled"), - (16, "Empty"), - (32, "Invalid Payload"), - ], - db_index=True, - default=1, - ), + model_name="repeater", + name="is_deleted", + field=models.BooleanField(default=False), ), migrations.AddIndex( model_name="repeater", index=models.Index( - condition=models.Q(("is_deleted", False), ("is_paused", False)), - fields=["next_attempt_at"], - name="next_attempt_at_partial_idx", + condition=models.Q(("is_deleted", False)), + fields=["id"], + name="is_deleted_partial_idx", + ), + ), + migrations.AddIndex( + model_name="repeatrecord", + index=models.Index( + condition=models.Q(("state__in", (1, 2))), + fields=["repeater_id"], + name="state_partial_idx", ), ), ], database_operations=[ + # Drop `Repeater.id_deleted` index migrations.RunSQL( sql=""" - CREATE INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b" - ON "repeaters_repeatrecord" ("state"); + DROP INDEX CONCURRENTLY "repeaters_repeater_is_deleted_08441bf0"; """, reverse_sql=""" - DROP INDEX CONCURRENTLY "repeaters_repeatrecord_state_8055083b"; + CREATE INDEX CONCURRENTLY "repeaters_repeater_is_deleted_08441bf0" + ON "repeaters_repeater" ("is_deleted"); """ ), + + # Replace with a partial index on `id_` column + migrations.RunSQL( + sql=""" + CREATE INDEX CONCURRENTLY "is_deleted_partial_idx" + ON "repeaters_repeater" ("id_") + WHERE NOT "is_deleted"; + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "is_deleted_partial_idx"; + """ + ), + + # Add partial index for RepeatRecord.state on `repeater_id` migrations.RunSQL( sql=""" - CREATE INDEX CONCURRENTLY "next_attempt_at_partial_idx" - ON "repeaters_repeater" ("next_attempt_at") - WHERE (NOT "is_deleted" AND NOT "is_paused"); + CREATE INDEX CONCURRENTLY "state_partial_idx" + ON "repeaters_repeatrecord" ("repeater_id_") + WHERE "state" IN (1, 2); """, reverse_sql=""" - DROP INDEX CONCURRENTLY "next_attempt_at_partial_idx"; + DROP INDEX CONCURRENTLY "state_partial_idx"; """ ), ] diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 2bcdbfe3ae5d..82713ff14a1d 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -288,7 +288,7 @@ class Repeater(RepeaterSuperProxy): max_workers = models.IntegerField(default=0) options = JSONField(default=dict) connection_settings_id = models.IntegerField(db_index=True) - is_deleted = models.BooleanField(default=False, db_index=True) + is_deleted = models.BooleanField(default=False) last_modified = models.DateTimeField(auto_now=True) date_created = models.DateTimeField(auto_now_add=True) @@ -299,9 +299,9 @@ class Meta: db_table = 'repeaters_repeater' indexes = [ models.Index( - fields=['next_attempt_at'], - condition=models.Q(("is_deleted", False), ("is_paused", False)), - name='next_attempt_at_partial_idx', + name='is_deleted_partial_idx', + fields=['id'], + condition=models.Q(is_deleted=False), ), ] @@ -1037,7 +1037,6 @@ class RepeatRecord(models.Model): state = models.PositiveSmallIntegerField( choices=State.choices, default=State.Pending, - db_index=True, ) registered_at = models.DateTimeField() next_check = models.DateTimeField(null=True, default=None) @@ -1053,7 +1052,12 @@ class Meta: name="next_check_not_null", fields=["next_check"], condition=models.Q(next_check__isnull=False), - ) + ), + models.Index( + name="state_partial_idx", + fields=["repeater_id"], + condition=models.Q(state__in=(State.Pending, State.Fail)), + ), ] constraints = [ models.CheckConstraint( From bd37a009ced50048aa57076ec52a73d4330b2066 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 23 Oct 2024 18:54:00 +0100 Subject: [PATCH 25/56] Add one more index. Use UNION ALL queries. --- .../repeaters/migrations/0016_add_indexes.py | 34 +++++++++++-- corehq/motech/repeaters/models.py | 51 +++++++++++++------ corehq/motech/repeaters/tasks.py | 6 +-- corehq/motech/repeaters/tests/test_models.py | 5 ++ 4 files changed, 72 insertions(+), 24 deletions(-) diff --git a/corehq/motech/repeaters/migrations/0016_add_indexes.py b/corehq/motech/repeaters/migrations/0016_add_indexes.py index 18866dcc5673..5f7f0b15af97 100644 --- a/corehq/motech/repeaters/migrations/0016_add_indexes.py +++ b/corehq/motech/repeaters/migrations/0016_add_indexes.py @@ -21,7 +21,18 @@ class Migration(migrations.Migration): index=models.Index( condition=models.Q(("is_deleted", False)), fields=["id"], - name="is_deleted_partial_idx", + name="deleted_partial_idx", + ), + ), + migrations.AddIndex( + model_name="repeater", + index=models.Index( + condition=models.Q( + ("is_deleted", False), + ("is_paused", False), + ), + fields=["id"], + name="deleted_paused_partial_idx", ), ), migrations.AddIndex( @@ -46,19 +57,34 @@ class Migration(migrations.Migration): """ ), - # Replace with a partial index on `id_` column + # Replace with a partial index on `id_` column. Used + # when next_attempt_at is null migrations.RunSQL( sql=""" - CREATE INDEX CONCURRENTLY "is_deleted_partial_idx" + CREATE INDEX CONCURRENTLY "deleted_partial_idx" ON "repeaters_repeater" ("id_") WHERE NOT "is_deleted"; """, reverse_sql=""" - DROP INDEX CONCURRENTLY "is_deleted_partial_idx"; + DROP INDEX CONCURRENTLY "deleted_partial_idx"; + """ + ), + + # Add partial index for is_deleted and is_paused on `id_` + # column. Used when next_attempt_at is not null + migrations.RunSQL( + sql=""" + CREATE INDEX CONCURRENTLY "deleted_paused_partial_idx" + ON "repeaters_repeater" ("id_") + WHERE (NOT "is_deleted" AND NOT "is_paused"); + """, + reverse_sql=""" + DROP INDEX CONCURRENTLY "deleted_paused_partial_idx"; """ ), # Add partial index for RepeatRecord.state on `repeater_id` + # column. Used when next_attempt_at is not null migrations.RunSQL( sql=""" CREATE INDEX CONCURRENTLY "state_partial_idx" diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 82713ff14a1d..2019a6b40f5e 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -238,28 +238,44 @@ def all_ready(self): """ Return all Repeaters ready to be forwarded. """ - not_paused = models.Q(is_paused=False) - next_attempt_not_in_the_future = ( - models.Q(next_attempt_at__isnull=True) - | models.Q(next_attempt_at__lte=timezone.now()) - ) - repeat_records_ready_to_send = models.Q( - repeat_records__state__in=(State.Pending, State.Fail) - ) - return ( - self.get_queryset() - .filter(not_paused) - .filter(next_attempt_not_in_the_future) - .filter(repeat_records_ready_to_send) + return self._all_ready_next_attempt_null().union( + self._all_ready_next_attempt_now(), + all=True, ) def get_all_ready_ids_by_domain(self): + next_attempt_null = self._all_ready_next_attempt_null().values_list('domain', 'id') + next_attempt_now = self._all_ready_next_attempt_now().values_list('domain', 'id') + query = next_attempt_null.union(next_attempt_now, all=True) results = defaultdict(list) - query = self.all_ready().values_list('domain', 'id') for (domain, id_uuid) in query.all(): results[domain].append(id_uuid.hex) return results + def all_ready_count(self): + return ( + self._all_ready_next_attempt_null().count() + + self._all_ready_next_attempt_now().count() + ) + + def _all_ready_next_attempt_null(self): + # Slower query. Uses deleted_partial_idx + return ( + self.get_queryset() + .filter(is_paused=False) + .filter(next_attempt_at__isnull=True) + .filter(repeat_records__state__in=(State.Pending, State.Fail)) + ) + + def _all_ready_next_attempt_now(self): + # Fast query. Uses deleted_paused_partial_idx and state_partial_idx + return ( + self.get_queryset() + .filter(is_paused=False) + .filter(next_attempt_at__lte=timezone.now()) + .filter(repeat_records__state__in=(State.Pending, State.Fail)) + ) + def get_queryset(self): repeater_obj = self.model() if type(repeater_obj).__name__ == "Repeater": @@ -299,10 +315,15 @@ class Meta: db_table = 'repeaters_repeater' indexes = [ models.Index( - name='is_deleted_partial_idx', + name='deleted_partial_idx', fields=['id'], condition=models.Q(is_deleted=False), ), + models.Index( + name="deleted_paused_partial_idx", + fields=["id"], + condition=models.Q(("is_deleted", False), ("is_paused", False)), + ), ] payload_generator_classes = () diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 6b67c27158d8..2bc5aeea5ec4 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -437,15 +437,11 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): ) -def get_all_ready_count(): - return Repeater.objects.all_ready().count() - - # This metric monitors the number of Repeaters with RepeatRecords ready to # be sent. A steep increase indicates a problem with `process_repeaters()`. metrics_gauge_task( 'commcare.repeaters.all_ready', - get_all_ready_count, + Repeater.objects.all_ready_count, run_every=crontab(minute='*/5'), # every five minutes multiprocess_mode=MPM_MAX ) diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 6e24d144202f..5a5c1f4511a4 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -977,3 +977,8 @@ def test_status_404_response(self): def test_none_response(self): self.assertFalse(is_success_response(None)) + + +def test_repeater_all_ready_union_all_sql(): + sql_str = str(Repeater.objects.all_ready().query) + assert_in('UNION ALL', sql_str) From a448b9ee7bca28ffba8456f3c6178f0a7bf169f3 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 26 Oct 2024 12:19:15 +0100 Subject: [PATCH 26/56] Don't report attempt too soon --- corehq/motech/repeaters/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 2bc5aeea5ec4..46a3e7d9c31b 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -371,9 +371,10 @@ def process_ready_repeat_record(repeat_record_id): .prefetch_related('repeater', 'attempt_set') .get(id=repeat_record_id) ) - report_repeater_attempt(repeat_record.repeater.repeater_id) if not is_repeat_record_ready(repeat_record): return None + + report_repeater_attempt(repeat_record.repeater.repeater_id) with timer('fire_timing') as fire_timer: state_or_none = repeat_record.fire(timing_context=fire_timer) report_repeater_usage( From 4321fb7e4e0136baa02b454daefc96283c1dbe36 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 26 Oct 2024 13:13:08 +0100 Subject: [PATCH 27/56] Add metrics --- corehq/motech/repeaters/tasks.py | 37 +++++++++++- corehq/motech/repeaters/tests/test_tasks.py | 63 +++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 46a3e7d9c31b..527b8e75d2fd 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -1,6 +1,7 @@ import random import uuid from datetime import datetime, timedelta +from inspect import cleandoc from django.conf import settings @@ -248,6 +249,7 @@ def process_repeaters(): if not process_repeater_lock.acquire(blocking=False): return + metrics_counter('commcare.repeaters.process_repeaters.start') try: for domain, repeater_id, lock_token in iter_ready_repeater_ids_forever(): process_repeater.delay(domain, repeater_id, lock_token) @@ -298,6 +300,7 @@ def iter_ready_repeater_ids_once(): ... """ + metrics_counter('commcare.repeaters.process_repeaters.iter_once') repeater_ids_by_domain = get_repeater_ids_by_domain() while True: if not repeater_ids_by_domain: @@ -374,6 +377,7 @@ def process_ready_repeat_record(repeat_record_id): if not is_repeat_record_ready(repeat_record): return None + _metrics_wait_duration(repeat_record) report_repeater_attempt(repeat_record.repeater.repeater_id) with timer('fire_timing') as fire_timer: state_or_none = repeat_record.fire(timing_context=fire_timer) @@ -404,6 +408,37 @@ def is_repeat_record_ready(repeat_record): ) +def _metrics_wait_duration(repeat_record): + """ + The duration since ``repeat_record`` was registered or last attempted. + + Buckets are exponential: [1m, 6m, 36m, 3.6h, 21.6h, 5.4d] + """ + buckets = [60 * (6 ** exp) for exp in range(6)] + metrics_histogram( + 'commcare.repeaters.process_repeaters.repeat_record_wait', + _get_wait_duration_seconds(repeat_record), + bucket_tag='duration', + buckets=buckets, + bucket_unit='s', + tags={ + 'domain': repeat_record.domain, + 'repeater': f'{repeat_record.domain}: {repeat_record.repeater.name}', + }, + documentation=cleandoc(_metrics_wait_duration.__doc__) + ) + + +def _get_wait_duration_seconds(repeat_record): + last_attempt = repeat_record.attempt_set.last() + if last_attempt: + duration_start = last_attempt.created_at + else: + duration_start = repeat_record.registered_at + wait_duration = datetime.utcnow() - duration_start + return int(wait_duration.total_seconds()) + + @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) def update_repeater(repeat_record_states, repeater_id, lock_token): """ @@ -441,7 +476,7 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): # This metric monitors the number of Repeaters with RepeatRecords ready to # be sent. A steep increase indicates a problem with `process_repeaters()`. metrics_gauge_task( - 'commcare.repeaters.all_ready', + 'commcare.repeaters.process_repeaters.all_ready_count', Repeater.objects.all_ready_count, run_every=crontab(minute='*/5'), # every five minutes multiprocess_mode=MPM_MAX diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 2a25d223f1cc..ee0b15999665 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -17,6 +17,7 @@ from corehq.motech.models import ConnectionSettings, RequestLog from corehq.motech.repeaters.models import FormRepeater, Repeater, RepeatRecord from corehq.motech.repeaters.tasks import ( + _get_wait_duration_seconds, _process_repeat_record, delete_old_request_logs, get_repeater_ids_by_domain, @@ -558,3 +559,65 @@ def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() + + +class TestGetWaitDurationSeconds(TestCase): + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.repeater = FormRepeater.objects.create( + domain=DOMAIN, + connection_settings=ConnectionSettings.objects.create( + domain=DOMAIN, + url='http://www.example.com/api/' + ), + ) + + def test_repeat_record_no_attempts(self): + five_minutes_ago = datetime.utcnow() - timedelta(minutes=5) + repeat_record = RepeatRecord.objects.create( + repeater=self.repeater, + domain=DOMAIN, + payload_id='abc123', + registered_at=five_minutes_ago, + ) + wait_duration = _get_wait_duration_seconds(repeat_record) + self.assertEqual(wait_duration, 300) + + def test_repeat_record_one_attempt(self): + five_minutes_ago = datetime.utcnow() - timedelta(minutes=5) + repeat_record = RepeatRecord.objects.create( + repeater=self.repeater, + domain=DOMAIN, + payload_id='abc123', + registered_at=five_minutes_ago, + ) + thirty_seconds_ago = datetime.utcnow() - timedelta(seconds=30) + repeat_record.attempt_set.create( + created_at=thirty_seconds_ago, + state=State.Fail, + ) + wait_duration = _get_wait_duration_seconds(repeat_record) + self.assertEqual(wait_duration, 30) + + def test_repeat_record_two_attempts(self): + an_hour_ago = datetime.utcnow() - timedelta(hours=1) + repeat_record = RepeatRecord.objects.create( + repeater=self.repeater, + domain=DOMAIN, + payload_id='abc123', + registered_at=an_hour_ago, + ) + thirty_minutes = datetime.utcnow() - timedelta(minutes=30) + repeat_record.attempt_set.create( + created_at=thirty_minutes, + state=State.Fail, + ) + five_seconds_ago = datetime.utcnow() - timedelta(seconds=5) + repeat_record.attempt_set.create( + created_at=five_seconds_ago, + state=State.Fail, + ) + wait_duration = _get_wait_duration_seconds(repeat_record) + self.assertEqual(wait_duration, 5) From 74137f99d53d70639b48ea9c01e6eefabd80b5e1 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 26 Oct 2024 23:28:09 +0100 Subject: [PATCH 28/56] Improve backoff logic --- corehq/motech/repeaters/models.py | 13 ++++++++ corehq/motech/repeaters/tasks.py | 16 +++++----- .../motech/repeaters/tests/test_repeater.py | 2 +- corehq/motech/repeaters/tests/test_tasks.py | 30 ++++++++++++++----- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 2019a6b40f5e..55543a917029 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1243,6 +1243,19 @@ def fire(self, force_send=False, timing_context=None): self.repeater.fire_for_record(self, timing_context=timing_context) except Exception as e: self.handle_payload_error(str(e), traceback_str=traceback.format_exc()) + # Repeat records with State.Fail are retried, and repeat + # records with State.InvalidPayload are not. + # + # But a repeat record can have State.InvalidPayload + # because it was sent and rejected, so we know that the + # remote endpoint is healthy and responding, or because + # this exception occurred and it was not sent, so we + # don't know anything about the remote endpoint. + # + # Return None so that `tasks.update_repeater()` treats + # the repeat record as unsent, and does not apply or + # reset a backoff. + return None return self.state return None diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 527b8e75d2fd..2317e05e0783 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -446,18 +446,16 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): results of ``_process_repeat_record()`` tasks. """ try: + if all(s in (State.Empty, None) for s in repeat_record_states): + # We can't tell anything about the remote endpoint. + return + success_or_invalid = (State.Success, State.InvalidPayload) repeater = Repeater.objects.get(id=repeater_id) - if any(s == State.Success for s in repeat_record_states): - # At least one repeat record was sent successfully. The - # remote endpoint is healthy. + if any(s in success_or_invalid for s in repeat_record_states): + # The remote endpoint appears to be healthy. repeater.reset_backoff() - elif all(s in (State.Empty, State.InvalidPayload, None) - for s in repeat_record_states): - # We can't tell anything about the remote endpoint. - # (_process_repeat_record() can return None on an exception.) - pass else: - # All sent payloads failed. Try again later. + # All the payloads that were sent failed. Try again later. repeater.set_backoff() finally: lock = get_repeater_lock(repeater_id) diff --git a/corehq/motech/repeaters/tests/test_repeater.py b/corehq/motech/repeaters/tests/test_repeater.py index 2f93dfa7b519..a5b0cd15dfa5 100644 --- a/corehq/motech/repeaters/tests/test_repeater.py +++ b/corehq/motech/repeaters/tests/test_repeater.py @@ -701,7 +701,7 @@ def test_payload_exception_on_fire(self): with patch.object(CaseRepeater, 'get_payload', side_effect=Exception('Boom!')): state_or_none = rr.fire() - self.assertEqual(state_or_none, State.InvalidPayload) + self.assertIsNone(state_or_none) repeat_record = RepeatRecord.objects.get(id=rr.id) self.assertEqual(repeat_record.state, State.InvalidPayload) self.assertEqual(repeat_record.failure_reason, 'Boom!') diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index ee0b15999665..17eb0561ebda 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -519,10 +519,23 @@ class TestUpdateRepeater(SimpleTestCase): @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): + repeat_record_states = [State.Success, State.Fail, State.Empty, None] + mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater + update_repeater(repeat_record_states, 1, 'token') + + mock_repeater.set_backoff.assert_not_called() + mock_repeater.reset_backoff.assert_called_once() - update_repeater([State.Success, State.Fail, State.Empty, None], 1, 'token') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): + repeat_record_states = [State.InvalidPayload, State.Fail, State.Empty, None] + + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() @@ -530,10 +543,11 @@ def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): + repeat_record_states = [State.Fail, State.Empty, None] + mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - - update_repeater([State.Fail, State.Empty, None], 1, 'token') + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_called_once() mock_repeater.reset_backoff.assert_not_called() @@ -541,10 +555,11 @@ def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): + repeat_record_states = [State.Empty] + mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - - update_repeater([State.Empty], 1, 'token') + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() @@ -552,10 +567,11 @@ def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): + repeat_record_states = [None] + mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - - update_repeater([None], 1, 'token') + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() From 968a922a59d621ea2017462f868ee416567af47a Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 28 Oct 2024 11:50:21 +0000 Subject: [PATCH 29/56] Update comments --- .../repeaters/migrations/0016_add_indexes.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/corehq/motech/repeaters/migrations/0016_add_indexes.py b/corehq/motech/repeaters/migrations/0016_add_indexes.py index 5f7f0b15af97..fd4d476fd32e 100644 --- a/corehq/motech/repeaters/migrations/0016_add_indexes.py +++ b/corehq/motech/repeaters/migrations/0016_add_indexes.py @@ -57,8 +57,13 @@ class Migration(migrations.Migration): """ ), - # Replace with a partial index on `id_` column. Used - # when next_attempt_at is null + # Replace `Repeater.id_deleted` index with a partial + # index on `id_` column. + # > One major reason for using a partial index is to + # > avoid indexing common values. ... This reduces the + # > size of the index, which will speed up those queries + # > that do use the index. + # > -- https://www.postgresql.org/docs/current/indexes-partial.html migrations.RunSQL( sql=""" CREATE INDEX CONCURRENTLY "deleted_partial_idx" @@ -71,7 +76,7 @@ class Migration(migrations.Migration): ), # Add partial index for is_deleted and is_paused on `id_` - # column. Used when next_attempt_at is not null + # column. migrations.RunSQL( sql=""" CREATE INDEX CONCURRENTLY "deleted_paused_partial_idx" @@ -84,7 +89,8 @@ class Migration(migrations.Migration): ), # Add partial index for RepeatRecord.state on `repeater_id` - # column. Used when next_attempt_at is not null + # column. This does the heavy lifting for the queries + # related to `RepeaterManager.all_ready()`. migrations.RunSQL( sql=""" CREATE INDEX CONCURRENTLY "state_partial_idx" From 4fd14a0f2bead91d4218dfc109e8dfc07b8b429a Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 28 Oct 2024 17:31:40 +0000 Subject: [PATCH 30/56] Show "Next attempt at" in Forwarders page --- .../repeaters/templates/repeaters/partials/repeater_row.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/corehq/motech/repeaters/templates/repeaters/partials/repeater_row.html b/corehq/motech/repeaters/templates/repeaters/partials/repeater_row.html index 069941a94f02..18e5d91026f3 100644 --- a/corehq/motech/repeaters/templates/repeaters/partials/repeater_row.html +++ b/corehq/motech/repeaters/templates/repeaters/partials/repeater_row.html @@ -5,6 +5,9 @@ {% if repeater.white_listed_case_types %}
Case Type: {{ repeater.white_listed_case_types|join:", " }} {% endif %} + {% if repeater.next_attempt_at and request|toggle_enabled:"PROCESS_REPEATERS" and not repeater.is_paused %} +
Next attempt at {{ repeater.next_attempt_at|date:"Y-m-d H:i" }} + {% endif %} From b1eb1715c86448f34ca64212619142f7e7f3f54e Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 28 Oct 2024 17:56:46 +0000 Subject: [PATCH 31/56] Fixes migration --- .../repeaters/migrations/0015_drop_receiverwrapper_couchdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/migrations/0015_drop_receiverwrapper_couchdb.py b/corehq/motech/repeaters/migrations/0015_drop_receiverwrapper_couchdb.py index 54dbd606bbdb..a75498273af5 100644 --- a/corehq/motech/repeaters/migrations/0015_drop_receiverwrapper_couchdb.py +++ b/corehq/motech/repeaters/migrations/0015_drop_receiverwrapper_couchdb.py @@ -8,7 +8,7 @@ @skip_on_fresh_install -def _delete_receiverwrapper_couchdb(): +def _delete_receiverwrapper_couchdb(apps, schema_editor): db = Database(f"{settings.COUCH_DATABASE}__receiverwrapper") try: db.server.delete_db(db.dbname) From 8a7f3432d3901b080601fd232aa10e8df94c50c8 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 19 Nov 2024 15:05:49 +0000 Subject: [PATCH 32/56] Add comment on other `True` return value --- corehq/motech/repeaters/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 83dc62458d58..770ec109363f 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -1275,6 +1275,9 @@ def is_new_synchronous_case_repeater_record(): Repeat record is a new record for a synchronous case repeater See corehq.motech.repeaters.signals.fire_synchronous_case_repeaters """ + # This will also return True if a user clicked "Resend" on a + # Pending repeat record in the Repeat Records Report. This + # is not intended, but it's also not harmful. return fire_synchronously and self.state == State.Pending if ( From 0f72ba93399d404522c72c1edc146d66be2304d8 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 2 Dec 2024 22:24:11 +0000 Subject: [PATCH 33/56] Count repeater backoffs --- corehq/motech/repeaters/tasks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 2317e05e0783..f99d107d4bd9 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -456,6 +456,13 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): repeater.reset_backoff() else: # All the payloads that were sent failed. Try again later. + metrics_counter( + 'commcare.repeaters.process_repeaters.repeater_backoff', + tags={ + 'domain': repeater.domain, + 'repeater': f'{repeater.domain}: {repeater.name}', + }, + ) repeater.set_backoff() finally: lock = get_repeater_lock(repeater_id) From 7f18e5223396aeb4dbab90c20c794b6b32079914 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 2 Dec 2024 22:24:36 +0000 Subject: [PATCH 34/56] Add documentation --- corehq/motech/repeaters/tasks.py | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index f99d107d4bd9..f4a239314cd4 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -1,3 +1,76 @@ +""" +check_repeaters() and process_repeaters() +========================================= + +check_repeaters() +----------------- + +The ``check_repeaters()`` task is how repeat records are sent, and its +workflow was shaped by the fact that repeaters and repeat records were +stored in CouchDB. + +``check_repeaters()`` iterates all **repeat records** where the value of +``RepeatRecord.next_check`` is in the past. + +We iterate them in parallel by dividing them into partitions (four +partitions in production). Repeat records are partitioned using their +ID. (``partition = RepeatRecord.id % num_partitions``.) + +For each repeat record, ``check_repeaters_in_partition()`` calls +``RepeatRecord.attempt_forward_now(is_retry=True)``. (``is_retry`` is +set to ``True`` because when a repeat record is registered, +``RepeatRecord.attempt_forward_now()`` is called immediately, and the +repeat record is only enqueued if sending fails.) + +Execution ends up back in the ``tasks`` module when +``RepeatRecord.attempt_forward_now()`` calls +``_process_repeat_record()``. It runs a battery of checks, and if they +all succeed, ``RepeatRecord.fire()`` is called. + +This process has several disadvantages: + +* It iterates many repeat records that will not be sent. It has no way + to filter out the repeat records of paused or deleted repeaters. + +* It attempts to forward all the repeat records of a repeater, even if + every previous repeat record has failed. + + +process_repeaters() +------------------- + +The ``process_repeaters()`` task sends repeat records, but does so in a +way that takes advantage of foreign keys between repeaters and their +repeat records. + +This process is enabled using the ``PROCESS_REPEATERS`` feature flag. + +The ``iter_ready_repeater_ids_once()`` generator yields the IDs of +repeaters that have repeat records ready to be sent. It does so in a +round-robin fashion, cycling through the domains. It does this so that: + +* Domains and repeaters are not rate-limited unnecessarily. +* Remote APIs are not unintentionally DDoS-attacked by CommCare HQ. +* No domain has to wait while another domain consumes all the repeat + record queue workers. + +As long as there are repeat records ready to be sent, the +``iter_ready_repeater_ids_forever()`` generator will continue to yield +from ``iter_ready_repeater_ids_once()``. The ``process_repeaters()`` +task iterates these repeater IDs, and passes each one to the +``process_repeater()`` task. + +``process_repeater()`` fetches a batch of repeat records (the number is +set per repeater) and spawns tasks to send them in parallel. The results +of all the send attempts are passed to ``update_repeater()``. If all the +send attempts failed, the **repeater** (not the repeat record) is backed +off. If any send attempts succeeded, the backoff is reset. + +The intention of this process is not only to share repeat record queue +workers fairly across domains, but also to optimise workers by not +trying to send repeat records that are unlikely to succeed. + +""" import random import uuid from datetime import datetime, timedelta From 7cbfadbeeac689980903f9e3a222b33d4eff55cb Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sun, 22 Dec 2024 23:07:13 +0200 Subject: [PATCH 35/56] Use less aggressive backoff for repeaters --- corehq/motech/repeaters/const.py | 1 + corehq/motech/repeaters/models.py | 48 +++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/corehq/motech/repeaters/const.py b/corehq/motech/repeaters/const.py index 669dd1306965..63b91147c14e 100644 --- a/corehq/motech/repeaters/const.py +++ b/corehq/motech/repeaters/const.py @@ -6,6 +6,7 @@ MAX_RETRY_WAIT = timedelta(days=7) MIN_RETRY_WAIT = timedelta(minutes=60) +MIN_REPEATER_RETRY_WAIT = timedelta(minutes=5) # Repeaters back off slower RATE_LIMITER_DELAY_RANGE = ( timedelta(minutes=getattr(settings, 'MIN_REPEATER_RATE_LIMIT_DELAY', 0)), timedelta(minutes=getattr(settings, 'MAX_REPEATER_RATE_LIMIT_DELAY', 15)), diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index ae208142ac19..ea739b3bf313 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -103,8 +103,8 @@ XFormInstance, ) from corehq.motech.const import ( - MAX_REQUEST_LOG_LENGTH, ALL_REQUEST_METHODS, + MAX_REQUEST_LOG_LENGTH, REQUEST_POST, ) from corehq.motech.models import ConnectionSettings @@ -126,6 +126,7 @@ MAX_ATTEMPTS, MAX_BACKOFF_ATTEMPTS, MAX_RETRY_WAIT, + MIN_REPEATER_RETRY_WAIT, MIN_RETRY_WAIT, State, ) @@ -422,32 +423,49 @@ def num_workers(self): return min(num_workers, settings.MAX_REPEATER_WORKERS) def set_backoff(self): - now = datetime.utcnow() - interval = _get_retry_interval(self.last_attempt_at, now) - self.last_attempt_at = now - self.next_attempt_at = now + interval + self.next_attempt_at = self._get_next_attempt_at(self.last_attempt_at) + self.last_attempt_at = datetime.utcnow() # Save using QuerySet.update() to avoid a possible race condition # with self.pause(), etc. and to skip the unnecessary functionality # in RepeaterSuperProxy.save(). Repeater.objects.filter(id=self.repeater_id).update( - last_attempt_at=now, - next_attempt_at=now + interval, + last_attempt_at=self.last_attempt_at, + next_attempt_at=self.next_attempt_at, ) def reset_backoff(self): if self.last_attempt_at or self.next_attempt_at: - # `_get_retry_interval()` implements exponential backoff by - # multiplying the previous interval by 3. Set last_attempt_at - # to None so that the next time we need to back off, we - # know it is the first interval. + # `self._get_next_attempt_at()` uses `last_attempt_at` to + # determine the previous interval. Set it to `None` so that + # the next time we need to back off, we know it is the first + # interval. self.last_attempt_at = None self.next_attempt_at = None # Avoid a possible race condition with self.pause(), etc. Repeater.objects.filter(id=self.repeater_id).update( - last_attempt_at=None, - next_attempt_at=None, + last_attempt_at=self.last_attempt_at, + next_attempt_at=self.next_attempt_at, ) + @staticmethod + def _get_next_attempt_at(last_attempt_at): + """ + Calculates exponential backoff. + """ + # Repeat records back off aggressively: The previous interval is + # multiplied by 3, and the minimum wait time is an hour. + # Repeaters are more cautious because backing off will hold up + # all their repeat records. If a remote endpoint is just + # rate-limiting, backing off for an hour is too much. + # MIN_REPEATER_RETRY_WAIT is only 5 minutes. + if last_attempt_at: + interval = 2 * (datetime.utcnow() - last_attempt_at) + else: + interval = timedelta(0) + interval = max(MIN_REPEATER_RETRY_WAIT, interval) + interval = min(MAX_RETRY_WAIT, interval) + return datetime.utcnow() + interval + @property def verify(self): return not self.connection_settings.skip_cert_verify @@ -1267,10 +1285,10 @@ def fire(self, force_send=False, timing_context=None): def attempt_forward_now(self, *, is_retry=False, fire_synchronously=False): from corehq.motech.repeaters.tasks import ( - process_repeat_record, process_datasource_repeat_record, - retry_process_repeat_record, + process_repeat_record, retry_process_datasource_repeat_record, + retry_process_repeat_record, ) def is_new_synchronous_case_repeater_record(): From 7373ba6434fdc57edcef389785b72a00d54023df Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sun, 22 Dec 2024 23:08:36 +0200 Subject: [PATCH 36/56] Add TODOs --- corehq/motech/repeaters/const.py | 3 +++ corehq/motech/repeaters/models.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/corehq/motech/repeaters/const.py b/corehq/motech/repeaters/const.py index 63b91147c14e..d81cdae76337 100644 --- a/corehq/motech/repeaters/const.py +++ b/corehq/motech/repeaters/const.py @@ -20,6 +20,9 @@ # Number of attempts to an online endpoint before cancelling payload MAX_ATTEMPTS = 3 # Number of exponential backoff attempts to an offline endpoint +# TODO: Drop MAX_BACKOFF_ATTEMPTS. We don't need MAX_BACKOFF_ATTEMPTS +# because we are using MAX_RETRY_WAIT, and MAX_BACKOFF_ATTEMPTS is +# being conflated with MAX_ATTEMPTS. MAX_BACKOFF_ATTEMPTS = 6 diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index ea739b3bf313..c0deaf6dca0d 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -458,6 +458,9 @@ def _get_next_attempt_at(last_attempt_at): # all their repeat records. If a remote endpoint is just # rate-limiting, backing off for an hour is too much. # MIN_REPEATER_RETRY_WAIT is only 5 minutes. + + # TODO: With a less aggressive backoff, should MAX_ATTEMPTS be + # increased? if last_attempt_at: interval = 2 * (datetime.utcnow() - last_attempt_at) else: From f312b5a07dc50416748746841570f3cd3c808ef1 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 23 Dec 2024 00:24:15 +0200 Subject: [PATCH 37/56] Add tests --- .../motech/repeaters/tests/test_repeater.py | 54 +++++++++++++++---- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/corehq/motech/repeaters/tests/test_repeater.py b/corehq/motech/repeaters/tests/test_repeater.py index 1a85e027c880..c270859f7f0c 100644 --- a/corehq/motech/repeaters/tests/test_repeater.py +++ b/corehq/motech/repeaters/tests/test_repeater.py @@ -4,11 +4,8 @@ from datetime import datetime, timedelta from unittest.mock import Mock, patch -from corehq.util.json import CommCareJSONEncoder -from corehq.util.test_utils import flag_enabled - from django.test import SimpleTestCase, TestCase -from corehq.apps.userreports.pillow import ConfigurableReportPillowProcessor, ConfigurableReportTableManager + import attr from requests import RequestException @@ -27,10 +24,16 @@ DuplicateFormatException, IgnoreDocument, ) -from corehq.pillows.case import get_case_pillow -from corehq.apps.userreports.tests.utils import get_sample_data_source, get_sample_doc_and_indicators -from corehq.apps.userreports.util import get_indicator_adapter from corehq.apps.receiverwrapper.util import submit_form_locally +from corehq.apps.userreports.pillow import ( + ConfigurableReportPillowProcessor, + ConfigurableReportTableManager, +) +from corehq.apps.userreports.tests.utils import ( + get_sample_data_source, + get_sample_doc_and_indicators, +) +from corehq.apps.userreports.util import get_indicator_adapter from corehq.apps.users.models import CommCareUser from corehq.form_processor.models import CommCareCase, XFormInstance from corehq.form_processor.tests.utils import FormProcessorTestUtils @@ -38,6 +41,7 @@ from corehq.motech.repeaters.const import ( MAX_BACKOFF_ATTEMPTS, MAX_RETRY_WAIT, + MIN_REPEATER_RETRY_WAIT, MIN_RETRY_WAIT, State, ) @@ -47,8 +51,8 @@ FormRepeater, LocationRepeater, Repeater, - ShortFormRepeater, RepeatRecord, + ShortFormRepeater, UserRepeater, _get_retry_interval, format_response, @@ -62,6 +66,9 @@ _process_repeat_record, check_repeaters, ) +from corehq.pillows.case import get_case_pillow +from corehq.util.json import CommCareJSONEncoder +from corehq.util.test_utils import flag_enabled MockResponse = namedtuple('MockResponse', 'status_code reason') CASE_ID = "ABC123CASEID" @@ -1327,7 +1334,7 @@ def _create_log_and_repeat_record(self): return sample_doc, expected_indicators -class TestRaceCondition(TestCase): +class TestSetBackoff(TestCase): domain = 'test-race-condition' @classmethod @@ -1350,7 +1357,7 @@ def tearDownClass(cls): cls.connx.delete() super().tearDownClass() - def test_pause_and_set_next_attempt(self): + def test_race_condition(self): repeater_a = Repeater.objects.get(id=self.repeater.repeater_id) repeater_b = Repeater.objects.get(id=self.repeater.repeater_id) self.assertIsNone(repeater_a.next_attempt_at) @@ -1363,6 +1370,33 @@ def test_pause_and_set_next_attempt(self): self.assertIsNotNone(repeater_c.next_attempt_at) self.assertTrue(repeater_c.is_paused) + def test_initial_get_next_attempt_at(self): + next_attempt_at = self.repeater._get_next_attempt_at(None) + in_5_mins = datetime.utcnow() + MIN_REPEATER_RETRY_WAIT + self.assertEqual( + next_attempt_at.isoformat(timespec='seconds'), + in_5_mins.isoformat(timespec='seconds') + ) + + def test_get_next_attempt_at(self): + five_mins_ago = datetime.utcnow() - timedelta(minutes=5) + next_attempt_at = self.repeater._get_next_attempt_at(five_mins_ago) + # Double the last interval + in_10_mins = datetime.utcnow() + timedelta(minutes=10) + self.assertEqual( + next_attempt_at.isoformat(timespec='seconds'), + in_10_mins.isoformat(timespec='seconds') + ) + + def test_max_get_next_attempt_at(self): + last_month = datetime(2020, 1, 1, 0, 0, 0) + next_attempt_at = self.repeater._get_next_attempt_at(last_month) + in_7_days = datetime.utcnow() + MAX_RETRY_WAIT + self.assertEqual( + next_attempt_at.isoformat(timespec='seconds'), + in_7_days.isoformat(timespec='seconds') + ) + def fromisoformat(isoformat): """ From 9c8d9fb71f0e410cd3d78edb068b0c6bd22b9ada Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 6 Jan 2025 22:01:33 +0000 Subject: [PATCH 38/56] Return one row per repeater --- corehq/motech/repeaters/models.py | 2 ++ corehq/motech/repeaters/tests/test_models.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index c0deaf6dca0d..9ff703520d8e 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -266,6 +266,7 @@ def _all_ready_next_attempt_null(self): .filter(is_paused=False) .filter(next_attempt_at__isnull=True) .filter(repeat_records__state__in=(State.Pending, State.Fail)) + .distinct() ) def _all_ready_next_attempt_now(self): @@ -275,6 +276,7 @@ def _all_ready_next_attempt_now(self): .filter(is_paused=False) .filter(next_attempt_at__lte=timezone.now()) .filter(repeat_records__state__in=(State.Pending, State.Fail)) + .distinct() ) def get_queryset(self): diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 296fd3e6db93..6faa2a32faf6 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -204,6 +204,15 @@ def test_all_ready_ids(self): {self.repeater.domain: [self.repeater.repeater_id]} ) + def test_all_ready_count_distinct(self): + with ( + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + ): + count = Repeater.objects.all_ready_count() + self.assertEqual(count, 1) + @contextmanager def make_repeat_record(repeater, state): From 2a8c9811d3f55b76c5d6f5ef65290bbba3b312ff Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 8 Jan 2025 11:26:23 +0000 Subject: [PATCH 39/56] Drop RepeaterManager.all_ready() --- corehq/motech/repeaters/models.py | 9 ---- corehq/motech/repeaters/tests/test_models.py | 46 +++++++++++--------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 9ff703520d8e..017d0518d087 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -235,15 +235,6 @@ def __new__(cls, *args, **kwargs): class RepeaterManager(models.Manager): - def all_ready(self): - """ - Return all Repeaters ready to be forwarded. - """ - return self._all_ready_next_attempt_null().union( - self._all_ready_next_attempt_now(), - all=True, - ) - def get_all_ready_ids_by_domain(self): next_attempt_null = self._all_ready_next_attempt_null().values_list('domain', 'id') next_attempt_now = self._all_ready_next_attempt_now().values_list('domain', 'id') diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 6faa2a32faf6..7edfca7bacca 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -150,51 +150,57 @@ def test_soft_delete(self): class RepeaterManagerTests(RepeaterTestCase): def test_all_ready_no_repeat_records(self): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 0) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual(len(repeater_ids), 0) def test_all_ready_pending_repeat_record(self): with make_repeat_record(self.repeater, RECORD_PENDING_STATE): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 1) - self.assertEqual(repeaters[0].id, self.repeater.id) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual( + dict(repeater_ids), + {self.repeater.domain: [self.repeater.repeater_id]} + ) def test_all_ready_failed_repeat_record(self): with make_repeat_record(self.repeater, RECORD_FAILURE_STATE): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 1) - self.assertEqual(repeaters[0].id, self.repeater.id) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual( + dict(repeater_ids), + {self.repeater.domain: [self.repeater.repeater_id]} + ) def test_all_ready_succeeded_repeat_record(self): with make_repeat_record(self.repeater, RECORD_SUCCESS_STATE): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 0) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual(len(repeater_ids), 0) def test_all_ready_cancelled_repeat_record(self): with make_repeat_record(self.repeater, RECORD_CANCELLED_STATE): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 0) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual(len(repeater_ids), 0) def test_all_ready_paused(self): with make_repeat_record(self.repeater, RECORD_PENDING_STATE), \ pause(self.repeater): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 0) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual(len(repeater_ids), 0) def test_all_ready_next_future(self): in_five_mins = timezone.now() + timedelta(minutes=5) with make_repeat_record(self.repeater, RECORD_PENDING_STATE), \ set_next_attempt_at(self.repeater, in_five_mins): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 0) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual(len(repeater_ids), 0) def test_all_ready_next_past(self): five_mins_ago = timezone.now() - timedelta(minutes=5) with make_repeat_record(self.repeater, RECORD_PENDING_STATE), \ set_next_attempt_at(self.repeater, five_mins_ago): - repeaters = Repeater.objects.all_ready() - self.assertEqual(len(repeaters), 1) - self.assertEqual(repeaters[0].id, self.repeater.id) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual( + dict(repeater_ids), + {self.repeater.domain: [self.repeater.repeater_id]} + ) def test_all_ready_ids(self): with make_repeat_record(self.repeater, RECORD_PENDING_STATE): @@ -998,5 +1004,5 @@ def test_none_response(self): def test_repeater_all_ready_union_all_sql(): - sql_str = str(Repeater.objects.all_ready().query) + sql_str = str(Repeater.objects.get_all_ready_ids_by_domain().query) assert_in('UNION ALL', sql_str) From c869ec65e84e6534969f5aa45ebb415f685e8754 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 8 Jan 2025 16:00:09 +0000 Subject: [PATCH 40/56] Refactor queries to count RepeatRecords --- corehq/motech/repeaters/models.py | 41 +++++++++++++------- corehq/motech/repeaters/tasks.py | 8 ++-- corehq/motech/repeaters/tests/test_models.py | 23 +++++++---- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index 017d0518d087..de4f1e7885f3 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -236,38 +236,42 @@ def __new__(cls, *args, **kwargs): class RepeaterManager(models.Manager): def get_all_ready_ids_by_domain(self): - next_attempt_null = self._all_ready_next_attempt_null().values_list('domain', 'id') - next_attempt_now = self._all_ready_next_attempt_now().values_list('domain', 'id') + next_attempt_null = ( + self.get_all_ready_next_attempt_null() + .distinct() + .values_list('domain', 'id') + ) + next_attempt_now = ( + self.get_all_ready_next_attempt_now() + .distinct() + .values_list('domain', 'id') + ) query = next_attempt_null.union(next_attempt_now, all=True) results = defaultdict(list) for (domain, id_uuid) in query.all(): results[domain].append(id_uuid.hex) return results - def all_ready_count(self): - return ( - self._all_ready_next_attempt_null().count() - + self._all_ready_next_attempt_now().count() - ) - - def _all_ready_next_attempt_null(self): - # Slower query. Uses deleted_partial_idx + def get_all_ready_next_attempt_null(self): + # See `_all_ready_next_attempt_now()`. Splitting the queries + # speeds up total query time by using different indexes. + # NOTE: Returns one row per RepeatRecord, not per Repeater. return ( self.get_queryset() .filter(is_paused=False) .filter(next_attempt_at__isnull=True) .filter(repeat_records__state__in=(State.Pending, State.Fail)) - .distinct() ) - def _all_ready_next_attempt_now(self): - # Fast query. Uses deleted_paused_partial_idx and state_partial_idx + def get_all_ready_next_attempt_now(self): + # See `_all_ready_next_attempt_null()`. Splitting the queries + # speeds up total query time by using different indexes. + # NOTE: Returns one row per RepeatRecord, not per Repeater. return ( self.get_queryset() .filter(is_paused=False) .filter(next_attempt_at__lte=timezone.now()) .filter(repeat_records__state__in=(State.Pending, State.Fail)) - .distinct() ) def get_queryset(self): @@ -1004,6 +1008,15 @@ def count_overdue(self, threshold=timedelta(minutes=10)): next_check__lt=datetime.utcnow() - threshold ).count() + @staticmethod + def count_all_ready(): + # Uses *Repeater* queries that return one row per RepeatRecord. + # See `RepeaterManager.get_all_ready_ids_by_domain()`. + return ( + Repeater.objects.get_all_ready_next_attempt_null().count() + + Repeater.objects.get_all_ready_next_attempt_now().count() + ) + def iterate(self, domain, repeater_id=None, state=None, chunk_size=1000): db = router.db_for_read(self.model) where = models.Q(domain=domain) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index f4a239314cd4..800d63e30441 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -551,11 +551,11 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): ) -# This metric monitors the number of Repeaters with RepeatRecords ready to -# be sent. A steep increase indicates a problem with `process_repeaters()`. +# This metric monitors the number of RepeatRecords ready to be sent. An +# unexpected increase indicates a problem with `process_repeaters()`. metrics_gauge_task( - 'commcare.repeaters.process_repeaters.all_ready_count', - Repeater.objects.all_ready_count, + 'commcare.repeaters.process_repeaters.count_all_ready', + RepeatRecord.objects.count_all_ready, run_every=crontab(minute='*/5'), # every five minutes multiprocess_mode=MPM_MAX ) diff --git a/corehq/motech/repeaters/tests/test_models.py b/corehq/motech/repeaters/tests/test_models.py index 7edfca7bacca..c1a05a845f36 100644 --- a/corehq/motech/repeaters/tests/test_models.py +++ b/corehq/motech/repeaters/tests/test_models.py @@ -210,14 +210,17 @@ def test_all_ready_ids(self): {self.repeater.domain: [self.repeater.repeater_id]} ) - def test_all_ready_count_distinct(self): + def test_distinct(self): with ( make_repeat_record(self.repeater, RECORD_PENDING_STATE), make_repeat_record(self.repeater, RECORD_PENDING_STATE), make_repeat_record(self.repeater, RECORD_PENDING_STATE), ): - count = Repeater.objects.all_ready_count() - self.assertEqual(count, 1) + repeater_ids = Repeater.objects.get_all_ready_ids_by_domain() + self.assertEqual( + dict(repeater_ids), + {self.repeater.domain: [self.repeater.repeater_id]} + ) @contextmanager @@ -807,6 +810,15 @@ def test_get_repeat_record_ids_with_all_filters(self): ) self.assertCountEqual([actual.id], ids) + def test_count(self): + with ( + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + make_repeat_record(self.repeater, RECORD_PENDING_STATE), + ): + count = RepeatRecord.objects.count_all_ready() + self.assertEqual(count, 3) + def new_record(self, next_check=before_now, state=State.Pending, domain=DOMAIN, payload_id="c0ffee"): return self.new_record_for_repeater( self.repeater, next_check=next_check, state=state, domain=domain, payload_id=payload_id @@ -1001,8 +1013,3 @@ def test_status_404_response(self): def test_none_response(self): self.assertFalse(is_success_response(None)) - - -def test_repeater_all_ready_union_all_sql(): - sql_str = str(Repeater.objects.get_all_ready_ids_by_domain().query) - assert_in('UNION ALL', sql_str) From 11c2b766ac33681dbc12a7645450d1bcf1a23fad Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 8 Jan 2025 16:14:16 +0000 Subject: [PATCH 41/56] Drop Datadog tag --- corehq/motech/repeaters/tasks.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 800d63e30441..1180fd2433fa 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -494,10 +494,7 @@ def _metrics_wait_duration(repeat_record): bucket_tag='duration', buckets=buckets, bucket_unit='s', - tags={ - 'domain': repeat_record.domain, - 'repeater': f'{repeat_record.domain}: {repeat_record.repeater.name}', - }, + tags={'domain': repeat_record.domain}, documentation=cleandoc(_metrics_wait_duration.__doc__) ) From 6e84a230ecbe93cc0c26a1ec6c1b2f0926b30cd7 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 8 Jan 2025 19:07:49 +0000 Subject: [PATCH 42/56] Tweak metrics --- corehq/motech/repeaters/tasks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 1180fd2433fa..6b6c78e54e0b 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -353,6 +353,11 @@ def iter_ready_repeater_ids_forever(): if lock.acquire(blocking=False, token=lock_token): yielded = True yield domain, repeater_id, lock_token + else: + metrics_counter( + 'commcare.repeaters.process_repeaters.repeater_locked', + tags={'domain': domain}, + ) if not yielded: # No repeaters are ready, or their domains can't forward or @@ -528,10 +533,7 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): # All the payloads that were sent failed. Try again later. metrics_counter( 'commcare.repeaters.process_repeaters.repeater_backoff', - tags={ - 'domain': repeater.domain, - 'repeater': f'{repeater.domain}: {repeater.name}', - }, + tags={'domain': repeater.domain}, ) repeater.set_backoff() finally: From 96dd85b8b0cb494a8ba9d3fe543e61e493f951a8 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 9 Jan 2025 22:24:19 +0000 Subject: [PATCH 43/56] Set much shorter timeout on repeater lock --- corehq/motech/repeaters/tasks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 6b6c78e54e0b..d72fc306a0af 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -395,8 +395,9 @@ def iter_ready_repeater_ids_once(): def get_repeater_lock(repeater_id): name = f'process_repeater_{repeater_id}' - three_hours = 3 * 60 * 60 - return get_redis_lock(key=name, name=name, timeout=three_hours) + # Requests will time out after 5 minutes. Set timeout to double to be safe. + ten_minutes = 10 * 60 + return get_redis_lock(key=name, name=name, timeout=ten_minutes) def get_repeater_ids_by_domain(): From 55ad75ff6e0ce34d6cb8d0b7157297cb28ad641a Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 9 Jan 2025 22:33:20 +0000 Subject: [PATCH 44/56] Fix comments --- corehq/motech/repeaters/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/models.py b/corehq/motech/repeaters/models.py index de4f1e7885f3..822084534672 100644 --- a/corehq/motech/repeaters/models.py +++ b/corehq/motech/repeaters/models.py @@ -253,7 +253,7 @@ def get_all_ready_ids_by_domain(self): return results def get_all_ready_next_attempt_null(self): - # See `_all_ready_next_attempt_now()`. Splitting the queries + # See `get_all_ready_next_attempt_now()`. Splitting the queries # speeds up total query time by using different indexes. # NOTE: Returns one row per RepeatRecord, not per Repeater. return ( @@ -264,7 +264,7 @@ def get_all_ready_next_attempt_null(self): ) def get_all_ready_next_attempt_now(self): - # See `_all_ready_next_attempt_null()`. Splitting the queries + # See `get_all_ready_next_attempt_null()`. Splitting the queries # speeds up total query time by using different indexes. # NOTE: Returns one row per RepeatRecord, not per Repeater. return ( From 55846d3d481b3e301ef801f103b368a63b1b44d1 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 9 Jan 2025 22:47:07 +0000 Subject: [PATCH 45/56] Nit --- corehq/motech/repeaters/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index d72fc306a0af..1a003cedb064 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -50,7 +50,7 @@ round-robin fashion, cycling through the domains. It does this so that: * Domains and repeaters are not rate-limited unnecessarily. -* Remote APIs are not unintentionally DDoS-attacked by CommCare HQ. +* Remote APIs are not unintentionally DoS-attacked by CommCare HQ. * No domain has to wait while another domain consumes all the repeat record queue workers. From 0e5c5ec2a385c05f724e236953e659759fea54ca Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 9 Jan 2025 23:44:51 +0000 Subject: [PATCH 46/56] Fix test --- .../dimagi/utils/couch/tests/test_redis_lock.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py b/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py index c5a8bd2d38f5..696d6af86312 100644 --- a/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py +++ b/corehq/ex-submodules/dimagi/utils/couch/tests/test_redis_lock.py @@ -4,7 +4,7 @@ from dimagi.utils.couch import get_redis_lock -from corehq.tests.noseplugins.redislocks import TestLock +from corehq.tests.pytest_plugins.redislocks import TestLock from corehq.util.metrics.lockmeter import MeteredLock @@ -12,8 +12,7 @@ def test_get_redis_lock_with_token(): lock_name = 'test-1' metered_lock = get_redis_lock(key=lock_name, name=lock_name, timeout=1) assert isinstance(metered_lock, MeteredLock) - # metered_lock.lock is a TestLock instance because of - # corehq.tests.noseplugins.redislocks.RedisLockTimeoutPlugin + # metered_lock.lock is a TestLock instance test_lock = metered_lock.lock assert isinstance(test_lock, TestLock) redis_lock = test_lock.lock From 79377f85d9256bb6bf2b261b460bf0d4299b0981 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 10 Jan 2025 17:02:14 +0000 Subject: [PATCH 47/56] Don't spawn a subtask for `process_repeater()` --- corehq/motech/repeaters/tasks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 1a003cedb064..9e47aae5a162 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -325,7 +325,7 @@ def process_repeaters(): metrics_counter('commcare.repeaters.process_repeaters.start') try: for domain, repeater_id, lock_token in iter_ready_repeater_ids_forever(): - process_repeater.delay(domain, repeater_id, lock_token) + process_repeater(domain, repeater_id, lock_token) finally: process_repeater_lock.release() @@ -414,7 +414,6 @@ def get_repeater_ids_by_domain(): } -@task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) def process_repeater(domain, repeater_id, lock_token): def get_task_signature(repeat_record): From 9604d8ae1cb1e5e2e74fb4db089706acec1a9f23 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 10 Jan 2025 19:29:29 +0000 Subject: [PATCH 48/56] Management command to expire lock --- .../commands/expire_process_repeaters_lock.py | 16 +++++++++++++++ corehq/motech/repeaters/tasks.py | 20 +++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py diff --git a/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py b/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py new file mode 100644 index 000000000000..2b659e80936e --- /dev/null +++ b/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py @@ -0,0 +1,16 @@ +from django.core.management.base import BaseCommand + +from dimagi.utils.couch.cache.cache_core import get_redis_client + +from corehq.motech.repeaters.const import PROCESS_REPEATERS_KEY + + +class Command(BaseCommand): + help = """ + If the `process_repeaters()` task was killed and the lock was not + released, this command expires the lock and allows the task to start. + """ + + def handle(self, *args, **options): + client = get_redis_client() + client.expire(PROCESS_REPEATERS_KEY, timeout=0) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 9e47aae5a162..1152cfb107ac 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -306,20 +306,20 @@ def process_repeaters(): Processes repeaters, instead of processing repeat records independently the way that ``check_repeaters()`` does. """ - process_repeater_lock = get_redis_lock( + + # NOTE: If `process_repeaters()` was killed and `process_repeaters_lock` + # was not released, expire the lock to allow `process_repeaters()` + # to start: + # + # $ ./manage.py expire_process_repeaters_lock + # + + process_repeaters_lock = get_redis_lock( PROCESS_REPEATERS_KEY, timeout=None, # Iterating repeaters forever is fine name=PROCESS_REPEATERS_KEY, ) - # How to recover from a crash: If `process_repeaters()` needs to be - # restarted and `process_repeater_lock` was not released, expire the - # lock to allow `process_repeaters()` to start: - # - # >>> from dimagi.utils.couch.cache.cache_core import get_redis_client - # >>> from corehq.motech.repeaters.const import PROCESS_REPEATERS_KEY - # >>> client = get_redis_client() - # >>> client.expire(PROCESS_REPEATERS_KEY, timeout=0) - if not process_repeater_lock.acquire(blocking=False): + if not process_repeaters_lock.acquire(blocking=False): return metrics_counter('commcare.repeaters.process_repeaters.start') From 964ae323e58d1ff8fedcbf802cfe0ecb11e57d9f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 10 Jan 2025 19:30:25 +0000 Subject: [PATCH 49/56] An alternate approach to iterating until complete --- corehq/motech/repeaters/tasks.py | 115 +++++-------- corehq/motech/repeaters/tests/test_tasks.py | 178 ++------------------ 2 files changed, 57 insertions(+), 236 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 1152cfb107ac..142057a51f67 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -72,13 +72,12 @@ """ import random -import uuid from datetime import datetime, timedelta from inspect import cleandoc from django.conf import settings -from celery import chord +from celery import chord, group from celery.schedules import crontab from celery.utils.log import get_task_logger @@ -322,50 +321,35 @@ def process_repeaters(): if not process_repeaters_lock.acquire(blocking=False): return - metrics_counter('commcare.repeaters.process_repeaters.start') try: - for domain, repeater_id, lock_token in iter_ready_repeater_ids_forever(): - process_repeater(domain, repeater_id, lock_token) - finally: - process_repeater_lock.release() - - -def iter_ready_repeater_ids_forever(): - """ - Cycles through repeaters (repeatedly ;) ) until there are no more - repeat records ready to be sent. - """ - while True: - yielded = False - for domain, repeater_id in iter_ready_repeater_ids_once(): - if not domain_can_forward_now(domain): - continue - if rate_limit_repeater(domain, repeater_id): - continue + metrics_counter('commcare.repeaters.process_repeaters.start') + while True: + metrics_counter('commcare.repeaters.process_repeaters.iter_once') + tasks = [] + for domain, repeater_id in iter_ready_repeater_ids(): + if not domain_can_forward_now(domain): + continue + if rate_limit_repeater(domain, repeater_id): + continue + tasks.append(process_repeater_chord(domain, repeater_id)) + + if tasks: + result = group(*tasks).apply_async() + # Waiting for the result of a subtask is discouraged + # https://docs.celeryq.dev/en/stable/userguide/tasks.html#avoid-launching-synchronous-subtasks + # but in this situation it is safe because the + # `process_repeater()` tasks use a different queue. + result.get(disable_sync_subtasks=False) - lock = get_repeater_lock(repeater_id) - # Generate a lock token using `uuid1()` the same way that - # `redis.lock.Lock` does. The `Lock` class uses the token to - # determine ownership, so that one process can acquire a - # lock and a different process can release it. This lock - # will be released by the `update_repeater()` task. - lock_token = uuid.uuid1().hex - if lock.acquire(blocking=False, token=lock_token): - yielded = True - yield domain, repeater_id, lock_token else: - metrics_counter( - 'commcare.repeaters.process_repeaters.repeater_locked', - tags={'domain': domain}, - ) - - if not yielded: - # No repeaters are ready, or their domains can't forward or - # are paused. - return + break + + finally: + process_repeaters_lock.release() + metrics_counter('commcare.repeaters.process_repeaters.complete') -def iter_ready_repeater_ids_once(): +def iter_ready_repeater_ids(): """ Yields domain-repeater_id tuples in a round-robin fashion. @@ -378,7 +362,6 @@ def iter_ready_repeater_ids_once(): ... """ - metrics_counter('commcare.repeaters.process_repeaters.iter_once') repeater_ids_by_domain = get_repeater_ids_by_domain() while True: if not repeater_ids_by_domain: @@ -393,13 +376,6 @@ def iter_ready_repeater_ids_once(): yield domain, repeater_id -def get_repeater_lock(repeater_id): - name = f'process_repeater_{repeater_id}' - # Requests will time out after 5 minutes. Set timeout to double to be safe. - ten_minutes = 10 * 60 - return get_redis_lock(key=name, name=name, timeout=ten_minutes) - - def get_repeater_ids_by_domain(): repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() always_enabled_domains = set(toggles.PROCESS_REPEATERS.get_enabled_domains()) @@ -414,7 +390,7 @@ def get_repeater_ids_by_domain(): } -def process_repeater(domain, repeater_id, lock_token): +def process_repeater_chord(domain, repeater_id): def get_task_signature(repeat_record): task_ = { @@ -426,7 +402,7 @@ def get_task_signature(repeat_record): repeater = Repeater.objects.get(domain=domain, id=repeater_id) repeat_records = repeater.repeat_records_ready[:repeater.num_workers] header_tasks = [get_task_signature(rr) for rr in repeat_records] - chord(header_tasks)(update_repeater.s(repeater_id, lock_token)) + return chord(header_tasks, update_repeater.s(repeater_id)) @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) @@ -515,31 +491,26 @@ def _get_wait_duration_seconds(repeat_record): @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) -def update_repeater(repeat_record_states, repeater_id, lock_token): +def update_repeater(repeat_record_states, repeater_id): """ Determines whether the repeater should back off, based on the results of ``_process_repeat_record()`` tasks. """ - try: - if all(s in (State.Empty, None) for s in repeat_record_states): - # We can't tell anything about the remote endpoint. - return - success_or_invalid = (State.Success, State.InvalidPayload) - repeater = Repeater.objects.get(id=repeater_id) - if any(s in success_or_invalid for s in repeat_record_states): - # The remote endpoint appears to be healthy. - repeater.reset_backoff() - else: - # All the payloads that were sent failed. Try again later. - metrics_counter( - 'commcare.repeaters.process_repeaters.repeater_backoff', - tags={'domain': repeater.domain}, - ) - repeater.set_backoff() - finally: - lock = get_repeater_lock(repeater_id) - lock.local.token = lock_token - lock.release() + if all(s in (State.Empty, None) for s in repeat_record_states): + # We can't tell anything about the remote endpoint. + return + success_or_invalid = (State.Success, State.InvalidPayload) + repeater = Repeater.objects.get(id=repeater_id) + if any(s in success_or_invalid for s in repeat_record_states): + # The remote endpoint appears to be healthy. + repeater.reset_backoff() + else: + # All the payloads that were sent failed. Try again later. + metrics_counter( + 'commcare.repeaters.process_repeaters.repeater_backoff', + tags={'domain': repeater.domain}, + ) + repeater.set_backoff() metrics_gauge_task( diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 17eb0561ebda..75f13e5e9886 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -21,8 +21,7 @@ _process_repeat_record, delete_old_request_logs, get_repeater_ids_by_domain, - iter_ready_repeater_ids_forever, - iter_ready_repeater_ids_once, + iter_ready_repeater_ids, process_repeater, update_repeater, ) @@ -257,139 +256,6 @@ def patch(self): self.addCleanup(patch_domain_can_forward.stop) -class TestIterReadyRepeaterIDsForever(SimpleTestCase): - - @staticmethod - def all_ready_ids_by_domain(): - return [ - { - # See test_iter_ready_repeater_ids_once() - 'domain1': ['repeater_id1', 'repeater_id2', 'repeater_id3'], - 'domain2': ['repeater_id4', 'repeater_id5'], - 'domain3': ['repeater_id6'], - }, - { - 'domain1': ['repeater_id1', 'repeater_id2'], - 'domain2': ['repeater_id4'] - }, - {}, - ] - - def test_no_ready_repeaters(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - return_value={}), # <-- - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain1', 'domain2', 'domain3']), - ): - self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) - - def test_domain_cant_forward_now(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=self.all_ready_ids_by_domain()), - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=False), # <-- - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain1', 'domain2', 'domain3']), - ): - self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) - - def test_process_repeaters_not_enabled(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=self.all_ready_ids_by_domain()), - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=[]), # <-- - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - return_value=False), # <-- - ): - self.assertFalse(next(iter_ready_repeater_ids_forever(), False)) - - def test_successive_loops(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=self.all_ready_ids_by_domain()), - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain1', 'domain2', 'domain3']), - patch('corehq.motech.repeaters.tasks.rate_limit_repeater', - return_value=False), - patch('corehq.motech.repeaters.tasks.get_repeater_lock'), - ): - repeaters = list(iter_ready_repeater_ids_forever()) - self.assertEqual(len(repeaters), 9) - repeater_ids = [(r[0], r[1]) for r in repeaters] - self.assertEqual(repeater_ids, [ - # First loop - ('domain1', 'repeater_id3'), - ('domain2', 'repeater_id5'), - ('domain3', 'repeater_id6'), - ('domain1', 'repeater_id2'), - ('domain2', 'repeater_id4'), - ('domain1', 'repeater_id1'), - - # Second loop - ('domain1', 'repeater_id2'), - ('domain2', 'repeater_id4'), - ('domain1', 'repeater_id1'), - ]) - - def test_rate_limit(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=self.all_ready_ids_by_domain()), - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain1', 'domain2', 'domain3']), - patch('corehq.motech.repeaters.tasks.rate_limit_repeater', - side_effect=lambda dom, rep: dom == 'domain2' and rep == 'repeater_id4'), - patch('corehq.motech.repeaters.tasks.get_repeater_lock'), - ): - repeaters = list(iter_ready_repeater_ids_forever()) - self.assertEqual(len(repeaters), 7) - repeater_ids = [(r[0], r[1]) for r in repeaters] - self.assertEqual(repeater_ids, [ - ('domain1', 'repeater_id3'), - ('domain2', 'repeater_id5'), - ('domain3', 'repeater_id6'), - ('domain1', 'repeater_id2'), - ('domain1', 'repeater_id1'), - ('domain1', 'repeater_id2'), - ('domain1', 'repeater_id1'), - ]) - - def test_disabled_domains_excluded(self): - with ( - patch('corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', - side_effect=self.all_ready_ids_by_domain()), - patch('corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True), - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.get_enabled_domains', - return_value=['domain2']), # <-- - patch('corehq.motech.repeaters.tasks.toggles.PROCESS_REPEATERS.enabled', - side_effect=lambda dom, __: dom == 'domain3'), # <-- - patch('corehq.motech.repeaters.tasks.rate_limit_repeater', - return_value=False), - patch('corehq.motech.repeaters.tasks.get_repeater_lock'), - ): - repeaters = list(iter_ready_repeater_ids_forever()) - self.assertEqual(len(repeaters), 4) - repeater_ids = [(r[0], r[1]) for r in repeaters] - self.assertEqual(repeater_ids, [ - ('domain2', 'repeater_id5'), - ('domain3', 'repeater_id6'), - ('domain2', 'repeater_id4'), - ('domain2', 'repeater_id4'), - ]) - - def test_iter_ready_repeater_ids_once(): with ( patch( @@ -405,7 +271,7 @@ def test_iter_ready_repeater_ids_once(): return_value=['domain1', 'domain2', 'domain3'], ), ): - pairs = list(iter_ready_repeater_ids_once()) + pairs = list(iter_ready_repeater_ids()) assert_equal(pairs, [ # First round of domains ('domain1', 'repeater_id3'), @@ -482,10 +348,7 @@ def test_process_repeater_sends_repeat_record(self): ) self.repeater.register(payload) - with ( - patch('corehq.motech.repeaters.models.simple_request') as request_mock, - patch('corehq.motech.repeaters.tasks.get_repeater_lock') - ): + with patch('corehq.motech.repeaters.models.simple_request') as request_mock: request_mock.return_value = ResponseMock(status_code=200, reason='OK') process_repeater(DOMAIN, self.repeater.repeater_id, 'token') @@ -500,10 +363,7 @@ def test_process_repeater_updates_repeater(self): ) self.repeater.register(payload) - with ( - patch('corehq.motech.repeaters.models.simple_request') as request_mock, - patch('corehq.motech.repeaters.tasks.get_repeater_lock') - ): + with patch('corehq.motech.repeaters.models.simple_request') as request_mock: request_mock.return_value = ResponseMock( status_code=429, reason='Too Many Requests', @@ -516,62 +376,52 @@ def test_process_repeater_updates_repeater(self): @flag_enabled('PROCESS_REPEATERS') class TestUpdateRepeater(SimpleTestCase): - @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): + def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater): repeat_record_states = [State.Success, State.Fail, State.Empty, None] - mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() - @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): + def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater): repeat_record_states = [State.InvalidPayload, State.Fail, State.Empty, None] - mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() - @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): + def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater): repeat_record_states = [State.Fail, State.Empty, None] - mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1) mock_repeater.set_backoff.assert_called_once() mock_repeater.reset_backoff.assert_not_called() - @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): + def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater): repeat_record_states = [State.Empty] - mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() - @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): + def test_update_repeater_does_nothing_on_none(self, mock_get_repeater): repeat_record_states = [None] - mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() From b801159fdfedd765fadf530f3ae79c308da02c3b Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 10 Jan 2025 23:11:59 +0000 Subject: [PATCH 50/56] Clean up tests --- corehq/motech/repeaters/tests/test_tasks.py | 111 +++----------------- 1 file changed, 13 insertions(+), 98 deletions(-) diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 75f13e5e9886..bcfba408eacd 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -1,37 +1,26 @@ from collections import namedtuple -from contextlib import contextmanager from datetime import datetime, timedelta from unittest.mock import MagicMock, patch -from uuid import uuid4 from django.test import SimpleTestCase, TestCase -from nose.tools import assert_equal +from freezegun import freeze_time -from corehq.apps.receiverwrapper.util import submit_form_locally -from corehq.form_processor.models import XFormInstance -from corehq.form_processor.utils.xform import ( - FormSubmissionBuilder, - TestFormMetadata, -) from corehq.motech.models import ConnectionSettings, RequestLog -from corehq.motech.repeaters.models import FormRepeater, Repeater, RepeatRecord -from corehq.motech.repeaters.tasks import ( +from corehq.util.test_utils import flag_enabled + +from ..const import State +from ..models import FormRepeater, Repeater, RepeatRecord +from ..tasks import ( _get_wait_duration_seconds, _process_repeat_record, delete_old_request_logs, get_repeater_ids_by_domain, iter_ready_repeater_ids, - process_repeater, update_repeater, ) -from corehq.util.test_utils import _create_case, flag_enabled - -from ..const import State -DOMAIN = 'gaidhlig' -PAYLOAD_IDS = ['aon', 'dha', 'trì', 'ceithir', 'coig', 'sia', 'seachd', 'ochd', - 'naoi', 'deich'] +DOMAIN = 'test-tasks' ResponseMock = namedtuple('ResponseMock', 'status_code reason') @@ -79,20 +68,6 @@ def test_num_queries_chunked(self): self.assertEqual(count, 0) -@contextmanager -def form_context(form_ids): - for form_id in form_ids: - builder = FormSubmissionBuilder( - form_id=form_id, - metadata=TestFormMetadata(domain=DOMAIN), - ) - submit_form_locally(builder.as_xml_string(), DOMAIN) - try: - yield - finally: - XFormInstance.objects.hard_delete_forms(DOMAIN, form_ids) - - class TestProcessRepeatRecord(TestCase): def test_returns_if_record_is_cancelled(self): @@ -256,7 +231,7 @@ def patch(self): self.addCleanup(patch_domain_can_forward.stop) -def test_iter_ready_repeater_ids_once(): +def test_iter_ready_repeater_ids(): with ( patch( 'corehq.motech.repeaters.tasks.Repeater.objects.get_all_ready_ids_by_domain', @@ -272,7 +247,7 @@ def test_iter_ready_repeater_ids_once(): ), ): pairs = list(iter_ready_repeater_ids()) - assert_equal(pairs, [ + assert pairs == [ # First round of domains ('domain1', 'repeater_id3'), ('domain2', 'repeater_id5'), @@ -284,7 +259,7 @@ def test_iter_ready_repeater_ids_once(): # Third round ('domain1', 'repeater_id1'), - ]) + ] def test_get_repeater_ids_by_domain(): @@ -306,71 +281,10 @@ def test_get_repeater_ids_by_domain(): side_effect=lambda dom, __: dom == 'domain3'), ): repeater_ids_by_domain = get_repeater_ids_by_domain() - assert_equal(repeater_ids_by_domain, { + assert repeater_ids_by_domain == { 'domain2': ['repeater_id4', 'repeater_id5'], 'domain3': ['repeater_id6'], - }) - - -@flag_enabled('PROCESS_REPEATERS') -class TestProcessRepeater(TestCase): - - @classmethod - def setUpClass(cls): - super().setUpClass() - - can_forward_now_patch = patch( - 'corehq.motech.repeaters.tasks.domain_can_forward_now', - return_value=True, - ) - can_forward_now_patch = can_forward_now_patch.start() - cls.addClassCleanup(can_forward_now_patch.stop) - - cls.set_backoff_patch = patch.object(FormRepeater, 'set_backoff') - cls.set_backoff_patch.start() - cls.addClassCleanup(cls.set_backoff_patch.stop) - - connection_settings = ConnectionSettings.objects.create( - domain=DOMAIN, - url='http://www.example.com/api/' - ) - cls.repeater = FormRepeater.objects.create( - domain=DOMAIN, - connection_settings=connection_settings, - ) - - def test_process_repeater_sends_repeat_record(self): - payload, __ = _create_case( - domain=DOMAIN, - case_id=str(uuid4()), - case_type='case', - owner_id='abc123' - ) - self.repeater.register(payload) - - with patch('corehq.motech.repeaters.models.simple_request') as request_mock: - request_mock.return_value = ResponseMock(status_code=200, reason='OK') - process_repeater(DOMAIN, self.repeater.repeater_id, 'token') - - request_mock.assert_called_once() - - def test_process_repeater_updates_repeater(self): - payload, __ = _create_case( - domain=DOMAIN, - case_id=str(uuid4()), - case_type='case', - owner_id='abc123' - ) - self.repeater.register(payload) - - with patch('corehq.motech.repeaters.models.simple_request') as request_mock: - request_mock.return_value = ResponseMock( - status_code=429, - reason='Too Many Requests', - ) - process_repeater(DOMAIN, self.repeater.repeater_id, 'token') - - self.repeater.set_backoff.assert_called_once() + } @flag_enabled('PROCESS_REPEATERS') @@ -427,6 +341,7 @@ def test_update_repeater_does_nothing_on_none(self, mock_get_repeater): mock_repeater.reset_backoff.assert_not_called() +@freeze_time('2025-01-01') class TestGetWaitDurationSeconds(TestCase): @classmethod From 90bf28144bff7308e8d68bbad9562c9fe6a0d987 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Sat, 11 Jan 2025 16:56:12 +0000 Subject: [PATCH 51/56] Update docstrings --- corehq/motech/repeaters/tasks.py | 55 +++++++++++++++++--------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index 142057a51f67..bd6c0dda6e80 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -17,13 +17,8 @@ ID. (``partition = RepeatRecord.id % num_partitions``.) For each repeat record, ``check_repeaters_in_partition()`` calls -``RepeatRecord.attempt_forward_now(is_retry=True)``. (``is_retry`` is -set to ``True`` because when a repeat record is registered, -``RepeatRecord.attempt_forward_now()`` is called immediately, and the -repeat record is only enqueued if sending fails.) - -Execution ends up back in the ``tasks`` module when -``RepeatRecord.attempt_forward_now()`` calls +``RepeatRecord.attempt_forward_now()``. Execution ends up back in the +``tasks`` module when ``RepeatRecord.attempt_forward_now()`` calls ``_process_repeat_record()``. It runs a battery of checks, and if they all succeed, ``RepeatRecord.fire()`` is called. @@ -35,6 +30,12 @@ * It attempts to forward all the repeat records of a repeater, even if every previous repeat record has failed. +* We don't have a way to send repeat records in chronological order. + +* Controlling the rate at which repeat records are sent is difficult. + (e.g. Currently we use a separate queue to reduce the rate of sending + the payloads of data source repeaters.) + process_repeaters() ------------------- @@ -45,30 +46,29 @@ This process is enabled using the ``PROCESS_REPEATERS`` feature flag. -The ``iter_ready_repeater_ids_once()`` generator yields the IDs of -repeaters that have repeat records ready to be sent. It does so in a -round-robin fashion, cycling through the domains. It does this so that: +The ``iter_ready_repeater_ids()`` generator yields the IDs of repeaters +that have repeat records ready to be sent. It does so in a round-robin +fashion, cycling through their domains. It does this so that: * Domains and repeaters are not rate-limited unnecessarily. -* Remote APIs are not unintentionally DoS-attacked by CommCare HQ. +* CommCare HQ does not DoS-attack remote APIs by unintentionally sending + repeat records in large chunks. * No domain has to wait while another domain consumes all the repeat record queue workers. -As long as there are repeat records ready to be sent, the -``iter_ready_repeater_ids_forever()`` generator will continue to yield -from ``iter_ready_repeater_ids_once()``. The ``process_repeaters()`` -task iterates these repeater IDs, and passes each one to the -``process_repeater()`` task. +``process_repeaters()`` creates a group of Celery tasks where each task +is a Celery chord for one repeater. The chord sends the next batch of +the repeater's repeat records, and then updates the repeater with the +results of those send attempts. The batch size defaults to 7, and can +be configured per repeater. Setting the batch size to 1 will send +repeat records in chronological order. If the remote endpoint is +offline or consistently returns errors, then the repeater will be +updated to back off. If any send attempts succeeded and a backoff had +been set, then the backoff is reset. -``process_repeater()`` fetches a batch of repeat records (the number is -set per repeater) and spawns tasks to send them in parallel. The results -of all the send attempts are passed to ``update_repeater()``. If all the -send attempts failed, the **repeater** (not the repeat record) is backed -off. If any send attempts succeeded, the backoff is reset. - -The intention of this process is not only to share repeat record queue -workers fairly across domains, but also to optimise workers by not -trying to send repeat records that are unlikely to succeed. +``process_repeaters()`` runs the group of tasks in parallel. When they +have completed, ``process_repeaters()`` loops through the repeaters +again, until there are no more repeat records ready to be sent. """ import random @@ -391,6 +391,9 @@ def get_repeater_ids_by_domain(): def process_repeater_chord(domain, repeater_id): + """ + Returns a Celery chord to process a repeater. + """ def get_task_signature(repeat_record): task_ = { @@ -494,7 +497,7 @@ def _get_wait_duration_seconds(repeat_record): def update_repeater(repeat_record_states, repeater_id): """ Determines whether the repeater should back off, based on the - results of ``_process_repeat_record()`` tasks. + results of ``process_ready_repeat_record()``. """ if all(s in (State.Empty, None) for s in repeat_record_states): # We can't tell anything about the remote endpoint. From d0f13a449194fed2250d4f57977e3bf4cdbbb6da Mon Sep 17 00:00:00 2001 From: Norman Hooper <708421+kaapstorm@users.noreply.github.com> Date: Tue, 14 Jan 2025 19:11:35 +0000 Subject: [PATCH 52/56] Improve wording Co-authored-by: Daniel Miller --- corehq/motech/repeaters/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index bd6c0dda6e80..d3ea81a0c148 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -51,8 +51,8 @@ fashion, cycling through their domains. It does this so that: * Domains and repeaters are not rate-limited unnecessarily. -* CommCare HQ does not DoS-attack remote APIs by unintentionally sending - repeat records in large chunks. +* CommCare HQ tries to avoid flooding remote APIs by distributing the + load among all active repeaters. * No domain has to wait while another domain consumes all the repeat record queue workers. From 03197dc94622d1b201452b090353780b4c7c69a3 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 15 Jan 2025 11:06:43 +0000 Subject: [PATCH 53/56] Wait for random task instead of all tasks --- corehq/motech/repeaters/tasks.py | 81 +++++++++++++++------ corehq/motech/repeaters/tests/test_tasks.py | 25 ++++--- 2 files changed, 72 insertions(+), 34 deletions(-) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index d3ea81a0c148..a6b81c9fe804 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -72,6 +72,7 @@ """ import random +import uuid from datetime import datetime, timedelta from inspect import cleandoc @@ -331,14 +332,34 @@ def process_repeaters(): continue if rate_limit_repeater(domain, repeater_id): continue - tasks.append(process_repeater_chord(domain, repeater_id)) + + lock = get_repeater_lock(repeater_id) + # Generate a lock token using `uuid1()` the same way that + # `redis.lock.Lock` does. The `Lock` class uses the token to + # determine ownership, so that one process can acquire a + # lock and a different process can release it. This lock + # will be released by the `update_repeater()` task. + lock_token = uuid.uuid1().hex + if lock.acquire(blocking=False, token=lock_token): + tasks.append(process_repeater_chord( + domain, + repeater_id, + lock_token, + )) if tasks: - result = group(*tasks).apply_async() - # Waiting for the result of a subtask is discouraged - # https://docs.celeryq.dev/en/stable/userguide/tasks.html#avoid-launching-synchronous-subtasks - # but in this situation it is safe because the - # `process_repeater()` tasks use a different queue. + random_task_num = random.randrange(len(tasks)) + random_task = tasks.pop(random_task_num) + if tasks: + group(*tasks).apply_async() + result = random_task.apply_async() + # Wait for at least one task to complete so that + # `iter_ready_repeater_ids()` will return a different + # set of results. Waiting for the result of a subtask is + # discouraged, but in this situation it is safe because + # the tasks for processing a repeater use a different + # queue. + # cf. https://docs.celeryq.dev/en/stable/userguide/tasks.html#avoid-launching-synchronous-subtasks result.get(disable_sync_subtasks=False) else: @@ -376,6 +397,13 @@ def iter_ready_repeater_ids(): yield domain, repeater_id +def get_repeater_lock(repeater_id): + name = f'process_repeater_{repeater_id}' + # Requests will time out after 5 minutes. Set timeout to double to be safe. + ten_minutes = 10 * 60 + return get_redis_lock(key=name, name=name, timeout=ten_minutes) + + def get_repeater_ids_by_domain(): repeater_ids_by_domain = Repeater.objects.get_all_ready_ids_by_domain() always_enabled_domains = set(toggles.PROCESS_REPEATERS.get_enabled_domains()) @@ -390,7 +418,7 @@ def get_repeater_ids_by_domain(): } -def process_repeater_chord(domain, repeater_id): +def process_repeater_chord(domain, repeater_id, lock_token): """ Returns a Celery chord to process a repeater. """ @@ -405,7 +433,7 @@ def get_task_signature(repeat_record): repeater = Repeater.objects.get(domain=domain, id=repeater_id) repeat_records = repeater.repeat_records_ready[:repeater.num_workers] header_tasks = [get_task_signature(rr) for rr in repeat_records] - return chord(header_tasks, update_repeater.s(repeater_id)) + return chord(header_tasks, update_repeater.s(repeater_id, lock_token)) @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) @@ -494,26 +522,31 @@ def _get_wait_duration_seconds(repeat_record): @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) -def update_repeater(repeat_record_states, repeater_id): +def update_repeater(repeat_record_states, repeater_id, lock_token): """ Determines whether the repeater should back off, based on the results of ``process_ready_repeat_record()``. """ - if all(s in (State.Empty, None) for s in repeat_record_states): - # We can't tell anything about the remote endpoint. - return - success_or_invalid = (State.Success, State.InvalidPayload) - repeater = Repeater.objects.get(id=repeater_id) - if any(s in success_or_invalid for s in repeat_record_states): - # The remote endpoint appears to be healthy. - repeater.reset_backoff() - else: - # All the payloads that were sent failed. Try again later. - metrics_counter( - 'commcare.repeaters.process_repeaters.repeater_backoff', - tags={'domain': repeater.domain}, - ) - repeater.set_backoff() + try: + if all(s in (State.Empty, None) for s in repeat_record_states): + # We can't tell anything about the remote endpoint. + return + success_or_invalid = (State.Success, State.InvalidPayload) + repeater = Repeater.objects.get(id=repeater_id) + if any(s in success_or_invalid for s in repeat_record_states): + # The remote endpoint appears to be healthy. + repeater.reset_backoff() + else: + # All the payloads that were sent failed. Try again later. + metrics_counter( + 'commcare.repeaters.process_repeaters.repeater_backoff', + tags={'domain': repeater.domain}, + ) + repeater.set_backoff() + finally: + lock = get_repeater_lock(repeater_id) + lock.local.token = lock_token + lock.release() metrics_gauge_task( diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index bcfba408eacd..79ee59613eef 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -290,52 +290,57 @@ def test_get_repeater_ids_by_domain(): @flag_enabled('PROCESS_REPEATERS') class TestUpdateRepeater(SimpleTestCase): + @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater): + def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): repeat_record_states = [State.Success, State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1) + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() + @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater): + def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): repeat_record_states = [State.InvalidPayload, State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1) + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() + @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater): + def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): repeat_record_states = [State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1) + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_called_once() mock_repeater.reset_backoff.assert_not_called() + @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater): + def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): repeat_record_states = [State.Empty] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1) + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() + @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') - def test_update_repeater_does_nothing_on_none(self, mock_get_repeater): + def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): repeat_record_states = [None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1) + update_repeater(repeat_record_states, 1, 'token') mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() From e95e2cc381f16599b9d8ad69276ce6730368e6d0 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Fri, 17 Jan 2025 23:09:58 +0000 Subject: [PATCH 54/56] Lint --- corehq/motech/repeaters/tests/test_tasks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 79ee59613eef..06853d425e9a 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -290,7 +290,7 @@ def test_get_repeater_ids_by_domain(): @flag_enabled('PROCESS_REPEATERS') class TestUpdateRepeater(SimpleTestCase): - @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): repeat_record_states = [State.Success, State.Fail, State.Empty, None] @@ -301,7 +301,7 @@ def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() - @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): repeat_record_states = [State.InvalidPayload, State.Fail, State.Empty, None] @@ -312,7 +312,7 @@ def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() - @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): repeat_record_states = [State.Fail, State.Empty, None] @@ -323,7 +323,7 @@ def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): mock_repeater.set_backoff.assert_called_once() mock_repeater.reset_backoff.assert_not_called() - @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): repeat_record_states = [State.Empty] @@ -334,7 +334,7 @@ def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() - @ patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): repeat_record_states = [None] From 983ebf63ec74e8b589fa88e881d2cfbcb5f0872b Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 21 Jan 2025 16:48:51 -0500 Subject: [PATCH 55/56] `MeteredLock` supports `reacquire()` --- corehq/util/metrics/lockmeter.py | 20 ++++++++++++++++++++ corehq/util/metrics/tests/test_lockmeter.py | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/corehq/util/metrics/lockmeter.py b/corehq/util/metrics/lockmeter.py index 27b77dbc113f..0c306195ac4b 100644 --- a/corehq/util/metrics/lockmeter.py +++ b/corehq/util/metrics/lockmeter.py @@ -46,6 +46,26 @@ def acquire(self, *args, **kw): self.lock_trace.set_tags({"key": self.key, "name": self.name}) return acquired + def reacquire(self): + buckets = self.timing_buckets + with ( + metrics_histogram_timer("commcare.lock.reacquire_time", buckets), + tracer.trace("commcare.lock.reacquire", resource=self.key) as span + ): + acquired = self.lock.reacquire() + span.set_tags({ + "key": self.key, + "name": self.name, + "acquired": ("true" if acquired else "false"), + }) + if acquired: + self.end_time = time.time() + self.lock.timeout + self.lock_timer.start() + if self.track_unreleased: + self.lock_trace = tracer.trace("commcare.lock.locked", resource=self.key) + self.lock_trace.set_tags({"key": self.key, "name": self.name}) + return acquired + def release(self): self.lock.release() if self.lock_timer.is_started(): diff --git a/corehq/util/metrics/tests/test_lockmeter.py b/corehq/util/metrics/tests/test_lockmeter.py index e658b2423463..54ed174e9e40 100644 --- a/corehq/util/metrics/tests/test_lockmeter.py +++ b/corehq/util/metrics/tests/test_lockmeter.py @@ -142,6 +142,22 @@ def test_acquire_untracked(self): call.trace().__exit__(None, None, None), ]) + def test_reacquire_untracked(self): + fake = FakeLock(timeout=-1) + lock = MeteredLock(fake, "test", track_unreleased=False) + with patch("corehq.util.metrics.lockmeter.tracer") as tracer: + lock.reacquire() + self.assertListEqual(tracer.mock_calls, [ + call.trace("commcare.lock.reacquire", resource="key"), + call.trace().__enter__(), + call.trace().__enter__().set_tags({ + "key": "key", + "name": "test", + "acquired": "true", + }), + call.trace().__exit__(None, None, None), + ]) + def test_release_untracked(self): fake = FakeLock() lock = MeteredLock(fake, "test", track_unreleased=False) @@ -185,5 +201,8 @@ def acquire(self, blocking=True): self.locked = True return blocking + def reacquire(self): + return True + def release(self): self.locked = False From 24a7acf3f108a1ad65fc0590aa813519b73af6c8 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 21 Jan 2025 17:08:31 -0500 Subject: [PATCH 56/56] `update_repeater()` calls `process_repeater()` --- corehq/motech/repeaters/const.py | 3 +- .../commands/expire_process_repeaters_lock.py | 16 --- corehq/motech/repeaters/tasks.py | 115 +++++++----------- corehq/motech/repeaters/tests/test_tasks.py | 44 ++++++- 4 files changed, 81 insertions(+), 97 deletions(-) delete mode 100644 corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py diff --git a/corehq/motech/repeaters/const.py b/corehq/motech/repeaters/const.py index d81cdae76337..365925f0df2e 100644 --- a/corehq/motech/repeaters/const.py +++ b/corehq/motech/repeaters/const.py @@ -14,8 +14,7 @@ CHECK_REPEATERS_INTERVAL = timedelta(minutes=5) CHECK_REPEATERS_PARTITION_COUNT = settings.CHECK_REPEATERS_PARTITION_COUNT CHECK_REPEATERS_KEY = 'check-repeaters-key' -PROCESS_REPEATERS_INTERVAL = timedelta(minutes=1) -PROCESS_REPEATERS_KEY = 'process-repeaters-key' +PROCESS_REPEATERS_INTERVAL = timedelta(minutes=5) ENDPOINT_TIMER = 'endpoint_timer' # Number of attempts to an online endpoint before cancelling payload MAX_ATTEMPTS = 3 diff --git a/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py b/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py deleted file mode 100644 index 2b659e80936e..000000000000 --- a/corehq/motech/repeaters/management/commands/expire_process_repeaters_lock.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.core.management.base import BaseCommand - -from dimagi.utils.couch.cache.cache_core import get_redis_client - -from corehq.motech.repeaters.const import PROCESS_REPEATERS_KEY - - -class Command(BaseCommand): - help = """ - If the `process_repeaters()` task was killed and the lock was not - released, this command expires the lock and allows the task to start. - """ - - def handle(self, *args, **options): - client = get_redis_client() - client.expire(PROCESS_REPEATERS_KEY, timeout=0) diff --git a/corehq/motech/repeaters/tasks.py b/corehq/motech/repeaters/tasks.py index a6b81c9fe804..b89cd26c1b12 100644 --- a/corehq/motech/repeaters/tasks.py +++ b/corehq/motech/repeaters/tasks.py @@ -78,7 +78,7 @@ from django.conf import settings -from celery import chord, group +from celery import chord from celery.schedules import crontab from celery.utils.log import get_task_logger @@ -109,7 +109,6 @@ ENDPOINT_TIMER, MAX_RETRY_WAIT, PROCESS_REPEATERS_INTERVAL, - PROCESS_REPEATERS_KEY, RATE_LIMITER_DELAY_RANGE, State, ) @@ -306,68 +305,23 @@ def process_repeaters(): Processes repeaters, instead of processing repeat records independently the way that ``check_repeaters()`` does. """ + metrics_counter('commcare.repeaters.process_repeaters.start') + for domain, repeater_id in iter_ready_repeater_ids(): + if not domain_can_forward_now(domain): + continue + if rate_limit_repeater(domain, repeater_id): + continue - # NOTE: If `process_repeaters()` was killed and `process_repeaters_lock` - # was not released, expire the lock to allow `process_repeaters()` - # to start: - # - # $ ./manage.py expire_process_repeaters_lock - # - - process_repeaters_lock = get_redis_lock( - PROCESS_REPEATERS_KEY, - timeout=None, # Iterating repeaters forever is fine - name=PROCESS_REPEATERS_KEY, - ) - if not process_repeaters_lock.acquire(blocking=False): - return - - try: - metrics_counter('commcare.repeaters.process_repeaters.start') - while True: - metrics_counter('commcare.repeaters.process_repeaters.iter_once') - tasks = [] - for domain, repeater_id in iter_ready_repeater_ids(): - if not domain_can_forward_now(domain): - continue - if rate_limit_repeater(domain, repeater_id): - continue - - lock = get_repeater_lock(repeater_id) - # Generate a lock token using `uuid1()` the same way that - # `redis.lock.Lock` does. The `Lock` class uses the token to - # determine ownership, so that one process can acquire a - # lock and a different process can release it. This lock - # will be released by the `update_repeater()` task. - lock_token = uuid.uuid1().hex - if lock.acquire(blocking=False, token=lock_token): - tasks.append(process_repeater_chord( - domain, - repeater_id, - lock_token, - )) - - if tasks: - random_task_num = random.randrange(len(tasks)) - random_task = tasks.pop(random_task_num) - if tasks: - group(*tasks).apply_async() - result = random_task.apply_async() - # Wait for at least one task to complete so that - # `iter_ready_repeater_ids()` will return a different - # set of results. Waiting for the result of a subtask is - # discouraged, but in this situation it is safe because - # the tasks for processing a repeater use a different - # queue. - # cf. https://docs.celeryq.dev/en/stable/userguide/tasks.html#avoid-launching-synchronous-subtasks - result.get(disable_sync_subtasks=False) - - else: - break - - finally: - process_repeaters_lock.release() - metrics_counter('commcare.repeaters.process_repeaters.complete') + lock = get_repeater_lock(repeater_id) + # Generate a lock token using `uuid1()` the same way that + # `redis.lock.Lock` does. The `Lock` class uses the token to + # determine ownership, so that one process can acquire a + # lock and a different process can release it. This lock + # will be released by the `update_repeater()` task. + lock_token = uuid.uuid1().hex + if lock.acquire(blocking=False, token=lock_token): + repeater = Repeater.objects.get(domain=domain, id=repeater_id) + process_repeater(repeater, lock_token) def iter_ready_repeater_ids(): @@ -399,9 +353,8 @@ def iter_ready_repeater_ids(): def get_repeater_lock(repeater_id): name = f'process_repeater_{repeater_id}' - # Requests will time out after 5 minutes. Set timeout to double to be safe. - ten_minutes = 10 * 60 - return get_redis_lock(key=name, name=name, timeout=ten_minutes) + half_an_hour = 30 * 60 + return get_redis_lock(key=name, name=name, timeout=half_an_hour) def get_repeater_ids_by_domain(): @@ -418,9 +371,9 @@ def get_repeater_ids_by_domain(): } -def process_repeater_chord(domain, repeater_id, lock_token): +def process_repeater(repeater, lock_token): """ - Returns a Celery chord to process a repeater. + Initiates a Celery chord to process a repeater. """ def get_task_signature(repeat_record): @@ -430,10 +383,15 @@ def get_task_signature(repeat_record): }[repeat_record.state] return task_.s(repeat_record.id, repeat_record.domain) - repeater = Repeater.objects.get(domain=domain, id=repeater_id) - repeat_records = repeater.repeat_records_ready[:repeater.num_workers] + # Fetch an extra row to determine whether there are more repeat + # records to send after this batch + repeat_records = repeater.repeat_records_ready[:repeater.num_workers + 1] + more = len(repeat_records) > repeater.num_workers + + repeat_records = repeat_records[:repeater.num_workers] header_tasks = [get_task_signature(rr) for rr in repeat_records] - return chord(header_tasks, update_repeater.s(repeater_id, lock_token)) + callback = update_repeater.s(repeater.repeater_id, lock_token, more) + chord(header_tasks, callback)() @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) @@ -522,17 +480,22 @@ def _get_wait_duration_seconds(repeat_record): @task(queue=settings.CELERY_REPEAT_RECORD_QUEUE) -def update_repeater(repeat_record_states, repeater_id, lock_token): +def update_repeater(repeat_record_states, repeater_id, lock_token, more): """ Determines whether the repeater should back off, based on the results of ``process_ready_repeat_record()``. + + If ``more`` is ``True``, there are more repeat records ready to be + sent. Initiates another Celery chord to continue processing the + repeater. If ``more`` is ``False``, releases the lock for the + repeater. """ + repeater = Repeater.objects.get(id=repeater_id) try: if all(s in (State.Empty, None) for s in repeat_record_states): # We can't tell anything about the remote endpoint. return success_or_invalid = (State.Success, State.InvalidPayload) - repeater = Repeater.objects.get(id=repeater_id) if any(s in success_or_invalid for s in repeat_record_states): # The remote endpoint appears to be healthy. repeater.reset_backoff() @@ -546,7 +509,11 @@ def update_repeater(repeat_record_states, repeater_id, lock_token): finally: lock = get_repeater_lock(repeater_id) lock.local.token = lock_token - lock.release() + if more: + lock.reacquire() + process_repeater(repeater, lock_token) + else: + lock.release() metrics_gauge_task( diff --git a/corehq/motech/repeaters/tests/test_tasks.py b/corehq/motech/repeaters/tests/test_tasks.py index 06853d425e9a..b18113a0b5cf 100644 --- a/corehq/motech/repeaters/tests/test_tasks.py +++ b/corehq/motech/repeaters/tests/test_tasks.py @@ -296,7 +296,7 @@ def test_update_repeater_resets_backoff_on_success(self, mock_get_repeater, __): repeat_record_states = [State.Success, State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1, 'token', False) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() @@ -307,7 +307,7 @@ def test_update_repeater_resets_backoff_on_invalid(self, mock_get_repeater, __): repeat_record_states = [State.InvalidPayload, State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1, 'token', False) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_called_once() @@ -318,7 +318,7 @@ def test_update_repeater_sets_backoff_on_failure(self, mock_get_repeater, __): repeat_record_states = [State.Fail, State.Empty, None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1, 'token', False) mock_repeater.set_backoff.assert_called_once() mock_repeater.reset_backoff.assert_not_called() @@ -329,7 +329,7 @@ def test_update_repeater_does_nothing_on_empty(self, mock_get_repeater, __): repeat_record_states = [State.Empty] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1, 'token', False) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() @@ -340,11 +340,45 @@ def test_update_repeater_does_nothing_on_none(self, mock_get_repeater, __): repeat_record_states = [None] mock_repeater = MagicMock() mock_get_repeater.return_value = mock_repeater - update_repeater(repeat_record_states, 1, 'token') + update_repeater(repeat_record_states, 1, 'token', False) mock_repeater.set_backoff.assert_not_called() mock_repeater.reset_backoff.assert_not_called() + @patch('corehq.motech.repeaters.tasks.process_repeater') + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_calls_process_repeater_on_more( + self, + mock_get_repeater, + mock_get_repeater_lock, + mock_process_repeater, + ): + repeat_record_states = [None] + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + mock_lock = MagicMock() + mock_get_repeater_lock.return_value = mock_lock + update_repeater(repeat_record_states, 1, 'token', more=True) + + mock_process_repeater.assert_called_once_with(mock_repeater, 'token') + + @patch('corehq.motech.repeaters.tasks.get_repeater_lock') + @patch('corehq.motech.repeaters.tasks.Repeater.objects.get') + def test_update_repeater_releases_lock_on_no_more( + self, + mock_get_repeater, + mock_get_repeater_lock, + ): + repeat_record_states = [None] + mock_repeater = MagicMock() + mock_get_repeater.return_value = mock_repeater + mock_lock = MagicMock() + mock_get_repeater_lock.return_value = mock_lock + update_repeater(repeat_record_states, 1, 'token', more=False) + + mock_lock.release.assert_called_once() + @freeze_time('2025-01-01') class TestGetWaitDurationSeconds(TestCase):