Skip to content

Commit

Permalink
Fix: NN assignment race condition for nodes created in admin UI (#444)
Browse files Browse the repository at this point in the history
* Fix: NN assignment race condition for nodes created in admin UI

* Reduce lock scope to minimum

* Add test for race condition on .save() call

* Add missing transaction.atomic() call

* Remove duplicate test (thanks git)

* Lint fix
  • Loading branch information
Andrew-Dickinson authored Aug 11, 2024
1 parent 632db02 commit 1b62e8a
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 13 deletions.
12 changes: 8 additions & 4 deletions src/meshapi/models/node.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import TYPE_CHECKING, Any

from django.core.validators import MaxValueValidator
from django.db import models
from django.db import models, transaction

from meshapi.util.django_pglocks import advisory_lock
from meshapi.util.network_number import NETWORK_NUMBER_MAX, get_next_available_network_number

if TYPE_CHECKING:
Expand Down Expand Up @@ -91,9 +92,12 @@ class NodeType(models.TextChoices):

def save(self, *args: Any, **kwargs: Any) -> None:
if not self.network_number:
self.network_number = get_next_available_network_number()

super().save(*args, **kwargs)
with transaction.atomic():
with advisory_lock("nn_assignment_lock"):
self.network_number = get_next_available_network_number()
super().save(*args, **kwargs)
else:
super().save(*args, **kwargs)

def __str__(self) -> str:
if self.name:
Expand Down
78 changes: 69 additions & 9 deletions src/meshapi/tests/test_nn.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import json
import time
from functools import partial
from unittest import mock

from django.conf import os
Expand All @@ -9,7 +10,6 @@

from meshapi.models import Building, Install, Member, Node

from ..views import get_next_available_network_number
from .group_helpers import create_groups
from .sample_data import sample_building, sample_install, sample_member
from .util import TestThread
Expand Down Expand Up @@ -308,12 +308,6 @@ def test_nn_search_for_new_number(self):
self.assertIsNotNone(Building.objects.filter(primary_node_id=131)[0].id)


def mocked_slow_nn_lookup():
answer = get_next_available_network_number()
time.sleep(1)
return answer


class TestNNRaceCondition(TransactionTestCase):
admin_c = Client()

Expand Down Expand Up @@ -366,7 +360,7 @@ def test_different_installs_race_condition(self):

def invoke_nn_form(install_num: int, outputs_dict: dict):
# Slow down the call which looks up the NN to force the race condition
with mock.patch("meshapi.views.forms.get_next_available_network_number", mocked_slow_nn_lookup):
with mock.patch("meshapi.util.network_number.no_op", partial(time.sleep, 1)):
result = self.admin_c.post(
"/api/v1/nn-assign/",
{"install_number": install_num, "password": os.environ.get("NN_ASSIGN_PSK")},
Expand Down Expand Up @@ -411,12 +405,78 @@ def invoke_nn_form(install_num: int, outputs_dict: dict):
f"status code incorrect for test_nn_valid_install_number. Should be {code}, but got {response2.status_code}",
)

def test_save_and_form_race_condition(self):
outputs_dict = {}

def invoke_nn_form(install_num: int, outputs_dict: dict):
# Slow down the call which looks up the NN to force the race condition
with mock.patch("meshapi.util.network_number.no_op", partial(time.sleep, 1)):
result = self.admin_c.post(
"/api/v1/nn-assign/",
{"install_number": install_num, "password": os.environ.get("NN_ASSIGN_PSK")},
content_type="application/json",
)
outputs_dict[install_num] = result

def invoke_nn_lookup_from_save(outputs_dict: dict):
try:
# Slow down the call which looks up the NN to force the race condition
with mock.patch("meshapi.util.network_number.no_op", partial(time.sleep, 1)):
node = Node(
status=Node.NodeStatus.PLANNED,
type=Node.NodeType.STANDARD,
latitude=0,
longitude=0,
)
# This save() should cause a call to get_next_available_network_number()
# in order to determine the primary key prior to DB write, but since we slowed it
# down, a race condition will occur unless the save() func properly protects the call
node.save()
node.refresh_from_db()
outputs_dict["save_call"] = node.network_number
except Exception as e:
outputs_dict["save_call"] = e

t1 = TestThread(target=invoke_nn_form, args=(self.install_number1, outputs_dict))
t2 = TestThread(target=invoke_nn_lookup_from_save, args=(outputs_dict,))

t1.start()
t2.start()

t1.join()
t2.join()

response1 = outputs_dict[self.install_number1]

# Re-raise the exception from the .save() call to fail the test if needed
# (.join() does not do this propagation automatically)
if isinstance(outputs_dict["save_call"], Exception):
raise ValueError() from outputs_dict[outputs_dict["save_call"]]

code = 201
self.assertEqual(
code,
response1.status_code,
f"status code incorrect for test_nn_valid_install_number. Should be {code}, but got {response1.status_code}",
)

resp_nns = {
json.loads(response1.content.decode("utf-8"))["network_number"],
outputs_dict["save_call"],
}
expected_nns = {101, 102}
self.assertEqual(
expected_nns,
resp_nns,
f"NNs incorrect for test_nn_valid_install_number. Should be {expected_nns}, but got {resp_nns}",
)

def test_same_install_race_condition(self):
outputs = []

def invoke_nn_form(install_num: int, outputs: list):
# Slow down the call which looks up the NN to force the race condition
with mock.patch("meshapi.views.forms.get_next_available_network_number", mocked_slow_nn_lookup):
with mock.patch("meshapi.util.network_number.no_op", partial(time.sleep, 1)):
result = self.admin_c.post(
"/api/v1/nn-assign/",
{"install_number": install_num, "password": os.environ.get("NN_ASSIGN_PSK")},
Expand Down
9 changes: 9 additions & 0 deletions src/meshapi/util/network_number.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
NETWORK_NUMBER_MAX = 8192


def no_op() -> None:
"""Literally does nothing, exists to provide a hook to the testing
harness for race condition analysis"""
pass


def get_next_available_network_number() -> int:
"""
This function finds, and marks as re-assigned, the next install whose number can be re-assigned
Expand All @@ -27,6 +33,9 @@ def get_next_available_network_number() -> int:
# Find the first valid NN that isn't in use
free_nn = next(i for i in range(NETWORK_NUMBER_MIN, NETWORK_NUMBER_MAX + 1) if i not in defined_nns)

# At testing time this turns into a time.sleep() call to help expose race conditions
no_op()

# Sanity check to make sure we don't assign something crazy. This is done by the query above,
# but we want to be super sure we don't violate these constraints so we check it here
if free_nn < NETWORK_NUMBER_MIN or free_nn > NETWORK_NUMBER_MAX:
Expand Down

0 comments on commit 1b62e8a

Please sign in to comment.