Skip to content

Commit

Permalink
Merge "Clean up allocations left by evacuation when deleting service"…
Browse files Browse the repository at this point in the history
… into stable/wallaby
  • Loading branch information
Zuul authored and openstack-gerrit committed Feb 12, 2024
2 parents 860d79f + 783598e commit 05848fb
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 90 deletions.
32 changes: 26 additions & 6 deletions nova/scheduler/client/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from nova import exception
from nova.i18n import _
from nova import objects
from nova.objects import fields
from nova import utils


Expand Down Expand Up @@ -2218,6 +2219,22 @@ def get_allocations_for_provider_tree(self, context, nodename):
return {consumer: self.get_allocs_for_consumer(context, consumer)
for consumer in consumers}

def _remove_allocations_for_evacuated_instances(self, context,
compute_node):
filters = {
'source_compute': compute_node.host,
'status': ['done'],
'migration_type': fields.MigrationType.EVACUATION,
}
evacuations = objects.MigrationList.get_by_filters(context, filters)

for evacuation in evacuations:
if not self.remove_provider_tree_from_instance_allocation(
context, evacuation.instance_uuid, compute_node.uuid):
LOG.error("Failed to clean allocation of evacuated "
"instance on the source node %s",
compute_node.uuid, instance=evacuation.instance)

def delete_resource_provider(self, context, compute_node, cascade=False):
"""Deletes the ResourceProvider record for the compute_node.
Expand All @@ -2236,16 +2253,19 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
# Delete any allocations for this resource provider.
# Since allocations are by consumer, we get the consumers on this
# host, which are its instances.
# NOTE(mriedem): This assumes the only allocations on this node
# are instances, but there could be migration consumers if the
# node is deleted during a migration or allocations from an
# evacuated host (bug 1829479). Obviously an admin shouldn't
# do that but...you know. I guess the provider deletion should fail
# in that case which is what we'd want to happen.
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
context, host, nodename)
for instance_uuid in instance_uuids:
self.delete_allocation_for_instance(context, instance_uuid)

# When an instance is evacuated, its allocation remains in
# the source compute node until the node recovers again.
# If the broken compute never recovered but instead it is
# decommissioned, then we should delete the allocations of
# successfully evacuated instances during service delete.
self._remove_allocations_for_evacuated_instances(context,
compute_node)

# Ensure to delete resource provider in tree by top-down
# traversable order.
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)
Expand Down
65 changes: 0 additions & 65 deletions nova/tests/functional/test_nova_manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1573,71 +1573,6 @@ def test_audit_orphaned_allocations_from_confirmed_resize(self):
self.assertIn('Processed 1 allocation.', output)
self.assertEqual(4, ret)

# TODO(sbauza): Remove this test once bug #1829479 is fixed
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
"""Evacuate a server and the delete the source node so that it will
leave a source allocation that the audit command will find.
"""

source_hostname = self.compute1.host
dest_hostname = self.compute2.host

source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)

server = self._boot_and_check_allocations(self.flavor, source_hostname)

# Stop the service and fake it down
self.compute1.stop()
source_service_id = self.admin_api.get_services(
host=source_hostname, binary='nova-compute')[0]['id']
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})

# evacuate the instance to the target
self._evacuate_server(
server, {'host': dest_hostname}, expected_host=dest_hostname)

# Now the instance is gone, we can delete the compute service
self.admin_api.api_delete('/os-services/%s' % source_service_id)

# Since the compute is deleted, we should have in theory a single
# allocation against the destination resource provider, but evacuated
# instances are not having their allocations deleted. See bug #1829479.
# We have two allocations for the same consumer, source and destination
self._check_allocation_during_evacuate(
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)

# Now, run the audit command that will find this orphaned allocation
ret = self.cli.audit(verbose=True)
output = self.output.getvalue()
self.assertIn(
'Allocations for consumer UUID %(consumer_uuid)s on '
'Resource Provider %(rp_uuid)s can be deleted' %
{'consumer_uuid': server['id'],
'rp_uuid': source_rp_uuid},
output)
self.assertIn('Processed 1 allocation.', output)
self.assertEqual(3, ret)

# Now we want to delete the orphaned allocation that is duplicate
ret = self.cli.audit(delete=True, verbose=True)

# We finally should only have the target allocations
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
self.assertRequestMatchesUsage({'VCPU': 0,
'MEMORY_MB': 0,
'DISK_GB': 0}, source_rp_uuid)

output = self.output.getvalue()
self.assertIn(
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
'Resource Provider %(rp_uuid)s' %
{'consumer_uuid': server['id'],
'rp_uuid': source_rp_uuid},
output)
self.assertIn('Processed 1 allocation.', output)
self.assertEqual(4, ret)


class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
Expand Down
25 changes: 8 additions & 17 deletions nova/tests/functional/wsgi/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,7 @@ def test_evacuate_then_delete_compute_service(self):
"""Tests a scenario where a server is created on a host, the host
goes down, the server is evacuated to another host, and then the
source host compute service is deleted. After that the deleted
compute service is restarted. Related placement resources are checked
throughout.
compute service is restarted and starts successfully.
"""
# Create our source host that we will evacuate *from* later.
host1 = self._start_compute('host1')
Expand Down Expand Up @@ -152,23 +151,15 @@ def test_evacuate_then_delete_compute_service(self):
services = self.admin_api.get_services(
binary='nova-compute', host='host1')
self.assertEqual(0, len(services), services)
# FIXME(mriedem): This is bug 1829479 where the compute service is
# deleted but the resource provider is not because there are still
# allocations against the provider from the evacuated server.
# Then the resource provider is also deleted.
resp = self.placement.get('/resource_providers/%s' % rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(rp_uuid, flavor)
# Try to restart the host1 compute service to create a new resource
# provider.
self.assertEqual(404, resp.status)
# Try to restart the host1 compute service to create a new service
# and a new resource provider.
self.restart_compute_service(host1)
# FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
# compute service attempts to create a new resource provider with a
# new uuid but the same name which results in a conflict. The service
# does not die, however, because _update_available_resource_for_node
# catches and logs but does not re-raise the error.
log_output = self.stdlog.logger.output
self.assertIn('Error updating resources for node host1.', log_output)
self.assertIn('Failed to create resource provider host1', log_output)
# Make sure the compute service record for host1 is recreated.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]

def test_migrate_confirm_after_deleted_source_compute(self):
"""Tests a scenario where a server is cold migrated and while in
Expand Down
34 changes: 32 additions & 2 deletions nova/tests/unit/scheduler/client/test_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3184,15 +3184,41 @@ def test_refresh_associations_disabled(self):

class TestAllocations(SchedulerReportClientTestCase):

@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"remove_provider_tree_from_instance_allocation")
@mock.patch('nova.objects.MigrationList.get_by_filters')
def test_remove_allocations_for_evacuated_instances(self,
mock_get_migrations, mock_rm_pr_tree):
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
mock.Mock(instance_uuid=uuids.inst2, status='done')]
mock_get_migrations.return_value = migrations
mock_rm_pr_tree.return_value = True
self.client._remove_allocations_for_evacuated_instances(self.context,
cn)
mock_get_migrations.assert_called_once_with(
self.context,
{'source_compute': cn.host, 'status': ['done'],
'migration_type': 'evacuation'})
mock_rm_pr_tree.assert_has_calls(
[mock.call(self.context, uuids.inst1, cn.uuid),
mock.call(self.context, uuids.inst2, cn.uuid)])
# status of migrations should be kept
for migration in migrations:
self.assertEqual('done', migration.status)

@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_providers_in_tree')
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_remove_allocations_for_evacuated_instances")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
def test_delete_resource_provider_cascade(self, mock_by_host,
mock_del_alloc, mock_delete, mock_get_rpt):
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
mock_by_host.return_value = [uuids.inst1, uuids.inst2]
Expand All @@ -3208,6 +3234,7 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
mock_by_host.assert_called_once_with(
self.context, cn.host, cn.hypervisor_hostname)
self.assertEqual(2, mock_del_alloc.call_count)
mock_evacuate.assert_called_once_with(self.context, cn)
exp_url = "/resource_providers/%s" % uuids.cn
mock_delete.assert_called_once_with(
exp_url, global_request_id=self.context.global_id)
Expand All @@ -3217,11 +3244,13 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
'get_providers_in_tree')
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_remove_allocations_for_evacuated_instances")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
def test_delete_resource_provider_no_cascade(self, mock_by_host,
mock_del_alloc, mock_delete, mock_get_rpt):
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
self.client._association_refresh_time[uuids.cn] = mock.Mock()
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
Expand All @@ -3236,6 +3265,7 @@ def test_delete_resource_provider_no_cascade(self, mock_by_host,
mock_delete.return_value = resp_mock
self.client.delete_resource_provider(self.context, cn)
mock_del_alloc.assert_not_called()
mock_evacuate.assert_not_called()
exp_url = "/resource_providers/%s" % uuids.cn
mock_delete.assert_called_once_with(
exp_url, global_request_id=self.context.global_id)
Expand Down
8 changes: 8 additions & 0 deletions releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
fixes:
- |
`Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
deleting a nova-compute service removes allocations of successfully
evacuated instances. This allows the associated resource provider to be
deleted automatically even if the nova-compute service cannot recover after
all instances on the node have been successfully evacuated.

0 comments on commit 05848fb

Please sign in to comment.