Skip to content

Commit

Permalink
Add soft_blocked_items to mlbf data store
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMind committed Nov 1, 2024
1 parent f298bdf commit 7e12e50
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 59 deletions.
97 changes: 49 additions & 48 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,41 +53,29 @@ def generate_mlbf(stats, blocked, not_blocked):
return cascade


# Extends the BlockType enum to include versions that have no block of any type
class MLBFDataType(Enum):
HARD_BLOCKED = 'blocked'
# SOFT_BLOCKED = 'soft_blocked'
BLOCKED = BlockType.BLOCKED.label
SOFT_BLOCKED = BlockType.SOFT_BLOCKED.label
NOT_BLOCKED = 'not_blocked'


def fetch_blocked_from_db():
qs = BlockVersion.objects.filter(
version__file__is_signed=True, block_type=BlockType.BLOCKED
).values_list('block__guid', 'version__version', 'version_id', named=True)
return set(qs)


def fetch_all_versions_from_db(excluding_version_ids=None):
from olympia.versions.models import Version

qs = Version.unfiltered.exclude(id__in=excluding_version_ids or ()).values_list(
'addon__addonguid__guid', 'version'
)
return set(qs)


class BaseMLBFLoader:
def __init__(self, storage: SafeStorage):
self.storage = storage

def data_type_key(self, key: MLBFDataType) -> str:
return f'{key.name.lower()}_items'

@cached_property
def _raw(self):
"""
raw serializable data for the given MLBFLoader.
"""
return {key.value: self[key] for key in MLBFDataType}
return {self.data_type_key(key): self[key] for key in MLBFDataType}

def __getitem__(self, key: MLBFDataType) -> List[str]:
return getattr(self, f'{key.value}_items')
return getattr(self, self.data_type_key(key))

@cached_property
def _cache_path(self):
Expand All @@ -97,6 +85,10 @@ def _cache_path(self):
def blocked_items(self) -> List[str]:
raise NotImplementedError

@cached_property
def soft_blocked_items(self) -> List[str]:
raise NotImplementedError

@cached_property
def not_blocked_items(self) -> List[str]:
raise NotImplementedError
Expand All @@ -110,53 +102,62 @@ def __init__(self, storage: SafeStorage):

@cached_property
def blocked_items(self) -> List[str]:
return self._data.get(MLBFDataType.HARD_BLOCKED.value)
return self._data.get(self.data_type_key(MLBFDataType.BLOCKED))

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._data.get(self.data_type_key(MLBFDataType.SOFT_BLOCKED))

@cached_property
def not_blocked_items(self) -> List[str]:
return self._data.get(MLBFDataType.NOT_BLOCKED.value)
return self._data.get(self.data_type_key(MLBFDataType.NOT_BLOCKED))


class MLBFDataBaseLoader(BaseMLBFLoader):
def __init__(self, storage: SafeStorage):
super().__init__(storage)
self._version_excludes = []

# TODO: there is an edge case where you create a new filter from
# a previously used time stamp. THis could lead to invalid files
# a filter using the DB should either clear the storage files
# or raise to not allow reusing the same time stamp.
# it is possibly debatable whether you should be able to
# determine the created_at time as an argument at all

# Save the raw data to storage to be used by later instances
# of this filter.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
with self.storage.open(self._cache_path, 'w') as f:
json.dump(self._raw, f)

@cached_property
def blocked_items(self) -> List[str]:
blocked = []
def _all_blocks(self):
return BlockVersion.objects.filter(version__file__is_signed=True).values_list(
'block__guid', 'version__version', 'version_id', 'block_type', named=True
)

for blocked_version in fetch_blocked_from_db():
blocked.append(
(blocked_version.block__guid, blocked_version.version__version)
)
self._version_excludes.append(blocked_version.version_id)
def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
for version in self._all_blocks
if version.block_type == block_type
]
)

return MLBF.hash_filter_inputs(blocked)
@cached_property
def blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.BLOCKED)

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.SOFT_BLOCKED)

@cached_property
def not_blocked_items(self) -> List[str]:
# see blocked_items - we need self._version_excludes populated
blocked_items = self.blocked_items
from olympia.versions.models import Version

all_blocks_ids = [version.version_id for version in self._all_blocks]
not_blocked_items = MLBF.hash_filter_inputs(
fetch_all_versions_from_db(self._version_excludes)
Version.unfiltered.exclude(id__in=all_blocks_ids or ()).values_list(
'addon__addonguid__guid', 'version'
)
)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items doesn't contain any blocked_items.
return list(set(not_blocked_items) - set(blocked_items))
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return list(
set(not_blocked_items) - set(self.blocked_items + self.soft_blocked_items)
)


class MLBF:
Expand Down
61 changes: 50 additions & 11 deletions src/olympia/blocklist/tests/test_mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ class TestStaticLoader(BaseMLBFLoader):
def blocked_items(self):
return ['blocked:version']

@cached_property
def soft_blocked_items(self):
return ['softblocked:version']

@cached_property
def not_blocked_items(self):
return ['notblocked:version']
Expand Down Expand Up @@ -76,8 +80,9 @@ def notblocked_items(self):
def test_raw_contains_correct_data(self):
loader = self.TestStaticLoader(self.storage)
assert loader._raw == {
'blocked': ['blocked:version'],
'not_blocked': ['notblocked:version'],
'blocked_items': ['blocked:version'],
'soft_blocked_items': ['softblocked:version'],
'not_blocked_items': ['notblocked:version'],
}

def test_invalid_key_access_raises(self):
Expand All @@ -86,32 +91,33 @@ def test_invalid_key_access_raises(self):
with self.assertRaises(AttributeError):
loader['invalid']
with self.assertRaises(AttributeError):
loader[BlockType.BLOCKED]
loader[BlockType.BANANA]

def test_valid_key_access_returns_expected_data(self):
loader = self.TestStaticLoader(self.storage)
assert loader[MLBFDataType.HARD_BLOCKED] == ['blocked:version']
assert loader[MLBFDataType.BLOCKED] == ['blocked:version']
assert loader[MLBFDataType.NOT_BLOCKED] == ['notblocked:version']

def test_cache_raw_data(self):
loader = self.TestStaticLoader(self.storage)

for data_type in MLBFDataType:
assert loader[data_type] == loader._raw[data_type.value]
assert loader[data_type] == loader._raw[loader.data_type_key(data_type)]

# The source of truth should ultimately be the named cached properties
# Even though _raw is cached, it should still return
# the reference to the named property
loader.blocked_items = []
assert loader[MLBFDataType.HARD_BLOCKED] == []
assert loader[MLBFDataType.BLOCKED] == []


class TestMLBFStorageLoader(_MLBFBase):
def setUp(self):
super().setUp()
self._data = {
'blocked': ['blocked:version'],
'not_blocked': ['notblocked:version'],
'blocked_items': ['blocked:version'],
'not_blocked_items': ['notblocked:version'],
'soft_blocked_items': ['softblocked:version'],
}

def test_raises_missing_file(self):
Expand Down Expand Up @@ -140,7 +146,7 @@ def test_load_returns_expected_data(self):
)

mlbf_data = MLBFDataBaseLoader(self.storage)
assert mlbf_data[MLBFDataType.HARD_BLOCKED] == MLBF.hash_filter_inputs(
assert mlbf_data[MLBFDataType.BLOCKED] == MLBF.hash_filter_inputs(
[(block_version.block.guid, block_version.version.version)]
)
assert mlbf_data[MLBFDataType.NOT_BLOCKED] == MLBF.hash_filter_inputs(
Expand All @@ -158,7 +164,12 @@ def test_blocked_items_caches_excluded_version_ids(self):
)
with self.assertNumQueries(2):
mlbf_data = MLBFDataBaseLoader(self.storage)
assert mlbf_data._version_excludes == [block_version.version.id]
assert (
MLBF.hash_filter_inputs(
[(block_version.block.guid, block_version.version.version)]
)
not in mlbf_data.blocked_items
)

def test_hash_filter_inputs_returns_set_of_hashed_strings(self):
"""
Expand Down Expand Up @@ -387,10 +398,38 @@ def test_duplicate_guid_is_blocked(self):
reused_addon.update(guid=GUID_REUSE_FORMAT.format(addon.id))
reused_addon.addonguid.update(guid=addon.guid)

self._block_version(block, self._version(addon), soft=False)
self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED)

mlbf = MLBF.generate_from_db('test')

(block_version,) = MLBF.hash_filter_inputs([(addon.guid, version)])
assert block_version in mlbf.data.blocked_items
assert block_version not in mlbf.data.not_blocked_items

def test_no_soft_blocked_versions_empty_array(self):
addon, block = self._blocked_addon()
self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED)
mlbf = MLBF.generate_from_db('test')
assert (
BlockVersion.objects.filter(
block_type=BlockType.SOFT_BLOCKED,
version__file__is_signed=True,
).count()
== 0
)
assert mlbf.data.soft_blocked_items == []

def test_no_hard_blocked_versions_empty_array(self):
addon, block = self._blocked_addon()
self._block_version(
block, self._version(addon), block_type=BlockType.SOFT_BLOCKED
)
mlbf = MLBF.generate_from_db('test')
assert (
BlockVersion.objects.filter(
block_type=BlockType.BLOCKED,
version__file__is_signed=True,
).count()
== 0
)
assert mlbf.data.blocked_items == []

0 comments on commit 7e12e50

Please sign in to comment.