Skip to content

Commit

Permalink
feat: enhance add_filter to support the max matched lines (#4303)
Browse files Browse the repository at this point in the history
- added a new keyword argument "max_match=MAX_MATCH" to `add_filter`.
  By declaring the max_match, at most `max_match` number of lines that
  contain the patterns will be filtered out in the collection.
  For redundant declarations the maximum `max_match` will be kept as the
  final scanning number.  If no `max_match` is declared when `add_filter`,
  the filters.MAX_MATCH (=10000) will be taken as the default value.
- added a new keyword argument "with_matches=False" to the `get_filters`.
  When "with_matches=True" is specified, the return value of the
  `get_filters` will be dict in which the max scanning numbers for each
  filter pattern are included as the dict value.
- update the exiting tests
- RHINENG-14669

Signed-off-by: Xiangce Liu <[email protected]>
  • Loading branch information
xiangce authored Jan 3, 2025
1 parent dbc93e3 commit 9c64871
Show file tree
Hide file tree
Showing 24 changed files with 437 additions and 199 deletions.
73 changes: 45 additions & 28 deletions insights/core/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,12 @@
from insights.util import parse_bool

_CACHE = {}
FILTERS = defaultdict(set)
FILTERS = defaultdict(dict)
ENABLED = parse_bool(os.environ.get("INSIGHTS_FILTERS_ENABLED"), default=True)
MATCH_COUNT = 10000
MAX_MATCH = 10000


# TODO:
# - support specifying the max match number of filtered lines
# add_filter(Messages, "Such an Error", 10)
# def add_filter(component, patterns, match_count=MATCH_COUNT):
def add_filter(component, patterns):
def add_filter(component, patterns, max_match=MAX_MATCH):
"""
Add a filter or list of filters to a component. When the component is
a datasource, the filter will be directly added to that datasouce.
Expand All @@ -71,8 +67,26 @@ def add_filter(component, patterns):
parser or combiner.
patterns (str, [str]): A string, list of strings, or set of strings to
add to the datasource's filters.
max_match (int): A int, the maximum matched lines to filter out.
MAX_MATCH by default.
"""

def get_dependency_datasources(comp):
"""Get (all) the first depended datasource"""
dep_ds = set()
if plugins.is_datasource(comp):
dep_ds.add(comp)
return dep_ds
for dep in dr.get_dependencies(comp):
dep_ds.update(get_dependency_datasources(dep))
return dep_ds

def none_max(a, b):
return a if b is None else b if a is None else max(a, b)

def max_matchs(da, db):
return dict((k, none_max(da.get(k), db.get(k))) for k in set(da.keys()).union(db.keys()))

def inner(comp, patterns):
if comp in _CACHE:
del _CACHE[comp]
Expand All @@ -82,25 +96,21 @@ def inner(comp, patterns):
raise TypeError("Filter patterns must be of type string, list, or set.")

if isinstance(patterns, six.string_types):
patterns = set([patterns])
patterns = {patterns: max_match}
elif isinstance(patterns, list):
patterns = set(patterns)
patterns = dict((pt, max_match) for pt in patterns)
# here patterns is a dict

for pat in patterns:
if not pat:
raise Exception("Filter patterns must not be empty.")

FILTERS[comp] |= patterns
FILTERS[comp].update(max_matchs(FILTERS[comp], patterns))

def get_dependency_datasources(comp):
"""Get (all) the first depended datasource"""
dep_ds = set()
if plugins.is_datasource(comp):
dep_ds.add(comp)
return dep_ds
for dep in dr.get_dependencies(comp):
dep_ds.update(get_dependency_datasources(dep))
return dep_ds
if max_match is None or type(max_match) is not int or max_match <= 0:
raise Exception(
"Invalid argument: {0}. It can only be a positive integer.".format(max_match)
)

if not plugins.is_datasource(component):
deps = get_dependency_datasources(component)
Expand All @@ -127,7 +137,7 @@ def get_dependency_datasources(comp):
_add_filter = add_filter


def get_filters(component):
def get_filters(component, with_matches=False):
"""
Get the set of filters for the given datasource.
Expand All @@ -143,13 +153,19 @@ def get_filters(component):
Args:
component (a datasource): The target datasource
with_matches (boolean): Needs the max matches being returned? False by
default.
Returns:
set: The set of filters defined for the datasource
(set or dict): when `with_matches=False`, returns the set of filters
defined for the datasource only.
when `with_matches=True`, returns filters defined for
the datasource with the max match count specified by
`add_filter`.
"""

def inner(c, filters=None):
filters = filters or set()
filters = filters or dict()

if hasattr(c, 'filterable') and c.filterable is False:
return filters
Expand All @@ -161,20 +177,21 @@ def inner(c, filters=None):
return filters

if c in FILTERS:
filters |= FILTERS[c]
filters.update(FILTERS[c])

for d in dr.get_dependents(c):
filters |= inner(d, filters)
filters.update(inner(d, filters))

return filters

if not component:
# No filters for nothing
return set()
return dict() if with_matches else set()

if component not in _CACHE:
_CACHE[component] = inner(component)

return _CACHE[component]
return _CACHE[component] if with_matches else set(_CACHE[component].keys())


def apply_filters(target, lines):
Expand Down Expand Up @@ -202,7 +219,7 @@ def loads(string):
"""Loads the filters dictionary given a string."""
d = _loads(string)
for k, v in d.items():
FILTERS[dr.get_component(k) or k] = set(v)
FILTERS[dr.get_component(k) or k] = v


def load(stream=None):
Expand All @@ -222,7 +239,7 @@ def dumps():
"""Returns a string representation of the sorted FILTERS dictionary."""
d = {}
for k, v in FILTERS.items():
d[dr.get_name(k)] = sorted(v)
d[dr.get_name(k)] = dict(sorted(v.items()))
return _dumps(d)


Expand Down
4 changes: 3 additions & 1 deletion insights/core/spec_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
Obfuscate the IP or Hostname appears in the spec content according to the
specs native requirement and user configuration.
- Filtering
Filter line as per the allow list got from the "filters.yaml"
"""

import logging
Expand Down Expand Up @@ -360,7 +362,7 @@ def _filter_line_per_allowlist(self, line, allow_info):
for a_key in list(allow_info.keys()):
# keep line when any filter match
# FIXME:
# Considering performance, din't handle multiple filters in one same line
# Considering performance, didn't handle multiple filters in one same line
if a_key in line:
allow_info[a_key] -= 1
# stop checking it when enough lines contain the key were found
Expand Down
8 changes: 4 additions & 4 deletions insights/core/spec_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self):
self._content = None
self._exception = None
self._filterable = False
self._filters = set()
self._filters = dict()

def load(self):
raise NotImplementedError()
Expand Down Expand Up @@ -95,7 +95,7 @@ def _clean_content(self):
allowlist = None
if self._filterable:
cleans.append("Filter")
allowlist = dict((f, filters.MATCH_COUNT) for f in self._filters)
allowlist = self._filters
# Cleaning - Entry
if cleans:
log.debug("Cleaning (%s) %s", "/".join(cleans), self.relative_path)
Expand Down Expand Up @@ -208,7 +208,7 @@ def __init__(self, relative_path, root="/", save_as=None, ds=None, ctx=None, cle
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()
self._filters = filters.get_filters(self.ds, True) if self.ds else set()

self.validate()

Expand Down Expand Up @@ -361,7 +361,7 @@ def __init__(
if self.ds and filters.ENABLED
else False
)
self._filters = filters.get_filters(self.ds) if self.ds else set()
self._filters = filters.get_filters(self.ds, True) if self.ds else set()

self.validate()

Expand Down
73 changes: 45 additions & 28 deletions insights/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,14 @@

def _intercept_add_filter(func):
@wraps(func)
def inner(component, pattern):
ret = add_filter(component, pattern)
def inner(component, pattern, max_match=filters.MAX_MATCH):
ret = add_filter(component, pattern, max_match=max_match)
calling_module = inspect.stack()[1][0].f_globals.get("__name__")
ADDED_FILTERS[calling_module] |= set(r for r in dr.get_registry_points(component) if r.filterable)
ADDED_FILTERS[calling_module] |= set(
r for r in dr.get_registry_points(component) if r.filterable
)
return ret

return inner


Expand All @@ -51,6 +54,7 @@ def inner(ds, pattern):
calling_module = inspect.stack()[1][0].f_globals.get("__name__")
ADDED_FILTERS[calling_module].add(ds)
return ret

return inner


Expand Down Expand Up @@ -92,8 +96,11 @@ def _beautify_deep_compare_diff(result, expected):
diff.append('\tkey "{0}" not in Result;'.format(k))
for k in common_keys:
if not eq(result[k], expected[k]):
diff.append('\tkey "{0}" unequal values:\n\t\tExpected: {1}\n\t\tResult : {2}'.format(
k, expected[k], result[k]))
diff.append(
'\tkey "{0}" unequal values:\n\t\tExpected: {1}\n\t\tResult : {2}'.format(
k, expected[k], result[k]
)
)
if not diff:
diff.append('\tUnrecognized unequal values in result layer one;')

Expand All @@ -118,7 +125,7 @@ def deep_compare(result, expected):
# This case ensures that when rules return a make_none() response, all of the older
# CI tests that are looking for None instead of make_none() will still pass
if result is None or (isinstance(result, dict) and result.get("type") == "none"):
assert (expected is None or expected == MAKE_NONE_RESULT), result
assert expected is None or expected == MAKE_NONE_RESULT, result
return

if isinstance(result, dict) and expected is None:
Expand Down Expand Up @@ -150,12 +157,11 @@ def run_input_data(component, input_data, store_skips=False):
'CloudInstance': ['insights.parsers.subscription_manager.SubscriptionManagerFacts'],
'CloudProvider': ['insights.parsers.rhsm_conf.RHSMConf'],
'OSRelease': ['insights.parsers.dmesg.DmesgLineList'],
'Sap': ['insights.parsers.saphostctrl.SAPHostCtrlInstances']
'Sap': ['insights.parsers.saphostctrl.SAPHostCtrlInstances'],
}


def run_test(component, input_data,
expected=_UNDEFINED, return_make_none=False, do_filter=True):
def run_test(component, input_data, expected=_UNDEFINED, return_make_none=False, do_filter=True):
"""
Arguments:
component: The insights component need to test.
Expand All @@ -165,6 +171,7 @@ def run_test(component, input_data,
do_filter: Does need to check dependency spec filter warning?
- it's not required to check the filters for sosreport
"""

def get_filtered_specs(module):
filtered = set()
mods = dir(importlib.import_module(module))
Expand All @@ -183,7 +190,9 @@ def get_filtered_specs(module):
rps = dr.get_registry_points(component)
filtered = get_filtered_specs(mod)
filterable = set(d for d in rps if dr.get_delegate(d).filterable) - filtered
missing_filters = filterable - ADDED_FILTERS.get(mod, set()) - ADDED_FILTERS.get(sup_mod, set())
missing_filters = (
filterable - ADDED_FILTERS.get(mod, set()) - ADDED_FILTERS.get(sup_mod, set())
)
if missing_filters:
names = [dr.get_name(m) for m in missing_filters]
msg = "%s must add filters to %s"
Expand All @@ -204,16 +213,18 @@ def integrate(input_data, component):
return run_test(component, input_data)


def context_wrap(lines,
path="path",
hostname=DEFAULT_HOSTNAME,
release=DEFAULT_RELEASE,
version="-1.-1",
machine_id="machine_id",
strip=True,
split=True,
filtered_spec=None,
**kwargs):
def context_wrap(
lines,
path="path",
hostname=DEFAULT_HOSTNAME,
release=DEFAULT_RELEASE,
version="-1.-1",
machine_id="machine_id",
strip=True,
split=True,
filtered_spec=None,
**kwargs
):
if isinstance(lines, six.string_types):
if strip:
lines = lines.strip()
Expand All @@ -223,10 +234,16 @@ def context_wrap(lines,
if filtered_spec is not None and filtered_spec in filters.FILTERS:
lines = [l for l in lines if any([f in l for f in filters.FILTERS[filtered_spec]])]

return Context(content=lines,
path=path, hostname=hostname,
release=release, version=version.split("."),
machine_id=machine_id, relative_path=path, **kwargs)
return Context(
content=lines,
path=path,
hostname=hostname,
release=release,
version=version.split("."),
machine_id=machine_id,
relative_path=path,
**kwargs
)


input_data_cache = {}
Expand All @@ -236,10 +253,7 @@ def context_wrap(lines,


def create_metadata(system_id, product):
ctx_metadata = {
"system_id": system_id,
"links": []
}
ctx_metadata = {"system_id": system_id, "links": []}
ctx_metadata["type"] = product.role
ctx_metadata["product"] = product.__class__.__name__
return json.dumps(ctx_metadata)
Expand All @@ -265,6 +279,7 @@ class InputData(object):
contain the specified value in the context.path field. This is useful for
testing pattern-like file parsers.
"""

def __init__(self, name=None, hostname=None):
cnt = input_data_cache.get(name, 0)
self.name = "{0}-{1:0>5}".format(name, cnt)
Expand Down Expand Up @@ -421,6 +436,7 @@ def archive_provider(component, test_func=deep_compare, stride=1):
[1] insights.tests.deep_compare()
"""

def _wrap(func):
@six.wraps(func)
def __wrap(stride=stride):
Expand All @@ -430,4 +446,5 @@ def __wrap(stride=stride):
__wrap.stride = stride
ARCHIVE_GENERATORS.append(__wrap)
return __wrap

return _wrap
Loading

0 comments on commit 9c64871

Please sign in to comment.