From e1a5589cf15306764d03627e5fb2b39d9db87087 Mon Sep 17 00:00:00 2001 From: Alex Ioannidis Date: Mon, 26 Aug 2024 18:27:45 +0200 Subject: [PATCH] processors: allow filtering out robots/machines --- invenio_stats/processors.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/invenio_stats/processors.py b/invenio_stats/processors.py index 8174a5b..3958557 100644 --- a/invenio_stats/processors.py +++ b/invenio_stats/processors.py @@ -11,6 +11,7 @@ import hashlib from datetime import datetime +from functools import partial from time import mktime from counter_robots import is_machine, is_robot @@ -88,8 +89,8 @@ def anonymize_user(doc): return doc -def flag_robots(doc): - """Flag events which are created by robots. +def flag_robots(doc, exclude=False): + """Flag and filter events which are created by robots. The list of robots is defined by the `COUNTER-robots Python package `_ , which follows the @@ -99,11 +100,17 @@ def flag_robots(doc): `_. """ doc["is_robot"] = "user_agent" in doc and is_robot(doc["user_agent"]) + if exclude and doc["is_robot"]: + return None return doc -def flag_machines(doc): - """Flag events which are created by machines. +filter_robots = partial(flag_robots, exclude=True) +"""Filter out robot events.""" + + +def flag_machines(doc, exclude=False): + """Flag and filter events which are created by machines. The list of machines is defined by the `COUNTER-robots Python package `_ , which follows the @@ -114,9 +121,15 @@ def flag_machines(doc): """ doc["is_machine"] = "user_agent" in doc and is_machine(doc["user_agent"]) + if exclude and doc["is_machine"]: + return None return doc +filter_machines = partial(flag_machines, exclude=True) +"""Filter out machine events.""" + + def hash_id(iso_timestamp, msg): """Generate event id, optimized for the search engine.""" return "{0}-{1}".format(