Skip to content

Commit

Permalink
regex-based POC
Browse files Browse the repository at this point in the history
  • Loading branch information
masklinn committed Jul 15, 2024
1 parent 4d988a0 commit c652b60
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 7 deletions.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ version = "1.0.0a1"
readme = "README.rst"
requires-python = ">=3.8"
dependencies = []
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }

license = {text = "Apache 2.0"}
urls = {repository = "https://github.com/ua-parser/uap-python"}
Expand Down Expand Up @@ -39,10 +38,16 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy"
]

[project.optional-dependencies]
yaml = ["PyYaml"]
re2 = ["google-re2"]
regex = ["ua-parser-rs"]

[tool.ruff.lint]
select = ["F", "E", "W", "I", "RET", "RUF", "PT"]
ignore = [
Expand Down
17 changes: 11 additions & 6 deletions src/ua_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@
from .caching import Cache, Local
from .loaders import load_builtins, load_yaml
from .re2 import Resolver as Re2Resolver
from .regex import Resolver as RegexResolver
from .user_agent_parser import Parse

CACHEABLE = {
"basic": True,
"re2": True,
"regex": True,
"legacy": False,
}

Expand Down Expand Up @@ -178,6 +180,8 @@ def get_parser(
r = BasicResolver(rules)
elif parser == "re2":
r = Re2Resolver(rules)
elif parser == "regex":
r = RegexResolver(rules)
else:
sys.exit(f"unknown parser {parser!r}")

Expand Down Expand Up @@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None:
("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))),
("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))),
("re2", Re2Resolver(load_builtins())),
("regex", RegexResolver(load_builtins())),
]
for name, resolver in resolvers:
print(f"{name:11}: ", end="", flush=True)
Expand Down Expand Up @@ -436,14 +441,14 @@ def __call__(
bench.add_argument(
"--bases",
nargs="+",
choices=["basic", "re2", "legacy"],
default=["basic", "re2", "legacy"],
choices=["basic", "re2", "regex", "legacy"],
default=["basic", "re2", "regex", "legacy"],
help="""Base resolvers to benchmark. `basic` is a linear search
through the regexes file, `re2` is a prefiltered regex set
implemented in C++, `legacy` is the legacy API (essentially a
basic resolver with a clearing cache of fixed 200 entries, but
less layered so usually slightly faster than an equivalent
basic-based resolver).""",
implemented in C++, `regex` is a prefiltered regex set implemented
in Rust, `legacy` is the legacy API (essentially a basic resolver
with a clearing cache of fixed 200 entries, but less layered so
usually slightly faster than an equivalent basic-based resolver).""",
)
bench.add_argument(
"--caches",
Expand Down
76 changes: 76 additions & 0 deletions src/ua_parser/regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
__all__ = ["Resolver"]

from operator import attrgetter

import ua_parser_rs # type: ignore

from .core import (
Device,
Domain,
Matchers,
OS,
PartialResult,
UserAgent,
)


class Resolver:
ua: ua_parser_rs.UserAgentExtractor
os: ua_parser_rs.OSExtractor
de: ua_parser_rs.DeviceExtractor

def __init__(self, matchers: Matchers) -> None:
ua, os, de = matchers
self.ua = ua_parser_rs.UserAgentExtractor(
map(
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
ua,
)
)
self.os = ua_parser_rs.OSExtractor(
map(
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
os,
)
)
self.de = ua_parser_rs.DeviceExtractor(
map(
attrgetter("regex", "regex_flag", "family", "brand", "model"),
de,
)
)

def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
user_agent = os = device = None
if Domain.USER_AGENT in domains:
if m := self.ua.extract(ua):
user_agent = UserAgent(
m.family,
m.major,
m.minor,
m.patch,
m.patch_minor,
)
if Domain.OS in domains:
if m := self.os.extract(ua):
os = OS(
m.os,
m.major,
m.minor,
m.patch,
m.patch_minor,
)
if Domain.DEVICE in domains:
if m := self.de.extract(ua):
device = Device(
m.family,
m.brand,
m.model,
)
return PartialResult(
domains=domains,
string=ua,
user_agent=user_agent,
os=os,
device=device,
)
13 changes: 13 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@
else:
PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2"))

try:
from ua_parser import regex
except ImportError:
PARSERS.append(
pytest.param(
None,
id="regex",
marks=pytest.mark.skip(reason="regex parser not available"),
)
)
else:
PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex"))

UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}


Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ deps =
pytest
pyyaml
google-re2
ua-parser-rs
commands =
pytest -Werror --doctest-glob="*.rst" {posargs}

Expand Down

0 comments on commit c652b60

Please sign in to comment.