diff --git a/README.rst b/README.rst index d115197..3248940 100644 --- a/README.rst +++ b/README.rst @@ -68,6 +68,27 @@ and submit a pull request. This process is beyond the scope of this documentati `GitHub's documentation `_. +Installation "Use" Flags +------------------------ + +Several `extras_require` dependencies are declared, for bundled installation of tools required for additional features +that are not required for basic usage. To utilize these flags, on any reference to the project or on-disk project +location when executing `pip install`, add the flags comma-separated within square brackets after the name or path: + + pip install -U -e '.[development,geographic]' + +Quoting will be required in most shells, as square brackets would ordinarily be "expanded". + +* `development` — Install a standard suite of development-time support packages, testing framework, and testing components. + +* `ecdsa` — Require an efficient ECDSA implementation for use of Elliptic Curve signing operations. + +* `geo` — This project utilizes IP2Location LITE data available from http://www.ip2location.com to blacklist users by + country of origin. Enabling this flag will install the official `IP2Location` library, however the actual dataset + will need to be downloaded separately. + + + Version History =============== @@ -78,6 +99,9 @@ Version 3.0 * **Removed Python 2 support and version specific code.** The project has been updated to modern Python packaging standards, including modern namespace use. Modern namespaces are wholly incompatible with the previous namespacing mechanism; this project can not be simultaneously installed with any Marrow project that is Python 2 compatible. +* **Added Web Application Firewall extension.** To protect your application against passive scanning attempts, access of tools for a programming language that are absolutely not present (i.e. PHP, ColdFusion, Adobe Flex, …), malicious probes, and even to restrict access by geographic location. + + Version 2.0 ----------- diff --git a/setup.py b/setup.py index a259685..615798c 100755 --- a/setup.py +++ b/setup.py @@ -80,9 +80,10 @@ ], extras_require = dict( - development = tests_require + ['pre-commit'], - ecdsa = ['ecdsa'], - fastecdsa = ['fastecdsa>=1.0.3'], + development = tests_require + ['pre-commit', 'bandit', 'e', 'pudb', 'ptipython'], + ecdsa = ['fastecdsa>=1.0.3'], + fastecdsa = ['fastecdsa>=1.0.3'], # Deprecated reference. + geo = ['IP2Location'], ), tests_require = tests_require, @@ -101,5 +102,13 @@ 'matches = web.security.predicate:ContextMatch', 'contains = web.security.predicate:ContextContains', ], + 'web.security.heuristic': [ + 'dns = web.security.waf:ClientDNSHeuristic', + 'path = web.security.waf:PathHeuristic', + 'php = web.security.waf:PHPHeuristic', + 'wordpress = web.security.waf:WordpressHeuristic', + 'hosting = web.security.waf:HostingCombinedHeuristic', + 'country = web.security.waf:GeoCountryHeuristic', + ] }, ) diff --git a/web/ext/acl.py b/web/ext/acl.py index ab2ef46..b09886b 100644 --- a/web/ext/acl.py +++ b/web/ext/acl.py @@ -249,7 +249,7 @@ def __init__(self, *_policy, default=None, policy=None): def prepare(self, context): """Called to prepare the request context by adding an `acl` attribute.""" - if __debug__: log.debug("Populating request context with ACL.", extra=dict(request=id(context))) + if __debug__: log.trace("Populating request context with ACL.", extra=context.extra) context.acl = ACL(context=context, policy=self.policy) @@ -262,24 +262,24 @@ def dispatch(self, context, crumb): acl = getattr(crumb.handler, '__acl__', ()) inherit = getattr(crumb.handler, '__acl_inherit__', True) - if __debug__: log.debug(f"Handling dispatch event: {crumb.handler!r} {acl!r}", extra=dict( - request = id(context), - consumed = crumb.path, - handler = safe_name(crumb.handler), - endpoint = crumb.endpoint, - acl = [repr(i) for i in acl], - inherit = inherit, - )) + if __debug__: log.trace(f"Handling dispatch event: {crumb.handler!r} {acl!r}", extra={ + 'consumed': crumb.path, + 'handler': safe_name(crumb.handler), + 'endpoint': crumb.endpoint, + 'acl': [repr(i) for i in acl], + 'inherit': inherit, + **context.extra + }) if not inherit: - if __debug__: log.info("Clearing collected access control list.") + if __debug__: log.warn("Clearing collected access control list.") del context.acl[:] context.acl.extend((Path(context.request.path), i, handler) for i in acl) def collect(self, context, handler, args, kw): if not context.acl: - if __debug__: log.debug("Skipping validation of empty ACL.", extra=dict(request=id(context))) + if __debug__: log.debug("Skipping validation of empty ACL.", extra=context.extra) return grant = context.acl.is_authorized diff --git a/web/ext/waf.py b/web/ext/waf.py index 650cb63..e6081c8 100644 --- a/web/ext/waf.py +++ b/web/ext/waf.py @@ -10,13 +10,19 @@ * https://www.cloudflare.com/en-ca/waf/ """ +from abc import ABCMeta, abstractmethod from html import escape +from pathlib import Path from re import compile as re +from socket import inet_aton from typeguard import check_argument_types from uri import URI +from webob import Request +from webob.exc import HTTPBadRequest -from web.core.typing import Any, Dict, Union, Callable, ClassVar, Path, Set, Pattern, Iterable, MutableSet, Optional +from web.core.typing import Any, Union, Callable, ClassVar, Generator, Iterable, Optional +from web.core.typing import Dict, Path, Set, Pattern, MutableSet from web.core.typing import Context, WSGI, WSGIEnvironment, WSGIStartResponse, Request, Response, Tags from web.core.context import Context from web.security.waf import WAFHeuristic @@ -26,15 +32,55 @@ log = __import__('logging').getLogger(__name__) # A standard logger object. -ClientSet = MutableSet[str] +ClientSet = MutableSet[bytes] +class PersistentClientSet(ClientSet, metaclass=ABCMeta): + """An ABC describing a mutable set that exposes methods for persisting and restoring its contents.""" + + @abstractmethod + def persist(self, context:Context) -> None: + """Persist the state of the set. + + It is up to the individual implementation to decide how to do this. Typically this would involve serialization + on-disk or the use of some form of data store, such as SQLite, PostgreSQL, or MongoDB. + """ + + raise NotImplementedError() + + @abstractmethod + def restore(self, context:Context) -> None: + """Restore the state of the set. + + It is up to the individual implementation to decide how to do this. Typically this involves deserialization + from disk or the use of some form of data store, such as SQLite, PostgreSQL, or MongoDB. + """ + + raise NotImplementedError() -class WebApplicationFirewallExtension: - """A basic rules-based Web Application Firewall implementation. + +class LineSerializedSet(set, PersistentClientSet): + location:Path # The target path to read and write data from/to. + + def __init__(self, *args, location:Union[str,Path]): + self.location = Path(location) - WIP. - """ + def persist(self, context:Context) -> None: + with self.location.open('w') as fh: + for element in sorted(self): + fh.write(str(element) + "\n") + def restore(self, context:Context) -> None: + self.clear() + + with self.location.open('r') as fh: + for line in fh.readlines(): + self.add(int(line.strip())) + + +class WebApplicationFirewallExtension: + """A basic rules-based Web Application Firewall implementation.""" + + uses:ClassVar[Tags] = {'timing.prefix'} # We want our execution time to be counted. provides:ClassVar[Tags] = {'waf'} # A set of keywords usable in `uses` and `needs` declarations. first:ClassVar[bool] = True # Always try to be first: if truthy, become a dependency for all non-first extensions. extensions:ClassVar[Tags] = {'waf.rule'} # A set of entry_point namespaces to search for related plugin registrations. @@ -56,8 +102,12 @@ def __init__(self, *heuristics, blacklist:Optional[ClientSet]=None, exempt:Optio super().__init__() self.heuristics = heuristics - self.blacklist = set() if blacklist is None else blacklist # Permit custom backing stores to be passed in. - self.exempt = set() if exempt is None else exempt # Permit custom backing stores to be passed in. + + # Permit custom backing stores to be passed in; we optimize by storing packed binary values, not strings. + self.blacklist = set() if blacklist is None else blacklist.__class__(inet_aton(i) for i in blacklist) + + # Permit custom backing stores to be passed in for the exemptions, as well. + self.exempt = set() if exempt is None else exempt def __call__(self, context:Context, app:WSGI) -> WSGI: """Wrap the WSGI application callable in our 'web application firewall'.""" @@ -65,10 +115,13 @@ def __call__(self, context:Context, app:WSGI) -> WSGI: assert check_argument_types() def inner(environ:WSGIEnvironment, start_response:WSGIStartResponse): - # Identify the remote user. + try: + request: Request = Request(environ) # This will be remembered and re-used as a singleton later. + uri: URI = URI(request.url) + request.GET # As will this "attempt to access query string parameters", malformation detection. - request: Request = Request(environ) - uri: URI = URI(request.url) + except Exception as e: # Protect against de-serialization errors. + return HTTPBadRequest(f"Encountered error de-serializing the request: {e!r}")(environ, start_response) # https://docs.pylonsproject.org/projects/webob/en/stable/api/request.html#webob.request.BaseRequest.client_addr # Ref: https://www.nginx.com/resources/wiki/start/topics/examples/forwarded/ @@ -76,25 +129,25 @@ def inner(environ:WSGIEnvironment, start_response:WSGIStartResponse): try: # Immediately reject known bad actors. - if request.client_addr in self.blacklist: + if inet_aton(request.client_addr) in self.blacklist: return HTTPClose()(environ, start_response) # No need to re-blacklist. # Validate the heuristic rules. for heuristic in self.heuristics: try: - heuristic(environ, uri) + heuristic(environ, uri, client) except HTTPClose as e: log.error(f"{heuristic} {e.args[0].lower()}") raise # Invoke the wrapped application if everything seems OK. Note that this pattern of wrapping permits - # your application to raise HTTPClose if wishing to blacklist the active connection. + # your application to raise HTTPClose if wishing to blacklist the active connection for any reason. return app(environ, start_response) except HTTPClose as e: if request.client_addr not in self.exempt: log.warning(f"Blacklisting: {request.client_addr}") - self.blacklist.add(request.client_addr) + self.blacklist.add(inet_aton(request.client_addr)) if not __debug__: e = HTTPClose() # Do not disclose the reason in production environments. elif ': ' in e.args[0]: # XXX: Not currently effective. @@ -112,32 +165,41 @@ def start(self, context: Context) -> None: Any of the actions you wanted to perform during `__init__` you should do here. """ - ... - + + # Permit the storage objects to resume from a saved state. + if hasattr(self.blacklist, 'restore'): self.blacklist.restore(context) + if hasattr(self.exempt, 'restore'): self.exempt.restore(context) + def stop(self, context: Context) -> None: """Executed during application shutdown after the last request has been served. The first argument is the global context class, not request-local context instance. """ - ... + + # As per startup, permit the storage objects to persist their state. + if hasattr(self.blacklist, 'persist'): self.blacklist.persist(context) + if hasattr(self.exempt, 'persist'): self.exempt.persist(context) - def graceful(self, context: Context, **config) -> None: + def graceful(self, context: Context) -> None: """Called when a SIGHUP is sent to the application. The first argument is the global context class, not request-local context instance. Allows your code to re-load configuration and your code should close then re-open sockets and files. """ - ... + + # Ask the storage object to persist its state, if able. + if hasattr(self.blacklist, 'persist'): self.blacklist.persist(context) + if hasattr(self.exempt, 'persist'): self.exempt.persist(context) - def status(self, context: Context) -> None: + def status(self, context: Context) -> Generator[str, None, None]: """Report on the current status of the Web Application Firewall.""" def plural(quantity, single, plural): return single if quantity == 1 else plural c = len(self.heuristics) - yield f"**Rules:** {c} {plural(c, 'entry', 'entries')}" + yield f"Rules: {c} {plural(c, 'entry', 'entries')}" c = len(self.blacklist) - yield f"**Blacklist:** {c} {plural(c, 'entry', 'entries')}" + yield f"Blacklist: {c} {plural(c, 'entry', 'entries')}" diff --git a/web/security/exc.py b/web/security/exc.py index 1606ab0..7611349 100644 --- a/web/security/exc.py +++ b/web/security/exc.py @@ -4,6 +4,6 @@ class HTTPClose(HTTPClientError): """Indicate to the front-end load balancer (FELB) that it should hang up on the client.""" - code = 499 - title = "Client Closed Request" + code = 444 + title = "Connection Closed Without Response" explanation = "The server did not accept your request." diff --git a/web/security/waf.py b/web/security/waf.py index c3134ab..88ddf00 100644 --- a/web/security/waf.py +++ b/web/security/waf.py @@ -11,9 +11,14 @@ from .util import DNS from .exc import HTTPClose +try: + from IP2Location import IP2Location +except ImportError: + IP2Location = None + class WAFHeuristic: - def __call__(self, environ:WSGIEnvironment, uri:URI) -> Optional[bool]: + def __call__(self, environ:WSGIEnvironment, uri:URI, client:str) -> Optional[bool]: """Perform the heuristic check. May return True to indicate processing should stop, raise an HTTPException to propagate to the client, or may @@ -78,7 +83,7 @@ def __repr__(self, *extra:str) -> str: *extra ) - def __call__(self, environ:WSGIEnvironment, uri:URI) -> Optional[bool]: + def __call__(self, environ:WSGIEnvironment, uri:URI, client:str) -> Optional[bool]: assert check_argument_types() addr:str = environ.get(self.origin, '') # Attempt to retrieve the client IP from the WSGI environment. @@ -139,7 +144,7 @@ class PathHeuristic(WAFHeuristic): One can also deny any request targeting a PHP script: - PathHeuristic(re.compile(r'\.phps?($|/)')) + PathHeuristic(re.compile(r'\\.phps?($|/)')) It's important to note that regular expression flags (such as case insensitivity) will be ignored; the search is always case sensitive. (phpMyAdmin != phpmyadmin; these are legitimately separate resources.) @@ -170,7 +175,7 @@ def __repr__(self, *extra:str) -> str: *extra ) - def __call__(self, environ:dict, uri:URI) -> None: + def __call__(self, environ:dict, uri:URI, client:str) -> None: assert check_argument_types() if self.forbidden & set(uri.path.parts): # This is ~a third faster than the simplest regex use. @@ -209,12 +214,30 @@ def __init__(self) -> None: class HostingCombinedHeuristic(PathHeuristic): """A combined set of suspicious URI fragments and general patterns matching commonly exploited tools. - This is the result of casually browsing through around ten years of error logs on an active hosting service. + This is the result of casually browsing through around ten years of error logs on an active hosting service and + combines a number of the other PathHeuristic rules into one for convenience. (The WAF already optimizes these down + into a single regex for runtime checking; this is an import optimization.) + + Several filename extensions which ought to be delivered by a front-end load balancer are included in this list; + DO NOT INCLUDE THIS HEURISTIC AT DEVELOPMENT TIME if you are delivering static content via an endpoint within your + application. A critical message will be emitted if used at development time. """ - def __init__(self) -> None: + def __init__(self, *extensions:str) -> None: + """Prepare a 'combined hosting experience' heuristic. + + You can pass in additional extensions to block beyond the basic set included as stringy regular expression + fragments via positional arguments. + """ + + if __debug__: + log.critical("Use of this heuristic if delivering statics from the application at development time will" \ + "likely blacklist you.") + + extensions = set(extensions) | {'html?', 'phps?', 'py', 'js', 'css', 'swf', 'txt', 'md'} + super().__init__( - re(r'\.(html?|swf|phps?)($|/)'), # Bare HTML files, Adobe Flash, or PHP. + re(r'\.(' + '|'.join(sorted(extensions)) + r')($|/)'), # Forbidden filename extensions. re(r'((web)?mail)|(round|cube|roundcube)((web)?mail)?2?(-[0-9\.]+)?'), # Webmail service, in general. 'wm', 'rc', 'rms', 'mss', 'mss2', # More common webmail containers. 'FlexDataServices', 'amfphp', 'soapCaller.bs', # Adobe Flex AMF and RPC services. @@ -222,3 +245,42 @@ def __init__(self) -> None: 'admin', 'mysql', 'phpMyAdmin', 'pma', 'dbadmin', 'MyAdmin', 'phppgadmin', # Common administrative access. 'crossdomain.xml', 'README', 'LICENSE', 'webdav', re(r'w00tw00t'), # Generic probes. ) + + +class GeoCountryHeuristic(WAFHeuristic): + """A rule which preemptively blocks attempted access from specific countries of origin. + + Example usage: + + GeoCountryHeuristic( + 'cn', 'kp', # China, take that, "Great Firewall", and North Korea. + 'ae', 'ir', 'iq', 'sa', 'tr', # Middle-eastern nations. + 'by', 'ru', 'ua', # Russia and nearby former bloc states. + 'am', 'az', 'ee', 'ge', 'kg', 'kz', 'lt', 'lv', 'md', 'tj', 'tm', 'uz', # Additional former states. + 'af', 'mr', 'ng', 'ph', 'pl', 'sd', 'ye', # LGBTQ and human rights violators, others included above. + ) + """ + + countries: Set[str] # The set of blocked ISO 3166 country codes. + resolver: IP2Location + + def __init__(self, *countries:str, db:str='IP2LOCATION-LITE-DB1.IPV6.BIN') -> None: + """Initialize the country heuristic's geographic database and blacklist.""" + + assert check_argument_types() + + if IP2Location is None: + raise ImportError("You must have the IP2Location library installed.") + + self.countries = {i.upper() for i in countries} + self.resolver = IP2Location(db) + + def __repr__(self, *extra:str) -> str: + countries = "'" + "', '".join(sorted(self.countries)) + "'" + return super().__repr__(countries, *extra) + + def __call__(self, environ:dict, uri:URI, client:str) -> None: + assert check_argument_types() + + if (short := self.resolver.get_country_short(client)) in self.countries: + raise HTTPClose(f"Access from {short} ({self.resolver.get_country_long(client)}) forbidden.")