diff --git a/.gitignore b/.gitignore index 69ee2d9c..62a30b15 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,5 @@ __pycache__ scripts/pulse-results/*.json .DS_Store venv +build/ +dist/ diff --git a/gatherers/__init__.py b/domain_scan/__init__.py similarity index 100% rename from gatherers/__init__.py rename to domain_scan/__init__.py diff --git a/runner/__init__.py b/domain_scan/gatherers/__init__.py similarity index 100% rename from runner/__init__.py rename to domain_scan/gatherers/__init__.py diff --git a/gatherers/censys.py b/domain_scan/gatherers/censys.py similarity index 98% rename from gatherers/censys.py rename to domain_scan/gatherers/censys.py index c13dc106..0521cc78 100644 --- a/gatherers/censys.py +++ b/domain_scan/gatherers/censys.py @@ -8,8 +8,8 @@ from google.oauth2 import service_account import google.api_core.exceptions -from gatherers.gathererabc import Gatherer -from utils import utils +from domain_scan.gatherers.gathererabc import Gatherer +from domain_scan.utils import utils # Options: # diff --git a/gatherers/gathererabc.py b/domain_scan/gatherers/gathererabc.py similarity index 100% rename from gatherers/gathererabc.py rename to domain_scan/gatherers/gathererabc.py diff --git a/gatherers/rdns.py b/domain_scan/gatherers/rdns.py similarity index 97% rename from gatherers/rdns.py rename to domain_scan/gatherers/rdns.py index 6a20f812..da51fdcd 100644 --- a/gatherers/rdns.py +++ b/domain_scan/gatherers/rdns.py @@ -3,7 +3,7 @@ import re from typing import Generator, List, Pattern -from gatherers.gathererabc import Gatherer +from domain_scan.gatherers.gathererabc import Gatherer # Reverse DNS # diff --git a/gatherers/url.py b/domain_scan/gatherers/url.py similarity index 91% rename from gatherers/url.py rename to domain_scan/gatherers/url.py index cebcef34..46cca50c 100644 --- a/gatherers/url.py +++ b/domain_scan/gatherers/url.py @@ -3,8 +3,8 @@ import requests -from gatherers.gathererabc import Gatherer -from utils import utils +from domain_scan.gatherers.gathererabc import Gatherer +from domain_scan.utils import utils class Gatherer(Gatherer): diff --git a/scanners/__init__.py b/domain_scan/runner/__init__.py similarity index 100% rename from scanners/__init__.py rename to domain_scan/runner/__init__.py diff --git a/runner/runner.py b/domain_scan/runner/runner.py similarity index 97% rename from runner/runner.py rename to domain_scan/runner/runner.py index fde4f894..371f93dc 100644 --- a/runner/runner.py +++ b/domain_scan/runner/runner.py @@ -1,4 +1,4 @@ -from utils import utils +from domain_scan.utils import utils def write_rows(rows, domain, base_domain, scanner, csv_writer, meta=None): diff --git a/utils/__init__.py b/domain_scan/scanners/__init__.py similarity index 100% rename from utils/__init__.py rename to domain_scan/scanners/__init__.py diff --git a/scanners/a11y.py b/domain_scan/scanners/a11y.py similarity index 99% rename from scanners/a11y.py rename to domain_scan/scanners/a11y.py index 031da53a..8db073f5 100644 --- a/scanners/a11y.py +++ b/domain_scan/scanners/a11y.py @@ -4,7 +4,7 @@ import requests import yaml -from utils import utils +from domain_scan.utils import utils workers = 3 diff --git a/scanners/analytics.py b/domain_scan/scanners/analytics.py similarity index 97% rename from scanners/analytics.py rename to domain_scan/scanners/analytics.py index 1a032ac2..ceeaa7df 100644 --- a/scanners/analytics.py +++ b/domain_scan/scanners/analytics.py @@ -1,7 +1,7 @@ import logging import os -from utils import utils +from domain_scan.utils import utils # Check whether a domain is present in a CSV, set in --analytics. diff --git a/scanners/csp.py b/domain_scan/scanners/csp.py similarity index 98% rename from scanners/csp.py rename to domain_scan/scanners/csp.py index 607d7c5a..2fdf4c52 100644 --- a/scanners/csp.py +++ b/domain_scan/scanners/csp.py @@ -1,6 +1,6 @@ import logging import requests -from scanners import utils +from domain_scan.scanners import utils ### # CSP Scanner - check the presence of CSP headers diff --git a/domain_scan/scanners/headless/__init__.py b/domain_scan/scanners/headless/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scanners/headless/base.js b/domain_scan/scanners/headless/base.js similarity index 100% rename from scanners/headless/base.js rename to domain_scan/scanners/headless/base.js diff --git a/scanners/headless/local_bridge.js b/domain_scan/scanners/headless/local_bridge.js similarity index 100% rename from scanners/headless/local_bridge.js rename to domain_scan/scanners/headless/local_bridge.js diff --git a/scanners/headless/local_bridge.py b/domain_scan/scanners/headless/local_bridge.py similarity index 96% rename from scanners/headless/local_bridge.py rename to domain_scan/scanners/headless/local_bridge.py index b9e701a1..f81927cc 100644 --- a/scanners/headless/local_bridge.py +++ b/domain_scan/scanners/headless/local_bridge.py @@ -1,7 +1,7 @@ import logging import json -from utils import utils +from domain_scan.utils import utils ### # Local Python bridge to the JS bridge to the JS scanner. diff --git a/scanners/noop.py b/domain_scan/scanners/noop.py similarity index 100% rename from scanners/noop.py rename to domain_scan/scanners/noop.py diff --git a/scanners/pshtt.py b/domain_scan/scanners/pshtt.py similarity index 99% rename from scanners/pshtt.py rename to domain_scan/scanners/pshtt.py index 993c5e6f..788582b2 100644 --- a/scanners/pshtt.py +++ b/domain_scan/scanners/pshtt.py @@ -4,7 +4,7 @@ import re from pshtt import pshtt -from utils import utils +from domain_scan.utils import utils ### # Measure a site's HTTP behavior using DHS NCATS' pshtt tool. diff --git a/scanners/sslyze.py b/domain_scan/scanners/sslyze.py similarity index 99% rename from scanners/sslyze.py rename to domain_scan/scanners/sslyze.py index e9321277..0877bc0c 100644 --- a/scanners/sslyze.py +++ b/domain_scan/scanners/sslyze.py @@ -27,7 +27,7 @@ from cryptography.hazmat.primitives.serialization import Encoding from cryptography.hazmat.primitives.asymmetric import ec, dsa, rsa -from utils import utils +from domain_scan.utils import utils # Number of seconds to wait during sslyze connection check. # Not much patience here, and very willing to move on. diff --git a/scanners/third_parties.js b/domain_scan/scanners/third_parties.js similarity index 100% rename from scanners/third_parties.js rename to domain_scan/scanners/third_parties.js diff --git a/scanners/third_parties.py b/domain_scan/scanners/third_parties.py similarity index 98% rename from scanners/third_parties.py rename to domain_scan/scanners/third_parties.py index 6f76a7aa..7856ac81 100644 --- a/scanners/third_parties.py +++ b/domain_scan/scanners/third_parties.py @@ -1,6 +1,6 @@ import logging -from utils import utils +from domain_scan.utils import utils # Evaluate third party service usage using Chrome headless. diff --git a/scanners/trustymail.py b/domain_scan/scanners/trustymail.py similarity index 100% rename from scanners/trustymail.py rename to domain_scan/scanners/trustymail.py diff --git a/domain_scan/utils/__init__.py b/domain_scan/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/known_services.json b/domain_scan/utils/known_services.json similarity index 100% rename from utils/known_services.json rename to domain_scan/utils/known_services.json diff --git a/utils/utils.py b/domain_scan/utils/utils.py similarity index 100% rename from utils/utils.py rename to domain_scan/utils/utils.py diff --git a/gather b/gather index a60fce92..6727b80e 100755 --- a/gather +++ b/gather @@ -9,7 +9,7 @@ import requests import logging import importlib -from utils import utils +from domain_scan.utils import utils # some metadata about the scan itself start_time = utils.local_now() @@ -55,14 +55,14 @@ def run(options=None, cache_dir="./cache", results_dir="./results"): try: gatherer_module = importlib.import_module( - "gatherers.%s" % source) + "domain_scan.gatherers.%s" % source) gatherer = gatherer_module.Gatherer(suffixes, options, extra) except ImportError: # If it's not a registered module, allow it to be "hot registered" # as long as the user gave us a flag with that name that can be # used as the --url option to the URL module. if options.get(source): - gatherer_module = importlib.import_module("gatherers.url") + gatherer_module = importlib.import_module("domain_scan.gatherers.url") extra['name'] = source gatherer = gatherer_module.Gatherer(suffixes, options, extra) else: diff --git a/lambda/lambda_handler.py b/lambda/lambda_handler.py index c09502b4..5982af23 100644 --- a/lambda/lambda_handler.py +++ b/lambda/lambda_handler.py @@ -2,7 +2,7 @@ import sys import logging -from utils import utils +from domain_scan.utils import utils # Central handler for all Lambda events. def handler(event, context): @@ -19,7 +19,7 @@ def handler(event, context): # Might be acceptable to let this crash the module, in Lambda. try: - scanner = importlib.import_module("scanners.%s" % name) + scanner = importlib.import_module("domain_scan.scanners.%s" % name) except ImportError: exc_type, exc_value, exc_traceback = sys.exc_info() logging.error("[%s] Scanner not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (name, exc_type, exc_value)) @@ -49,4 +49,3 @@ def handler(event, context): # date transform functions in one place, before Amazon's built-in # JSON serialization prepares the data for transport. return utils.from_json(utils.json_for(response)) - diff --git a/scan b/scan index 6580de59..da322d78 100755 --- a/scan +++ b/scan @@ -16,9 +16,9 @@ import boto3 import botocore from concurrent.futures import ThreadPoolExecutor -from scanners.headless.local_bridge import headless_scan -from utils import utils -from runner import runner +from domain_scan.scanners.headless.local_bridge import headless_scan +from domain_scan.utils import utils +from domain_scan.runner import runner # Default and maximum for local workers (threads) per-scanner. @@ -130,7 +130,7 @@ def run(options=None): for name in options.get("scan").split(","): try: - scanner = importlib.import_module("scanners.%s" % name) + scanner = importlib.import_module("domain_scan.scanners.%s" % name) except ImportError: exc_type, exc_value, exc_traceback = sys.exc_info() logging.error("[%s] Scanner not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (name, exc_type, exc_value)) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..0b2360c1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal = true diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..8b996223 --- /dev/null +++ b/setup.py @@ -0,0 +1,84 @@ +""" +setup module for domain-scan + +Based on: + +- https://packaging.python.org/distributing/ +- https://github.com/pypa/sampleproject/blob/master/setup.py +- https://github.com/dhs-ncats/pshtt/blob/master/setup.py +""" + +from setuptools import setup, find_packages + +setup( + name='domain-scan', + + # Versions should comply with PEP440 + version='0.1.0-dev1', + description='lightweight scan pipeline for orchestrating third party tools, at scale and (optionally) using serverless infrastructure', + + # NCATS "homepage" + url='https://18f.gsa.gov', + # The project's main homepage + download_url='https://github.com/18F/domain-scan', + + # Author details + author='GSA 18F', + author_email='pulse@cio.gov', + + license='License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication', + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + 'Development Status :: 4 - Beta', + + # Indicate who your project is intended for + 'Intended Audience :: Developers', + + # Pick your license as you wish (should match "license" above) + 'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication', + + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + + # What does your project relate to? + keywords='https best practices web-crawling domain scanning', + + packages=find_packages(), + + install_requires=[ + 'strict-rfc3339', + 'publicsuffix', + 'boto3', + 'ipython', + 'sslyze>=1.3.4,<1.4.0', + 'cryptography', + 'pyyaml', + 'requests', + 'google-cloud-bigquery', + 'google-auth-oauthlib' + ], + + extras_require={ + 'test': [ + 'pytest' + ], + }, + + # Conveniently allows one to run the CLI scripts + scripts=[ + 'gather', + 'scan', + ] +) diff --git a/tests/test_utils.py b/tests/test_utils.py index c4e9f1b2..bc0b47c1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import sys import pytest from .context import utils # noqa -from utils import utils as subutils +from domain_scan.utils import utils as subutils def get_default_false_values(parser):