diff --git a/CHANGELOG b/CHANGELOG index 07d1d2e74..7e633a575 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,4 @@ 0.1.0 - Initial Public Release 0.1.1 - Simplfied load of default settings, including wiki demo 0.2.0 - Complete re-write of HTTP Cookie parsing +0.2.1 - Support for MITMProxy v0.11 + minor bugfixes diff --git a/VERSION b/VERSION index 0ea3a944b..0c62199f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0 +0.2.1 diff --git a/automation/DeployBrowsers/deploy_firefox.py b/automation/DeployBrowsers/deploy_firefox.py index 942b183fa..58dc0d3a0 100755 --- a/automation/DeployBrowsers/deploy_firefox.py +++ b/automation/DeployBrowsers/deploy_firefox.py @@ -118,6 +118,24 @@ def deploy_firefox(browser_params, crash_recovery): if browser_params['disable_flash']: fp.set_preference('plugin.state.flash', 0) + # Disable health reports + fp.set_preference('datareporting.healthreport.uploadEnabled', False) + fp.set_preference('toolkit.telemetry.enabled', False) + + fp.set_preference('extensions.checkCompatibility.nightly', False) + fp.set_preference('browser.search.update', False) + # Disable know your rights banner + fp.set_preference('browser.rights.3.shown', True) + fp.set_preference('browser.shell.checkDefaultBrowser', False) + fp.set_preference('security.OCSP.enabled', "0") + fp.set_preference('browser.safebrowsing.enabled', False) + fp.set_preference('devtools.profiler.enabled', False) + fp.set_preference('network.seer.enabled', False) # predictive actions + fp.set_preference('network.dns.disablePrefetch', True) # no need to prefetch + fp.set_preference('network.prefetch-next', False) # no need to prefetch + # Disable page thumbnails + fp.set_preference('browser.pagethumbnails.capturing_disabled', True) + driver = webdriver.Firefox(firefox_profile=fp) # set window size diff --git a/automation/PostProcessing/build_cookie_table.py b/automation/PostProcessing/build_cookie_table.py index 845ff12f5..41aded7c6 100644 --- a/automation/PostProcessing/build_cookie_table.py +++ b/automation/PostProcessing/build_cookie_table.py @@ -112,7 +112,9 @@ def build_http_cookie_table(database, verbose=False): # Parse http request cookies commit = 0 - for req_id, crawl_id, header_str, time_stamp in cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests"): + cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests \ + WHERE id NOT IN (SELECT header_id FROM http_cookies)") + for req_id, crawl_id, header_str, time_stamp in cur1.fetchall(): header = mimetools.Message(StringIO(header_str)) if header.has_key('Cookie'): queries = parse_cookies(header['Cookie'], verbose, http_type = 'request') @@ -132,7 +134,9 @@ def build_http_cookie_table(database, verbose=False): # Parse http response cookies commit = 0 - for resp_id, crawl_id, req_url, header_str, time_stamp in cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses"): + cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses \ + WHERE id NOT IN (SELECT header_id FROM http_cookies)") + for resp_id, crawl_id, req_url, header_str, time_stamp in cur1.fetchall(): header = mimetools.Message(StringIO(header_str)) for cookie_str in header.getallmatchingheaders('Set-Cookie'): queries = parse_cookies(cookie_str, verbose, url = req_url, http_type = 'response') diff --git a/automation/Proxy/MITMProxy.py b/automation/Proxy/MITMProxy.py index d9184f970..13b85393a 100644 --- a/automation/Proxy/MITMProxy.py +++ b/automation/Proxy/MITMProxy.py @@ -30,28 +30,28 @@ def __init__(self, server, crawl_id, url_queue, db_socket_address): controller.Master.__init__(self, server) - def load_process_message(self, q): + def load_process_message(self, q, timeout): """ Tries to read and process a message from the proxy queue, returns True iff this succeeds """ try: - msg = q.get(timeout=0.01) + msg = q.get(timeout=timeout) controller.Master.handle(self, *msg) return True except Queue.Empty: return False - def tick(self, q): + def tick(self, q, timeout=0.01): """ new tick function used to label first-party domains and avoid race conditions when doing so """ if self.curr_top_url is None: # proxy is fresh, need to get first-party domain right away self.curr_top_url = self.url_queue.get() elif not self.url_queue.empty(): # new FP has been visited # drains the queue to get rid of stale messages from previous site - while self.load_process_message(q): + while self.load_process_message(q, timeout): pass self.prev_requests, self.curr_requests = self.curr_requests, set() self.prev_top_url, self.curr_top_url = self.curr_top_url, self.url_queue.get() - self.load_process_message(q) + self.load_process_message(q, timeout) def run(self): """ Light wrapper around run with error printing """ @@ -70,7 +70,7 @@ def run(self): def handle_request(self, msg): """ Receives HTTP request, and sends it to logging function """ msg.reply() - self.curr_requests.add(msg) + self.curr_requests.add(msg.request) mitm_commands.process_general_mitm_request(self.db_socket, self.crawl_id, self.curr_top_url, msg) # Record data from HTTP responses diff --git a/automation/Proxy/mitmproxy.crt b/automation/Proxy/cert/mitmproxy-ca.crt similarity index 100% rename from automation/Proxy/mitmproxy.crt rename to automation/Proxy/cert/mitmproxy-ca.crt diff --git a/automation/Proxy/mitmproxy.key b/automation/Proxy/cert/mitmproxy-ca.key similarity index 100% rename from automation/Proxy/mitmproxy.key rename to automation/Proxy/cert/mitmproxy-ca.key diff --git a/automation/Proxy/mitmproxy.pem b/automation/Proxy/cert/mitmproxy-ca.pem similarity index 100% rename from automation/Proxy/mitmproxy.pem rename to automation/Proxy/cert/mitmproxy-ca.pem diff --git a/automation/Proxy/cert/mitmproxy-dhparam.pem b/automation/Proxy/cert/mitmproxy-dhparam.pem new file mode 100644 index 000000000..92ff96a9a --- /dev/null +++ b/automation/Proxy/cert/mitmproxy-dhparam.pem @@ -0,0 +1,5 @@ +-----BEGIN DH PARAMETERS----- +MIGHAoGBAOdPzMbYgoYfO3YBYauCLRlE8X1XypTiAjoeCFD0qWRx8YUsZ6Sj20W5 +zsfQxlZfKovo3f2MftjkDkbI/C/tDgxoe0ZPbjy5CjdOhkzxn0oTbKTs16Rw8DyK +1LjTR65sQJkJEdgsX8TSi/cicCftJZl9CaZEaObF2bdgSgGK+PezAgEC +-----END DH PARAMETERS----- \ No newline at end of file diff --git a/automation/Proxy/deploy_mitm_proxy.py b/automation/Proxy/deploy_mitm_proxy.py index e7138839d..14a336a92 100644 --- a/automation/Proxy/deploy_mitm_proxy.py +++ b/automation/Proxy/deploy_mitm_proxy.py @@ -5,6 +5,7 @@ import MITMProxy from libmproxy import proxy +from libmproxy.proxy.server import ProxyServer def init_proxy(db_socket_address, crawl_id): @@ -22,8 +23,8 @@ def init_proxy(db_socket_address, crawl_id): proxy_port = sock.getsockname()[1] sock.close() - config = proxy.ProxyConfig(cacert=os.path.join(os.path.dirname(__file__), 'mitmproxy.pem'),) - server = proxy.ProxyServer(config, proxy_port) + config = proxy.ProxyConfig(cadir=os.path.join(os.path.dirname(__file__), 'cert'),port=proxy_port) + server = ProxyServer(config) print 'Intercepting Proxy listening on ' + str(proxy_port) m = MITMProxy.InterceptingMaster(server, crawl_id, proxy_site_queue, db_socket_address) thread = threading.Thread(target=m.run, args=()) diff --git a/automation/Proxy/mitm_commands.py b/automation/Proxy/mitm_commands.py index 459458b32..b93f1799d 100644 --- a/automation/Proxy/mitm_commands.py +++ b/automation/Proxy/mitm_commands.py @@ -5,19 +5,19 @@ def process_general_mitm_request(db_socket, crawl_id, top_url, msg): """ Logs a HTTP request object """ - referrer = msg.headers['referer'][0] if len(msg.headers['referer']) > 0 else '' - - data = (crawl_id, msg.get_url(), msg.method, referrer, str(msg.headers), top_url, str(datetime.datetime.now())) + referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else '' + + data = (crawl_id, msg.request.url, msg.request.method, referrer, str(msg.request.headers), top_url, str(datetime.datetime.now())) db_socket.send(("INSERT INTO http_requests (crawl_id, url, method, referrer, headers, " "top_url, time_stamp) VALUES (?,?,?,?,?,?,?)", data)) def process_general_mitm_response(db_socket, crawl_id, top_url, msg): """ Logs a HTTP response object and, if necessary, """ - referrer = msg.headers['referer'][0] if len(msg.headers['referer']) > 0 else '' - location = msg.headers['location'][0] if len(msg.headers['location']) > 0 else '' - - data = (crawl_id, msg.request.get_url(), msg.request.method, referrer, msg.code, msg.msg, str(msg.headers), + referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else '' + location = msg.response.headers['location'][0] if len(msg.response.headers['location']) > 0 else '' + + data = (crawl_id, msg.request.url, msg.request.method, referrer, msg.response.code, msg.response.msg, str(msg.response.headers), location, top_url, str(datetime.datetime.now())) db_socket.send(("INSERT INTO http_responses (crawl_id, url, method, referrer, response_status, " "response_status_text, headers, location, top_url, time_stamp) VALUES (?,?,?,?,?,?,?,?,?,?)", data)) diff --git a/automation/TaskManager.py b/automation/TaskManager.py index 5caa6fc5c..271b81890 100644 --- a/automation/TaskManager.py +++ b/automation/TaskManager.py @@ -158,7 +158,7 @@ def close(self): # CRAWLER COMMAND CODE - def distribute_command(self, command, index=None, timeout=None): + def distribute_command(self, command, index=None, timeout=None, reset=False): """ parses command type and issues command(s) to the proper browser specifies the type of command this is: @@ -173,7 +173,7 @@ def distribute_command(self, command, index=None, timeout=None): while True: for browser in self.browsers: if browser.ready(): - self.start_thread(browser, command, timeout) + self.start_thread(browser, command, timeout, reset) command_executed = True break if command_executed: @@ -184,7 +184,7 @@ def distribute_command(self, command, index=None, timeout=None): #send the command to this specific browser while True: if self.browsers[index].ready(): - self.start_thread(self.browsers[index], command, timeout) + self.start_thread(self.browsers[index], command, timeout, reset) break time.sleep(SLEEP_CONS) elif index == '*': @@ -193,7 +193,7 @@ def distribute_command(self, command, index=None, timeout=None): while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: - self.start_thread(self.browsers[i], command, timeout) + self.start_thread(self.browsers[i], command, timeout, reset) command_executed[i] = True time.sleep(SLEEP_CONS) elif index == '**': @@ -203,24 +203,23 @@ def distribute_command(self, command, index=None, timeout=None): while False in command_executed: for i in xrange(len(self.browsers)): if self.browsers[i].ready() and not command_executed[i]: - self.start_thread(self.browsers[i], command, timeout, condition) + self.start_thread(self.browsers[i], command, timeout, reset, condition) command_executed[i] = True time.sleep(SLEEP_CONS) with condition: condition.notifyAll() # All browsers loaded, tell them to start else: - #not a supported command print "Command index type is not supported or out of range" - def start_thread(self, browser, command, timeout, condition=None): + def start_thread(self, browser, command, timeout, reset, condition=None): """ starts the command execution thread """ - args = (browser, command, timeout, condition) + args = (browser, command, timeout, reset, condition) thread = threading.Thread(target=self.issue_command, args=args) browser.command_thread = thread thread.daemon = True thread.start() - def issue_command(self, browser, command, timeout=None, condition=None): + def issue_command(self, browser, command, timeout, reset, condition=None): """ sends command tuple to the BrowserManager gives the option to override default timeout @@ -241,7 +240,6 @@ def issue_command(self, browser, command, timeout=None, condition=None): try: status = browser.status_queue.get(True, timeout) if status == "OK": - #print str(browser.crawl_id) + " " + "got OK" command_succeeded = True else: print("Received failure status while executing command: " + command[0]) @@ -253,14 +251,16 @@ def issue_command(self, browser, command, timeout=None, condition=None): " VALUES (?,?,?,?)", (browser.crawl_id, command[0], command_arguments, command_succeeded))) - if not command_succeeded: + if reset: + browser.reset() + elif not command_succeeded: browser.restart_browser_manager() # DEFINITIONS OF HIGH LEVEL COMMANDS - def get(self, url, index=None, overwrite_timeout=None): + def get(self, url, index=None, overwrite_timeout=None, reset=False): """ goes to a url """ - self.distribute_command(('GET', url), index, overwrite_timeout) + self.distribute_command(('GET', url), index, overwrite_timeout, reset) def dump_storage_vectors(self, url, start_time, index=None, overwrite_timeout=None): """ dumps the local storage vectors (flash, localStorage, cookies) to db """ diff --git a/run_simple_crawl.py b/run_simple_crawl.py deleted file mode 100644 index 2523c701f..000000000 --- a/run_simple_crawl.py +++ /dev/null @@ -1,100 +0,0 @@ -from automation import TaskManager -import sys -import json -import os - -# Runs a basic crawl which simply runs through a list of websites - - -def load_sites(site_path): - """ loads a list of websites from a text file """ - sites = [] - - f = open(site_path) - for site in f: - cleaned_site = site.strip() if site.strip().startswith("http") else "http://" + site.strip() - sites.append(cleaned_site) - f.close() - - return sites - - -def run_site_crawl(db_path, sites, preferences, dump_location): - """ - runs the crawl itself - is the absolute path of crawl database - is a dictionary of preferences to initialize the crawler - """ - manager = TaskManager.TaskManager(db_path, preferences, 1) - - for site in sites: - manager.get(site) - - if dump_location: - manager.dump_profile(dump_location,True) - - manager.close() - - -def print_help_message(): - """ prints out the help message in the case that too few arguments are mentioned """ - print "\nMust call simple crawl script with at least one arguments: \n" \ - "The absolute directory path of the new crawl DB\n" \ - "Other command line argument flags are:\n" \ - "-browser: specifies type of browser to use (firefox or chrome)\n" \ - "-donottrack: True/False value as to whether to use the Do Not Track flag\n" \ - "-tp_cookies: string designating third-party cookie preferences: always, never or just_visted\n" \ - "-proxy: True/False value as to whether to use proxy-based instrumentation\n" \ - "-headless: True/False value as to whether to run browser in headless mode\n" \ - "-timeout: timeout (in seconds) for the TaskManager to default time out loads\n" \ - "-profile_tar: absolute path of folder in which to load tar-zipped user profile\n" \ - "-dump_location: absolute path of folder in which to dump tar-zipped user profile\n" \ - "-bot_mitigation: True/False value as to whether to enable bot-mitigation measures" - - -def main(argv): - """ main helper function, reads command-line arguments and launches crawl """ - - # filters out bad arguments - if len(argv) < 3 or len(argv) % 2 == 0: - print_help_message() - return - - db_path = argv[1] # absolute path for the database - site_file = argv[2] # absolute path of the file that contains the list of sites to visit - sites = load_sites(site_file) - - # loads up the default preference dictionary - fp = open(os.path.join(os.path.dirname(__file__), 'automation/default_settings.json')) - preferences = json.load(fp) - fp.close() - - dump_location = None - # overwrites the default preferences based on command-line inputs - for i in xrange(3, len(argv), 2): - if argv[i] == "-browser": - preferences["browser"] = "chrome" if argv[i+1].lower() == "chrome" else "firefox" - elif argv[i] == "-donottrack": - preferences["donottrack"] = True if argv[i+1].lower() == "true" else False - elif argv[i] == "-tp_cookies": - preferences["tp_cookies"] = argv[i+1].lower() - elif argv[i] == "-proxy": - preferences["proxy"] = True if argv[i+1].lower() == "true" else False - elif argv[i] == "-headless": - preferences["headless"] = True if argv[i+1].lower() == "true" else False - elif argv[i] == "-bot_mitigation": - preferences["bot_mitigation"] = True if argv[i+1].lower() == "true" else False - elif argv[i] == "-timeout": - preferences["timeout"] = float(argv[i+1]) if float(argv[i]) > 0 else 30.0 - elif argv[i] == "-profile_tar": - preferences["profile_tar"] = argv[i+1] - elif argv[i] == "-disable_flash": - preferences["disable_flash"] = True if argv[i+1].lower() == "true" else False - elif argv[i] == "-dump_location": - dump_location = argv[i+1] - - # launches the crawl with the updated preferences - run_site_crawl(db_path, sites, preferences, dump_location) - -if __name__ == "__main__": - main(sys.argv) diff --git a/test_sites.txt b/test_sites.txt deleted file mode 100644 index aa41ad127..000000000 --- a/test_sites.txt +++ /dev/null @@ -1,5 +0,0 @@ -google.com -yahoo.com -buzzfeed.com -youtube.com -bbc.com