Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into release
Browse files Browse the repository at this point in the history
Conflicts:
	VERSION
	demo.py
  • Loading branch information
englehardt committed Dec 31, 2014
2 parents 3673992 + 5fccd99 commit 1fe205e
Show file tree
Hide file tree
Showing 14 changed files with 60 additions and 136 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
0.1.0 - Initial Public Release
0.1.1 - Simplfied load of default settings, including wiki demo
0.2.0 - Complete re-write of HTTP Cookie parsing
0.2.1 - Support for MITMProxy v0.11 + minor bugfixes
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.0
0.2.1
18 changes: 18 additions & 0 deletions automation/DeployBrowsers/deploy_firefox.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,24 @@ def deploy_firefox(browser_params, crash_recovery):
if browser_params['disable_flash']:
fp.set_preference('plugin.state.flash', 0)

# Disable health reports
fp.set_preference('datareporting.healthreport.uploadEnabled', False)
fp.set_preference('toolkit.telemetry.enabled', False)

fp.set_preference('extensions.checkCompatibility.nightly', False)
fp.set_preference('browser.search.update', False)
# Disable know your rights banner
fp.set_preference('browser.rights.3.shown', True)
fp.set_preference('browser.shell.checkDefaultBrowser', False)
fp.set_preference('security.OCSP.enabled', "0")
fp.set_preference('browser.safebrowsing.enabled', False)
fp.set_preference('devtools.profiler.enabled', False)
fp.set_preference('network.seer.enabled', False) # predictive actions
fp.set_preference('network.dns.disablePrefetch', True) # no need to prefetch
fp.set_preference('network.prefetch-next', False) # no need to prefetch
# Disable page thumbnails
fp.set_preference('browser.pagethumbnails.capturing_disabled', True)

driver = webdriver.Firefox(firefox_profile=fp)

# set window size
Expand Down
8 changes: 6 additions & 2 deletions automation/PostProcessing/build_cookie_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ def build_http_cookie_table(database, verbose=False):

# Parse http request cookies
commit = 0
for req_id, crawl_id, header_str, time_stamp in cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests"):
cur1.execute("SELECT id, crawl_id, headers, time_stamp FROM http_requests \
WHERE id NOT IN (SELECT header_id FROM http_cookies)")
for req_id, crawl_id, header_str, time_stamp in cur1.fetchall():
header = mimetools.Message(StringIO(header_str))
if header.has_key('Cookie'):
queries = parse_cookies(header['Cookie'], verbose, http_type = 'request')
Expand All @@ -132,7 +134,9 @@ def build_http_cookie_table(database, verbose=False):

# Parse http response cookies
commit = 0
for resp_id, crawl_id, req_url, header_str, time_stamp in cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses"):
cur1.execute("SELECT id, crawl_id, url, headers, time_stamp FROM http_responses \
WHERE id NOT IN (SELECT header_id FROM http_cookies)")
for resp_id, crawl_id, req_url, header_str, time_stamp in cur1.fetchall():
header = mimetools.Message(StringIO(header_str))
for cookie_str in header.getallmatchingheaders('Set-Cookie'):
queries = parse_cookies(cookie_str, verbose, url = req_url, http_type = 'response')
Expand Down
12 changes: 6 additions & 6 deletions automation/Proxy/MITMProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,28 +30,28 @@ def __init__(self, server, crawl_id, url_queue, db_socket_address):

controller.Master.__init__(self, server)

def load_process_message(self, q):
def load_process_message(self, q, timeout):
""" Tries to read and process a message from the proxy queue, returns True iff this succeeds """
try:
msg = q.get(timeout=0.01)
msg = q.get(timeout=timeout)
controller.Master.handle(self, *msg)
return True
except Queue.Empty:
return False

def tick(self, q):
def tick(self, q, timeout=0.01):
""" new tick function used to label first-party domains and avoid race conditions when doing so """
if self.curr_top_url is None: # proxy is fresh, need to get first-party domain right away
self.curr_top_url = self.url_queue.get()
elif not self.url_queue.empty(): # new FP has been visited
# drains the queue to get rid of stale messages from previous site
while self.load_process_message(q):
while self.load_process_message(q, timeout):
pass

self.prev_requests, self.curr_requests = self.curr_requests, set()
self.prev_top_url, self.curr_top_url = self.curr_top_url, self.url_queue.get()

self.load_process_message(q)
self.load_process_message(q, timeout)

def run(self):
""" Light wrapper around run with error printing """
Expand All @@ -70,7 +70,7 @@ def run(self):
def handle_request(self, msg):
""" Receives HTTP request, and sends it to logging function """
msg.reply()
self.curr_requests.add(msg)
self.curr_requests.add(msg.request)
mitm_commands.process_general_mitm_request(self.db_socket, self.crawl_id, self.curr_top_url, msg)

# Record data from HTTP responses
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
5 changes: 5 additions & 0 deletions automation/Proxy/cert/mitmproxy-dhparam.pem
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-----BEGIN DH PARAMETERS-----
MIGHAoGBAOdPzMbYgoYfO3YBYauCLRlE8X1XypTiAjoeCFD0qWRx8YUsZ6Sj20W5
zsfQxlZfKovo3f2MftjkDkbI/C/tDgxoe0ZPbjy5CjdOhkzxn0oTbKTs16Rw8DyK
1LjTR65sQJkJEdgsX8TSi/cicCftJZl9CaZEaObF2bdgSgGK+PezAgEC
-----END DH PARAMETERS-----
5 changes: 3 additions & 2 deletions automation/Proxy/deploy_mitm_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import MITMProxy
from libmproxy import proxy
from libmproxy.proxy.server import ProxyServer


def init_proxy(db_socket_address, crawl_id):
Expand All @@ -22,8 +23,8 @@ def init_proxy(db_socket_address, crawl_id):
proxy_port = sock.getsockname()[1]
sock.close()

config = proxy.ProxyConfig(cacert=os.path.join(os.path.dirname(__file__), 'mitmproxy.pem'),)
server = proxy.ProxyServer(config, proxy_port)
config = proxy.ProxyConfig(cadir=os.path.join(os.path.dirname(__file__), 'cert'),port=proxy_port)
server = ProxyServer(config)
print 'Intercepting Proxy listening on ' + str(proxy_port)
m = MITMProxy.InterceptingMaster(server, crawl_id, proxy_site_queue, db_socket_address)
thread = threading.Thread(target=m.run, args=())
Expand Down
14 changes: 7 additions & 7 deletions automation/Proxy/mitm_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@

def process_general_mitm_request(db_socket, crawl_id, top_url, msg):
""" Logs a HTTP request object """
referrer = msg.headers['referer'][0] if len(msg.headers['referer']) > 0 else ''

data = (crawl_id, msg.get_url(), msg.method, referrer, str(msg.headers), top_url, str(datetime.datetime.now()))
referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else ''
data = (crawl_id, msg.request.url, msg.request.method, referrer, str(msg.request.headers), top_url, str(datetime.datetime.now()))
db_socket.send(("INSERT INTO http_requests (crawl_id, url, method, referrer, headers, "
"top_url, time_stamp) VALUES (?,?,?,?,?,?,?)", data))


def process_general_mitm_response(db_socket, crawl_id, top_url, msg):
""" Logs a HTTP response object and, if necessary, """
referrer = msg.headers['referer'][0] if len(msg.headers['referer']) > 0 else ''
location = msg.headers['location'][0] if len(msg.headers['location']) > 0 else ''

data = (crawl_id, msg.request.get_url(), msg.request.method, referrer, msg.code, msg.msg, str(msg.headers),
referrer = msg.request.headers['referer'][0] if len(msg.request.headers['referer']) > 0 else ''
location = msg.response.headers['location'][0] if len(msg.response.headers['location']) > 0 else ''
data = (crawl_id, msg.request.url, msg.request.method, referrer, msg.response.code, msg.response.msg, str(msg.response.headers),
location, top_url, str(datetime.datetime.now()))
db_socket.send(("INSERT INTO http_responses (crawl_id, url, method, referrer, response_status, "
"response_status_text, headers, location, top_url, time_stamp) VALUES (?,?,?,?,?,?,?,?,?,?)", data))
26 changes: 13 additions & 13 deletions automation/TaskManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def close(self):

# CRAWLER COMMAND CODE

def distribute_command(self, command, index=None, timeout=None):
def distribute_command(self, command, index=None, timeout=None, reset=False):
"""
parses command type and issues command(s) to the proper browser
<index> specifies the type of command this is:
Expand All @@ -173,7 +173,7 @@ def distribute_command(self, command, index=None, timeout=None):
while True:
for browser in self.browsers:
if browser.ready():
self.start_thread(browser, command, timeout)
self.start_thread(browser, command, timeout, reset)
command_executed = True
break
if command_executed:
Expand All @@ -184,7 +184,7 @@ def distribute_command(self, command, index=None, timeout=None):
#send the command to this specific browser
while True:
if self.browsers[index].ready():
self.start_thread(self.browsers[index], command, timeout)
self.start_thread(self.browsers[index], command, timeout, reset)
break
time.sleep(SLEEP_CONS)
elif index == '*':
Expand All @@ -193,7 +193,7 @@ def distribute_command(self, command, index=None, timeout=None):
while False in command_executed:
for i in xrange(len(self.browsers)):
if self.browsers[i].ready() and not command_executed[i]:
self.start_thread(self.browsers[i], command, timeout)
self.start_thread(self.browsers[i], command, timeout, reset)
command_executed[i] = True
time.sleep(SLEEP_CONS)
elif index == '**':
Expand All @@ -203,24 +203,23 @@ def distribute_command(self, command, index=None, timeout=None):
while False in command_executed:
for i in xrange(len(self.browsers)):
if self.browsers[i].ready() and not command_executed[i]:
self.start_thread(self.browsers[i], command, timeout, condition)
self.start_thread(self.browsers[i], command, timeout, reset, condition)
command_executed[i] = True
time.sleep(SLEEP_CONS)
with condition:
condition.notifyAll() # All browsers loaded, tell them to start
else:
#not a supported command
print "Command index type is not supported or out of range"

def start_thread(self, browser, command, timeout, condition=None):
def start_thread(self, browser, command, timeout, reset, condition=None):
""" starts the command execution thread """
args = (browser, command, timeout, condition)
args = (browser, command, timeout, reset, condition)
thread = threading.Thread(target=self.issue_command, args=args)
browser.command_thread = thread
thread.daemon = True
thread.start()

def issue_command(self, browser, command, timeout=None, condition=None):
def issue_command(self, browser, command, timeout, reset, condition=None):
"""
sends command tuple to the BrowserManager
<timeout> gives the option to override default timeout
Expand All @@ -241,7 +240,6 @@ def issue_command(self, browser, command, timeout=None, condition=None):
try:
status = browser.status_queue.get(True, timeout)
if status == "OK":
#print str(browser.crawl_id) + " " + "got OK"
command_succeeded = True
else:
print("Received failure status while executing command: " + command[0])
Expand All @@ -253,14 +251,16 @@ def issue_command(self, browser, command, timeout=None, condition=None):
" VALUES (?,?,?,?)",
(browser.crawl_id, command[0], command_arguments, command_succeeded)))

if not command_succeeded:
if reset:
browser.reset()
elif not command_succeeded:
browser.restart_browser_manager()

# DEFINITIONS OF HIGH LEVEL COMMANDS

def get(self, url, index=None, overwrite_timeout=None):
def get(self, url, index=None, overwrite_timeout=None, reset=False):
""" goes to a url """
self.distribute_command(('GET', url), index, overwrite_timeout)
self.distribute_command(('GET', url), index, overwrite_timeout, reset)

def dump_storage_vectors(self, url, start_time, index=None, overwrite_timeout=None):
""" dumps the local storage vectors (flash, localStorage, cookies) to db """
Expand Down
100 changes: 0 additions & 100 deletions run_simple_crawl.py

This file was deleted.

5 changes: 0 additions & 5 deletions test_sites.txt

This file was deleted.

0 comments on commit 1fe205e

Please sign in to comment.