diff --git a/classes2/__init__.py b/classes2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/classes2/discovery.py b/classes2/discovery.py new file mode 100644 index 0000000..db6725b --- /dev/null +++ b/classes2/discovery.py @@ -0,0 +1,742 @@ +import re +import socket +import urllib +from collections import Counter, defaultdict + +from HTMLParser import HTMLParser + + +class DiscoverAllCMS(object): + # match all fingerprints against all responses + # this might generate false positives + + def __init__(self, data): + self.cache = data['cache'] + self.results = data['results'] + self.matcher = data['matcher'] + self.fps = data['fingerprints'] + self.printer = data['printer'] + + # only used for pretty printing of debugging info + self.tmp_set = set() + + def run(self): + self.printer.print_debug_line('Checking for more matches in cache (option -a) ...', 1) + + # find matches for all the responses in the cache + for fp_category in ['cms', 'platform']: + for fp_type in self.fps.data[fp_category]: + fps = self.fps.data[fp_category][fp_type]['fps'] + + for response in self.cache.get_responses(): + matches = self.matcher.get_result(fps, response) + for fp in matches: + self.results.add( fp_category, fp['name'], fp['output'], fp ) + + if (fp['name'], fp['output']) not in self.tmp_set: + self.printer.print_debug_line('- Found match: %s %s' % (fp['name'], fp['output']) , 2) + + self.tmp_set.add((fp['name'], fp['output'])) + + +class DiscoverCMS(object): + + def __init__(self, options, data): + self.printer = data['printer'] + self.matcher = data['matcher'] + self.requester = data['requester'] + self.result = data['results'] + self.printer = data['printer'] + + self.batch_size = options['batch_size'] + self.num_cms_to_find = options['stop_after'] + self.find_all_cms = options['run_all'] + + # only used for pretty printing of debugging info + self.tmp_set = set() + + self.queue = defaultdict(list) + for fp_type in data['fingerprints'].data['cms']: + for fp in data['fingerprints'].data['cms'][fp_type]['fps']: + self.queue[fp['url']].append(fp) + + + + def get_queue(self, cms=None): + queue = [] + if cms is None: + for i in range(self.batch_size): + try: + url, fp_list = self.queue.popitem() + queue.append(fp_list) + except KeyError: + break + else: + # the following procedure is *not* optimal + # the self.queue dict is completely destroyed and + # and rebuilt each time this procedure is called :( + + # create a temp queue dict + tmp_queue = defaultdict(list) + + # remove elements from the dict until it is empty + while len(self.queue) > 0: + url, fp_list = self.queue.popitem() + + # remove all the elements of a queue entry's list + # one-by-one and check if the fingerprints are + # belong to the specified 'cms' + tmp_list = [] + out_list = [] + + while len(fp_list) > 0: + # remove the fingerprint + fp = fp_list.pop() + + # if the fingerprint matches the cms, add it to the + # out_list for the current url + # otherwise add it to the tmp_list + if fp['name'] == cms: + out_list.append(fp) + else: + tmp_list.append(fp) + + # if there are elements in tmp_list (the new list of fps that + # that do *not* match the 'cms'), add it to the tmp_queue's entry + # for the current url + if len(tmp_list) > 0: + tmp_queue[url].extend(tmp_list) + + # if matches for the specified cms have been found, add the list + # to the fingerprintQueue for the requester + if len(out_list) > 0: + queue.append(out_list) + + # replace the queue with the tmp queue + self.queue = tmp_queue + + return queue + + + + def run(self): + batch_no = 0 + self.printer.print_debug_line('Determining CMS type ...', 1) + + detected_cms = [] + stop_searching = len(detected_cms) >= self.num_cms_to_find + + while (not stop_searching or self.find_all_cms) and (not len(self.queue) == 0): + self.printer.print_debug_line('Checking fingerprint group no. %s ...' % (batch_no, ) , 3) + + # set the requester queue + results = self.requester.run('CMS', self.get_queue()) + + # search for CMS matches + cms_matches = [] + while not results.empty(): + fingerprints, response = results.get() + + for fp in self.matcher.get_result(fingerprints, response): + self.result.add( 'cms', fp['name'], fp['output'], fp) + cms_matches.append(fp['name']) + + # search for the found CMS versions + for cms in cms_matches: + + # skip checking the cms, if it has already been detected + if cms in detected_cms: continue + + if cms not in self.tmp_set: + self.tmp_set.add(cms) + self.printer.print_debug_line('- Found CMS match: %s' % (cms, ) , 2) + + # set the requester queue with only fingerprints for the cms + results = self.requester.run('CMS_version', self.get_queue(cms)) + + # find the results + self.printer.print_debug_line('Determining CMS version ...', 1) + while results.qsize() > 0: + res_fps,response = results.get() + for fp in self.matcher.get_result(res_fps, response): + self.result.add( 'cms', fp['name'], fp['output'], fp) + + if (fp['name'], fp['output']) not in self.tmp_set: + self.tmp_set.add((fp['name'], fp['output'])) + self.printer.print_debug_line('- Found version: %s %s' % (fp['name'], fp['output']) , 2) + + + # update the stop criteria + detected_cms.append(cms) + + stop_searching = (len(detected_cms) >= self.num_cms_to_find) or len(self.queue) == 0 + batch_no += 1 + + + +class DiscoverCookies(object): + + def __init__(self, data): + self.data = data + self.printer = data['printer'] + + def run(self): + self.printer.print_debug_line('Checking for cookies ...' , 1) + + cookies = set() + for r in self.data['cache'].get_responses(): + try: + c = r.headers['set-cookie'].strip().split('=')[0] + if c not in cookies: + self.printer.print_debug_line('- Found cookie: %s' % (c,) , 2) + + cookies.add(c) + + except: + pass + + self.data['results'].site_info['cookies'] = cookies + + +class DiscoverSubdomains: + + def __init__(self, url, data): + self.results = data['results'] + self.subdomains = data['fingerprints'].data['subdomains']['fps'] + self.url = url + + + def run(self): + domain = urllib.request.urlparse(self.url).netloc + domain = domain.split(':')[0] + + valid = set() + for subdomain in self.subdomains: + d = subdomain + '.' + domain + try: + valid.add((d, socket.gethostbyname(d))) + except: + continue + + return valid + + + + +class DiscoverErrorPage: + # find error pages on the site + # the requester has a built-in list of items and patterns + # to remove before calculating a checksum of pages that + # should not exists + + def __init__(self, options, data): + self.host = options['url'] + self.fps = data['fingerprints'].data['error_pages']['fps'] + self.requester = data['requester'] + self.printer = data['printer'] + + + def run(self): + self.requester.find_404s = True + + self.printer.print_debug_line('Error page detection ...', 1) + + queue = [[fp] for fp in self.fps] + results = self.requester.run('ErrorPages', queue) + + error_pages = set() + while results.qsize() > 0: + fp, response = results.get() + if response is not None: + error_pages.add(response.md5_404) + error_pages.add(response.md5_404_text) + self.printer.print_debug_line('- Error page fingerprint: %s, %s - %s' % (response.md5_404, response.md5_404_text, fp[0]['url']), 2) + + self.requester.find_404s = False + + return error_pages + + +class DiscoverInteresting(object): + def __init__(self, options, data): + self.url = options['url'] + self.printer = data['printer'] + self.requester = data['requester'] + self.matcher = data['matcher'] + self.result = data['results'] + self.error_pages = data['error_pages'] + self.cache = data['cache'] + self.category = "interesting" + + # add the fingerprints to the queue, ensuring that + # all fps with the same url, are collected in a list + self.queue = defaultdict(list) + for fp in data['fingerprints'].data['interesting']['fps']: + self.queue[fp['url']].append(fp) + + + def run(self): + self.printer.print_debug_line('Detecting interesting files ...', 1) + + # process the results + results = self.requester.run('Interesting', list(self.queue.values())) + + while results.qsize() > 0: + fps,response = results.get() + + # if the response includes a 404 md5, check if the response + # is a redirection to a known error page + # this is a fix for https://github.com/jekyc/wig/issues/7 + if response is not None: + redirected = response.md5_404 in self.error_pages + redirected = redirected or (response.md5_404_text in self.error_pages) + redirected = redirected or (response.md5_404_text == self.cache[self.url].md5_404_text) + + # if it is an error page, skip it + if redirected: continue + + # if the response does not have a 404 md5, something most have gone wrong + # skip checking the page + else: + continue + + for fp in self.matcher.get_result(fps, response): + self.result.add( self.category, None, None, fp, weight=1) + try: + self.printer.print_debug_line('- Found file: %s (%s)' % (fp['url'], fp['note'] ), 2) + except: + pass + + +class DiscoverIP(object): + + def __init__(self, path): + self.path = path + + def run(self): + + try: + hostname = self.path.split('//')[1] + hostname = hostname.split('/')[0] + ip = socket.gethostbyname(hostname) + except Exception as e: + #print(e) + ip = 'Unknown' + + return ip + + +class DiscoverJavaScript(object): + def __init__(self, options, data): + self.printer = data['printer'] + self.cache = data['cache'] + self.matcher = data['matcher'] + self.result = data['results'] + + self.fingerprints = [] + for fp_type in data['fingerprints'].data['js']: + self.fingerprints.extend(data['fingerprints'].data['js'][fp_type]['fps']) + + + def run(self): + self.printer.print_debug_line('Detecting Javascript ...', 1) + for response in self.cache.get_responses(): + + # match only if the response is JavaScript + # check content type + content_type = response.headers['content-type'] if 'content-type' in response.headers else '' + # and extension + is_js = 'javascript' in content_type or '.js' in response.url.split('.')[-1] + + # if the response is JavaScript try to match it to the known fingerprints + if is_js: + matches = self.matcher.get_result(self.fingerprints, response) + for fp in matches: + self.result.add( 'js', fp['name'], fp['output'], fingerprint=fp, weight=1) + + self.printer.print_debug_line('- Found JavaScript: %s %s' % (fp['name'], fp['output']), 2) + + + +# Used by the DiscoverMore crawler +# The +class LinkExtractor(HTMLParser): + def __init__(self, strict): + HTMLParser.__init__(self) + self.results = set() + + def get_results(self): + return self.results + + def handle_starttag(self, tag, attrs): + try: + url = '' + if tag == 'script' or tag == 'img': + for attr in attrs: + if attr[0] == 'src': + self.results.add(attr[1]) + if tag == 'link': + for attr in attrs: + if attr[0] == 'href': + self.results.add(attr[1]) + except: + pass + + + +class DiscoverMore(object): + + def __init__(self, options, data): + self.host = options['url'] + self.threads = options['threads'] + self.printer = data['printer'] + self.cache = data['cache'] + self.result = data['results'] + self.matcher = data['matcher'] + self.requester = data['requester'] + self.fingerprints = data['fingerprints'] + + + def _get_urls(self, response): + # only get urls from elements that use 'src' to avoid + # fetching resources provided by -tags, as this could + # lead to the crawling of the whole application + regexes = [ 'src="(.+?)"', "src='(.+?)'"] + + urls = set() + for regex in regexes: + for match in re.findall(regex, response.body): + urls.add( match ) + + return urls + + + def run(self): + self.printer.print_debug_line('Detecting links ...', 1) + resources = set() + parser = LinkExtractor(strict=False) + + for req in self.cache.get_responses(): + # skip pages that do not set 'content-type' + # these might be binaries + if not 'content-type' in req.headers: + continue + + # skip responses that have been discovered + # with 'DiscoverMore' + if req.crawled_response: + continue + + # only scrape pages that can contain links/references + if 'text/html' in req.headers['content-type']: + tmp = self._get_urls(req) + + parser.feed(req.body) + tmp = tmp.union( parser.get_results()) + + for i in tmp: + url_data = urllib.request.urlparse(i) + + # skip data urls + if url_data.path.startswith('data:'): continue + + resources.add( i ) + + # the items in the resource set should mimic a list of fingerprints: + # a fingerprint is a dict with at least an URL key + self.printer.print_debug_line('- Discovered %s new resources' % (len(resources), ), 2) + + # prepare the urls + queue = defaultdict(list) + for url in resources: + queue[url].append({'url': url}) + + + # fetch'em + results = self.requester.run('DiscoverMore', list(queue.values())) + + +class DiscoverOS: + def __init__(self, options, data): + self.printer = data['printer'] + self.cache = data['cache'] + self.results = data['results'] + self.fingerprints = data['fingerprints'].data['os']['fps'] + + self.os = Counter() + self.os_family_list = Counter() + self.matched_packages = set() + + + def search_and_prioritize_os(self, pkg_name, pkg_version): + for fp in self.fingerprints: + if fp['pkg_name'] == pkg_name and fp['pkg_version'] == pkg_version: + weight = 1 if not 'weight' in fp else fp['weight'] + + if not type(fp['os_version']) == type([]): + fp['os_version'] = [fp['os_version']] + + for os_version in fp['os_version']: + if fp['os_name'].lower() in self.os_family_list: + self.printer.print_debug_line('- Prioritizing fingerprints for OS: %s' % (fp['os_name'], ), 7) + self.os[ (fp['os_name'], os_version) ] += weight * 100 + else: + self.os[ (fp['os_name'], os_version) ] += weight + + + def find_match_in_headers(self, response): + headers = response.headers + if 'server' in headers: + line = headers['server'] + + if "(" in line: + os = line[line.find('(')+1:line.find(')')] + + # hack for RHEL + if os == 'Red Hat': + os = 'Red Hat Enterprise Linux' + + line = line[:line.find('(')-1] + line[line.find(')')+1: ] + else: + os = None + + if os is not None: + self.os_family_list[os.lower()] += 1 + + for part in line.split(" "): + try: + pkg,version = list(map(str.lower, part.split('/'))) + self.search_and_prioritize_os(pkg, version) + + except Exception as e: + continue + + + def find_match_in_results(self): + platforms = self.results.scores['platform'] + + for pkg in platforms: + for version in platforms[pkg]: + # hack for asp.net + if pkg == 'ASP.NET': + version = version[:3] if not version.startswith("4.5") else version[:5] + + self.search_and_prioritize_os(pkg, version) + + + def finalize(self): + # add OS to results: self.os: {(os, version): weight, ...} + results = [] + for p in self.os: + results.append({'version': p[1], 'os': p[0], 'count': self.os[p]}) + + if len(results) == 0: return + + prio = sorted(results, key=lambda x:x['count'], reverse=True) + max_count = prio[0]['count'] + for i in prio: + if i['count'] == max_count: + self.results.add('os', i['os'], i['version'], weight=i['count']) + self.printer.print_debug_line('- Found OS: %s %s' % (i['os'], i['version']), 2) + else: + break + + + def run(self): + self.printer.print_debug_line('Detecting OS ...', 1) + headers = set() + responses = self.cache.get_responses() + + # find matches in the header + for response in responses: + self.find_match_in_headers(response) + + # find match in current results + self.find_match_in_results() + + # do some house keeping + self.finalize() + + + + +class DiscoverPlatform: + + def __init__(self, options, data): + self.printer = data['printer'] + self.requester = data['requester'] + self.matcher = data['matcher'] + self.result = data['results'] + self.printer = data['printer'] + + self.threads = options['threads'] + self.batch_size = options['batch_size'] + + self.queue = defaultdict(list) + for fp_type in data['fingerprints'].data['platform']: + for fp in data['fingerprints'].data['platform'][fp_type]['fps']: + self.queue[fp['url']].append(fp) + + # only used for pretty printing of debugging info + self.tmp_set = set() + + def run(self): + self.printer.print_debug_line('Detecting platform ...', 1) + + while len(self.queue) > 0: + queue = [] + for i in range(self.batch_size): + try: + url, fp_list = self.queue.popitem() + queue.append(fp_list) + except KeyError: + break + + results = self.requester.run('Plaform', queue) + + # search for CMS matches + while not results.empty(): + fingerprints, response = results.get() + matches = self.matcher.get_result(fingerprints, response) + for fp in matches: + self.result.add('platform', fp['name'], fp['output'], fp) + + if (fp['name'], fp['output']) not in self.tmp_set: + self.printer.print_debug_line('- Found platform %s %s' % (fp['name'], fp['output']), 2) + + self.tmp_set.add((fp['name'], fp['output'])) + + + +class DiscoverTitle: + + def __init__(self, options, data): + self.data = data + self.url = options['url'] + self.printer = data['printer'] + + def run(self): + self.printer.print_debug_line('Getting title ...', 1) + + r = self.data['requester'].run('Title', [[{'url': '/'}]]) + + front_page = self.data['cache'][self.url] + + try: + title = re.findall('\s*(.*)\s*', front_page.body)[0] + title = title.strip() + except: + title = '' + + try: + self.printer.print_debug_line('- Found title: %s' % (title, ), 2) + except: + pass + + return title + + +class DiscoverTools: + def __init__(self, data): + self.fps = data['fingerprints'] + self.results = data['results'] + self.printer = data['printer'] + + + def run(self): + self.printer.print_debug_line('Searching for tools ...', 1) + cms_results = self.results.get_versions() + + # loop over the cms' in the results + for cms,_ in cms_results: + # loop over all the translations + for fn in self.fps.translator: + # check if the translated name is the same as the cms + if self.fps.translator[fn]['name'] == cms and 'tool' in self.fps.translator[fn]: + for tool in self.fps.translator[fn]['tool']: + self.results.add_tool(cms, tool['name'], tool['link']) + self.printer.print_debug_line('- Found tool: %s (%s)' % (tool['name'], tool['link']), 2) + + + +class DiscoverUrlLess(object): + def __init__(self, options, data): + self.printer = data['printer'] + self.cache = data['cache'] + self.results = data['results'] + self.matcher = data['matcher'] + self.fingerprints = data['fingerprints'] + + + def run(self): + self.printer.print_debug_line('Matching urlless fingerprints...', 1) + + # only used for pretty printing of debugging info + tmp_set = set() + + for fp_category in ['cms', 'platform']: + for fp_type in self.fingerprints.data[fp_category]: + fps = self.fingerprints.data[fp_category][fp_type]['fps'] + fps = [fp for fp in fps if fp['url'] == ''] + + # find matches for all the responses in the cache + for response in self.cache.get_responses(): + matches = self.matcher.get_result(fps, response) + for fp in matches: + + url_data = urllib.request.urlparse(response.get_url()) + fp['url'] = url_data.path + + + show_all_detections = True + if 'show_all_detections' in fp: + show_all_detections = fp['show_all_detections'] + + if (fp['name'], fp['output']) in tmp_set: + if show_all_detections: + self.results.add(fp_category, fp['name'], fp['output'], fingerprint=fp, weight=1) + + else: + self.printer.print_debug_line('- Found fingerprint: %s %s' % (fp['name'], fp['output']), 2) + self.results.add(fp_category, fp['name'], fp['output'], fingerprint=fp, weight=1) + + tmp_set.add((fp['name'], fp['output'])) + + + + +class DiscoverVulnerabilities: + def __init__(self, data): + self.printer = data['printer'] + self.results = data['results'] + self.fps = [] + + vuln_sources = data['fingerprints'].data['vulnerabilities'] + + for source in vuln_sources: + self.fps.extend(data['fingerprints'].data['vulnerabilities'][source]['fps']) + + + def run(self): + self.printer.print_debug_line('Searching for vulnerabilities ...', 1) + + cms_results = self.results.get_versions() + + vendors = Counter() + for r in cms_results: vendors[r[0]] += 1 + + # if there are more than 5 results, + # skip displaying vuln count, as the + # results are unreliable + for cms, version in cms_results: + if vendors[cms] > 5: continue + + try: + for fp in self.fps: + if fp['name'] == cms and fp['version'] == version: + self.results.add_vulnerabilities(cms, version, fp['num_vulns'], fp['link']) + self.printer.print_debug_line('- Found vulnerability: %s %s: %s' % (cms, version, fp['num_vulns']), 2) + + except Exception as e: + print(e) + pass diff --git a/classes2/request2.py b/classes2/request2.py new file mode 100644 index 0000000..65ec621 --- /dev/null +++ b/classes2/request2.py @@ -0,0 +1,334 @@ +import sys +try: + import concurrent.futures +except: + print 'Install futures: sudo pip install futures' + raise sys.exit(1) +import hashlib +import re +import string +import random +import urlparse +import urllib2 +from HTMLParser import HTMLParser + + +class HTMLStripper(HTMLParser): + def __init__(self): + self.reset() + self.strict = False + self.convert_charrefs = True + self.tagtext = [] + def handle_data(self, d): + self.tagtext.append(d) + def get_tagtext(self): + return ''.join(self.tagtext) + + +def _clean_page(page): + # this the same method nmap's http.lua uses for error page detection + # nselib/http.lua: clean_404 + # remove information from the page that might not be static + + # time + page = re.sub(b'(\d?\d:?){2,3}', b'',page) + page = re.sub(b'AM', b'',page, flags=re.IGNORECASE) + page = re.sub(b'PM', b'',page, flags=re.IGNORECASE) + page = re.sub(b'(\d){13}', b'', page) # timestamp + + # date with 4 digit year + page = re.sub(b'(\d){8}', '',page) + page = re.sub(b'\d{4}-\d{2}-\d{2}', b'',page) + page = re.sub(b'\d{4}/\d{2}/\d{2}', b'',page) + page = re.sub(b'\d{2}-\d{2}-\d{4}', b'',page) + page = re.sub(b'\d{2}/\d{2}/\d{4}', b'',page) + + # date with 2 digit year + page = re.sub( b'(\d){6}', '',page) + page = re.sub( b'\d{2}-\d{2}-\d{2}', b'',page) + page = re.sub( b'\d{2}/\d{2}/\d{2}', b'',page) + + # links and paths + page = re.sub( b'/[^ ]+', b'', page) + page = re.sub( b'[a-zA-Z]:\\[^ ]+', b'', page) + + # return the fingerprint of the stripped page + return hashlib.md5(page).hexdigest().lower() + + +def _create_response(response): + R = Response() + + url = response.geturl() + response_info = urlparse.urlparse(url) + body = response.read() + + # get the page text only + parser = HTMLStripper() + parser.feed(body.decode('utf-8', 'ignore')) + page_text = parser.get_tagtext() + + R.set_body(body) + R.protocol = response_info.scheme + R.host = response_info.netloc + R.url = url + R.status = {'code': response.code, 'text': response.msg} + R.headers = {pair[0].lower():pair[1] for pair in response.headers} + R.md5 = hashlib.md5(body).hexdigest().lower() + R.md5_404 = _clean_page(body) + R.md5_404_text = _clean_page(page_text.encode('utf-8', 'ignore')) + + return(R) + + +####################################################################### +# +# Override urllib.request classes +# +####################################################################### + +class OutOfScopeException(Exception): + def __init__(self, org_url, new_url): + self.original_netloc = org_url.netloc + self.new_netloc = new_url.netloc + + def __str__(self): + return repr( "%s is not in scope %s" % (self.new_netloc, self.original_netloc) ) + + +class UnknownHostName(Exception): + def __init__(self, url): + self.url = url + + def __str__(self): + return "Unknown host: %s" % (self.url,) + + +class ErrorHandler(urllib2.HTTPDefaultErrorHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + return(fp) + + +class RedirectHandler(urllib2.HTTPRedirectHandler): + """ + This currently only checks if the redirection netloc is + the same as the the netloc for the request. + + NOTE: this is very strict, as it will not allow redirections + from 'example.com' to 'www.example.com' + """ + + def http_error_302(self, req, fp, code, msg, headers): + if 'location' in headers: + org_url = urlparse.urlparse(req.get_full_url()) + new_url = urlparse.urlparse(headers['location']) + + # if the location starts with '/' the path is relative + if headers['location'].startswith('/'): + new_url = new_url._replace(scheme=org_url.scheme, netloc=org_url.netloc) + + if not new_url.netloc == org_url.netloc: + raise OutOfScopeException(org_url, new_url) + + # call python's built-in redirection handler + return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + + +####################################################################### +# +# Custom request and response classes +# +####################################################################### + +class Response: + """ + This is object is used to store response information + + The normal http.client.HTTPResponse cannot be pickled + which is used in the caching process + """ + + def __init__(self): + self.url = '' + self.protocol = '' + self.host = '' + self.status = {} + self.headers = {} + self.body = '' + + self.md5 = None + self.md5_404 = None + self.should_be_error_page = False + + self.crawled_response = False + + chars = string.ascii_uppercase + string.digits + self.id = ''.join(random.choice(chars) for _ in range(16)) + + + def get_url(self): + url_data = urlparse.urlparse(self.url) + + if url_data.scheme == '': url_data._replace(scheme=self.protocol) + if url_data.netloc == '': url_data._replace(netloc=self.host) + + return url_data.geturl() + + + def set_body(self, body): + # check if the encoding is specified in the http header + content_type = 'Content-Type'.lower() + + if content_type not in self.headers: + self.body = str(body).decode(errors='replace') + + else: + # find content-type definitions + content_types = {'text': False, 'charset': None} + + for item in self.headers[content_type].split(';'): + if 'text' in item: + content_types['text'] = True + + if 'charset' in item: + content_types['charset'] = item.split('=')[1] + + # set the encoding to use + if content_types['charset'] is not None: + self.body = str(body).decode(content_types['charset'], errors='replace') + elif content_types['text']: + self.body = str(body).decode('ISO-8859-1', errors='replace') + else: + self.body = str(body).decode(errors='replace') + + + def __repr__(self): + def get_string(r): + string = r.url + '\n' + string += '%s %s\n' %(r.status['code'], r.status['text']) + string += '\n'.join([header +': '+ r.headers[header] for header in r.headers]) + string += '\n\n' + string += 'MD5: ' + self.md5 + '\n' + string += 'MD5 Error page: ' + self.md5_404 + '\n' + return string + + return get_string(self) + + +class Requester: + def __init__(self, options, data): + self.threads = options['threads'] + self.proxy = options['proxy'] + self.user_agent = options['user_agent'] + + self.data = data + self.cache = data['cache'] + self.requested = data['requested'] + self.printer = data['printer'] + + self.is_redirected = False + self.find_404s = False + self.fingerprintQueue = None + + self.url_data = urlparse.urlparse(options['url']) + if options['prefix']: + self.url_data.path = options['prefix'] + self.url_data.path + self.url = urlparse.urlunparse(self.url_data) + + def _create_fetcher(self, redirect_handler=True): + args = [ErrorHandler] + if self.proxy == None: + args.append(urllib2.ProxyHandler({})) + elif not self.proxy == False: + protocol = self.url_data.scheme + args.append(urllib2.ProxyHandler({protocol: self.proxy})) + + if redirect_handler: + args.append(RedirectHandler) + + opener = urllib2.build_opener(*args) + opener.addheaders = [('User-agent', self.user_agent)] + return opener + + def detect_redirect(self): + parse = urlparse.urlparse + + # the original url + org_url = self.url_data + + # get an opener doing redirections + try: + opener = self._create_fetcher(redirect_handler=False) + response = opener.open(self.url) + except: + raise UnknownHostName(self.url) + + # the new url + new_url = parse(response.geturl()) + + # detect a redirection + new_loc = new_url.scheme + '://' + new_url.netloc + org_loc = org_url.scheme + '://' + org_url.netloc + + self.is_redirected = not(new_loc == org_loc) + + if self.is_redirected: + self.printer.print_debug_line('%s redirects to %s' % (org_loc, new_loc),2) + else: + self.printer.print_debug_line('%s does not redirect' % (org_loc, ), 2) + + # create an response object and add it to the cache + R = _create_response(response) + self.cache[new_loc] = R + self.cache[self.url] = R + + return (self.is_redirected, new_loc) + + + def request(self, fp_list, run_type): + + url = fp_list[0]['url'] + complete_url = urlparse.urljoin(self.url, url) + + R = None + + # check if the url is out of scope + url_data = urlparse.urlparse(complete_url) + host_data = urlparse.urlparse(self.url) + + if not url_data.netloc == host_data.netloc: + pass + + elif not complete_url in self.cache: + try: + opener = self._create_fetcher() + request = urllib2.Request(complete_url) + response = opener.open(request) + R = _create_response(response) + + if run_type == 'DiscoverMore': + R.crawled_response = True + + self.cache[complete_url] = R + self.cache[response.geturl()] = R + except Exception as e: + pass + else: + R = self.cache[complete_url] + + return (fp_list, R) + + + def run(self, run_type=None, fp_lists=[]): + with concurrent.futures.ThreadPoolExecutor(max_workers=self.threads) as executor: + future_list = [] + + for fp_list in fp_lists: + future_list.append(executor.submit(self.request, fp_list, run_type)) + + for future in concurrent.futures.as_completed(future_list): + self.requested.put(future.result()) + + return self.requested \ No newline at end of file diff --git a/wig.py b/wig.py index 4d2beab..62309a6 100755 --- a/wig.py +++ b/wig.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/python """ wig - WebApp Information Gatherer @@ -26,17 +26,22 @@ """ -import time, queue, sys, argparse +import time, sys, argparse from classes.cache import Cache from classes.results import Results from classes.fingerprints import Fingerprints -from classes.discovery import * from classes.headers import ExtractHeaders from classes.matcher import Match from classes.printer import Printer from classes.output import OutputPrinter, OutputJSON -from classes.request2 import Requester, UnknownHostName - +if sys.version_info.major == 3: + import queue + from classes.discovery import * + from classes.request2 import Requester, UnknownHostName +elif sys.version_info.major == 2: + import Queue as queue + from classes2.discovery import * + from classes2.request2 import Requester, UnknownHostName class Wig(object):