From da5ab0deba2c9e6a0ef3152eb8bec578319600a0 Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 17:46:50 -0700 Subject: [PATCH 01/15] Cleaning up readme --- README | 6 ------ README.md | 10 ++++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) delete mode 100644 README create mode 100644 README.md diff --git a/README b/README deleted file mode 100644 index 00f67b9..0000000 --- a/README +++ /dev/null @@ -1,6 +0,0 @@ -Python 2.5 runtime is deprecated since March 8, 2013. -http://googleappengine.blogspot.com/2013/03/python-25-thanks-for-good-times.html - -I fork the original version in order to support Python 2.7 runtime. You will not get any warning or error while deploying. - -Enjoy! \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c71ec1 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access. + +For instructions on how to setup go here. + +[http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) + + +For POST support and other features, see mirrorrr-plus: + +[https://code.google.com/p/mirrorrr-plus/](https://code.google.com/p/mirrorrr-plus/) From 8998df0c09c81df0772ee5627ee293b817b59965 Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:28:41 -0700 Subject: [PATCH 02/15] Updated to work with Python 2.7, modules, concurrent requests --- .gitignore | 3 +- app.yaml | 17 ++++- index.yaml | 11 ---- main.html | 18 +---- mirror.py | 188 +++++++++++++++-------------------------------------- 5 files changed, 70 insertions(+), 167 deletions(-) delete mode 100644 index.yaml diff --git a/.gitignore b/.gitignore index c7f5a23..da75fc6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .svn -*.pyc \ No newline at end of file +*.pyc +.DS_Store diff --git a/app.yaml b/app.yaml index c0afb9f..4a2886a 100644 --- a/app.yaml +++ b/app.yaml @@ -1,8 +1,19 @@ -application: yourappid -version: secureable +application: your-app-id-here +version: first +module: default runtime: python27 api_version: 1 -threadsafe: true +threadsafe: yes + +inbound_services: +- warmup + +instance_class: F1 +automatic_scaling: + max_idle_instances: automatic + min_pending_latency: automatic + max_pending_latency: 500ms + max_concurrent_requests: 30 handlers: diff --git a/index.yaml b/index.yaml deleted file mode 100644 index a3b9e05..0000000 --- a/index.yaml +++ /dev/null @@ -1,11 +0,0 @@ -indexes: - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. diff --git a/main.html b/main.html index b7754e9..9d28ccb 100644 --- a/main.html +++ b/main.html @@ -10,14 +10,14 @@ var activeColor = "#000000"; var disabledColor = "#959595"; var defaultValue = "type url here..."; - + function loadBox() { var box = document.getElementById("url_entry"); if (box.value != defaultValue) { box.style.color = activeColor; } } - + function focusBox() { var box = document.getElementById("url_entry"); if (box.value == defaultValue) { @@ -25,7 +25,7 @@ box.style.color = activeColor; } } - + function blurBox() { var box = document.getElementById("url_entry"); if (box.value == "") { @@ -56,18 +56,6 @@ -
-
recent
-
- {% for entry in latest_urls %} - - {% endfor %} -
-
-
{% if secure_url %}{% endif %} {% if secure_url %}not secure{% else %}secure{% endif %}
diff --git a/mirror.py b/mirror.py index b96f593..3812300 100644 --- a/mirror.py +++ b/mirror.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -# Copyright 2008 Brett Slatkin -# +# Copyright 2008-2014 Brett Slatkin +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,33 +33,30 @@ import transform_content -################################################################################ +############################################################################### DEBUG = False EXPIRATION_DELTA_SECONDS = 3600 -EXPIRATION_RECENT_URLS_SECONDS = 90 -## DEBUG = True -## EXPIRATION_DELTA_SECONDS = 10 -## EXPIRATION_RECENT_URLS_SECONDS = 1 +# DEBUG = True +# EXPIRATION_DELTA_SECONDS = 10 HTTP_PREFIX = "http://" -HTTPS_PREFIX = "http://" IGNORE_HEADERS = frozenset([ - 'set-cookie', - 'expires', - 'cache-control', + "set-cookie", + "expires", + "cache-control", # Ignore hop-by-hop headers - 'connection', - 'keep-alive', - 'proxy-authenticate', - 'proxy-authorization', - 'te', - 'trailers', - 'transfer-encoding', - 'upgrade', + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailers", + "transfer-encoding", + "upgrade", ]) TRANSFORMED_CONTENT_TYPES = frozenset([ @@ -67,34 +64,16 @@ "text/css", ]) -MIRROR_HOSTS = frozenset([ - 'mirrorr.com', - 'mirrorrr.com', - 'www.mirrorr.com', - 'www.mirrorrr.com', - 'www1.mirrorrr.com', - 'www2.mirrorrr.com', - 'www3.mirrorrr.com', -]) - MAX_CONTENT_SIZE = 10 ** 6 -MAX_URL_DISPLAY_LENGTH = 50 - -################################################################################ +############################################################################### def get_url_key_name(url): url_hash = hashlib.sha256() url_hash.update(url) return "hash_" + url_hash.hexdigest() -################################################################################ - -class EntryPoint(db.Model): - translated_address = db.TextProperty(required=True) - last_updated = db.DateTimeProperty(auto_now=True) - display_address = db.TextProperty() - +############################################################################### class MirroredContent(object): def __init__(self, original_address, translated_address, @@ -113,24 +92,18 @@ def get_by_key_name(key_name): @staticmethod def fetch_and_store(key_name, base_url, translated_address, mirrored_url): """Fetch and cache a page. - + Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. - + Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ - # Check for the X-Mirrorrr header to ignore potential loops. - if base_url in MIRROR_HOSTS: - logging.warning('Encountered recursive request for "%s"; ignoring', - mirrored_url) - return None - logging.debug("Fetching '%s'", mirrored_url) try: response = urlfetch.fetch(mirrored_url) @@ -147,7 +120,7 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): content = response.content page_content_type = adjusted_headers.get("content-type", "") for content_type in TRANSFORMED_CONTENT_TYPES: - # Startswith() because there could be a 'charset=UTF-8' in the header. + # startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent(base_url, mirrored_url, content) @@ -155,7 +128,7 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): # If the transformed content is over 1MB, truncate it (yikes!) if len(content) > MAX_CONTENT_SIZE: - logging.warning('Content is over 1MB; truncating') + logging.warning("Content is over 1MB; truncating") content = content[:MAX_CONTENT_SIZE] new_content = MirroredContent( @@ -168,10 +141,15 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS): logging.error('memcache.add failed: key_name = "%s", ' 'original_url = "%s"', key_name, mirrored_url) - + return new_content -################################################################################ +############################################################################### + +class WarmupHandler(webapp.RequestHandler): + def get(self): + pass + class BaseHandler(webapp2.RequestHandler): def get_relative_url(self): @@ -180,9 +158,19 @@ def get_relative_url(self): return "/" return self.request.url[slash:] + def is_recursive_request(self): + if "AppEngine-Google" in self.request.headers.get("User-Agent", ""): + logging.warning("Ignoring recursive request by user-agent=%r; ignoring") + self.error(404) + return True + return False + class HomeHandler(BaseHandler): def get(self): + if self.is_recursive_request(): + return + # Handle the input form to redirect the user to a relative url form_url = self.request.get("url") if form_url: @@ -192,29 +180,12 @@ def get(self): inputted_url = inputted_url[len(HTTP_PREFIX):] return self.redirect("/" + inputted_url) - latest_urls = memcache.get('latest_urls') - if latest_urls is None: - latest_urls = EntryPoint.gql("ORDER BY last_updated DESC").fetch(25) - - # Generate a display address that truncates the URL, adds an ellipsis. - # This is never actually saved in the Datastore. - for entry_point in latest_urls: - entry_point.display_address = \ - entry_point.translated_address[:MAX_URL_DISPLAY_LENGTH] - if len(entry_point.display_address) == MAX_URL_DISPLAY_LENGTH: - entry_point.display_address += '...' - - if not memcache.add('latest_urls', latest_urls, - time=EXPIRATION_RECENT_URLS_SECONDS): - logging.error('memcache.add failed: latest_urls') - # Do this dictionary construction here, to decouple presentation from # how we store data. secure_url = None if self.request.scheme == "http": - secure_url = "https://mirrorrr.appspot.com" + secure_url = "https://%s%s" % (self.request.host, self.request.path_qs) context = { - "latest_urls": latest_urls, "secure_url": secure_url, } self.response.out.write(template.render("main.html", context)) @@ -222,8 +193,11 @@ def get(self): class MirrorHandler(BaseHandler): def get(self, base_url): + if self.is_recursive_request(): + return + assert base_url - + # Log the user-agent and referrer, to see who is linking to us. logging.debug('User-Agent = "%s", Referrer = "%s"', self.request.user_agent, @@ -249,79 +223,19 @@ def get(self, base_url): if content is None: return self.error(404) - # Store the entry point down here, once we know the request is good and - # there has been a cache miss (i.e., the page expired). If the referrer - # wasn't local, or it was '/', then this is an entry point. - if (cache_miss and - 'Googlebot' not in self.request.user_agent and - 'Slurp' not in self.request.user_agent and - (not self.request.referer.startswith(self.request.host_url) or - self.request.referer == self.request.host_url + "/")): - # Ignore favicons as entry points; they're a common browser fetch on - # every request for a new site that we need to special case them here. - if not self.request.url.endswith("favicon.ico"): - logging.info("Inserting new entry point") - entry_point = EntryPoint( - key_name=key_name, - translated_address=translated_address) - try: - entry_point.put() - except (db.Error, apiproxy_errors.Error): - logging.exception("Could not insert EntryPoint") - for key, value in content.headers.iteritems(): self.response.headers[key] = value if not DEBUG: - self.response.headers['cache-control'] = \ - 'max-age=%d' % EXPIRATION_DELTA_SECONDS + self.response.headers["cache-control"] = \ + "max-age=%d" % EXPIRATION_DELTA_SECONDS self.response.out.write(content.data) - -class AdminHandler(webapp2.RequestHandler): - def get(self): - self.response.headers['content-type'] = 'text/plain' - self.response.out.write(str(memcache.get_stats())) - - -class KaboomHandler(webapp2.RequestHandler): - def get(self): - self.response.headers['content-type'] = 'text/plain' - self.response.out.write('Flush successful: %s' % memcache.flush_all()) - - -class CleanupHandler(webapp2.RequestHandler): - """Cleans up EntryPoint records.""" - - def get(self): - keep_cleaning = True - try: - content_list = EntryPoint.gql('ORDER BY last_updated').fetch(25) - keep_cleaning = (len(content_list) > 0) - db.delete(content_list) - - if content_list: - message = "Deleted %d entities" % len(content_list) - else: - keep_cleaning = False - message = "Done" - except (db.Error, apiproxy_errors.Error), e: - keep_cleaning = True - message = "%s: %s" % (e.__class__, e) - - context = { - 'keep_cleaning': keep_cleaning, - 'message': message, - } - self.response.out.write(template.render('cleanup.html', context)) - -################################################################################ +############################################################################### app = webapp2.WSGIApplication([ (r"/", HomeHandler), (r"/main", HomeHandler), - (r"/kaboom", KaboomHandler), - (r"/admin", AdminHandler), - (r"/cleanup", CleanupHandler), - (r"/([^/]+).*", MirrorHandler) + (r"/([^/]+).*", MirrorHandler), + (r"/warmup", WarmupHandler), ], debug=DEBUG) From c0842bf84cccd52f26eb0e967f93cd20ef2057ad Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:32:22 -0700 Subject: [PATCH 03/15] Fixing bad merge with upstream --- mirror.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mirror.py b/mirror.py index 3812300..4fe1a5b 100644 --- a/mirror.py +++ b/mirror.py @@ -146,7 +146,7 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): ############################################################################### -class WarmupHandler(webapp.RequestHandler): +class WarmupHandler(webapp2.RequestHandler): def get(self): pass From 148e50009d0ba8cbbc9d102a1aa022d4c7be75f0 Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:32:46 -0700 Subject: [PATCH 04/15] Small readme change --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c71ec1..21d2e0b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access. -For instructions on how to setup go here. +For instructions on how to setup go here: [http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) From d52619597720700ed2ed71ae708995fcf63f6fd4 Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:33:15 -0700 Subject: [PATCH 05/15] Fixing copyright --- transform_content.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/transform_content.py b/transform_content.py index 313bf73..c5bb1dd 100644 --- a/transform_content.py +++ b/transform_content.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -# Copyright 2008 Brett Slatkin -# +# Copyright 2008-2014 Brett Slatkin +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -78,7 +78,7 @@ (CSS_URL_START + SAME_DIR_URL_REGEX, "url(\g%(accessed_dir)s\g"), - + (CSS_URL_START + TRAVERSAL_URL_REGEX, "url(\g%(accessed_dir)s/\g/\g"), From 525e5d36045c9e0aa55437e233dc9a0d54e3b03b Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:39:18 -0700 Subject: [PATCH 06/15] Pointing to live instance --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 21d2e0b..59edac2 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,12 @@ Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access. -For instructions on how to setup go here: +Example live version: -[http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) +[https://mirrorrr.appspot.com](https://mirrorrr.appspot.com) + +Instructions on how to setup your own proxy: +[http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) For POST support and other features, see mirrorrr-plus: From 7aba09498a455ce871def362944978444a75a740 Mon Sep 17 00:00:00 2001 From: Brett Slatkin Date: Sun, 20 Apr 2014 18:53:27 -0700 Subject: [PATCH 07/15] Tuning params --- app.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/app.yaml b/app.yaml index 4a2886a..e119598 100644 --- a/app.yaml +++ b/app.yaml @@ -10,10 +10,9 @@ inbound_services: instance_class: F1 automatic_scaling: - max_idle_instances: automatic - min_pending_latency: automatic - max_pending_latency: 500ms - max_concurrent_requests: 30 + min_idle_instances: 1 + max_idle_instances: 1 + max_concurrent_requests: 40 handlers: From ff4c1e70c26f7e8573894a51007c82af9af5f719 Mon Sep 17 00:00:00 2001 From: solsTiCe d'Hiver Date: Mon, 26 Jan 2015 09:43:45 +0100 Subject: [PATCH 08/15] Clean up * Remove unused url * Fix warmup url * Remove base.#.css * Remove cleanup.html --- app.yaml | 19 +------------------ cleanup.html | 11 ----------- main.html | 2 +- mirror.py | 2 +- 4 files changed, 3 insertions(+), 31 deletions(-) delete mode 100644 cleanup.html diff --git a/app.yaml b/app.yaml index e119598..c95755e 100644 --- a/app.yaml +++ b/app.yaml @@ -15,7 +15,6 @@ automatic_scaling: max_concurrent_requests: 40 handlers: - - url: /robots\.txt static_files: static/robots.txt upload: static/robots\.txt @@ -25,27 +24,11 @@ handlers: upload: static/favicon\.ico secure: optional -- url: /static/base(\.[0-9])\.css - static_files: static/base.css - upload: static/base\.css - secure: optional - - url: /static static_dir: static secure: optional -- url: /admin - login: admin - script: mirror.app - secure: optional - -- url: /cleanup - login: admin - script: mirror.app - secure: optional - -- url: /kaboom - login: admin +- url: /_ah/warmup script: mirror.app secure: optional diff --git a/cleanup.html b/cleanup.html deleted file mode 100644 index be9560b..0000000 --- a/cleanup.html +++ /dev/null @@ -1,11 +0,0 @@ - - - Cleanup - {% if keep_cleaning %} - - {% endif %} - - -{{message}} - - \ No newline at end of file diff --git a/main.html b/main.html index 9d28ccb..87346cd 100644 --- a/main.html +++ b/main.html @@ -4,7 +4,7 @@ mirror - ɹoɹɹıɯ - + + + - -
-
+ - +
+

mıɾɾoɾ - ɹoɹɹıɯ

+
+ +
-
+
- http:// + http:// +
- -
- Fair use: All content belongs to the original copyright holders, respectively. -
-
- {% if secure_url %}{% endif %} {% if secure_url %}not secure{% else %}secure{% endif %} +
+Fair use: All content belongs to the original copyright holders, respectively.
-
-
+ - - + diff --git a/static/base.css b/static/base.css index caa8c74..be70848 100644 --- a/static/base.css +++ b/static/base.css @@ -11,7 +11,7 @@ body { color: #000000; } -#header { +header { text-align: center; font-size: 20px; letter-spacing: 10px; @@ -19,16 +19,6 @@ body { margin-top: 40px; } -#wrapper { - width: 100%; -} - -#container { - margin-left: auto; - margin-right: auto; - width: 600px; -} - #form_wrapper { margin-top: 10px; width: 100%; @@ -66,58 +56,9 @@ body { margin-top: 20px; letter-spacing: 3px; color: #7f7f7f; + text-align: center; } -#recent { - margin-top: 50px; -} - -#recent_entries { - margin-left: 20px; -} - -.url_container { - margin-top: 3px; - padding-left: 5px; -} - -.info { - font-family: sans-serif; - font-size: 10px; - color: #606060; - margin-left: 5px; -} - -.url { - font-size: 12px; -} - -.url1 { color: #000000 !important; } -.url2 { color: #0a0a0a !important; } -.url3 { color: #151515 !important; } -.url4 { color: #1f1f1f !important; } -.url5 { color: #2a2a2a !important; } -.url6 { color: #353535 !important; } -.url7 { color: #3f3f3f !important; } -.url8 { color: #4a4a4a !important; } -.url9 { color: #555555 !important; } -.url10 { color: #5f5f5f !important; } -.url11 { color: #6a6a6a !important; } -.url12 { color: #757575 !important; } -.url13 { color: #7f7f7f !important; } -.url14 { color: #8a8a8a !important; } -.url15 { color: #959595 !important; } -.url16 { color: #9f9f9f !important; } -.url17 { color: #aaaaaa !important; } -.url18 { color: #b5b5b5 !important; } -.url19 { color: #bfbfbf !important; } -.url20 { color: #cacaca !important; } -.url21 { color: #d5d5d5 !important; } -.url22 { color: #dfdfdf !important; } -.url23 { color: #eaeaea !important; } -.url24 { color: #f5f5f5 !important; } -.url25 { color: #ffffff !important; } - /* secure link */ .secure { text-align: center; From 409a103fa1e33abebaa7ef392f1affbd0b6f1922 Mon Sep 17 00:00:00 2001 From: Gantron Date: Thu, 30 Mar 2017 09:40:51 -0700 Subject: [PATCH 14/15] Remove application and module --- app.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/app.yaml b/app.yaml index c95755e..615ddb3 100644 --- a/app.yaml +++ b/app.yaml @@ -1,6 +1,4 @@ -application: your-app-id-here version: first -module: default runtime: python27 api_version: 1 threadsafe: yes From 154fe6d1936042d17311ac1b3db55f61dc5f7adc Mon Sep 17 00:00:00 2001 From: Gantron Date: Thu, 30 Mar 2017 09:43:20 -0700 Subject: [PATCH 15/15] Update app.yaml --- app.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/app.yaml b/app.yaml index 615ddb3..dcdbd8d 100644 --- a/app.yaml +++ b/app.yaml @@ -1,4 +1,3 @@ -version: first runtime: python27 api_version: 1 threadsafe: yes