diff --git a/.gitignore b/.gitignore index c7f5a23..da75fc6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .svn -*.pyc \ No newline at end of file +*.pyc +.DS_Store diff --git a/README b/README deleted file mode 100644 index 00f67b9..0000000 --- a/README +++ /dev/null @@ -1,6 +0,0 @@ -Python 2.5 runtime is deprecated since March 8, 2013. -http://googleappengine.blogspot.com/2013/03/python-25-thanks-for-good-times.html - -I fork the original version in order to support Python 2.7 runtime. You will not get any warning or error while deploying. - -Enjoy! \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..59edac2 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access. + +Example live version: + +[https://mirrorrr.appspot.com](https://mirrorrr.appspot.com) + +Instructions on how to setup your own proxy: + +[http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/) + +For POST support and other features, see mirrorrr-plus: + +[https://code.google.com/p/mirrorrr-plus/](https://code.google.com/p/mirrorrr-plus/) diff --git a/app.yaml b/app.yaml index c0afb9f..dcdbd8d 100644 --- a/app.yaml +++ b/app.yaml @@ -1,11 +1,17 @@ -application: yourappid -version: secureable runtime: python27 api_version: 1 -threadsafe: true +threadsafe: yes -handlers: +inbound_services: +- warmup + +instance_class: F1 +automatic_scaling: + min_idle_instances: 1 + max_idle_instances: 1 + max_concurrent_requests: 40 +handlers: - url: /robots\.txt static_files: static/robots.txt upload: static/robots\.txt @@ -15,27 +21,11 @@ handlers: upload: static/favicon\.ico secure: optional -- url: /static/base(\.[0-9])\.css - static_files: static/base.css - upload: static/base\.css - secure: optional - - url: /static static_dir: static secure: optional -- url: /admin - login: admin - script: mirror.app - secure: optional - -- url: /cleanup - login: admin - script: mirror.app - secure: optional - -- url: /kaboom - login: admin +- url: /_ah/warmup script: mirror.app secure: optional diff --git a/cleanup.html b/cleanup.html deleted file mode 100644 index be9560b..0000000 --- a/cleanup.html +++ /dev/null @@ -1,11 +0,0 @@ - - - Cleanup - {% if keep_cleaning %} - - {% endif %} - - -{{message}} - - \ No newline at end of file diff --git a/index.yaml b/index.yaml deleted file mode 100644 index a3b9e05..0000000 --- a/index.yaml +++ /dev/null @@ -1,11 +0,0 @@ -indexes: - -# AUTOGENERATED - -# This index.yaml is automatically updated whenever the dev_appserver -# detects that a new type of query is run. If you want to manage the -# index.yaml file manually, remove the above marker line (the line -# saying "# AUTOGENERATED"). If you want to manage some indexes -# manually, move them above the marker line. The index.yaml file is -# automatically uploaded to the admin console when you next deploy -# your application using appcfg.py. diff --git a/main.html b/main.html index b7754e9..e3fe8e8 100644 --- a/main.html +++ b/main.html @@ -1,89 +1,44 @@ - + + mirror - ɹoɹɹıɯ - - - + + - -
-
+ - +
+

mıɾɾoɾ - ɹoɹɹıɯ

+
+ +
-
+
- http:// + http:// +
- -
- Fair use: All content belongs to the original copyright holders, respectively. -
-
-
recent
-
- {% for entry in latest_urls %} - - {% endfor %} -
+
+Fair use: All content belongs to the original copyright holders, respectively.
-
- {% if secure_url %}{% endif %} {% if secure_url %}not secure{% else %}secure{% endif %} -
- -
-
+ - - + diff --git a/mirror.py b/mirror.py index b96f593..a28b76a 100644 --- a/mirror.py +++ b/mirror.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -# Copyright 2008 Brett Slatkin -# +# Copyright 2008-2014 Brett Slatkin +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,7 +18,6 @@ import datetime import hashlib import logging -import pickle import re import time import urllib @@ -26,40 +25,36 @@ from google.appengine.api import memcache from google.appengine.api import urlfetch -from google.appengine.ext import db import webapp2 from google.appengine.ext.webapp import template from google.appengine.runtime import apiproxy_errors import transform_content -################################################################################ +############################################################################### DEBUG = False EXPIRATION_DELTA_SECONDS = 3600 -EXPIRATION_RECENT_URLS_SECONDS = 90 -## DEBUG = True -## EXPIRATION_DELTA_SECONDS = 10 -## EXPIRATION_RECENT_URLS_SECONDS = 1 +# DEBUG = True +# EXPIRATION_DELTA_SECONDS = 10 HTTP_PREFIX = "http://" -HTTPS_PREFIX = "http://" IGNORE_HEADERS = frozenset([ - 'set-cookie', - 'expires', - 'cache-control', + "set-cookie", + "expires", + "cache-control", # Ignore hop-by-hop headers - 'connection', - 'keep-alive', - 'proxy-authenticate', - 'proxy-authorization', - 'te', - 'trailers', - 'transfer-encoding', - 'upgrade', + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailers", + "transfer-encoding", + "upgrade", ]) TRANSFORMED_CONTENT_TYPES = frozenset([ @@ -67,34 +62,16 @@ "text/css", ]) -MIRROR_HOSTS = frozenset([ - 'mirrorr.com', - 'mirrorrr.com', - 'www.mirrorr.com', - 'www.mirrorrr.com', - 'www1.mirrorrr.com', - 'www2.mirrorrr.com', - 'www3.mirrorrr.com', -]) - -MAX_CONTENT_SIZE = 10 ** 6 - -MAX_URL_DISPLAY_LENGTH = 50 +MAX_CONTENT_SIZE = 10 ** 6 - 600 -################################################################################ +############################################################################### def get_url_key_name(url): url_hash = hashlib.sha256() url_hash.update(url) return "hash_" + url_hash.hexdigest() -################################################################################ - -class EntryPoint(db.Model): - translated_address = db.TextProperty(required=True) - last_updated = db.DateTimeProperty(auto_now=True) - display_address = db.TextProperty() - +############################################################################### class MirroredContent(object): def __init__(self, original_address, translated_address, @@ -113,24 +90,18 @@ def get_by_key_name(key_name): @staticmethod def fetch_and_store(key_name, base_url, translated_address, mirrored_url): """Fetch and cache a page. - + Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. - + Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ - # Check for the X-Mirrorrr header to ignore potential loops. - if base_url in MIRROR_HOSTS: - logging.warning('Encountered recursive request for "%s"; ignoring', - mirrored_url) - return None - logging.debug("Fetching '%s'", mirrored_url) try: response = urlfetch.fetch(mirrored_url) @@ -147,17 +118,12 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): content = response.content page_content_type = adjusted_headers.get("content-type", "") for content_type in TRANSFORMED_CONTENT_TYPES: - # Startswith() because there could be a 'charset=UTF-8' in the header. + # startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent(base_url, mirrored_url, content) break - # If the transformed content is over 1MB, truncate it (yikes!) - if len(content) > MAX_CONTENT_SIZE: - logging.warning('Content is over 1MB; truncating') - content = content[:MAX_CONTENT_SIZE] - new_content = MirroredContent( base_url=base_url, original_address=mirrored_url, @@ -165,13 +131,23 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url): status=response.status_code, headers=adjusted_headers, data=content) - if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS): - logging.error('memcache.add failed: key_name = "%s", ' - 'original_url = "%s"', key_name, mirrored_url) - + + # Do not memcache content over 1MB + if len(content) < MAX_CONTENT_SIZE: + if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS): + logging.error('memcache.add failed: key_name = "%s", ' + 'original_url = "%s"', key_name, mirrored_url) + else: + logging.warning("Content is over 1MB; not memcached") + return new_content -################################################################################ +############################################################################### + +class WarmupHandler(webapp2.RequestHandler): + def get(self): + pass + class BaseHandler(webapp2.RequestHandler): def get_relative_url(self): @@ -180,9 +156,19 @@ def get_relative_url(self): return "/" return self.request.url[slash:] + def is_recursive_request(self): + if "AppEngine-Google" in self.request.headers.get("User-Agent", ""): + logging.warning("Ignoring recursive request by user-agent=%r; ignoring") + self.error(404) + return True + return False + class HomeHandler(BaseHandler): def get(self): + if self.is_recursive_request(): + return + # Handle the input form to redirect the user to a relative url form_url = self.request.get("url") if form_url: @@ -192,29 +178,12 @@ def get(self): inputted_url = inputted_url[len(HTTP_PREFIX):] return self.redirect("/" + inputted_url) - latest_urls = memcache.get('latest_urls') - if latest_urls is None: - latest_urls = EntryPoint.gql("ORDER BY last_updated DESC").fetch(25) - - # Generate a display address that truncates the URL, adds an ellipsis. - # This is never actually saved in the Datastore. - for entry_point in latest_urls: - entry_point.display_address = \ - entry_point.translated_address[:MAX_URL_DISPLAY_LENGTH] - if len(entry_point.display_address) == MAX_URL_DISPLAY_LENGTH: - entry_point.display_address += '...' - - if not memcache.add('latest_urls', latest_urls, - time=EXPIRATION_RECENT_URLS_SECONDS): - logging.error('memcache.add failed: latest_urls') - # Do this dictionary construction here, to decouple presentation from # how we store data. secure_url = None if self.request.scheme == "http": - secure_url = "https://mirrorrr.appspot.com" + secure_url = "https://%s%s" % (self.request.host, self.request.path_qs) context = { - "latest_urls": latest_urls, "secure_url": secure_url, } self.response.out.write(template.render("main.html", context)) @@ -222,8 +191,11 @@ def get(self): class MirrorHandler(BaseHandler): def get(self, base_url): + if self.is_recursive_request(): + return + assert base_url - + # Log the user-agent and referrer, to see who is linking to us. logging.debug('User-Agent = "%s", Referrer = "%s"', self.request.user_agent, @@ -249,79 +221,19 @@ def get(self, base_url): if content is None: return self.error(404) - # Store the entry point down here, once we know the request is good and - # there has been a cache miss (i.e., the page expired). If the referrer - # wasn't local, or it was '/', then this is an entry point. - if (cache_miss and - 'Googlebot' not in self.request.user_agent and - 'Slurp' not in self.request.user_agent and - (not self.request.referer.startswith(self.request.host_url) or - self.request.referer == self.request.host_url + "/")): - # Ignore favicons as entry points; they're a common browser fetch on - # every request for a new site that we need to special case them here. - if not self.request.url.endswith("favicon.ico"): - logging.info("Inserting new entry point") - entry_point = EntryPoint( - key_name=key_name, - translated_address=translated_address) - try: - entry_point.put() - except (db.Error, apiproxy_errors.Error): - logging.exception("Could not insert EntryPoint") - for key, value in content.headers.iteritems(): self.response.headers[key] = value if not DEBUG: - self.response.headers['cache-control'] = \ - 'max-age=%d' % EXPIRATION_DELTA_SECONDS + self.response.headers["cache-control"] = \ + "max-age=%d" % EXPIRATION_DELTA_SECONDS self.response.out.write(content.data) - -class AdminHandler(webapp2.RequestHandler): - def get(self): - self.response.headers['content-type'] = 'text/plain' - self.response.out.write(str(memcache.get_stats())) - - -class KaboomHandler(webapp2.RequestHandler): - def get(self): - self.response.headers['content-type'] = 'text/plain' - self.response.out.write('Flush successful: %s' % memcache.flush_all()) - - -class CleanupHandler(webapp2.RequestHandler): - """Cleans up EntryPoint records.""" - - def get(self): - keep_cleaning = True - try: - content_list = EntryPoint.gql('ORDER BY last_updated').fetch(25) - keep_cleaning = (len(content_list) > 0) - db.delete(content_list) - - if content_list: - message = "Deleted %d entities" % len(content_list) - else: - keep_cleaning = False - message = "Done" - except (db.Error, apiproxy_errors.Error), e: - keep_cleaning = True - message = "%s: %s" % (e.__class__, e) - - context = { - 'keep_cleaning': keep_cleaning, - 'message': message, - } - self.response.out.write(template.render('cleanup.html', context)) - -################################################################################ +############################################################################### app = webapp2.WSGIApplication([ (r"/", HomeHandler), (r"/main", HomeHandler), - (r"/kaboom", KaboomHandler), - (r"/admin", AdminHandler), - (r"/cleanup", CleanupHandler), - (r"/([^/]+).*", MirrorHandler) + (r"/_ah/warmup", WarmupHandler), + (r"/([^/]+).*", MirrorHandler), ], debug=DEBUG) diff --git a/static/base.css b/static/base.css index b62820e..be70848 100644 --- a/static/base.css +++ b/static/base.css @@ -1,137 +1,78 @@ * { - margin: 0; - padding: 0; + margin: 0; + padding: 0; } body { - background-color: #ffffff; - font-family: sans-serif; - margin: 0px; - padding: 8px; - color: #000000; + background-color: #ffffff; + font-family: sans-serif; + margin: 0px; + padding: 8px; + color: #000000; } -#header { - text-align: center; - font-size: 20px; - letter-spacing: 10px; - margin-bottom: 40px; - margin-top: 40px; -} - -#wrapper { - width: 100%; -} - -#container { - margin-left: auto; - margin-right: auto; - width: 600px; +header { + text-align: center; + font-size: 20px; + letter-spacing: 10px; + margin-bottom: 40px; + margin-top: 40px; } #form_wrapper { - margin-top: 10px; - width: 100%; + margin-top: 10px; + width: 100%; } #input_wrapper { - width: 500px; - margin-left: auto; - margin-right: auto; + width: 500px; + margin-left: auto; + margin-right: auto; } #http_prefix { - vertical-align: middle; + vertical-align: middle; } #go_button { - width: 50px; - vertical-align: middle; + width: 50px; + vertical-align: middle; } #url_entry { - width: 375px; - border: 1px dashed black; - padding: 3px; - margin-left: 3px; - margin-right: 10px; - font-family: arial,helvetica,sans-serif; - font-weight: normal; - color: #959595; - font-size: 14px; + width: 375px; + border: 1px dashed black; + padding: 3px; + margin-left: 3px; + margin-right: 10px; + font-family: arial,helvetica,sans-serif; + font-weight: normal; + color: #959595; + font-size: 14px; } #warning { - font-size: 10px; - margin-top: 20px; - letter-spacing: 3px; - color: #7f7f7f; -} - -#recent { - margin-top: 50px; -} - -#recent_entries { - margin-left: 20px; + font-size: 10px; + margin-top: 20px; + letter-spacing: 3px; + color: #7f7f7f; + text-align: center; } -.url_container { - margin-top: 3px; - padding-left: 5px; -} - -.info { - font-family: sans-serif; - font-size: 10px; - color: #606060; - margin-left: 5px; -} - -.url { - font-size: 12px; -} - -.url1 { color: #000000 !important; } -.url2 { color: #0a0a0a !important; } -.url3 { color: #151515 !important; } -.url4 { color: #1f1f1f !important; } -.url5 { color: #2a2a2a !important; } -.url6 { color: #353535 !important; } -.url7 { color: #3f3f3f !important; } -.url8 { color: #4a4a4a !important; } -.url9 { color: #555555 !important; } -.url10 { color: #5f5f5f !important; } -.url11 { color: #6a6a6a !important; } -.url12 { color: #757575 !important; } -.url13 { color: #7f7f7f !important; } -.url14 { color: #8a8a8a !important; } -.url15 { color: #959595 !important; } -.url16 { color: #9f9f9f !important; } -.url17 { color: #aaaaaa !important; } -.url18 { color: #b5b5b5 !important; } -.url19 { color: #bfbfbf !important; } -.url20 { color: #cacaca !important; } -.url21 { color: #d5d5d5 !important; } -.url22 { color: #dfdfdf !important; } -.url23 { color: #eaeaea !important; } -.url24 { color: #f5f5f5 !important; } -.url25 { color: #ffffff !important; } - /* secure link */ .secure { - text-align: center; - margin-top: 50px; - vertical-align: middle; - font-size: 12px; + text-align: center; + margin-top: 50px; + vertical-align: middle; + font-size: 12px; } .secure img { - margin-bottom: -0.3em; - border: 0; + margin-bottom: -0.3em; + border: 0; } .secure a, .secure a:hover, .secure a:visited, .secure a:active { - color: #000; - text-decoration: none; + color: #000; + text-decoration: none; } diff --git a/static/lock.png b/static/lock.png index afd64f2..f2646f5 100644 Binary files a/static/lock.png and b/static/lock.png differ diff --git a/static/nolock.png b/static/nolock.png index 6a3cd91..ca13401 100644 Binary files a/static/nolock.png and b/static/nolock.png differ diff --git a/transform_content.py b/transform_content.py index 313bf73..c5bb1dd 100644 --- a/transform_content.py +++ b/transform_content.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -# Copyright 2008 Brett Slatkin -# +# Copyright 2008-2014 Brett Slatkin +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -78,7 +78,7 @@ (CSS_URL_START + SAME_DIR_URL_REGEX, "url(\g%(accessed_dir)s\g"), - + (CSS_URL_START + TRAVERSAL_URL_REGEX, "url(\g%(accessed_dir)s/\g/\g"),