diff --git a/.gitignore b/.gitignore
index c7f5a23..da75fc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
.svn
-*.pyc
\ No newline at end of file
+*.pyc
+.DS_Store
diff --git a/README b/README
deleted file mode 100644
index 00f67b9..0000000
--- a/README
+++ /dev/null
@@ -1,6 +0,0 @@
-Python 2.5 runtime is deprecated since March 8, 2013.
-http://googleappengine.blogspot.com/2013/03/python-25-thanks-for-good-times.html
-
-I fork the original version in order to support Python 2.7 runtime. You will not get any warning or error while deploying.
-
-Enjoy!
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..59edac2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+Google App Engine app that Mirrors the content of URLs you supply. Rewrites the fetched page to mirror all content, including images, Flash, Javascript, CSS, and even favicons. You stay within the cache when you follow links. Useful for pulling load off of slashdotted servers. Also can be used to anonymize web access.
+
+Example live version:
+
+[https://mirrorrr.appspot.com](https://mirrorrr.appspot.com)
+
+Instructions on how to setup your own proxy:
+
+[http://www.hongkiat.com/blog/proxy-with-google-app-engine/](http://www.hongkiat.com/blog/proxy-with-google-app-engine/)
+
+For POST support and other features, see mirrorrr-plus:
+
+[https://code.google.com/p/mirrorrr-plus/](https://code.google.com/p/mirrorrr-plus/)
diff --git a/app.yaml b/app.yaml
index c0afb9f..dcdbd8d 100644
--- a/app.yaml
+++ b/app.yaml
@@ -1,11 +1,17 @@
-application: yourappid
-version: secureable
runtime: python27
api_version: 1
-threadsafe: true
+threadsafe: yes
-handlers:
+inbound_services:
+- warmup
+
+instance_class: F1
+automatic_scaling:
+ min_idle_instances: 1
+ max_idle_instances: 1
+ max_concurrent_requests: 40
+handlers:
- url: /robots\.txt
static_files: static/robots.txt
upload: static/robots\.txt
@@ -15,27 +21,11 @@ handlers:
upload: static/favicon\.ico
secure: optional
-- url: /static/base(\.[0-9])\.css
- static_files: static/base.css
- upload: static/base\.css
- secure: optional
-
- url: /static
static_dir: static
secure: optional
-- url: /admin
- login: admin
- script: mirror.app
- secure: optional
-
-- url: /cleanup
- login: admin
- script: mirror.app
- secure: optional
-
-- url: /kaboom
- login: admin
+- url: /_ah/warmup
script: mirror.app
secure: optional
diff --git a/cleanup.html b/cleanup.html
deleted file mode 100644
index be9560b..0000000
--- a/cleanup.html
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
+
-
+
+
+
-
-
-
- {% for entry in latest_urls %}
-
- {% endfor %}
-
+
+Fair use: All content belongs to the original copyright holders, respectively.
-
-
-
-
+
-
-
+
diff --git a/mirror.py b/mirror.py
index b96f593..a28b76a 100644
--- a/mirror.py
+++ b/mirror.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python
-# Copyright 2008 Brett Slatkin
-#
+# Copyright 2008-2014 Brett Slatkin
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,7 +18,6 @@
import datetime
import hashlib
import logging
-import pickle
import re
import time
import urllib
@@ -26,40 +25,36 @@
from google.appengine.api import memcache
from google.appengine.api import urlfetch
-from google.appengine.ext import db
import webapp2
from google.appengine.ext.webapp import template
from google.appengine.runtime import apiproxy_errors
import transform_content
-################################################################################
+###############################################################################
DEBUG = False
EXPIRATION_DELTA_SECONDS = 3600
-EXPIRATION_RECENT_URLS_SECONDS = 90
-## DEBUG = True
-## EXPIRATION_DELTA_SECONDS = 10
-## EXPIRATION_RECENT_URLS_SECONDS = 1
+# DEBUG = True
+# EXPIRATION_DELTA_SECONDS = 10
HTTP_PREFIX = "http://"
-HTTPS_PREFIX = "http://"
IGNORE_HEADERS = frozenset([
- 'set-cookie',
- 'expires',
- 'cache-control',
+ "set-cookie",
+ "expires",
+ "cache-control",
# Ignore hop-by-hop headers
- 'connection',
- 'keep-alive',
- 'proxy-authenticate',
- 'proxy-authorization',
- 'te',
- 'trailers',
- 'transfer-encoding',
- 'upgrade',
+ "connection",
+ "keep-alive",
+ "proxy-authenticate",
+ "proxy-authorization",
+ "te",
+ "trailers",
+ "transfer-encoding",
+ "upgrade",
])
TRANSFORMED_CONTENT_TYPES = frozenset([
@@ -67,34 +62,16 @@
"text/css",
])
-MIRROR_HOSTS = frozenset([
- 'mirrorr.com',
- 'mirrorrr.com',
- 'www.mirrorr.com',
- 'www.mirrorrr.com',
- 'www1.mirrorrr.com',
- 'www2.mirrorrr.com',
- 'www3.mirrorrr.com',
-])
-
-MAX_CONTENT_SIZE = 10 ** 6
-
-MAX_URL_DISPLAY_LENGTH = 50
+MAX_CONTENT_SIZE = 10 ** 6 - 600
-################################################################################
+###############################################################################
def get_url_key_name(url):
url_hash = hashlib.sha256()
url_hash.update(url)
return "hash_" + url_hash.hexdigest()
-################################################################################
-
-class EntryPoint(db.Model):
- translated_address = db.TextProperty(required=True)
- last_updated = db.DateTimeProperty(auto_now=True)
- display_address = db.TextProperty()
-
+###############################################################################
class MirroredContent(object):
def __init__(self, original_address, translated_address,
@@ -113,24 +90,18 @@ def get_by_key_name(key_name):
@staticmethod
def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
"""Fetch and cache a page.
-
+
Args:
key_name: Hash to use to store the cached page.
base_url: The hostname of the page that's being mirrored.
translated_address: The URL of the mirrored page on this site.
mirrored_url: The URL of the original page. Hostname should match
the base_url.
-
+
Returns:
A new MirroredContent object, if the page was successfully retrieved.
None if any errors occurred or the content could not be retrieved.
"""
- # Check for the X-Mirrorrr header to ignore potential loops.
- if base_url in MIRROR_HOSTS:
- logging.warning('Encountered recursive request for "%s"; ignoring',
- mirrored_url)
- return None
-
logging.debug("Fetching '%s'", mirrored_url)
try:
response = urlfetch.fetch(mirrored_url)
@@ -147,17 +118,12 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
content = response.content
page_content_type = adjusted_headers.get("content-type", "")
for content_type in TRANSFORMED_CONTENT_TYPES:
- # Startswith() because there could be a 'charset=UTF-8' in the header.
+ # startswith() because there could be a 'charset=UTF-8' in the header.
if page_content_type.startswith(content_type):
content = transform_content.TransformContent(base_url, mirrored_url,
content)
break
- # If the transformed content is over 1MB, truncate it (yikes!)
- if len(content) > MAX_CONTENT_SIZE:
- logging.warning('Content is over 1MB; truncating')
- content = content[:MAX_CONTENT_SIZE]
-
new_content = MirroredContent(
base_url=base_url,
original_address=mirrored_url,
@@ -165,13 +131,23 @@ def fetch_and_store(key_name, base_url, translated_address, mirrored_url):
status=response.status_code,
headers=adjusted_headers,
data=content)
- if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS):
- logging.error('memcache.add failed: key_name = "%s", '
- 'original_url = "%s"', key_name, mirrored_url)
-
+
+ # Do not memcache content over 1MB
+ if len(content) < MAX_CONTENT_SIZE:
+ if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS):
+ logging.error('memcache.add failed: key_name = "%s", '
+ 'original_url = "%s"', key_name, mirrored_url)
+ else:
+ logging.warning("Content is over 1MB; not memcached")
+
return new_content
-################################################################################
+###############################################################################
+
+class WarmupHandler(webapp2.RequestHandler):
+ def get(self):
+ pass
+
class BaseHandler(webapp2.RequestHandler):
def get_relative_url(self):
@@ -180,9 +156,19 @@ def get_relative_url(self):
return "/"
return self.request.url[slash:]
+ def is_recursive_request(self):
+ if "AppEngine-Google" in self.request.headers.get("User-Agent", ""):
+ logging.warning("Ignoring recursive request by user-agent=%r; ignoring")
+ self.error(404)
+ return True
+ return False
+
class HomeHandler(BaseHandler):
def get(self):
+ if self.is_recursive_request():
+ return
+
# Handle the input form to redirect the user to a relative url
form_url = self.request.get("url")
if form_url:
@@ -192,29 +178,12 @@ def get(self):
inputted_url = inputted_url[len(HTTP_PREFIX):]
return self.redirect("/" + inputted_url)
- latest_urls = memcache.get('latest_urls')
- if latest_urls is None:
- latest_urls = EntryPoint.gql("ORDER BY last_updated DESC").fetch(25)
-
- # Generate a display address that truncates the URL, adds an ellipsis.
- # This is never actually saved in the Datastore.
- for entry_point in latest_urls:
- entry_point.display_address = \
- entry_point.translated_address[:MAX_URL_DISPLAY_LENGTH]
- if len(entry_point.display_address) == MAX_URL_DISPLAY_LENGTH:
- entry_point.display_address += '...'
-
- if not memcache.add('latest_urls', latest_urls,
- time=EXPIRATION_RECENT_URLS_SECONDS):
- logging.error('memcache.add failed: latest_urls')
-
# Do this dictionary construction here, to decouple presentation from
# how we store data.
secure_url = None
if self.request.scheme == "http":
- secure_url = "https://mirrorrr.appspot.com"
+ secure_url = "https://%s%s" % (self.request.host, self.request.path_qs)
context = {
- "latest_urls": latest_urls,
"secure_url": secure_url,
}
self.response.out.write(template.render("main.html", context))
@@ -222,8 +191,11 @@ def get(self):
class MirrorHandler(BaseHandler):
def get(self, base_url):
+ if self.is_recursive_request():
+ return
+
assert base_url
-
+
# Log the user-agent and referrer, to see who is linking to us.
logging.debug('User-Agent = "%s", Referrer = "%s"',
self.request.user_agent,
@@ -249,79 +221,19 @@ def get(self, base_url):
if content is None:
return self.error(404)
- # Store the entry point down here, once we know the request is good and
- # there has been a cache miss (i.e., the page expired). If the referrer
- # wasn't local, or it was '/', then this is an entry point.
- if (cache_miss and
- 'Googlebot' not in self.request.user_agent and
- 'Slurp' not in self.request.user_agent and
- (not self.request.referer.startswith(self.request.host_url) or
- self.request.referer == self.request.host_url + "/")):
- # Ignore favicons as entry points; they're a common browser fetch on
- # every request for a new site that we need to special case them here.
- if not self.request.url.endswith("favicon.ico"):
- logging.info("Inserting new entry point")
- entry_point = EntryPoint(
- key_name=key_name,
- translated_address=translated_address)
- try:
- entry_point.put()
- except (db.Error, apiproxy_errors.Error):
- logging.exception("Could not insert EntryPoint")
-
for key, value in content.headers.iteritems():
self.response.headers[key] = value
if not DEBUG:
- self.response.headers['cache-control'] = \
- 'max-age=%d' % EXPIRATION_DELTA_SECONDS
+ self.response.headers["cache-control"] = \
+ "max-age=%d" % EXPIRATION_DELTA_SECONDS
self.response.out.write(content.data)
-
-class AdminHandler(webapp2.RequestHandler):
- def get(self):
- self.response.headers['content-type'] = 'text/plain'
- self.response.out.write(str(memcache.get_stats()))
-
-
-class KaboomHandler(webapp2.RequestHandler):
- def get(self):
- self.response.headers['content-type'] = 'text/plain'
- self.response.out.write('Flush successful: %s' % memcache.flush_all())
-
-
-class CleanupHandler(webapp2.RequestHandler):
- """Cleans up EntryPoint records."""
-
- def get(self):
- keep_cleaning = True
- try:
- content_list = EntryPoint.gql('ORDER BY last_updated').fetch(25)
- keep_cleaning = (len(content_list) > 0)
- db.delete(content_list)
-
- if content_list:
- message = "Deleted %d entities" % len(content_list)
- else:
- keep_cleaning = False
- message = "Done"
- except (db.Error, apiproxy_errors.Error), e:
- keep_cleaning = True
- message = "%s: %s" % (e.__class__, e)
-
- context = {
- 'keep_cleaning': keep_cleaning,
- 'message': message,
- }
- self.response.out.write(template.render('cleanup.html', context))
-
-################################################################################
+###############################################################################
app = webapp2.WSGIApplication([
(r"/", HomeHandler),
(r"/main", HomeHandler),
- (r"/kaboom", KaboomHandler),
- (r"/admin", AdminHandler),
- (r"/cleanup", CleanupHandler),
- (r"/([^/]+).*", MirrorHandler)
+ (r"/_ah/warmup", WarmupHandler),
+ (r"/([^/]+).*", MirrorHandler),
], debug=DEBUG)
diff --git a/static/base.css b/static/base.css
index b62820e..be70848 100644
--- a/static/base.css
+++ b/static/base.css
@@ -1,137 +1,78 @@
* {
- margin: 0;
- padding: 0;
+ margin: 0;
+ padding: 0;
}
body {
- background-color: #ffffff;
- font-family: sans-serif;
- margin: 0px;
- padding: 8px;
- color: #000000;
+ background-color: #ffffff;
+ font-family: sans-serif;
+ margin: 0px;
+ padding: 8px;
+ color: #000000;
}
-#header {
- text-align: center;
- font-size: 20px;
- letter-spacing: 10px;
- margin-bottom: 40px;
- margin-top: 40px;
-}
-
-#wrapper {
- width: 100%;
-}
-
-#container {
- margin-left: auto;
- margin-right: auto;
- width: 600px;
+header {
+ text-align: center;
+ font-size: 20px;
+ letter-spacing: 10px;
+ margin-bottom: 40px;
+ margin-top: 40px;
}
#form_wrapper {
- margin-top: 10px;
- width: 100%;
+ margin-top: 10px;
+ width: 100%;
}
#input_wrapper {
- width: 500px;
- margin-left: auto;
- margin-right: auto;
+ width: 500px;
+ margin-left: auto;
+ margin-right: auto;
}
#http_prefix {
- vertical-align: middle;
+ vertical-align: middle;
}
#go_button {
- width: 50px;
- vertical-align: middle;
+ width: 50px;
+ vertical-align: middle;
}
#url_entry {
- width: 375px;
- border: 1px dashed black;
- padding: 3px;
- margin-left: 3px;
- margin-right: 10px;
- font-family: arial,helvetica,sans-serif;
- font-weight: normal;
- color: #959595;
- font-size: 14px;
+ width: 375px;
+ border: 1px dashed black;
+ padding: 3px;
+ margin-left: 3px;
+ margin-right: 10px;
+ font-family: arial,helvetica,sans-serif;
+ font-weight: normal;
+ color: #959595;
+ font-size: 14px;
}
#warning {
- font-size: 10px;
- margin-top: 20px;
- letter-spacing: 3px;
- color: #7f7f7f;
-}
-
-#recent {
- margin-top: 50px;
-}
-
-#recent_entries {
- margin-left: 20px;
+ font-size: 10px;
+ margin-top: 20px;
+ letter-spacing: 3px;
+ color: #7f7f7f;
+ text-align: center;
}
-.url_container {
- margin-top: 3px;
- padding-left: 5px;
-}
-
-.info {
- font-family: sans-serif;
- font-size: 10px;
- color: #606060;
- margin-left: 5px;
-}
-
-.url {
- font-size: 12px;
-}
-
-.url1 { color: #000000 !important; }
-.url2 { color: #0a0a0a !important; }
-.url3 { color: #151515 !important; }
-.url4 { color: #1f1f1f !important; }
-.url5 { color: #2a2a2a !important; }
-.url6 { color: #353535 !important; }
-.url7 { color: #3f3f3f !important; }
-.url8 { color: #4a4a4a !important; }
-.url9 { color: #555555 !important; }
-.url10 { color: #5f5f5f !important; }
-.url11 { color: #6a6a6a !important; }
-.url12 { color: #757575 !important; }
-.url13 { color: #7f7f7f !important; }
-.url14 { color: #8a8a8a !important; }
-.url15 { color: #959595 !important; }
-.url16 { color: #9f9f9f !important; }
-.url17 { color: #aaaaaa !important; }
-.url18 { color: #b5b5b5 !important; }
-.url19 { color: #bfbfbf !important; }
-.url20 { color: #cacaca !important; }
-.url21 { color: #d5d5d5 !important; }
-.url22 { color: #dfdfdf !important; }
-.url23 { color: #eaeaea !important; }
-.url24 { color: #f5f5f5 !important; }
-.url25 { color: #ffffff !important; }
-
/* secure link */
.secure {
- text-align: center;
- margin-top: 50px;
- vertical-align: middle;
- font-size: 12px;
+ text-align: center;
+ margin-top: 50px;
+ vertical-align: middle;
+ font-size: 12px;
}
.secure img {
- margin-bottom: -0.3em;
- border: 0;
+ margin-bottom: -0.3em;
+ border: 0;
}
.secure a, .secure a:hover, .secure a:visited, .secure a:active {
- color: #000;
- text-decoration: none;
+ color: #000;
+ text-decoration: none;
}
diff --git a/static/lock.png b/static/lock.png
index afd64f2..f2646f5 100644
Binary files a/static/lock.png and b/static/lock.png differ
diff --git a/static/nolock.png b/static/nolock.png
index 6a3cd91..ca13401 100644
Binary files a/static/nolock.png and b/static/nolock.png differ
diff --git a/transform_content.py b/transform_content.py
index 313bf73..c5bb1dd 100644
--- a/transform_content.py
+++ b/transform_content.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python
-# Copyright 2008 Brett Slatkin
-#
+# Copyright 2008-2014 Brett Slatkin
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -78,7 +78,7 @@
(CSS_URL_START + SAME_DIR_URL_REGEX,
"url(\g
%(accessed_dir)s\g"),
-
+
(CSS_URL_START + TRAVERSAL_URL_REGEX,
"url(\g%(accessed_dir)s/\g/\g"),