diff --git a/README.md b/README.md index 1822ce2..d274f49 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,159 @@ Django Sitemap Generate ======================= -Background sitemap generation for Django +Background sitemap generation for Django. + +[![Build Status](https://github.com/just-work/django-sitemap-generate/workflows/build/badge.svg?branch=master&event=push)](https://github.com/just-work/django-sitemap-generate/actions?query=event%3Apush+branch%3Amaster+workflow%3Abuild) +[![codecov](https://codecov.io/gh/just-work/django-sitemap-generate/branch/master/graph/badge.svg)](https://codecov.io/gh/just-work/django-sitemap-generate) +[![PyPI version](https://badge.fury.io/py/django-sitemap-generate.svg)](https://badge.fury.io/py/django-sitemap-generate) + +Use case +-------- + +Almost every content site has a sitemap. Django provides an application serving +sitemap views, and it's OK if your website is small. If you have complicate +logic in sitemap generation or if you have millions of items in sitemap - you'll +have a massive load spikes when Google and another search engines come with +thousands of there indexer bots. These bots will request same sitemap pages in +parallel and those requests couldn't be cached because of large index interval +and small hit rate. + +The solution is to re-generate sitemap files periodically, once per day and not +once per search engine indexer. These files could be served as static files +which will not affect backend performance at all. + +Prerequisites +------------- + +These project uses index sitemap view and per-model sitemap views to generate +sitemap xml files. To provide it you will need following. + +1. Add `django.contrib.sitemaps` to installed apps + ```python + INSTALLED_APPS.append('django.contrib.sitemaps') + ``` +2. Configure at least one sitemap + ```python + from django.contrib.sitemaps import Sitemap + + from testproject.testapp import models + + + class VideoSitemap(Sitemap): + name = 'video' + changefreq = 'daily' + limit = 50000 + + def items(self): + return models.Video.objects.order_by('id') + ``` + + Note that `changefreq` parameter is a hint for search engine indexer, it + does not affect sitemap files generation period. + +3. Configure sitemap serving + ```python + from django.contrib.sitemaps import views + from django.urls import path + + from testproject.testapp.sitemaps import VideoSitemap, ArticleSitemap + + sitemaps = { + VideoSitemap.name: VideoSitemap, + ArticleSitemap.name: ArticleSitemap + } + + urlpatterns = [ + path('sitemap.xml', views.index, {'sitemaps': sitemaps}, + name='sitemap-index'), + path('sitemap-
.xml', views.sitemap, {'sitemaps': sitemaps}, + name='django.contrib.sitemaps.views.sitemap'), + ] + ``` + +Now your website supports sitemap views. + +Installation +------------ + +```shell script +pip install django-sitemap generate +``` + +Working example is in `testproject.testapp`. + +1. Add `sitemap_generate` application to installed apps in django settings: + ```python + INSTALLED_APPS.append('sitemap_generate') + ``` +2. Add a reference to sitemap mapping to django settings: + ```python + SITEMAP_MAPPING = 'testproject.testapp.urls.sitemaps' + ``` +3. You may need to override default sitemap index url name + ```python + SITEMAP_INDEX_URL = 'sitemap-index' + ``` +4. Also you may need to setup forwarded protocol handling in django settings: + ```python + SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') + ``` +5. Note that django paginates sitemap with `p` query parameter, but + corresponding sitemap files are named `sitemap-video.xml`, + `sitemap-video-2.xml` and so on. You'll need to configure some "rewrites". + +Usage +----- + +When you request sitemap over http, django substitutes website domain name from +request to links in sitemap xml. In background, you'll need some environment +variables. By defaults link are generated for `localhost` over HTTPS. + +```shell script +export \ + SITEMAP_PROTO=https \ + SITEMAP_HOST=github.com \ + SITEMAP_PORT=443 + +# generate all sitemaps +python manage.py generate_sitemap + +# generate sitemap for single model +python manage.py generate_sitemap video +``` + +You may run sitemap generation from crontab: + +``` +0 0 * * * python manage.py generate_sitemap +``` + +You may run sitemap generation from celery: + +```python +@celery.task +def generate_sitemap(): + generator = SitemapGenerator(sitemaps={'video': VideoSitema[}) + generator.generate() +``` + +And you will need to configure xml files static responses, i.e. in nginx: + +``` +location ~* /sitemaps/(?sitemap(-(article|video)).xml { + try_files /media/sitemaps/$fn$arg_p.xml @backend; +} + +location /media/ { + alias /app/media/; +} + +location @backend { + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + set $app http://app:8000; + proxy_pass $app; +} +``` \ No newline at end of file diff --git a/sitemap_generate/defaults.py b/sitemap_generate/defaults.py new file mode 100644 index 0000000..7e9fb5b --- /dev/null +++ b/sitemap_generate/defaults.py @@ -0,0 +1,8 @@ +from os import getenv as e + +# Hostname used in sitemap links +SITEMAP_HOST = e('SITEMAP_HOST', 'localhost') +# Port used in sitemap links +SITEMAP_PORT = int(e('SITEMAP_PORT', 443)) +# Protocol used in sitemap links +SITEMAP_PROTO = e('SITEMAP_PROTO', 'https') diff --git a/sitemap_generate/generator.py b/sitemap_generate/generator.py new file mode 100644 index 0000000..268014e --- /dev/null +++ b/sitemap_generate/generator.py @@ -0,0 +1,134 @@ +import os +from logging import getLogger +from typing import Dict, Optional, Callable, Type +from urllib.parse import ParseResult, urlparse + +from django.contrib.sitemaps import Sitemap +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage, Storage +from django.core.servers import basehttp +from django.http import HttpResponse +from django.urls import reverse + +from sitemap_generate import defaults + + +class SitemapError(Exception): + """ Sitemap generation error.""" + + def __init__(self, status_code, content): + super().__init__(status_code, content) + self.status_code = status_code + self.content = content + + +StartResponseFunc = Callable[[str, dict], None] +WSGIFunc = Callable[[dict, StartResponseFunc], HttpResponse] + + +class ResponseRecorder: + """ Helper for fetching sitemaps over WSGI request.""" + + def __init__(self, wsgi: WSGIFunc): + """ + + :param wsgi: Django wsgi application + """ + self.wsgi = wsgi + self.status: Optional[str] = None + + def record(self, url: str) -> bytes: + """ + Fetches an url over WSGI request and returns response content. + + :param url: request url + :returns: response content + :raises SitemapError: if response status code is not 200. + """ + url: ParseResult = urlparse(url) + + environ = { + 'REQUEST_METHOD': 'GET', + 'wsgi.input': '', + 'SERVER_NAME': defaults.SITEMAP_HOST, + 'SERVER_PORT': defaults.SITEMAP_PORT, + 'PATH_INFO': url.path, + 'QUERY_STRING': url.query, + 'HTTP_X_FORWARDED_PROTO': defaults.SITEMAP_PROTO + } + response = self.wsgi(environ, self._start_response) + if self.status != "200 OK": + raise SitemapError(self.status, response.content) + return response.content + + def _start_response(self, status, _): + """ WSGI headers callback func.""" + self.status = status + + +class SitemapGenerator: + """ Sitemap XML files generator.""" + + def __init__(self, + media_path: str = 'sitemaps', + storage: Storage = default_storage, + index_url_name: str = 'sitemap-index', + sitemaps: Optional[Dict[str, Type[Sitemap]]] = None): + """ + + :param media_path: relative path on file storage + :param storage: file storage implementation used for sitemaps + :param index_url_name: name of view serving sitemap index xml file + :param sitemaps: mapping: sitemap name -> sitemap implementation + """ + cls = self.__class__ + self.logger = getLogger(f'{cls.__module__}.{cls.__name__}') + self.sitemap_root = media_path + self.storage = storage + self.index_url_name = index_url_name + self.sitemaps = sitemaps or {} + self.recorder = ResponseRecorder( + basehttp.get_internal_wsgi_application()) + + def fetch_content(self, url: str) -> bytes: + """ Fetch sitemap xml content with wsgi request recorder.""" + self.logger.debug(f"Fetching {url}...") + return self.recorder.record(url) + + def store_sitemap(self, filename: str, content: bytes): + """ Save sitemap content to file storage.""" + path = os.path.join(self.sitemap_root, filename) + if self.storage.exists(path): + self.storage.delete(path) + self.storage.save(path, ContentFile(content)) + + def generate(self, sitemap=None): + """ Generate all sitemap files.""" + self.logger.debug("Start sitemap generation.") + url = reverse(self.index_url_name) + + index_content = self.fetch_content(url) + self.store_sitemap('sitemap.xml', index_content) + + for name, sitemap_class in self.sitemaps.items(): + if sitemap and sitemap != name: + continue + self.logger.debug("Generating sitemap for %s", name) + self.generate_pages(name, sitemap_class()) + + self.logger.debug("Finish sitemap generation.") + + def generate_pages(self, section: str, sitemap: Sitemap): + """ Generate sitemap section pages.""" + url = reverse('django.contrib.sitemaps.views.sitemap', + kwargs={'section': section}) + for page in sitemap.paginator.page_range: + if page > 1: + page_url = f'{url}?p={page}' + filename = f'sitemap-{section}{page}.xml' + else: + page_url = url + filename = f'sitemap-{section}.xml' + + page_content = self.fetch_content(page_url) + self.store_sitemap(filename, page_content) diff --git a/sitemap_generate/management/__init__.py b/sitemap_generate/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sitemap_generate/management/commands/__init__.py b/sitemap_generate/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sitemap_generate/management/commands/generate_sitemap.py b/sitemap_generate/management/commands/generate_sitemap.py new file mode 100644 index 0000000..13a7995 --- /dev/null +++ b/sitemap_generate/management/commands/generate_sitemap.py @@ -0,0 +1,19 @@ +from django.conf import settings +from django.core.management import BaseCommand +from django.utils.module_loading import import_string + +from sitemap_generate.generator import SitemapGenerator + + +class Command(BaseCommand): + help = "generate sitemap xml files" + + sitemaps = import_string(settings.SITEMAP_MAPPING) + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument('sitemap', type=str, nargs='?') + + def handle(self, *args, **options): + generator = SitemapGenerator(sitemaps=self.sitemaps) + generator.generate(options.get('sitemap')) diff --git a/testproject/settings.py b/testproject/settings.py index 49595c1..b1bf9fd 100644 --- a/testproject/settings.py +++ b/testproject/settings.py @@ -25,7 +25,7 @@ # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True -ALLOWED_HOSTS = [] +ALLOWED_HOSTS = ['localhost'] # Application definition @@ -35,6 +35,7 @@ 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', + 'django.contrib.sitemaps', 'django.contrib.messages', 'django.contrib.staticfiles', 'sitemap_generate', @@ -122,4 +123,10 @@ STATIC_URL = '/static/' # Mocked file storage for tests -DEFAULT_FILE_STORAGE = 'inmemorystorage.InMemoryStorage' \ No newline at end of file +DEFAULT_FILE_STORAGE = 'inmemorystorage.InMemoryStorage' + +# Sitemap-Generate +SITEMAP_MAPPING = 'testproject.testapp.urls.sitemaps' + +# Proxy-pass +SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') diff --git a/testproject/testapp/migrations/0001_initial.py b/testproject/testapp/migrations/0001_initial.py new file mode 100644 index 0000000..1eb78e5 --- /dev/null +++ b/testproject/testapp/migrations/0001_initial.py @@ -0,0 +1,26 @@ +# Generated by Django 3.0.5 on 2020-04-03 13:05 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Article', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ], + ), + migrations.CreateModel( + name='Video', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ], + ), + ] diff --git a/testproject/testapp/sitemaps.py b/testproject/testapp/sitemaps.py new file mode 100644 index 0000000..9c6ed18 --- /dev/null +++ b/testproject/testapp/sitemaps.py @@ -0,0 +1,21 @@ +from django.contrib.sitemaps import Sitemap + +from testproject.testapp import models + + +class VideoSitemap(Sitemap): + name = 'video' + changefreq = 'daily' + limit = 1 + + def items(self): + return models.Video.objects.order_by('id') + + +class ArticleSitemap(Sitemap): + name = 'articles' + changefreq = 'daily' + limit = 50000 + + def items(self): + return models.Article.objects.order_by('id') diff --git a/testproject/testapp/tests.py b/testproject/testapp/tests.py index e69de29..a4eb756 100644 --- a/testproject/testapp/tests.py +++ b/testproject/testapp/tests.py @@ -0,0 +1,71 @@ +import os +from typing import cast + +from django.core.files.storage import default_storage, Storage +from django.core.management import call_command +from django.test import TestCase + +from sitemap_generate import defaults +from testproject.testapp import models + + +class GenerateSitemapCommandTestCase(TestCase): + header = ['', + ''] + footer = [''] + + empty = header + [''] + footer + + @classmethod + def setUpTestData(cls): + cls.videos = [models.Video.objects.create() for _ in range(2)] + cls.storage = cast(Storage, default_storage) + + def tearDown(self) -> None: + super().tearDown() + _, files = self.storage.listdir('sitemaps') + for path in files: + self.storage.delete(os.path.join('sitemaps', path)) + + def test_generate_sitemap(self): + """ Checks sitemap xml files generation.""" + call_command('generate_sitemap') + self.assertTrue(self.storage.exists('sitemaps/sitemap.xml')) + self.assertTrue(self.storage.exists('sitemaps/sitemap-video.xml')) + self.assertTrue(self.storage.exists('sitemaps/sitemap-video2.xml')) + self.assertTrue(self.storage.exists('sitemaps/sitemap-articles.xml')) + + with self.storage.open('sitemaps/sitemap.xml') as f: + content = f.read().decode('utf-8') + links = [ + 'sitemaps/sitemap-video.xml', + 'sitemaps/sitemap-video.xml?p=2', + 'sitemaps/sitemap-articles.xml', + ] + for link in links: + self.assertIn(link, content) + + with self.storage.open('sitemaps/sitemap-articles.xml') as f: + content = f.read().decode('utf-8').splitlines() + self.assertListEqual(content, self.empty) + + with self.storage.open('sitemaps/sitemap-video.xml') as f: + content = f.read().decode('utf-8').splitlines() + scheme = defaults.SITEMAP_PROTO + host = defaults.SITEMAP_HOST + port = defaults.SITEMAP_PORT + if (scheme, port) in [('https', 443), ('http', 80)]: + netloc = host + else: + netloc = f'{host}:{port}' + url = f'{scheme}://{netloc}/videos/{self.videos[0].pk}/' + url = f'{url}daily' + expected = self.header + [url] + self.footer + self.assertListEqual(content, expected) + + def test_generate_single_sitemap(self): + """ Management command allows to pass sitemap name.""" + call_command('generate_sitemap', sitemap='video') + self.assertTrue(self.storage.exists('sitemaps/sitemap.xml')) + self.assertTrue(self.storage.exists('sitemaps/sitemap-video.xml')) + self.assertFalse(self.storage.exists('sitemaps/sitemap-articles.xml')) diff --git a/testproject/testapp/urls.py b/testproject/testapp/urls.py new file mode 100644 index 0000000..24986fd --- /dev/null +++ b/testproject/testapp/urls.py @@ -0,0 +1,16 @@ +from django.contrib.sitemaps import views +from django.urls import path + +from testproject.testapp.sitemaps import VideoSitemap, ArticleSitemap + +sitemaps = { + VideoSitemap.name: VideoSitemap, + ArticleSitemap.name: ArticleSitemap +} + +urlpatterns = [ + path('sitemap.xml', views.index, {'sitemaps': sitemaps}, + name='sitemap-index'), + path('sitemap-
.xml', views.sitemap, {'sitemaps': sitemaps}, + name='django.contrib.sitemaps.views.sitemap'), +] diff --git a/testproject/urls.py b/testproject/urls.py index 7843628..b0fafde 100644 --- a/testproject/urls.py +++ b/testproject/urls.py @@ -14,8 +14,9 @@ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ from django.contrib import admin -from django.urls import path +from django.urls import path, include urlpatterns = [ path('admin/', admin.site.urls), + path('sitemaps/', include('testproject.testapp.urls')) ]