aggregate.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Aggregates URLs from a set of XML sitemaps listed under the entry path.

This script processes the XML file at given path, opens all sitemaps
listed inside, and prints all URLs inside those maps to stdout.
It should support most sitemaps that comply with the spec at
https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd.

It was tested with sitemaps generated by the following WP plugins:
 - (Google XML Sitemaps)[https://wordpress.org/plugins/google-sitemap-generator/]
 - (XML Sitemap & Google News feeds)[https://wordpress.org/plugins/xml-sitemap-feed/]
 - (Yoast SEO)[https://wordpress.org/plugins/wordpress-seo/]

Issues & Documentation: https://github.com/jsphpl/redirect-mapper
"""

import argparse
import ssl
from pprint import pprint
from urllib2 import urlopen
from xml.etree import ElementTree

# Ignore self-signed certificates
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

TAGS = {
    'SITEMAP': '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap',
    'LOCATION': '{http://www.sitemaps.org/schemas/sitemap/0.9}loc',
}

def main(args):
    sources = aggregate(args.entry)
    for source in sources:
        process(source)

def aggregate(entry):
    """Aggregate urls of sitemaps from one entry xml sitemap"""
    file = urlopen(entry, context=ssl_context)
    tree = ElementTree.parse(file)

    for sitemap in tree.getroot().findall(TAGS['SITEMAP']):
        location = sitemap.find(TAGS['LOCATION'])
        if location is not None:
            yield location.text

    file.close()

def process(source):
    """Process a single xml sitemap path"""
    file = urlopen(source, context=ssl_context)
    tree = ElementTree.parse(file)

    for item in tree.getroot():
        location = item.find(TAGS['LOCATION'])
        if location is not None:
            print(location.text)

    file.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('entry', type=str, metavar='URL/PATH', help='Path or URL of the root sitemap.')

    main(parser.parse_args())