-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaggregate.py
executable file
·68 lines (52 loc) · 2.16 KB
/
aggregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Aggregates URLs from a set of XML sitemaps listed under the entry path.
This script processes the XML file at given path, opens all sitemaps
listed inside, and prints all URLs inside those maps to stdout.
It should support most sitemaps that comply with the spec at
https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd.
It was tested with sitemaps generated by the following WP plugins:
- (Google XML Sitemaps)[https://wordpress.org/plugins/google-sitemap-generator/]
- (XML Sitemap & Google News feeds)[https://wordpress.org/plugins/xml-sitemap-feed/]
- (Yoast SEO)[https://wordpress.org/plugins/wordpress-seo/]
Issues & Documentation: https://github.com/jsphpl/redirect-mapper
"""
import argparse
import ssl
from pprint import pprint
from urllib2 import urlopen
from xml.etree import ElementTree
# Ignore self-signed certificates
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
TAGS = {
'SITEMAP': '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap',
'LOCATION': '{http://www.sitemaps.org/schemas/sitemap/0.9}loc',
}
def main(args):
sources = aggregate(args.entry)
for source in sources:
process(source)
def aggregate(entry):
"""Aggregate urls of sitemaps from one entry xml sitemap"""
file = urlopen(entry, context=ssl_context)
tree = ElementTree.parse(file)
for sitemap in tree.getroot().findall(TAGS['SITEMAP']):
location = sitemap.find(TAGS['LOCATION'])
if location is not None:
yield location.text
file.close()
def process(source):
"""Process a single xml sitemap path"""
file = urlopen(source, context=ssl_context)
tree = ElementTree.parse(file)
for item in tree.getroot():
location = item.find(TAGS['LOCATION'])
if location is not None:
print(location.text)
file.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('entry', type=str, metavar='URL/PATH', help='Path or URL of the root sitemap.')
main(parser.parse_args())