11import requests
22import xml .etree .ElementTree as ET
3+ from xml .dom import minidom
34from tqdm import tqdm
45from urllib .parse import quote
56
@@ -34,12 +35,14 @@ def fetch_sitemap(url):
3435 response .raise_for_status ()
3536 return response .text
3637
37- def serialize_xml (element ):
38- """
39- Serialize the XML Element to a bytes object with XML declaration,
40- without any additional formatting (all content in one line).
41- """
42- return ET .tostring (element , encoding = 'utf-8' , xml_declaration = True )
38+ def prettify_xml (element ):
39+ """Prettify and return a string representation of the XML with XML declaration including encoding."""
40+ rough_string = ET .tostring (element , encoding = 'utf-8' )
41+ reparsed = minidom .parseString (rough_string )
42+ # Specify encoding to include it in the XML declaration
43+ pretty = reparsed .toprettyxml (indent = " " , encoding = "UTF-8" )
44+ # Decode bytes to string for writing to file
45+ return pretty .decode ('UTF-8' )
4346
4447def encode_url (url ):
4548 """Encode the URL to make it XML-safe and RFC-compliant."""
@@ -54,7 +57,14 @@ def add_static_urls_without_translations(root, urls):
5457 root .append (url_element )
5558
5659def add_translated_urls (url_element , original_url ):
57- """Add translated URLs with language codes appended to the path."""
60+ """Add translated URLs with language codes appended to the path, including x-default."""
61+ # Add x-default hreflang pointing to the original URL
62+ alt_link_default = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
63+ alt_link_default .set ('rel' , 'alternate' )
64+ alt_link_default .set ('hreflang' , 'x-default' )
65+ alt_link_default .set ('href' , encode_url (original_url ))
66+
67+ # Add hreflang links for each language
5868 for hreflang , lang_code in languages .items ():
5969 # Add the language code to the path
6070 path_parts = original_url .split ('/' , 3 )
@@ -75,11 +85,13 @@ def main():
7585 cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
7686
7787 # Fetch both sitemaps
88+ print ("Fetching sitemaps..." )
7889 book_sitemap_data = fetch_sitemap (book_sitemap_url )
7990 cloud_sitemap_data = fetch_sitemap (cloud_sitemap_url )
8091
8192 # Parse XML
8293 ns = {'ns' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
94+ print ("Parsing sitemaps..." )
8395 book_root = ET .fromstring (book_sitemap_data )
8496 cloud_root = ET .fromstring (cloud_sitemap_data )
8597
@@ -106,9 +118,11 @@ def main():
106118 "https://training.hacktricks.xyz/terms" ,
107119 "https://training.hacktricks.xyz/privacy" ,
108120 ]
121+ print ("Adding static URLs without translations..." )
109122 add_static_urls_without_translations (new_root , static_training_urls )
110123
111124 # Process main URLs from book and cloud hacktricks sitemaps
125+ print ("Processing main URLs with translations..." )
112126 for url_element in tqdm (all_urls , desc = "Processing URLs" ):
113127 loc = url_element .find ('ns:loc' , ns )
114128 if loc is None :
@@ -135,30 +149,18 @@ def main():
135149 lastmod_el = ET .SubElement (url_entry , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
136150 lastmod_el .text = lastmod .text
137151
138- # Add translations
152+ # Add translations and x-default
139153 add_translated_urls (url_entry , loc_text )
140154
141155 new_root .append (url_entry )
142156
143- # Serialize XML to bytes with XML declaration, no pretty formatting
144- serialized_xml = serialize_xml (new_root )
145-
146- # Convert bytes to string and replace single quotes with double quotes in XML declaration
147- serialized_xml_str = serialized_xml .decode ('utf-8' )
148- if serialized_xml_str .startswith ("<?xml" ):
149- # Replace single quotes with double quotes in the XML declaration only
150- xml_declaration_end = serialized_xml_str .find ("?>" ) + 2
151- xml_declaration = serialized_xml_str [:xml_declaration_end ]
152- xml_declaration = xml_declaration .replace ("'" , '"' )
153- rest_of_xml = serialized_xml_str [xml_declaration_end :]
154- serialized_xml_str = xml_declaration + rest_of_xml
155-
156- # Remove any newline or carriage return characters to ensure single-line XML
157- serialized_xml_str = serialized_xml_str .replace ('\n ' , '' ).replace ('\r ' , '' )
158-
159- # Write the serialized XML to file as text
157+ # Save prettified XML to file
158+ print ("Generating prettified XML sitemap..." )
159+ beautified_xml = prettify_xml (new_root )
160160 with open ("sitemap.xml" , "w" , encoding = "utf-8" ) as f :
161- f .write (serialized_xml_str )
161+ f .write (beautified_xml )
162+
163+ print ("sitemap.xml has been successfully generated." )
162164
163165if __name__ == "__main__" :
164166 main ()
0 commit comments