tldlist/fetchroot.py at master · jophy/tldlist · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/python
# -*- coding:utf-8 -*-
__author__ = 'Jophy'
import urllib2
import re
import time
from xml.dom import minidom

try:
    from bs4 import BeautifulSoup
except ImportError:
    print 'This python program need import beautifulsoup \n' \
          'Lear more :http://www.crummy.com/software/BeautifulSoup/'


def tld_list():
    root_zone_url = 'http://www.iana.org/domains/root/db'
    try:
        content = urllib2.urlopen(root_zone_url).read()
    except Exception as e:
        print str(e)
    soup = BeautifulSoup(content)
    tr = soup.find_all(attrs={'class': re.compile(r'iana-group-\d')})
    for group in tr:
        f = open('tldlistnew.txt', 'ab+')
        link = 'http://www.iana.org' + group.a['href'].encode('utf-8')
        tld = group.a.get_text().encode('utf-8')
        idn = ''
        type = group.contents[3].get_text().encode('utf-8')
        if 'xn' in group.a['href']:
            idn = group.a['href'].rstrip('.html').lstrip('/domains/root/db/').encode('utf-8')
        f.write(('%s -- %s -- %s -- %s\n' % (tld, idn, type, link)))
        f.close()


def tld_parser():
    f = open('tldlist.txt', 'r')
    for line in f:
        i = open('tldall.txt', 'ab+')
        line = line.strip()
        tld_url = line.split(' -- ')[-1].strip()
        content = urllib2.urlopen(tld_url).read()
        re_nic = re.compile(r'<b>URL for registration services:</b> <a href="(.*)">.*</a><br/>')
        re_whois = re.compile(r'<b>WHOIS Server:</b>\s*(\S*)')
        nic_result = re_nic.findall(content)
        whois_result = re_whois.findall(content)
        print nic_result
        print whois_result

        if len(nic_result) > 0:
            i.write(line + ' -- ' + nic_result[0])
        else:
            i.write(line + ' -- ' + '')

        if len(whois_result) > 0:
            i.write(' -- ' + whois_result[0] + '\n')
        else:
            i.write(' -- ' + '' + '\n')
        i.close()
    f.close()


def adddom(tld_dic):
    #tld node
    tld = doc.createElement('tld')
    tld.setAttribute('id', str(tld_dic['id']))
    tldlist.appendChild(tld)

    #domain node
    domain = doc.createElement('domain')
    domain.appendChild(doc.createTextNode(tld_dic['domain']))
    tld.appendChild(domain)

    #idn node
    idn = doc.createElement('idn')
    idn.appendChild(doc.createTextNode(tld_dic['idn']))
    tld.appendChild(idn)

    #type node
    type = doc.createElement('type')
    type.appendChild(doc.createTextNode(tld_dic['type']))
    tld.appendChild(type)

    #nic node
    nic = doc.createElement('nic')
    nic.appendChild(doc.createTextNode(tld_dic['nic']))
    tld.appendChild(nic)

    #whois node
    whois = doc.createElement('whois')
    whois.appendChild(doc.createTextNode(tld_dic['whois']))
    tld.appendChild(whois)


def create_xml():
    doc.appendChild(doc.createComment('A tld xml including the whole gTLDs & ccTLDs . '))
    doc.appendChild(doc.createComment('Create time :' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
    doc.appendChild(doc.createComment('Author: Jophy (https://github.com/jophy)'))
    #tldlist = doc.createElement('tldlist')
    doc.appendChild(tldlist)
    all = open('tldall.txt', 'r')
    i = 1
    for line in all:
        split = line.strip('\n').split(' -- ')
        print split
        domain = split[0]
        idn = split[1]
        type = split[2]
        nic = split[4]
        whois = split[5]
        iana = split[3]
        adddom({
            'id': str(i),
            'domain': domain,
            'idn': idn,
            'type': type,
            'nic': nic,
            'whois': whois,
            'iana': iana
        })
        i += 1
    all.close()
    xml = open('tldlist.xml', 'w')
    xml.write(doc.toprettyxml())
    xml.close()

if __name__ == '__main__':
    '''

    Due to some unpredictable errors (Mostly Network Errors),

    I recommend you to run ONE function each time.

    You could also use my results directly .

    '''
    #First step : run tld_list() function , get tld list in brief
    #create tldlist.txt
    tld_list()

    #second step : run tld_parser() function , get tld information in details
    #create tldall.txt
    tld_parser()

    #third step : run following codes , get a xml file.
    #create tldlist.xml
    doc = minidom.Document()
    tldlist = doc.createElement('tldlist')
    create_xml()