WebCrawler/webcrawler.py at main · f23y/WebCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import sys
import multiprocessing
import re
import os

try:
    # Python 3
    import urllib.request as lib
    python3=True
except Exception:
    # Python 2
    import urllib as lib
    python3=False
def craw_links(url,depth,keywords,processed):
    '''url:the url to craw
       depth:the current depth to craw
       keywords:the tuple of keywords to focus
       pool:process pool
    '''
    contents=[]
    if url.startswith('http://') or url.startswith('https://'):
        if url not in processed:
            # mark this url as processed
            processed.append(url)
        else:
            # avoid processing the same url again
            return
        print('Crawing'+url+'...')
        fp=lib.urlopen(url)
        if python3:
            # Python3 returns bytes, so need to decode.
            contents=fp.read()
            contents_decoded=contents.decode('UTF-8')
        else:
            # Python2 returns str, does not need this decode.
            contents_decoded=fp.read()
        fp.close()
        pattern='|'.join(keywords)
        # if this page contains certain keywords, save it to a file.
        flag=False
        if pattern:
            searched=re.search(pattern, contents_decoded)
        else:
            #if the keywords to filter is not given, save current page.
            flag=True
        print(flag,searched)
        if flag or searched:
            if python3:
                with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:
                    fp.write(contents)
            else:
                with open('craw\\'+url.replace(':','_').replace('/','_') ,'w') as fp:
                    fp.write(contents_decoded)
        # find all the links in the current page
        links=re.findall('href="(.*?)"', contents_decoded)
        # craw all links in the current page
        for link in links:
            # consider the relative path
            if not link.startswith(('http://','https://')):
                try:
                    index=url.rindex('/')
                    link=url[0:index+1]+link
                except:
                    pass
            if depth>0 and link.endswith(('.htm','.html')):
                craw_links(link,depth-1,keywords,processed)

if __name__ == "__main__":
    processed=[]
    keywords=('date','score')
    if not os.path.exists('craw') or not os.path.isdir('craw'):
        os.mkdir('craw')
    craw_links(r'http://www.xxxx.edu.cn/', 1, keywords, processed)