-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebcrawler.py
More file actions
73 lines (71 loc) · 2.42 KB
/
webcrawler.py
File metadata and controls
73 lines (71 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import sys
import multiprocessing
import re
import os
try:
# Python 3
import urllib.request as lib
python3=True
except Exception:
# Python 2
import urllib as lib
python3=False
def craw_links(url,depth,keywords,processed):
'''url:the url to craw
depth:the current depth to craw
keywords:the tuple of keywords to focus
pool:process pool
'''
contents=[]
if url.startswith('http://') or url.startswith('https://'):
if url not in processed:
# mark this url as processed
processed.append(url)
else:
# avoid processing the same url again
return
print('Crawing'+url+'...')
fp=lib.urlopen(url)
if python3:
# Python3 returns bytes, so need to decode.
contents=fp.read()
contents_decoded=contents.decode('UTF-8')
else:
# Python2 returns str, does not need this decode.
contents_decoded=fp.read()
fp.close()
pattern='|'.join(keywords)
# if this page contains certain keywords, save it to a file.
flag=False
if pattern:
searched=re.search(pattern, contents_decoded)
else:
#if the keywords to filter is not given, save current page.
flag=True
print(flag,searched)
if flag or searched:
if python3:
with open('craw\\'+url.replace(':','_').replace('/','_'), 'wb') as fp:
fp.write(contents)
else:
with open('craw\\'+url.replace(':','_').replace('/','_') ,'w') as fp:
fp.write(contents_decoded)
# find all the links in the current page
links=re.findall('href="(.*?)"', contents_decoded)
# craw all links in the current page
for link in links:
# consider the relative path
if not link.startswith(('http://','https://')):
try:
index=url.rindex('/')
link=url[0:index+1]+link
except:
pass
if depth>0 and link.endswith(('.htm','.html')):
craw_links(link,depth-1,keywords,processed)
if __name__ == "__main__":
processed=[]
keywords=('date','score')
if not os.path.exists('craw') or not os.path.isdir('craw'):
os.mkdir('craw')
craw_links(r'http://www.xxxx.edu.cn/', 1, keywords, processed)