forked from liuhuanyong/BaikeInfoExtraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsogoubaike.py
80 lines (69 loc) · 3.11 KB
/
sogoubaike.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
# coding: utf-8
# File: info_extract.py
# Author: lhy
# Date: 18-3-8
from urllib import request
from lxml import etree
from urllib import parse
class SougouBaike():
def __index__(self):
pass
def get_html(self, url):
return request.urlopen(url).read().decode('utf-8').replace(' ', '')
def find_sofouid(self, word):
url = "http://baike.sogou.com/Search.e?sp=S%s" % parse.quote(word)
print(url)
selector = etree.HTML(self.get_html(url))
id = selector.xpath('//h2/a/@href')[0].split(';')[0]
info_url = "http://baike.sogou.com/%s"%id
return info_url
def info_extract_sogou(self, word): #sogou百科
info_url = self.find_sofouid(word)
selector = etree.HTML(self.get_html(info_url))
info_list = list()
info_data = self.extract_sogou(selector)
if selector.xpath('//li[@class="current_item"]/text()'):
info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
info_list.append(info_data)
polysemantics = self.checksogou_polysemantic(selector)
if polysemantics:
info_list += polysemantics
infos = [info for info in info_list if len(info) > 2]
return infos
def extract_sogou(self, selector):
info_data = {}
info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//div[@class="relevant_wrap"]/a/text()')]
if selector.xpath('//li[@class="current_item"]/text()'):
info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace(' ', '').replace('(','').replace(')','')
else:
info_data['current_semantic'] = ''
tables = selector.xpath('//table[@class="abstract_list"]')
for table in tables:
attributes = table.xpath('./tbody/tr/th/text()')
values = [td.xpath('string(.)') for td in table.xpath('./tbody/tr/td')]
for item in zip(attributes, values):
info_data[item[0].replace(' ', '').replace('\xa0','')] = item[1].replace(' ', '')
return info_data
def checksogou_polysemantic(self, selector):
semantics = ['http://baike.sogou.com' + sem.split('?')[0] for sem in selector.xpath("//ol[@class='semantic_item_list']/li/a/@href")]
names = [name for name in selector.xpath("//ol[@class='semantic_item_list']/li/a/text()")]
info_list = list()
if semantics:
for item in zip(names, semantics):
selector = etree.HTML(self.get_html(item[1]))
info_data = self.extract_sogou(selector)
info_data['current_semantic'] = item[0].replace('(','').replace(')','')
if info_data:
info_list.append(info_data)
return info_list
'''Testing'''
'''
if __name__ == "__main__":
baikeinfo = SougouBaike()
while(1):
word = input('enter an word:')
baikeinfo.info_extract_sogou(word)
'''