-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDoubanAnalyse.py
132 lines (114 loc) · 5 KB
/
DoubanAnalyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# coding:utf-8
import warnings
warnings.filterwarnings("ignore")
import jieba # 分词包
import numpy # numpy计算包
import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
import re
import pandas as pd
import matplotlib.pyplot as plt
from urllib import request
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud # 词云包
# 分析网页函数
def getTopMovie_list(start):
resp = request.urlopen('https://movie.douban.com/top250?start='+'%d'%(start*25)) # '%d'% int类型转字符串
html_data = resp.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
# print(html_data)
#nowplaying_movie = soup.find_all('div', id='content')
nowplaying_movie=soup.find_all('ol',class_='grid_view')
nowplaying_movie_list = nowplaying_movie[0].find_all('li')
nowplaying_list = []
pattern=re.compile(r'\d+') #编译正则式
for item in nowplaying_movie_list:
movie_info=item.find_all('div',class_='hd')
movie_atag=movie_info[0].find_all('a')
movie_url=movie_atag[0]['href']
#print(movie_url)
nowplaying_dict = {}
#nowplaying_dict['id'] = item['data-subject']
nowplaying_dict['id']=pattern.findall(movie_url)[0]
span_list=movie_atag[0].find_all('span')
nowplaying_dict['name']=span_list[0].string #.string提取标签内容
nowplaying_list.append(nowplaying_dict)
# for tag_img_item in item.find_all('img'):
# nowplaying_dict['name'] = tag_img_item['alt']
# print(nowplaying_dict['name'])
# nowplaying_list.append(nowplaying_dict)
print(nowplaying_list)
return nowplaying_list
# 爬取评论函数
def getCommentsById(movieId, pageNum):
eachCommentList = []
if pageNum > 0:
start = (pageNum - 1) * 20
else:
return False
requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' + '?' + 'start=' + str(start) + '&limit=20'
resp = request.urlopen(requrl)
if resp.msg!='OK':
return eachCommentList
html_data = resp.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
for item in comment_div_lits:
if item.find_all('p')[0].string is not None:
eachCommentList.append(item.find_all('p')[0].string)
return eachCommentList
def main():
#获取4页100部
for top in range(10):
NowPlayingMovie_list = getTopMovie_list(top)
# 循环获取每页25部
for index in range(25):
print('%d'%top+" "+'%d'%index)
flag=0
if (top==0 and (index==22 or index==23)):
continue
commentList = []
name = NowPlayingMovie_list[index]['name']
for i in range(10):
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[index]['id'], num) # 电影的id
if commentList_temp[0]==None:
print("Error: Page "+'%d'%(top+1)+" Item "+'%d'%(index+1))
flag=1
break
commentList.append(commentList_temp)
if flag==1:
continue
# 将列表中的数据转换为字符串
comments = ''
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
# 使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({'segment': segment})
# 去掉停用词
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
encoding='utf-8') # quoting=3全不引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
# 统计词频
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"count": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["count"], ascending=False)
# 用词云进行显示
wordcloud = WordCloud(font_path="simhei.ttf", width=800, height=400, background_color="white",
max_font_size=120)
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
wordcloud.fit_words(word_frequence)
#plt.imshow(wordcloud, interpolation="bilinear")
wordcloud.to_file(name + ".jpg")
print(name+" get")
# 主函数
main()