Skip to content

Commit

Permalink
Update eroom_finder.py
Browse files Browse the repository at this point in the history
  • Loading branch information
linpingta authored Jul 13, 2024
1 parent e0624e4 commit f43b2a7
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions eroom_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def re_match(re_pattern, string, errif=None):
return errif


def get_house_info(start_url, sess):
html = sess.get(start_url).text
def get_house_info(start_url, sess, headers):
html = sess.get(start_url, headers=headers, allow_redirects=True, timeout=3).text
house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
return house_num

Expand Down Expand Up @@ -58,20 +58,20 @@ def get_info_dic(info, area, city_name):
return info_dic


def crawl_data(sess, real_dict, city_name):
def crawl_data(sess, real_dict, city_name, headers):
total_num = 0
err_num = 0
data_info_list = []
url = 'https://%s.lianjia.com/ershoufang/{}/pg{}/' % city_name

for key_, value_ in real_dict.items():
start_url = ('https://%s.lianjia.com/ershoufang/{}/' % city_name).format(value_)
house_num = get_house_info(start_url, sess)
house_num = get_house_info(start_url, sess, headers)
print('{}: 二手房源共计「{}」套'.format(key_, house_num))
time.sleep(2)
total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
for i in tqdm(range(total_page), desc=key_):
html = sess.get(url.format(value_, i+1)).text
html = sess.get(url.format(value_, i+1), headers=headers, allow_redirects=True, timeout=3).text
soup = BeautifulSoup(html, 'lxml')
info_collect = soup.find_all(class_="info clear")
for info in info_collect:
Expand Down Expand Up @@ -143,7 +143,7 @@ def main():
if args.area_name == 'small':
real_dict = area_dic_small

data_info_list = crawl_data(sess, real_dict, city_name)
data_info_list = crawl_data(sess, real_dict, city_name, headers)
data = pd.DataFrame(data_info_list)
data.to_csv("eroom_time__%s_detail__%s__area_%s.csv" % (datetime.datetime.now().strftime('%Y%m%d'), int(time.time()), len(area_dic.values())), encoding='utf-8-sig')

Expand Down

0 comments on commit f43b2a7

Please sign in to comment.