-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05-mongodb-lagou.py
98 lines (80 loc) · 3.91 KB
/
05-mongodb-lagou.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding:utf-8 -*-
import sys, os
import time
import uuid
from pymongo import MongoClient
from fake_useragent import UserAgent
# Task 1: Setup Mongo connection
client = MongoClient()
db = client.lagou
my_set = db.set
import requests
# Task 2: Get first page of data from lagou.com
# Direct using below url in browser will see:
# {
# "success": false,
# "msg": "您操作太频繁,请稍后再访问",
# "clientIp": "71.198.44.51"
# }
# Later I figured out that it requires headers with User-Agent and Referer
url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%88%90%E9%83%BD&needAddtionalResult=false"
# Please note, there's parameter in Referer you might want to customize.
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_Python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=',
}
payload = {
'first': 'true',
'pn': '1', # This is the page number, you can get totalCount from json, then calculate how much page it would be
'kd': 'Python', # Keyword
}
response = requests.post(url, headers=headers, data=payload)
# my_set.insert_one(response.json()['content']['positionResult'])
# Task 3: Pull all data and save into mongodb
def get_uuid():
return str(uuid.uuid4())
i = 1
remain = True
while remain:
# Tips: Using fake_agent would help to avoid "您操作太频繁,请稍后再访问"
# Tips: Using same url too many times, you might hit:
# requests.exceptions.SSLError: HTTPSConnectionPool(host='www.lagou.com', port=443): Max retries exceeded with url: /jobs/positionAjax.json?city=%E6%88%90%E9%83%BD&needAddtionalResult=false (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)')))
# Even using cookie + fake_agent, if no time.sleep(5), you still will hit "您操作太频繁,请稍后再访问"
# Actually, I hit it at page 29. Without cookie but only fake_useragent, I hit it at page 5.
time.sleep(5)
ua = UserAgent()
headers['User-Agent'] = ua.random
cookie = "JSESSIONID=" + get_uuid() + ";" \
"user_trace_token=" + get_uuid() + "; LGUID=" + get_uuid() + ";" \
"SEARCH_ID=" + get_uuid() + '; _gid=GA1.2.717841549.1514043316; ' \
'_ga=GA1.2.952298646.1514043316; ' \
'LGSID=' + get_uuid() + "; " \
"LGRID=" + get_uuid() + "; "
headers['cookie'] = cookie
print('UA:' + str(headers['User-Agent']))
response = requests.post(url, headers=headers, data=payload)
if response.status_code != 200 or not response.json()['success']:
print(response.json())
print("Something went wrong!")
break
print("Total: " + str(response.json()['content']['positionResult']['totalCount']))
print("Left: " + str(response.json()['content']['positionResult']['totalCount'] - i * 15))
print("============================== This is page: [" + str(i) + "] ============================== ")
print("============================== This is page: [" + str(i) + "] ============================== ")
print("============================== This is page: [" + str(i) + "] ============================== ")
# print(response.json())
count_results = len(response.json()['content']['positionResult']['result'])
# If we have results, save it
if count_results > 0:
my_set.insert_one(response.json()['content']['positionResult'])
# If still have pages.
if response.json()['content']['positionResult']['totalCount'] - i * 15 > 0:
i = i + 1
payload = {
'first': 'true',
'pn': i,
# This is the page number, you can get totalCount from json, then calculate how much page it would be
'kd': 'Python', # Keyword
}
else:
remain = False