-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhypeauditor_scraper.py
89 lines (66 loc) · 3.22 KB
/
hypeauditor_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from tqdm import tqdm
import pandas as pd
import time
class HypeAuditorScraper:
# public:
def scrape(self, gmail, password, hongkong):
# approach:
# intialize browser-> login google account -> login hypeauditor -> get the IG usernames ->
# (parse HTML -> next page)* 20 pages -> store in a pandas DataFrame
URL = "https://hypeauditor.com/top-instagram-all-hong-kong/" if hongkong == True else "https://hypeauditor.com/top-instagram-all-india/" # change -india to any country you wanna test out e.g. -united-states
self._initializeDriver()
self._googleLogin(gmail, password)
self._hypeauditorLogin()
self.driver.get(URL)
self.driver.implicitly_wait(1)
df = pd.DataFrame()
for i in tqdm(range(20)): # 20 pages for 1000 results
df = pd.concat([df, self._fetchData()], axis = 0, ignore_index = True)
if i < 19:
self._nextPage()
self.driver.quit()
return df
# private:
def _initializeDriver(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--blink-settings=imagesEnabled=false') # disable the images to speed up
self.driver = webdriver.Chrome(options = chrome_options)
def _googleLogin(self, gmail, password):
self.driver.get("https://www.gmail.com")
email_element = self.driver.find_element(By.TAG_NAME, "input")
email_element.send_keys(gmail, Keys.ENTER) # input gmail
time.sleep(5)
password_element = self.driver.find_element(By.XPATH, '//*[@id="password"]/div[1]/div/div[1]/input')
password_element.send_keys(password, Keys.ENTER) # input password
def _hypeauditorLogin(self):
self.driver.get("https://hypeauditor.com/login/")
time.sleep(1)
self.driver.find_element(By.XPATH,
'//*[@id="login-form-wrap"]/form[1]/div[1]/a').click() # press "use google login"
time.sleep(2)
self.driver.find_element(By.XPATH,
'//*[@id="view_container"]/div/div/div[2]/div/div[1]/div/form/span/section/div/div/div/div/ul/li[1]/div').click() # press which account to login
time.sleep(1)
self.driver.find_element(By.XPATH,
'//*[@id="yDmH0d"]/c-wiz/div/div[2]/div/div[2]/div/div/div[2]/div/div/button').click() # press "continue"
def _fetchData(self): # 50 results per page
def objectToText(object_list, col_name): # .find_elements() method returns a list of objects
return pd.DataFrame(data = [x.text for x in object_list], columns = [col_name])
self.driver.implicitly_wait(1)
username_list = objectToText(self.driver.find_elements(By.CLASS_NAME, "contributor__name-content"), "IG Username")
return username_list
def _nextPage(self):
time.sleep(1)
actions = ActionChains(self.driver)
actions.send_keys(Keys.HOME) # ensure the page is on the topmost
for _ in range(6):
actions.send_keys(Keys.PAGE_DOWN) # page down 6 times to locate the "next page" button
actions.perform()
time.sleep(2)
button = self.driver.find_element(By.XPATH,
'//*[@id="__layout"]/div/div/div[2]/div/div[2]/div[1]/div[3]/button[2]/i') # press next page
button.click()