-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproxies_scrapy.py
87 lines (62 loc) · 2.42 KB
/
proxies_scrapy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import time
import logging
import os
import json
import requests
import re
from urllib.parse import urljoin
from environs import Env
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from tqdm import tqdm
logging.basicConfig(format="%(process)d %(levelname)s %(message)s", level=logging.INFO)
env = Env()
env.read_env()
HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
def get_countries_url(response, proxies_urls):
soup = bs(response, 'lxml')
links_tag = soup.find_all(href=re.compile('proxys'))
country_links = {}
for link_tag in links_tag:
country_code = link_tag['href'].strip('/proxys/')
full_path = urljoin(proxies_urls, country_code)
country = (link_tag.text).split(' ')[0]
if country_code != '':
country_links[country] = f'{full_path}/'
return country_links
def get_proxies(countries_proxies_url, driver):
proxies_countries = {}
for country in tqdm(countries_proxies_url):
driver.get(countries_proxies_url[country])
try:
select = Select(driver.find_element_by_name('xpp'))
select.select_by_value('5')
time.sleep(3)
proxies = driver.find_elements_by_xpath('//tr[@onmouseover]/td[1]')
proxies_countries[country] = [proxy.text for proxy in proxies]
except Exception as err:
logging.exception(err)
driver.quit()
return proxies_countries
def save_to_json(filename, proxies):
with open(filename, 'w') as file:
json.dump(proxies, file)
print('Done...')
def main():
countries = 'http://spys.one/proxys/'
free_proxies_url = 'http://spys.one/free-proxy-list/'
proxies_filename = env('PROXY_FILE')
response = requests.get(countries, headers=HEADERS)
response.raise_for_status()
countries_urls = get_countries_url(response.text, free_proxies_urls)
driver = webdriver.Chrome(executable_path = env('PATH_TO_DRIVER'))
proxies = get_proxies(countries_urls, driver)
save_to_json(proxies_filename, proxies)
if __name__ == "__main__":
main()