From aa23e0d88c59ecceaf6c5ff25d5ea8b2c4a32825 Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" Date: Tue, 2 Feb 2021 11:51:32 +0800 Subject: [PATCH 01/11] Create ip89.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit www.89ip.cn 免费代理 --- proxypool/crawlers/public/ip89.py | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 proxypool/crawlers/public/ip89.py diff --git a/proxypool/crawlers/public/ip89.py b/proxypool/crawlers/public/ip89.py new file mode 100644 index 00000000..691f6781 --- /dev/null +++ b/proxypool/crawlers/public/ip89.py @@ -0,0 +1,34 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re + +MAX_NUM = 9999 +BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM) + + +class Daili66Crawler(BaseCrawler): + """ + 89ip crawler, http://api.89ip.cn + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + ip_address = re.compile('([\d:\.]*)
') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = Daili66Crawler() + for proxy in crawler.crawl(): + print(proxy) From 5a18288233b0f62c78b3cf78e8b1d80817b730d3 Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" Date: Tue, 2 Feb 2021 11:54:37 +0800 Subject: [PATCH 02/11] Update ip89.py update Class name --- proxypool/crawlers/public/ip89.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/proxypool/crawlers/public/ip89.py b/proxypool/crawlers/public/ip89.py index 691f6781..f67c3870 100644 --- a/proxypool/crawlers/public/ip89.py +++ b/proxypool/crawlers/public/ip89.py @@ -1,4 +1,3 @@ -from pyquery import PyQuery as pq from proxypool.schemas.proxy import Proxy from proxypool.crawlers.base import BaseCrawler import re @@ -7,7 +6,7 @@ BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM) -class Daili66Crawler(BaseCrawler): +class Ip89Crawler(BaseCrawler): """ 89ip crawler, http://api.89ip.cn """ @@ -29,6 +28,6 @@ def parse(self, html): if __name__ == '__main__': - crawler = Daili66Crawler() + crawler = Ip89Crawler() for proxy in crawler.crawl(): print(proxy) From 380c9d4acf584df21f6d8f0d5af2f87c3d0a4e47 Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" Date: Tue, 2 Feb 2021 16:13:47 +0800 Subject: [PATCH 03/11] Create fatezero_proxylist.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加 http://proxylist.fatezero.org/ 代理 --- .../crawlers/public/fatezero_proxylist.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 proxypool/crawlers/public/fatezero_proxylist.py diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py new file mode 100644 index 00000000..8a7d6e27 --- /dev/null +++ b/proxypool/crawlers/public/fatezero_proxylist.py @@ -0,0 +1,32 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'http://proxylist.fatezero.org/proxy.list' + + +class FatezeroCrawler(BaseCrawler): + """ + Fatezero crawler,http://proxylist.fatezero.org + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + hosts_ports = html.split('\n') + for addr in hosts_ports: + ip_address = json.loads(addr) + if(True): + host = ip_address['host'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = FatezeroCrawler() + for proxy in crawler.crawl(): + print(proxy) From f56e74778ca1bef138673075fde9b7f3b90e8ee9 Mon Sep 17 00:00:00 2001 From: "j.yao.SUSE" Date: Wed, 3 Feb 2021 12:57:03 +0800 Subject: [PATCH 04/11] Create ihuan.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i幻 代理 --- proxypool/crawlers/public/ihuan.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 proxypool/crawlers/public/ihuan.py diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py new file mode 100644 index 00000000..7386b705 --- /dev/null +++ b/proxypool/crawlers/public/ihuan.py @@ -0,0 +1,34 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'https://ip.ihuan.me/today/{path}.html' + + +class IhuanCrawler(BaseCrawler): + """ + ip ihuan crawler, https://ip.ihuan.me + """ + urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + # doc = pq(html)('.text-left') + ip_address = re.compile('([\d:\.]*).*?
') + hosts_ports = ip_address.findall(html) + for addr in hosts_ports: + addr_split = addr.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = IhuanCrawler() + for proxy in crawler.crawl(): + print(proxy) From afc3cb73edbdd6466646cdbbef72a2ac97b114e2 Mon Sep 17 00:00:00 2001 From: jy Date: Thu, 4 Feb 2021 14:48:18 +0800 Subject: [PATCH 05/11] update example usage2 --- Dockerfile | 2 +- docker-compose.yml | 2 +- examples/usage2.py | 93 ++++++++++++++++++++++++++++++++++ proxypool/processors/server.py | 15 ++++++ proxypool/setting.py | 10 ++-- 5 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 examples/usage2.py diff --git a/Dockerfile b/Dockerfile index dab1227e..2e5448a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.6 WORKDIR /app COPY . . -RUN pip install -r requirements.txt +RUN pip install -r requirements.txt -i https://pypi.douban.com/simple VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/docker-compose.yml b/docker-compose.yml index c39ded37..d9615cd8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: container_name: redis4proxypool command: redis-server ports: - - "6379:6379" + - "6378:6379" # restart: always proxypool: build: . diff --git a/examples/usage2.py b/examples/usage2.py new file mode 100644 index 00000000..b81902a1 --- /dev/null +++ b/examples/usage2.py @@ -0,0 +1,93 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + + # 验证IP归属 + if not getChinaIP(self.proxyip.split(':')[0]): + raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + headers['Pragma'] = 'no-cache' + headers['Host'] = 'bb.cf08tp.cn' + headers['Cookie'] = 'PHPSESSID={}'.format( + ''.join(str(uuid.uuid1()).split('-'))) + print(headers) + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=10).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(3) + except Exception as e: + print(e) + # 休眠 + time.sleep(self.fetchSecond) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5556/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index e87f82f5..d3edd70d 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -37,6 +37,21 @@ def get_proxy(): return conn.random().string() +@app.route('/all') +def get_proxy_all(): + """ + get a random proxy + :return: get a random proxy + """ + conn = get_conn() + proxies = conn.all() + proxies_string = '' + for proxy in proxies: + proxies_string += str(proxy) + '\n' + + return proxies_string + + @app.route('/count') def get_count(): """ diff --git a/proxypool/setting.py b/proxypool/setting.py index 7c3008b3..8f3093cd 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -26,7 +26,7 @@ # redis host REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') # redis port -REDIS_PORT = env.int('REDIS_PORT', 6379) +REDIS_PORT = env.int('REDIS_PORT', 6378) # redis password, if no password, set it to None REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) # redis db, if no choice, set it to 0 @@ -42,7 +42,7 @@ # definition of proxy scores PROXY_SCORE_MAX = 100 -PROXY_SCORE_MIN = 0 +PROXY_SCORE_MIN = 15 PROXY_SCORE_INIT = 10 # definition of proxy number @@ -57,8 +57,8 @@ # definition of tester TEST_URL = env.str('TEST_URL', 'http://www.baidu.com') -TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) -TEST_BATCH = env.int('TEST_BATCH', 20) +TEST_TIMEOUT = env.int('TEST_TIMEOUT', 8) +TEST_BATCH = env.int('TEST_BATCH', 200) # only save anonymous proxy TEST_ANONYMOUS = True # TEST_HEADERS = env.json('TEST_HEADERS', { @@ -68,7 +68,7 @@ # definition of api API_HOST = env.str('API_HOST', '0.0.0.0') -API_PORT = env.int('API_PORT', 5555) +API_PORT = env.int('API_PORT', 5556) API_THREADED = env.bool('API_THREADED', True) # flags of enable From e144f1afd56bedfa01dbab2a0d32b2f64e05ff20 Mon Sep 17 00:00:00 2001 From: jy Date: Thu, 4 Feb 2021 15:27:10 +0800 Subject: [PATCH 06/11] update requirements.txt --- examples/usage2.py | 2 +- proxypool/setting.py | 2 +- requirements.txt | 22 ++++++++++++---------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/usage2.py b/examples/usage2.py index b81902a1..5f611005 100644 --- a/examples/usage2.py +++ b/examples/usage2.py @@ -45,7 +45,7 @@ def run(self): ''.join(str(uuid.uuid1()).split('-'))) print(headers) html = requests.get(headers=headers, url=targetUrl, proxies={ - "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=10).content.decode() + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=8).content.decode() # 结束计时 end = time.time() # 输出内容 diff --git a/proxypool/setting.py b/proxypool/setting.py index 8f3093cd..39773597 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -42,7 +42,7 @@ # definition of proxy scores PROXY_SCORE_MAX = 100 -PROXY_SCORE_MIN = 15 +PROXY_SCORE_MIN = 10 PROXY_SCORE_INIT = 10 # definition of proxy number diff --git a/requirements.txt b/requirements.txt index cc0b6111..a3e728e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ -environs==7.2.0 -Flask==1.0.3 -attrs==19.1.0 +environs==9.3.0 +Flask==1.1.2 +attrs==20.3.0 retrying==1.3.3 -aiohttp==3.6.2 -requests==2.22.0 -loguru==0.3.2 -pyquery==1.4.0 -supervisor==4.1.0 -redis==2.10.6 -lxml==4.3.3 \ No newline at end of file +aiohttp==3.7.3 +requests==2.25.1 +loguru==0.5.3 +pyquery==1.4.3 +supervisor==4.2.1 +redis==3.5.3 +lxml==4.6.2 +fake_headers==1.0.2 +maxminddb_geolite2==2018.703 From 3aea38c649f97d6a3bf868420359807b1c1471b1 Mon Sep 17 00:00:00 2001 From: jy Date: Sat, 6 Feb 2021 00:02:43 +0800 Subject: [PATCH 07/11] =?UTF-8?q?=E4=BC=98=E5=8C=96=20public=20crawlers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.yml | 2 +- examples/usage2.py | 14 +++--- proxypool/crawlers/base.py | 9 ++-- proxypool/crawlers/public/daili66.py | 2 +- .../crawlers/public/fatezero_proxylist.py | 4 +- proxypool/crawlers/public/goubanjia.py | 44 +++++++++++++++++++ proxypool/crawlers/public/ip3366.py | 6 +-- proxypool/crawlers/public/kuaidaili.py | 6 +-- proxypool/crawlers/public/zhandaye.py | 2 +- proxypool/setting.py | 4 +- proxypool/storages/redis.py | 4 +- 11 files changed, 73 insertions(+), 24 deletions(-) create mode 100644 proxypool/crawlers/public/goubanjia.py diff --git a/docker-compose.yml b/docker-compose.yml index d9615cd8..20d881ab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: command: redis-server ports: - "6378:6379" - # restart: always + restart: always proxypool: build: . image: 'germey/proxypool' diff --git a/examples/usage2.py b/examples/usage2.py index 5f611005..918c5eb2 100644 --- a/examples/usage2.py +++ b/examples/usage2.py @@ -29,9 +29,10 @@ def __init__(self, proxyip): def run(self): # 开始计时 - + pure_ip_address = self.proxyip.split(':')[0] # 验证IP归属 - if not getChinaIP(self.proxyip.split(':')[0]): + if not getChinaIP(pure_ip_address): + # pass raise ValueError('不是有效IP') # start = time.time() @@ -41,11 +42,12 @@ def run(self): headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' headers['Pragma'] = 'no-cache' headers['Host'] = 'bb.cf08tp.cn' + headers['x-forward-for'] = pure_ip_address headers['Cookie'] = 'PHPSESSID={}'.format( ''.join(str(uuid.uuid1()).split('-'))) print(headers) html = requests.get(headers=headers, url=targetUrl, proxies={ - "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=8).content.decode() + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() # 结束计时 end = time.time() # 输出内容 @@ -74,16 +76,16 @@ def run(self): # CrawlThread(proxyip).start() try: CrawlThread(proxyip).run() - time.sleep(3) + time.sleep(1.5) except Exception as e: print(e) # 休眠 - time.sleep(self.fetchSecond) + time.sleep(len(ips) /self.fetchSecond ) if __name__ == '__main__': # 获取IP的API接口 - # apiUrl = "http://127.0.0.1:5556/all" + # apiUrl = "http://127.0.0.1:5555/all" apiUrl = "http://127.0.0.1:5555/random" # 要抓取的目标网站地址 targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" diff --git a/proxypool/crawlers/base.py b/proxypool/crawlers/base.py index aa35430e..563d49bb 100644 --- a/proxypool/crawlers/base.py +++ b/proxypool/crawlers/base.py @@ -2,17 +2,19 @@ import requests from loguru import logger from proxypool.setting import GET_TIMEOUT - - +from fake_headers import Headers +import time class BaseCrawler(object): urls = [] @retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000) def fetch(self, url, **kwargs): try: + headers = Headers(headers=True).generate() kwargs.setdefault('timeout', GET_TIMEOUT) kwargs.setdefault('verify', False) - response = requests.get(url, **kwargs) + kwargs.setdefault('headers', headers) + response = requests.get(url ,**kwargs) if response.status_code == 200: response.encoding = 'utf-8' return response.text @@ -27,6 +29,7 @@ def crawl(self): for url in self.urls: logger.info(f'fetching {url}') html = self.fetch(url) + time.sleep(.5) for proxy in self.parse(html): logger.info(f'fetched proxy {proxy.string()} from {url}') yield proxy diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 09a3ee45..6f8c7e61 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -4,7 +4,7 @@ BASE_URL = 'http://www.66ip.cn/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 500 class Daili66Crawler(BaseCrawler): diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py index 8a7d6e27..3b99dea9 100644 --- a/proxypool/crawlers/public/fatezero_proxylist.py +++ b/proxypool/crawlers/public/fatezero_proxylist.py @@ -19,8 +19,8 @@ def parse(self, html): hosts_ports = html.split('\n') for addr in hosts_ports: - ip_address = json.loads(addr) - if(True): + if(addr): + ip_address = json.loads(addr) host = ip_address['host'] port = ip_address['port'] yield Proxy(host=host, port=port) diff --git a/proxypool/crawlers/public/goubanjia.py b/proxypool/crawlers/public/goubanjia.py new file mode 100644 index 00000000..57157858 --- /dev/null +++ b/proxypool/crawlers/public/goubanjia.py @@ -0,0 +1,44 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +from pyquery import PyQuery as pq +import time +BASE_URL = 'http://www.goubanjia.com/' + + +class GoubanjiaCrawler(BaseCrawler): + """ + ip Goubanjia crawler, http://www.goubanjia.com/ + """ + urls = [BASE_URL] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html)('.ip').items() + # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) + for td in doc: + trs = td.children() + ip_str = '' + for tr in trs: + attrib = tr.attrib + if 'style' in attrib and 'none' in tr.attrib['style']: + continue + ip_str+= '' if not tr.text else tr.text + addr_split = ip_str.split(':') + if(len(addr_split) == 2): + host = addr_split[0] + port = addr_split[1] + yield Proxy(host=host, port=port) + else: + port = trs[-1].text + host = ip_str.replace(port,'') + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = GoubanjiaCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/ip3366.py b/proxypool/crawlers/public/ip3366.py index 78d29447..474a4f77 100644 --- a/proxypool/crawlers/public/ip3366.py +++ b/proxypool/crawlers/public/ip3366.py @@ -3,15 +3,15 @@ import re -MAX_PAGE = 5 -BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}' +MAX_PAGE = 8 +BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}' class IP3366Crawler(BaseCrawler): """ ip3366 crawler, http://www.ip3366.net/ """ - urls = [BASE_URL.format(page=i) for i in range(1, 8)] + urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/kuaidaili.py b/proxypool/crawlers/public/kuaidaili.py index f3fa6437..71ab1717 100644 --- a/proxypool/crawlers/public/kuaidaili.py +++ b/proxypool/crawlers/public/kuaidaili.py @@ -4,15 +4,15 @@ from pyquery import PyQuery as pq -BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/' -MAX_PAGE = 5 +BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/' +MAX_PAGE = 300 class KuaidailiCrawler(BaseCrawler): """ kuaidaili crawler, https://www.kuaidaili.com/ """ - urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)] def parse(self, html): """ diff --git a/proxypool/crawlers/public/zhandaye.py b/proxypool/crawlers/public/zhandaye.py index b6278a28..83af04b6 100755 --- a/proxypool/crawlers/public/zhandaye.py +++ b/proxypool/crawlers/public/zhandaye.py @@ -6,7 +6,7 @@ BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html' -MAX_PAGE = 5 +MAX_PAGE = 5 * 2 class ZhandayeCrawler(BaseCrawler): """ diff --git a/proxypool/setting.py b/proxypool/setting.py index 39773597..31bb48c6 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -26,7 +26,7 @@ # redis host REDIS_HOST = env.str('REDIS_HOST', '127.0.0.1') # redis port -REDIS_PORT = env.int('REDIS_PORT', 6378) +REDIS_PORT = env.int('REDIS_PORT', 6379) # redis password, if no password, set it to None REDIS_PASSWORD = env.str('REDIS_PASSWORD', None) # redis db, if no choice, set it to 0 @@ -68,7 +68,7 @@ # definition of api API_HOST = env.str('API_HOST', '0.0.0.0') -API_PORT = env.int('API_PORT', 5556) +API_PORT = env.int('API_PORT', 5555) API_THREADED = env.bool('API_THREADED', True) # flags of enable diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 60a03e9f..0ebbccc2 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -51,11 +51,11 @@ def random(self) -> Proxy: :return: proxy, like 8.8.8.8:8 """ # try to get proxy with max score - proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX) + proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else get proxy by rank - proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX) + proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX) if len(proxies): return convert_proxy_or_proxies(choice(proxies)) # else raise error From 15a76051db22026bebacec811da9474d0b652e12 Mon Sep 17 00:00:00 2001 From: jy Date: Sat, 6 Feb 2021 10:32:49 +0800 Subject: [PATCH 08/11] add proxy jiangxianli --- proxypool/crawlers/public/daili66.py | 2 +- proxypool/crawlers/public/ihuan.py | 6 ++-- proxypool/crawlers/public/jiangxianli.py | 35 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 proxypool/crawlers/public/jiangxianli.py diff --git a/proxypool/crawlers/public/daili66.py b/proxypool/crawlers/public/daili66.py index 6f8c7e61..7b3bf7c2 100644 --- a/proxypool/crawlers/public/daili66.py +++ b/proxypool/crawlers/public/daili66.py @@ -4,7 +4,7 @@ BASE_URL = 'http://www.66ip.cn/{page}.html' -MAX_PAGE = 500 +MAX_PAGE = 50 class Daili66Crawler(BaseCrawler): diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py index 7386b705..a27c998e 100644 --- a/proxypool/crawlers/public/ihuan.py +++ b/proxypool/crawlers/public/ihuan.py @@ -10,8 +10,10 @@ class IhuanCrawler(BaseCrawler): """ ip ihuan crawler, https://ip.ihuan.me """ - urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))] - + path = time.strftime("%Y/%m/%d/%H", time.localtime()) + urls = [BASE_URL.format(path=path)] + ignore = True + def parse(self, html): """ parse html file to get proxies diff --git a/proxypool/crawlers/public/jiangxianli.py b/proxypool/crawlers/public/jiangxianli.py new file mode 100644 index 00000000..14fc46cc --- /dev/null +++ b/proxypool/crawlers/public/jiangxianli.py @@ -0,0 +1,35 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import re +import json +BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}' + +MAX_PAGE = 10 +class JiangxianliCrawler(BaseCrawler): + """ + jiangxianli crawler,https://ip.jiangxianli.com/ + """ + urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + + result =json.loads(html) + if result['code'] != 0: + return + MAX_PAGE = int(result['data']['last_page']) + hosts_ports = result['data']['data'] + for ip_address in hosts_ports: + if(ip_address): + host = ip_address['ip'] + port = ip_address['port'] + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = JiangxianliCrawler() + for proxy in crawler.crawl(): + print(proxy) From 5df18abef6e39123d5988f47ecc1d0318c196611 Mon Sep 17 00:00:00 2001 From: jy Date: Sun, 7 Feb 2021 16:19:12 +0800 Subject: [PATCH 09/11] =?UTF-8?q?tester=20=E5=A2=9E=E5=8A=A0=E5=8D=95?= =?UTF-8?q?=E4=B8=AAproxy=E6=B5=8B=E8=AF=95=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxypool/processors/tester.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index e0812110..f002056a 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -84,7 +84,14 @@ def run(self): if not cursor: break +def run_tester(): + host = '96.113.165.182' + port = '3128' + tasks = [tester.test(Proxy(host=host, port=port))] + tester.loop.run_until_complete(asyncio.wait(tasks)) if __name__ == '__main__': tester = Tester() tester.run() + # run_tester() + From 19d79da778586715d813dbeadea073eca63bef92 Mon Sep 17 00:00:00 2001 From: jy Date: Sun, 7 Feb 2021 16:33:45 +0800 Subject: [PATCH 10/11] reset setting Dockerfile docker-compose to default --- Dockerfile | 3 ++- docker-compose.yml | 4 ++-- proxypool/setting.py | 11 +++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2e5448a5..4b8f9c0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.6 WORKDIR /app COPY . . -RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +RUN pip install -r requirements.txt -i VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/docker-compose.yml b/docker-compose.yml index 20d881ab..03d85de7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,8 +5,8 @@ services: container_name: redis4proxypool command: redis-server ports: - - "6378:6379" - restart: always + - "6379:6379" + # restart: always proxypool: build: . image: 'germey/proxypool' diff --git a/proxypool/setting.py b/proxypool/setting.py index 31bb48c6..908d2803 100644 --- a/proxypool/setting.py +++ b/proxypool/setting.py @@ -42,7 +42,7 @@ # definition of proxy scores PROXY_SCORE_MAX = 100 -PROXY_SCORE_MIN = 10 +PROXY_SCORE_MIN = 0 PROXY_SCORE_INIT = 10 # definition of proxy number @@ -57,8 +57,8 @@ # definition of tester TEST_URL = env.str('TEST_URL', 'http://www.baidu.com') -TEST_TIMEOUT = env.int('TEST_TIMEOUT', 8) -TEST_BATCH = env.int('TEST_BATCH', 200) +TEST_TIMEOUT = env.int('TEST_TIMEOUT', 10) +TEST_BATCH = env.int('TEST_BATCH', 20) # only save anonymous proxy TEST_ANONYMOUS = True # TEST_HEADERS = env.json('TEST_HEADERS', { @@ -76,6 +76,5 @@ ENABLE_GETTER = env.bool('ENABLE_GETTER', True) ENABLE_SERVER = env.bool('ENABLE_SERVER', True) -logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') -logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') - +# logger.add(env.str('LOG_RUNTIME_FILE', join(LOG_DIR, 'runtime.log')), level='DEBUG', rotation='1 week', retention='20 days') +# logger.add(env.str('LOG_ERROR_FILE', join(LOG_DIR, 'error.log')), level='ERROR', rotation='1 week') \ No newline at end of file From 556b320a28546f31dc52ab2abaf3e9632bd9b708 Mon Sep 17 00:00:00 2001 From: jy Date: Fri, 8 Oct 2021 11:30:39 +0800 Subject: [PATCH 11/11] =?UTF-8?q?1.=20=20tester=E4=BC=98=E5=8C=96=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=202.=20=E8=BF=94=E5=9B=9Eall=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E4=BC=98=E5=8C=96=203.=20=E5=A2=9E=E5=8A=A0taiyangdaili=203.?= =?UTF-8?q?=20usage=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 4 +- docker-compose.yml | 2 +- examples/usage2.py | 20 ++--- examples/usage3.py | 95 +++++++++++++++++++++++ proxypool/crawlers/public/fanqieip.py | 31 ++++++++ proxypool/crawlers/public/taiyangdaili.py | 2 +- proxypool/processors/server.py | 11 +-- proxypool/processors/tester.py | 8 +- proxypool/schemas/proxy.py | 6 ++ proxypool/storages/redis.py | 4 +- 10 files changed, 158 insertions(+), 25 deletions(-) create mode 100644 examples/usage3.py create mode 100644 proxypool/crawlers/public/fanqieip.py diff --git a/Dockerfile b/Dockerfile index 4b8f9c0e..8a4437d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.6 WORKDIR /app COPY . . -# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple -RUN pip install -r requirements.txt -i +RUN pip install -r requirements.txt -i https://pypi.douban.com/simple +# RUN pip install -r requirements.txt -i VOLUME ["/app/proxypool/crawlers/private"] CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/docker-compose.yml b/docker-compose.yml index 03d85de7..1671b978 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: container_name: redis4proxypool command: redis-server ports: - - "6379:6379" + - "6378:6379" # restart: always proxypool: build: . diff --git a/examples/usage2.py b/examples/usage2.py index 918c5eb2..487aefef 100644 --- a/examples/usage2.py +++ b/examples/usage2.py @@ -17,7 +17,7 @@ def getChinaIP(ip='127.0.0.1'): reader = geolite2.reader() ip_info = reader.get(ip) geolite2.close() - print(ip_info) + # print(ip_info) return True if ip_info['country']['iso_code'] == 'CN' else False @@ -32,22 +32,22 @@ def run(self): pure_ip_address = self.proxyip.split(':')[0] # 验证IP归属 if not getChinaIP(pure_ip_address): - # pass - raise ValueError('不是有效IP') + pass + # raise ValueError('不是有效IP') # start = time.time() # 消除关闭证书验证的警告 urllib3.disable_warnings() headers = Headers(headers=True).generate() - headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' + # headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' headers['Pragma'] = 'no-cache' - headers['Host'] = 'bb.cf08tp.cn' - headers['x-forward-for'] = pure_ip_address + # headers['Host'] = 'bb.cf08tp.cn' + # headers['x-forward-for'] = pure_ip_address headers['Cookie'] = 'PHPSESSID={}'.format( ''.join(str(uuid.uuid1()).split('-'))) - print(headers) + # print(headers) html = requests.get(headers=headers, url=targetUrl, proxies={ - "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() + "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode() # 结束计时 end = time.time() # 输出内容 @@ -88,8 +88,8 @@ def run(self): # apiUrl = "http://127.0.0.1:5555/all" apiUrl = "http://127.0.0.1:5555/random" # 要抓取的目标网站地址 - targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" - # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' + # targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + targetUrl = 'http://www.so.com' fetchSecond = 5 # 开始自动获取IP GetIpThread(fetchSecond).start() diff --git a/examples/usage3.py b/examples/usage3.py new file mode 100644 index 00000000..67b636f7 --- /dev/null +++ b/examples/usage3.py @@ -0,0 +1,95 @@ +# -*- coding: UTF-8 -*- + +''' +''' +import requests +import time +import threading +import urllib3 +from fake_headers import Headers +import uuid +from geolite2 import geolite2 +ips = [] + +# 爬数据的线程类 + +def getChinaIP(ip='127.0.0.1'): + reader = geolite2.reader() + ip_info = reader.get(ip) + geolite2.close() + # print(ip_info) + return True if ip_info['country']['iso_code'] == 'CN' else False + + + +class CrawlThread(threading.Thread): + def __init__(self, proxyip): + super(CrawlThread, self).__init__() + self.proxyip = proxyip + + def run(self): + # 开始计时 + pure_ip_address = self.proxyip.split(':')[0] + # 验证IP归属 + if not getChinaIP(pure_ip_address): + pass + # raise ValueError('不是有效IP') + # + start = time.time() + # 消除关闭证书验证的警告 + urllib3.disable_warnings() + headers = Headers(headers=True).generate() + headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html' + headers['Pragma'] = 'no-cache' + # headers['Host'] = 'ga.314300.cn' + # headers['x-forward-for'] = pure_ip_address + headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL' + # print(headers) + headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3' + html = requests.get(headers=headers, url=targetUrl, proxies={ + "http": 'http://' + self.proxyip}, verify=False, timeout=12).content.decode() + # 结束计时 + end = time.time() + # 输出内容 + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") + +# 获取代理IP的线程类 + + +class GetIpThread(threading.Thread): + def __init__(self, fetchSecond): + super(GetIpThread, self).__init__() + self.fetchSecond = fetchSecond + + def run(self): + global ips + while True: + # 获取IP列表 + res = requests.get(apiUrl).content.decode() + # 按照\n分割获取到的IP + ips = res.split('\n') + # 利用每一个IP + for proxyip in ips: + if proxyip.strip(): + # 开启一个线程 + # CrawlThread(proxyip).start() + try: + CrawlThread(proxyip).run() + time.sleep(1.5) + except Exception as e: + print(e) + # 休眠 + time.sleep(len(ips) /self.fetchSecond ) + + +if __name__ == '__main__': + # 获取IP的API接口 + # apiUrl = "http://127.0.0.1:5555/all" + apiUrl = "http://127.0.0.1:5555/random" + # 要抓取的目标网站地址 + # targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" + targetUrl = 'http://ga.314300.cn/toupiao/json/?id=40&s=tp' + fetchSecond = 5 + # 开始自动获取IP + GetIpThread(fetchSecond).start() diff --git a/proxypool/crawlers/public/fanqieip.py b/proxypool/crawlers/public/fanqieip.py new file mode 100644 index 00000000..01e7b7a9 --- /dev/null +++ b/proxypool/crawlers/public/fanqieip.py @@ -0,0 +1,31 @@ +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from pyquery import PyQuery as pq + +BaseUrl = 'https://www.fanqieip.com/free/{num}' +MAX_PAGE = 5 * 100 + + +class FanqieIPCrawler(BaseCrawler): + """ + FanqieIP crawler, https://www.fanqieip.com + """ + urls = [BaseUrl.format(num=i) for i in range(1, MAX_PAGE)] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('.layui-table tbody tr ').items() + for tr in trs: + host = tr.find('td div')[0].text + port = tr.find('td div')[1].text + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = FanqieIPCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/taiyangdaili.py b/proxypool/crawlers/public/taiyangdaili.py index 7a48cb43..bff823d0 100644 --- a/proxypool/crawlers/public/taiyangdaili.py +++ b/proxypool/crawlers/public/taiyangdaili.py @@ -3,7 +3,7 @@ from pyquery import PyQuery as pq BaseUrl = 'http://www.taiyanghttp.com/free/page{num}' -MAX_PAGE = 5 +MAX_PAGE = 5 * 2 class TaiyangdailiCrawler(BaseCrawler): diff --git a/proxypool/processors/server.py b/proxypool/processors/server.py index d3edd70d..785bbf4f 100644 --- a/proxypool/processors/server.py +++ b/proxypool/processors/server.py @@ -1,6 +1,6 @@ -from flask import Flask, g +from flask import Flask, g , request , jsonify from proxypool.storages.redis import RedisClient -from proxypool.setting import API_HOST, API_PORT, API_THREADED +from proxypool.setting import API_HOST, API_PORT, API_THREADED,PROXY_SCORE_MIN, PROXY_SCORE_MAX __all__ = ['app'] @@ -40,11 +40,12 @@ def get_proxy(): @app.route('/all') def get_proxy_all(): """ - get a random proxy - :return: get a random proxy + get proxy by min_score to max_score + :return: proxies list """ + args = request.args conn = get_conn() - proxies = conn.all() + proxies = conn.all(args.get('min_score',PROXY_SCORE_MIN),args.get('max_score',PROXY_SCORE_MAX)) proxies_string = '' for proxy in proxies: proxies_string += str(proxy) + '\n' diff --git a/proxypool/processors/tester.py b/proxypool/processors/tester.py index f002056a..69ce4cb3 100644 --- a/proxypool/processors/tester.py +++ b/proxypool/processors/tester.py @@ -6,7 +6,7 @@ from proxypool.setting import TEST_TIMEOUT, TEST_BATCH, TEST_URL, TEST_VALID_STATUS, TEST_ANONYMOUS from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError from asyncio import TimeoutError - +import requests EXCEPTIONS = ( ClientProxyConnectionError, @@ -43,7 +43,7 @@ async def test(self, proxy: Proxy): # if TEST_ANONYMOUS is True, make sure that # the proxy has the effect of hiding the real IP if TEST_ANONYMOUS: - url = 'https://httpbin.org/ip' + url = 'http://www.nghttp2.org/httpbin/ip' async with session.get(url, timeout=TEST_TIMEOUT) as response: resp_json = await response.json() origin_ip = resp_json['origin'] @@ -85,8 +85,8 @@ def run(self): break def run_tester(): - host = '96.113.165.182' - port = '3128' + host = '111.246.42.52' + port = '8888' tasks = [tester.test(Proxy(host=host, port=port))] tester.loop.run_until_complete(asyncio.wait(tasks)) diff --git a/proxypool/schemas/proxy.py b/proxypool/schemas/proxy.py index 8be3fb34..84323e81 100644 --- a/proxypool/schemas/proxy.py +++ b/proxypool/schemas/proxy.py @@ -8,6 +8,12 @@ class Proxy(object): """ host = attr(type=str, default=None) port = attr(type=int, default=None) + location = attr(type=str, default=None) + isp = attr(type=str, default=None) + country = attr(type=str, default=None) + anonymous = attr(type=bool, default=None) + protocol = attr(type=str, default=None) + alive_time = attr(type=int, default=None) def __str__(self): """ diff --git a/proxypool/storages/redis.py b/proxypool/storages/redis.py index 0ebbccc2..3570cc1e 100644 --- a/proxypool/storages/redis.py +++ b/proxypool/storages/redis.py @@ -103,12 +103,12 @@ def count(self) -> int: """ return self.db.zcard(REDIS_KEY) - def all(self) -> List[Proxy]: + def all(self,min_score=PROXY_SCORE_MIN,max_score=PROXY_SCORE_MAX) -> List[Proxy]: """ get all proxies :return: list of proxies """ - return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)) + return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, min_score,max_score)) def batch(self, cursor, count) -> List[Proxy]: """