Skip to content

Commit f2190d2

Browse files
authored
Merge pull request #11 from bitmakerla/add-proxy-rotation
Add proxy rotation module
2 parents 6714f6a + c47c635 commit f2190d2

File tree

5 files changed

+454
-1
lines changed

5 files changed

+454
-1
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,6 @@ ps-helper = "ps_helper.cli.main:main"
3232

3333
[tool.setuptools.packages.find]
3434
where = ["src"]
35+
36+
[tool.pytest.ini_options]
37+
pythonpath = ["src"]

src/ps_helper/middlewares/README.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,90 @@ DOWNLOADER_CLIENTCONTEXTFACTORY = "ps_helper.middlewares.LegacyConnectContextFac
1616
Scrapy will then use the LegacyConnectContextFactory for all HTTPS connections.
1717

1818
--------------------------------------
19+
20+
# Proxy Rotator Middlewares
21+
22+
This module provides two Scrapy downloader middlewares for rotating HTTP proxies with optional smart banning logic and statistics tracking.
23+
24+
---
25+
26+
## 🧩 Middlewares
27+
28+
### **1. SequentialProxyRotatorMiddleware**
29+
A simple **round-robin** proxy rotation strategy that cycles through proxies sequentially.
30+
31+
#### Enable in `settings.py`
32+
```python
33+
DOWNLOADER_MIDDLEWARES = {
34+
"ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware": 620,
35+
}
36+
```
37+
38+
#### Required Setting
39+
```python
40+
PROXY_PROVIDERS = {
41+
"provider1": {"url": "proxy1.com", "port": 8080},
42+
"provider2": {"url": "proxy2.com", "port": 8080, "user": "user", "password": "pass"},
43+
}
44+
```
45+
46+
#### Behavior
47+
- Rotates proxies in order.
48+
- Logs total requests, successes, failures, and success rate for each proxy when the spider closes.
49+
50+
---
51+
52+
### **2. SmartProxyRotatorMiddleware**
53+
A more advanced rotation system that supports banning failed proxies temporarily and two rotation modes (`random` or `round_robin`).
54+
55+
#### Enable in `settings.py`
56+
```python
57+
DOWNLOADER_MIDDLEWARES = {
58+
"ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware": 620,
59+
}
60+
```
61+
62+
#### Available Settings
63+
```python
64+
PROXY_PROVIDERS = {
65+
"proxy1": {"url": "proxy1.com", "port": 8080},
66+
"proxy2": {"url": "proxy2.com", "port": 8080, "user": "user", "password": "pass"},
67+
}
68+
69+
PROXY_BAN_THRESHOLD = 3 # Number of failures before banning a proxy
70+
PROXY_COOLDOWN = 300 # Cooldown duration in seconds for banned proxies
71+
PROXY_ROTATION_MODE = "random" # 'random' or 'round_robin'
72+
```
73+
74+
#### Features
75+
- Automatically bans proxies that fail too many times.
76+
- Supports **cooldown** (temporary ban).
77+
- Chooses proxies randomly or sequentially while skipping banned ones.
78+
- Displays a detailed summary when the spider closes.
79+
80+
---
81+
82+
## 🧠 Summary Logs Example
83+
When a spider finishes, a summary like this appears in the logs:
84+
85+
```
86+
============================================================
87+
PROXY USAGE SUMMARY
88+
============================================================
89+
Proxy: http://proxy1.com:8080
90+
Total requests: 120
91+
Successes: 110
92+
Failures: 10
93+
Success rate: 91.7%
94+
Banned: NO
95+
--------------------------------------------------
96+
Proxy: http://proxy2.com:8080
97+
Total requests: 50
98+
Successes: 25
99+
Failures: 25
100+
Success rate: 50.0%
101+
Banned: YES
102+
============================================================
103+
```
104+
105+
---
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
import random
2+
import logging
3+
import time
4+
from scrapy import signals
5+
from scrapy.exceptions import NotConfigured
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
class BaseProxyRotator:
11+
"""Base class with shared proxy setup logic and stats."""
12+
13+
def __init__(self, proxy_providers):
14+
self.proxy_providers = proxy_providers
15+
self.proxies = self._build_proxy_list(proxy_providers)
16+
self.proxy_stats = {
17+
proxy: {"requests": 0, "success": 0, "fails": 0, "banned_until": 0}
18+
for proxy in self.proxies
19+
}
20+
logger.info(f"ProxyRotator initialized with {len(self.proxies)} proxies")
21+
22+
def _build_proxy_list(self, providers_dict):
23+
proxies = []
24+
for provider, data in providers_dict.items():
25+
user = data.get("user")
26+
password = data.get("password")
27+
url = data.get("url")
28+
port = data.get("port")
29+
if user and password:
30+
proxy = f"http://{user}:{password}@{url}:{port}"
31+
else:
32+
proxy = f"http://{url}:{port}"
33+
proxies.append(proxy)
34+
return proxies
35+
36+
def _record_success(self, proxy):
37+
if proxy in self.proxy_stats:
38+
self.proxy_stats[proxy]["success"] += 1
39+
40+
def _record_failure(self, proxy):
41+
if proxy in self.proxy_stats:
42+
self.proxy_stats[proxy]["fails"] += 1
43+
44+
def _record_request(self, proxy):
45+
if proxy in self.proxy_stats:
46+
self.proxy_stats[proxy]["requests"] += 1
47+
48+
def log_summary(self, spider):
49+
logger.info("=" * 60)
50+
logger.info("PROXY USAGE SUMMARY")
51+
logger.info("=" * 60)
52+
for proxy, stats in self.proxy_stats.items():
53+
total = stats["requests"]
54+
fails = stats["fails"]
55+
success = stats["success"]
56+
rate = (success / total * 100) if total else 0
57+
banned = "YES" if stats.get("banned_until", 0) > time.time() else "NO"
58+
spider.logger.info(
59+
f"Proxy: {proxy}\n"
60+
f" Total requests: {total}\n"
61+
f" Successes: {success}\n"
62+
f" Failures: {fails}\n"
63+
f" Success rate: {rate:.1f}%\n"
64+
f" Banned: {banned}\n"
65+
f"{'-' * 50}"
66+
)
67+
logger.info("=" * 60)
68+
69+
70+
class SequentialProxyRotatorMiddleware(BaseProxyRotator):
71+
"""
72+
Simple sequential rotation (round-robin) with stats.
73+
to DOWNLOADER_MIDDLEWARES option::
74+
75+
DOWNLOADER_MIDDLEWARES = {
76+
# ...
77+
'ps_helper.middlewares.proxy_rotator.SequentialProxyRotatorMiddleware': 620,
78+
# ...
79+
}
80+
81+
Settings:
82+
* ``PROXY_PROVIDERS`` - a list of proxies to choose from;
83+
"""
84+
85+
def __init__(self, proxy_providers):
86+
super().__init__(proxy_providers)
87+
self.current_index = 0
88+
89+
@classmethod
90+
def from_crawler(cls, crawler):
91+
providers = crawler.settings.get("PROXY_PROVIDERS")
92+
if not providers:
93+
raise NotConfigured("PROXY_PROVIDERS not configured")
94+
95+
middleware = cls(providers)
96+
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
97+
return middleware
98+
99+
def get_next_proxy(self):
100+
proxy = self.proxies[self.current_index]
101+
self.current_index = (self.current_index + 1) % len(self.proxies)
102+
return proxy
103+
104+
def process_request(self, request, spider):
105+
proxy = self.get_next_proxy()
106+
request.meta["proxy"] = proxy
107+
self._record_request(proxy)
108+
logger.debug(f"[Sequential] Using proxy: {proxy}")
109+
return None
110+
111+
def process_response(self, request, response, spider):
112+
proxy = request.meta.get("proxy")
113+
if proxy:
114+
if response.status < 400:
115+
self._record_success(proxy)
116+
else:
117+
self._record_failure(proxy)
118+
return response
119+
120+
def process_exception(self, request, exception, spider):
121+
proxy = request.meta.get("proxy")
122+
if proxy:
123+
self._record_failure(proxy)
124+
logger.warning(f"[Sequential] Proxy {proxy} exception: {exception}")
125+
return None
126+
127+
def spider_closed(self, spider):
128+
self.log_summary(spider)
129+
130+
131+
class SmartProxyRotatorMiddleware(BaseProxyRotator):
132+
"""
133+
Advanced rotation with failure tracking, cooldown bans, and stats.
134+
135+
To enable it, add it to DOWNLOADER_MIDDLEWARES option::
136+
137+
DOWNLOADER_MIDDLEWARES = {
138+
# ...
139+
'ps_helper.middlewares.proxy_rotator.SmartProxyRotatorMiddleware': 620,
140+
# ...
141+
}
142+
143+
Settings:
144+
* ``PROXY_PROVIDERS`` - a list of proxies to choose from
145+
* ``PROXY_BAN_THRESHOLD`` - number of failures before the proxy is banned
146+
* ``PROXY_COOLDOWN`` - seconds that the proxy is deactivated
147+
* ``PROXY_ROTATION_MODE`` - 'random' or 'round_robin'
148+
"""
149+
150+
def __init__(
151+
self,
152+
proxy_providers,
153+
ban_threshold=3,
154+
cooldown_time=300,
155+
rotation_mode="random",
156+
):
157+
super().__init__(proxy_providers)
158+
self.ban_threshold = ban_threshold
159+
self.cooldown_time = cooldown_time
160+
self.rotation_mode = rotation_mode
161+
self.current_index = 0 # for round robin
162+
163+
@classmethod
164+
def from_crawler(cls, crawler):
165+
providers = crawler.settings.get("PROXY_PROVIDERS")
166+
if not providers:
167+
raise NotConfigured("PROXY_PROVIDERS not configured")
168+
169+
ban_threshold = crawler.settings.getint("PROXY_BAN_THRESHOLD", 3)
170+
cooldown_time = crawler.settings.getint("PROXY_COOLDOWN", 300)
171+
rotation_mode = crawler.settings.get("PROXY_ROTATION_MODE", "random")
172+
173+
middleware = cls(providers, ban_threshold, cooldown_time, rotation_mode)
174+
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
175+
return middleware
176+
177+
def get_available_proxies(self):
178+
"""Return only proxies not currently banned."""
179+
now = time.time()
180+
return [p for p, s in self.proxy_stats.items() if s["banned_until"] < now]
181+
182+
def get_next_proxy(self):
183+
available = self.get_available_proxies()
184+
185+
if not available:
186+
logger.warning("[Smart] All proxies are banned! Resetting bans.")
187+
for s in self.proxy_stats.values():
188+
s["banned_until"] = 0
189+
available = self.proxies
190+
191+
if self.rotation_mode == "round_robin":
192+
# Skip banned ones but keep round-robin order
193+
for _ in range(len(self.proxies)):
194+
proxy = self.proxies[self.current_index]
195+
self.current_index = (self.current_index + 1) % len(self.proxies)
196+
if proxy in available:
197+
return proxy
198+
# fallback if somehow none available
199+
return random.choice(available)
200+
else:
201+
return random.choice(available)
202+
203+
def _ban_proxy(self, proxy):
204+
stats = self.proxy_stats[proxy]
205+
stats["banned_until"] = time.time() + self.cooldown_time
206+
logger.info(f"[Smart] Proxy temporarily banned: {proxy}")
207+
208+
def register_failure(self, proxy):
209+
stats = self.proxy_stats[proxy]
210+
stats["fails"] += 1
211+
if stats["fails"] >= self.ban_threshold:
212+
self._ban_proxy(proxy)
213+
stats["fails"] = 0 # reset after ban
214+
215+
def process_request(self, request, spider):
216+
proxy = self.get_next_proxy()
217+
request.meta["proxy"] = proxy
218+
self._record_request(proxy)
219+
logger.debug(f"[Smart] Using proxy: {proxy}")
220+
221+
def process_response(self, request, response, spider):
222+
proxy = request.meta.get("proxy")
223+
if proxy:
224+
if response.status >= 400:
225+
self.register_failure(proxy)
226+
self._record_failure(proxy)
227+
logger.warning(f"[Smart] Proxy {proxy} failed (HTTP {response.status})")
228+
else:
229+
self._record_success(proxy)
230+
return response
231+
232+
def process_exception(self, request, exception, spider):
233+
proxy = request.meta.get("proxy")
234+
if proxy:
235+
self.register_failure(proxy)
236+
self._record_failure(proxy)
237+
logger.warning(f"[Smart] Proxy {proxy} raised exception: {exception}")
238+
return None
239+
240+
def spider_closed(self, spider):
241+
self.log_summary(spider)

tests/pdf_analyzer_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from ps_helper.pdf_analyzer import PDFAnalyzer
2+
from ps_helper.pdf.pdf_analyzer import PDFAnalyzer
33

44
LOCAL_PDF_PATH = "test_files/scansmpl.pdf"
55

0 commit comments

Comments
 (0)