Skip to content

Commit 6714f6a

Browse files
authored
Merge pull request #10 from bitmakerla/update_success_rate
Update success rate
2 parents d7683aa + 415a764 commit 6714f6a

File tree

2 files changed

+77
-17
lines changed

2 files changed

+77
-17
lines changed

src/ps_helper/extensions/metrics_extension.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313

1414
class MetricsExtension:
15-
def __init__(self, stats, schema=None, unique_field=None, max_buckets=30):
15+
def __init__(self, stats, schema=None, unique_field=None, max_buckets=30, items_expected=None):
1616
"""
1717
Scrapy Metrics Extension.
1818
@@ -39,14 +39,23 @@ def __init__(self, stats, schema=None, unique_field=None, max_buckets=30):
3939
self.schema = schema
4040
self.unique_field = unique_field
4141

42+
self.items_expected = items_expected
43+
4244
@classmethod
4345
def from_crawler(cls, crawler):
4446
schema = getattr(crawler.spidercls, "schema", None)
4547
unique_field = getattr(crawler.spidercls, "unique_field", None)
4648

4749
max_buckets = crawler.settings.getint("METRICS_TIMELINE_BUCKETS", 30)
48-
49-
ext = cls(crawler.stats, schema=schema, unique_field=unique_field, max_buckets=max_buckets)
50+
items_expected = getattr(crawler.spidercls, "ITEMS_EXPECTED", None)
51+
52+
ext = cls(
53+
crawler.stats,
54+
schema=schema,
55+
unique_field=unique_field,
56+
max_buckets=max_buckets,
57+
items_expected=items_expected
58+
)
5059

5160
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
5261
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
@@ -108,14 +117,39 @@ def spider_closed(self, spider, reason):
108117
interval_size = max(1, math.ceil(total_minutes / self.max_buckets))
109118

110119
# Success rate
111-
successful_requests = self.stats.get_value("downloader/response_count", 0)
112-
total_requests = self.stats.get_value("downloader/request_count", 0)
120+
items = self.stats.get_value("custom/items_scraped", 0)
121+
pages = self.stats.get_value("custom/pages_processed", 0)
122+
total_requests = self.stats.get_value("downloader/response_count", 0)
113123
retries_total = self.stats.get_value("retry/count", 0)
114124

115-
adjusted_successful = max(successful_requests - retries_total, 0)
116-
adjusted_total = max(total_requests, 1)
117-
118-
success_rate = (adjusted_successful / adjusted_total) * 100
125+
status_200 = self.http_status_counter.get(200, 0)
126+
http_success_rate = (status_200 / total_requests * 100) if total_requests > 0 else 0
127+
128+
# Efficiency
129+
requests_per_item_obtained = total_requests / items if items > 0 else float('inf')
130+
131+
# Penalización por ineficiencia
132+
if requests_per_item_obtained <= 3:
133+
efficiency_factor = 1.0 # Sin penalización
134+
elif requests_per_item_obtained <= 4:
135+
efficiency_factor = 0.95 # 5% penalización
136+
elif requests_per_item_obtained <= 5:
137+
efficiency_factor = 0.90 # 10% penalización
138+
elif requests_per_item_obtained <= 7:
139+
efficiency_factor = 0.80 # 20% penalización
140+
else:
141+
efficiency_factor = 0.65 # 35% penalización (muy ineficiente)
142+
143+
if self.items_expected:
144+
goal_achievement = (items / self.items_expected * 100) if self.items_expected > 0 else 0
145+
146+
success_rate = (
147+
(goal_achievement * 0.7 + http_success_rate * 0.3) * efficiency_factor
148+
)
149+
success_rate = min(100, max(0, success_rate))
150+
else:
151+
success_rate = http_success_rate * efficiency_factor
152+
success_rate = min(100, max(0, success_rate))
119153

120154
# Group timeline
121155
aggregated = defaultdict(int)
@@ -134,9 +168,6 @@ def spider_closed(self, spider, reason):
134168
)
135169
]
136170

137-
items = self.stats.get_value("custom/items_scraped", 0)
138-
pages = self.stats.get_value("custom/pages_processed", 0)
139-
140171
# Speed
141172
items_per_min = items / (elapsed / 60) if elapsed > 0 else 0
142173
pages_per_min = pages / (elapsed / 60) if elapsed > 0 else 0
@@ -174,6 +205,8 @@ def spider_closed(self, spider, reason):
174205
"pages_per_minute": round(pages_per_min, 2),
175206
"time_per_page_seconds": round(time_per_page, 2),
176207
"success_rate": round(success_rate, 2),
208+
"http_success_rate": round(http_success_rate, 2),
209+
"goal_achievement": round(goal_achievement, 2) if self.items_expected else None,
177210
"schema_coverage": {
178211
"percentage": round(schema_coverage_percentage, 2),
179212
"valid": self.valid_items,

src/ps_helper/scripts/generate_report.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def load_scrapy_stats(json_path):
106106
"duration": format_duration(data.get("elapsed_time_seconds", 0)),
107107
"items_per_minute": round(data.get("items_per_minute", 0), 1),
108108
"pages_per_minute": round(data.get("pages_per_minute", 0), 2),
109+
"http_success_rate": data.get("http_success_rate", 0),
110+
"goal_achievement": data.get("goal_achievement", 0),
109111
}
110112

111113
http_errors = data.get("http_errors", {})
@@ -253,14 +255,18 @@ def _generate_retry_reasons_html(data):
253255
)
254256

255257
# Success rate
256-
if scrapy_stats["success_rate"] >= 95:
258+
if scrapy_stats["success_rate"] >= 90:
257259
status_class = "success"
258260
status_text = "Successful"
259261
icon = "✅"
260-
elif scrapy_stats["success_rate"] >= 80:
262+
elif scrapy_stats["success_rate"] >= 70:
261263
status_class = "warning"
262264
status_text = "With Warnings"
263265
icon = "⚠️"
266+
elif scrapy_stats["success_rate"] >= 50:
267+
status_class = "warning-orange"
268+
status_text = "Below Target"
269+
icon = "⚠️"
264270
else:
265271
status_class = "error"
266272
status_text = "Critical Error"
@@ -320,7 +326,18 @@ def _generate_retry_reasons_html(data):
320326
labels=df_errors["Error"],
321327
values=df_errors["Count"],
322328
marker=dict(
323-
colors=["#FF5733", "#F8623D", "#C67448", "#838E56", "#00BF71"][
329+
colors=[
330+
"#FF5733", # Naranja rojo (original)
331+
"#FF6B3D", # Naranja brillante
332+
"#FF8047", # Naranja medio
333+
"#FF9551", # Naranja claro
334+
"#FFAA5C", # Naranja amarillento
335+
"#D4B85E", # Amarillo verdoso
336+
"#A8C560", # Lima
337+
"#7CB862", # Verde lima
338+
"#50AA64", # Verde medio
339+
"#00BF71" # Verde esmeralda (original)
340+
][
324341
: len(df_errors)
325342
]
326343
),
@@ -562,6 +579,11 @@ def _generate_retry_reasons_html(data):
562579
border-color: #F8623D;
563580
}}
564581
582+
.status-banner.warning-orange {{
583+
background: linear-gradient(135deg, #fff4e6 0%, #ffecd1 100%);
584+
border-color: #FF8047;
585+
}}
586+
565587
.status-banner.error {{
566588
background: linear-gradient(135deg, #ffe8e6 0%, #ffd6d1 100%);
567589
border-color: #FF5733;
@@ -594,6 +616,7 @@ def _generate_retry_reasons_html(data):
594616
595617
.status-text p.success {{ color: #059669; }}
596618
.status-text p.warning {{ color: #d97706; }}
619+
.status-text p.warning-orange {{ color: #ea580c; }}
597620
.status-text p.error {{ color: #dc2626; }}
598621
599622
.status-metrics {{
@@ -822,8 +845,12 @@ def _generate_retry_reasons_html(data):
822845
<div class="metric-label">Items Scraped</div>
823846
</div>
824847
<div class="metric-item">
825-
<div class="metric-value">{scrapy_stats['success_rate']}%</div>
826-
<div class="metric-label">Success Rate</div>
848+
<div class="metric-value">{scrapy_stats['http_success_rate']}%</div>
849+
<div class="metric-label">Http Success Rate</div>
850+
</div>
851+
<div class="metric-item">
852+
<div class="metric-value">{scrapy_stats['goal_achievement']}%</div>
853+
<div class="metric-label">Goal Achievement</div>
827854
</div>
828855
<div class="metric-item">
829856
<div class="metric-value">{scrapy_stats['duration']}</div>

0 commit comments

Comments
 (0)