1212
1313
1414class MetricsExtension :
15- def __init__ (self , stats , schema = None , unique_field = None , max_buckets = 30 ):
15+ def __init__ (self , stats , schema = None , unique_field = None , max_buckets = 30 , items_expected = None ):
1616 """
1717 Scrapy Metrics Extension.
1818
@@ -39,14 +39,23 @@ def __init__(self, stats, schema=None, unique_field=None, max_buckets=30):
3939 self .schema = schema
4040 self .unique_field = unique_field
4141
42+ self .items_expected = items_expected
43+
4244 @classmethod
4345 def from_crawler (cls , crawler ):
4446 schema = getattr (crawler .spidercls , "schema" , None )
4547 unique_field = getattr (crawler .spidercls , "unique_field" , None )
4648
4749 max_buckets = crawler .settings .getint ("METRICS_TIMELINE_BUCKETS" , 30 )
48-
49- ext = cls (crawler .stats , schema = schema , unique_field = unique_field , max_buckets = max_buckets )
50+ items_expected = getattr (crawler .spidercls , "ITEMS_EXPECTED" , None )
51+
52+ ext = cls (
53+ crawler .stats ,
54+ schema = schema ,
55+ unique_field = unique_field ,
56+ max_buckets = max_buckets ,
57+ items_expected = items_expected
58+ )
5059
5160 crawler .signals .connect (ext .spider_opened , signal = signals .spider_opened )
5261 crawler .signals .connect (ext .spider_closed , signal = signals .spider_closed )
@@ -108,14 +117,39 @@ def spider_closed(self, spider, reason):
108117 interval_size = max (1 , math .ceil (total_minutes / self .max_buckets ))
109118
110119 # Success rate
111- successful_requests = self .stats .get_value ("downloader/response_count" , 0 )
112- total_requests = self .stats .get_value ("downloader/request_count" , 0 )
120+ items = self .stats .get_value ("custom/items_scraped" , 0 )
121+ pages = self .stats .get_value ("custom/pages_processed" , 0 )
122+ total_requests = self .stats .get_value ("downloader/response_count" , 0 )
113123 retries_total = self .stats .get_value ("retry/count" , 0 )
114124
115- adjusted_successful = max (successful_requests - retries_total , 0 )
116- adjusted_total = max (total_requests , 1 )
117-
118- success_rate = (adjusted_successful / adjusted_total ) * 100
125+ status_200 = self .http_status_counter .get (200 , 0 )
126+ http_success_rate = (status_200 / total_requests * 100 ) if total_requests > 0 else 0
127+
128+ # Efficiency
129+ requests_per_item_obtained = total_requests / items if items > 0 else float ('inf' )
130+
131+ # Penalización por ineficiencia
132+ if requests_per_item_obtained <= 3 :
133+ efficiency_factor = 1.0 # Sin penalización
134+ elif requests_per_item_obtained <= 4 :
135+ efficiency_factor = 0.95 # 5% penalización
136+ elif requests_per_item_obtained <= 5 :
137+ efficiency_factor = 0.90 # 10% penalización
138+ elif requests_per_item_obtained <= 7 :
139+ efficiency_factor = 0.80 # 20% penalización
140+ else :
141+ efficiency_factor = 0.65 # 35% penalización (muy ineficiente)
142+
143+ if self .items_expected :
144+ goal_achievement = (items / self .items_expected * 100 ) if self .items_expected > 0 else 0
145+
146+ success_rate = (
147+ (goal_achievement * 0.7 + http_success_rate * 0.3 ) * efficiency_factor
148+ )
149+ success_rate = min (100 , max (0 , success_rate ))
150+ else :
151+ success_rate = http_success_rate * efficiency_factor
152+ success_rate = min (100 , max (0 , success_rate ))
119153
120154 # Group timeline
121155 aggregated = defaultdict (int )
@@ -134,9 +168,6 @@ def spider_closed(self, spider, reason):
134168 )
135169 ]
136170
137- items = self .stats .get_value ("custom/items_scraped" , 0 )
138- pages = self .stats .get_value ("custom/pages_processed" , 0 )
139-
140171 # Speed
141172 items_per_min = items / (elapsed / 60 ) if elapsed > 0 else 0
142173 pages_per_min = pages / (elapsed / 60 ) if elapsed > 0 else 0
@@ -174,6 +205,8 @@ def spider_closed(self, spider, reason):
174205 "pages_per_minute" : round (pages_per_min , 2 ),
175206 "time_per_page_seconds" : round (time_per_page , 2 ),
176207 "success_rate" : round (success_rate , 2 ),
208+ "http_success_rate" : round (http_success_rate , 2 ),
209+ "goal_achievement" : round (goal_achievement , 2 ) if self .items_expected else None ,
177210 "schema_coverage" : {
178211 "percentage" : round (schema_coverage_percentage , 2 ),
179212 "valid" : self .valid_items ,
0 commit comments