diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 3203b7293..3a62ed353 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -51,6 +51,7 @@ class EvaluationResult(Prediction): - score: An float value (e.g., 67.30) representing the overall performance - results: a list of (example, prediction, score) tuples for each example in devset """ + def __init__(self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]]): super().__init__(score=score, results=results) @@ -126,9 +127,9 @@ def __call__( Returns: The evaluation results are returned as a dspy.EvaluationResult object containing the following attributes: - + - score: A float percentage score (e.g., 67.30) representing overall performance - + - results: a list of (example, prediction, score) tuples for each example in devset """ metric = metric if metric is not None else self.metric @@ -145,11 +146,7 @@ def __call__( executor = ParallelExecutor( num_threads=num_threads, disable_progress_bar=not display_progress, - max_errors=( - self.max_errors - if self.max_errors is not None - else dspy.settings.max_errors - ), + max_errors=(self.max_errors if self.max_errors is not None else dspy.settings.max_errors), provide_traceback=self.provide_traceback, compare_results=True, ) @@ -157,13 +154,6 @@ def __call__( def process_item(example): prediction = program(**example.inputs()) score = metric(example, prediction) - - # Increment assert and suggest failures to program's attributes - if hasattr(program, "_assert_failures"): - program._assert_failures += dspy.settings.get("assert_failures") - if hasattr(program, "_suggest_failures"): - program._suggest_failures += dspy.settings.get("suggest_failures") - return prediction, score results = executor.execute(process_item, devset) @@ -191,7 +181,6 @@ def process_item(example): results=results, ) - def _construct_result_table( self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str ) -> "pd.DataFrame": diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py index bc75d3efe..2b2fd8ec2 100644 --- a/dspy/teleprompt/bootstrap.py +++ b/dspy/teleprompt/bootstrap.py @@ -85,10 +85,6 @@ def compile(self, student, *, teacher=None, trainset): self.student = self._train() self.student._compiled = True - # set assert_failures and suggest_failures as attributes of student w/ value 0 - self.student._assert_failures = 0 - self.student._suggest_failures = 0 - return self.student def _prepare_student_and_teacher(self, student, teacher): @@ -111,7 +107,9 @@ def _prepare_predictor_mappings(self): teacher.predictors(), ), "Student and teacher must have the same number of predictors." - for (name1, predictor1), (name2, predictor2) in zip(student.named_predictors(), teacher.named_predictors(), strict=False): + for (name1, predictor1), (name2, predictor2) in zip( + student.named_predictors(), teacher.named_predictors(), strict=False + ): assert name1 == name2, "Student and teacher must have the same program structure." if hasattr(predictor1.signature, "equals"): assert predictor1.signature.equals( @@ -210,11 +208,7 @@ def _bootstrap_one_example(self, example, round_idx=0): with self.error_lock: self.error_count += 1 current_error_count = self.error_count - effective_max_errors = ( - self.max_errors - if self.max_errors is not None - else dspy.settings.max_errors - ) + effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors if current_error_count >= effective_max_errors: raise e logger.error(f"Failed to run or to evaluate example {example} with {self.metric} due to {e}.") @@ -244,7 +238,6 @@ def _bootstrap_one_example(self, example, round_idx=0): # Update the traces for name, demos in name2traces.items(): - # If there are multiple traces for the same predictor in the sample example, # sample 50/50 from the first N-1 traces or the last trace. if len(demos) > 1: diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index e189b07bd..c6447e833 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -58,11 +58,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None self.trainset = trainset self.valset = valset or trainset # TODO: FIXME: Note this choice. - effective_max_errors = ( - self.max_errors - if self.max_errors is not None - else dspy.settings.max_errors - ) + effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors scores = [] all_subscores = [] @@ -129,13 +125,6 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None all_subscores.append(subscores) - ############ Assertion-aware Optimization ############ - if hasattr(program, "_suggest_failures"): - score = score - program._suggest_failures * 0.2 - if hasattr(program, "_assert_failures"): - score = 0 if program._assert_failures > 0 else score - ###################################################### - if len(scores) == 0 or score > max(scores): print("New best score:", score, "for seed", seed) best_program = program