openui_eval/main.py at main · anxkhn/openui_eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/env python3
"""
Multimodal LLM Benchmark System - Main CLI Interface
A comprehensive benchmarking system for evaluating multimodal LLMs on HTML generation tasks
with iterative improvement and structured evaluation.
"""
import argparse
import sys
from pathlib import Path
from typing import List, Optional

from src.core.config import Config
from src.core.logger import setup_logger
from src.pipeline.benchmark_pipeline import BenchmarkPipeline


def parse_arguments() -> argparse.Namespace:
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(
        description="Multimodal LLM Benchmark System",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run with default configuration
  python main.py
  # Run specific models and tasks
  python main.py --models gemma3n:e2b gemma3:4b qwen2.5vl:7b --tasks calculator portfolio
  # Run with custom iterations and judges
  python main.py --iterations 5 --judges gemma3n:e2b gemma3:4b llama3.2-vision:11b
  # Resume from a specific checkpoint
  python main.py --resume-from results/benchmark_20241201_143022
  # Run in evaluation-only mode
  python main.py --mode evaluation --resume-from results/benchmark_20241201_143022
  # Use custom configuration file
  python main.py --config custom_config.yaml
        """,
    )
    # Model configuration
    parser.add_argument(
        "--models",
        nargs="+",
        help="List of models to benchmark (default: all supported models)",
    )
    # Task configuration
    parser.add_argument(
        "--tasks",
        nargs="+",
        help="List of tasks to run (default: all predefined tasks)",
    )
    # Generation configuration
    parser.add_argument(
        "--iterations",
        type=int,
        default=3,
        help="Number of iterative improvement iterations (default: 3)",
    )
    # Evaluation configuration
    parser.add_argument(
        "--judges",
        nargs="+",
        help="List of models to use as judges (default: all models)",
    )
    # Execution mode
    parser.add_argument(
        "--mode",
        choices=["full", "generation", "evaluation"],
        default="full",
        help="Execution mode: full (generation + evaluation), generation only, or evaluation only",
    )
    # Resume functionality
    parser.add_argument(
        "--resume-from",
        type=str,
        help="Resume from a previous benchmark run (provide results directory path)",
    )
    # Configuration file
    parser.add_argument(
        "--config", type=str, help="Path to configuration file (YAML or JSON)"
    )
    # Output directory
    parser.add_argument(
        "--output-dir",
        type=str,
        default="results",
        help="Output directory for results (default: results)",
    )
    # Logging level
    parser.add_argument(
        "--log-level",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        default="INFO",
        help="Logging level (default: INFO)",
    )
    # Parallel execution
    parser.add_argument(
        "--parallel",
        action="store_true",
        help="Enable parallel task execution where possible",
    )
    # Dry run mode
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Perform a dry run without actually executing tasks",
    )
    return parser.parse_args()


def create_config_from_args(args: argparse.Namespace) -> Config:
    """Create benchmark configuration from command line arguments."""
    # Load base configuration from file if provided
    if args.config:
        config = Config.from_yaml(args.config)
    else:
        config = Config()
    # Store the original judges list before potentially overriding models
    original_judges = config.judges[:]
    # Override with command line arguments
    if args.models:
        from src.core.config import ModelConfig

        config.models = [ModelConfig(name=model) for model in args.models]
    if args.tasks:
        # Filter existing tasks or create new ones
        from src.core.config import TaskConfig
        from src.tasks.task_definitions import get_task_by_name

        filtered_tasks = []
        for task_name in args.tasks:
            task_def = get_task_by_name(task_name)
            if task_def:
                filtered_tasks.append(
                    TaskConfig(
                        name=task_def.name,
                        description=task_def.description,
                        prompt_template=task_def.prompt,
                    )
                )
        if filtered_tasks:
            config.tasks = filtered_tasks
    if args.iterations:
        config.iterations = args.iterations
    # If judges are explicitly specified via CLI, use those
    # Otherwise, preserve the original judges setting (which may be "all" meaning all available models)
    if args.judges:
        config.judges = args.judges
    else:
        # Keep the original judges list - this allows "all" to work with all available models,
        # not just the models specified for generation
        config.judges = original_judges
    if args.output_dir:
        config.output_dir = args.output_dir
    # Set resume configuration
    if args.resume_from:
        config.resume_from = args.resume_from
    # Set execution mode (map CLI values to config values)
    mode_mapping = {
        "full": "full-pipeline",
        "generation": "generation-only",
        "evaluation": "judging-only",
    }
    config.mode = mode_mapping.get(args.mode, args.mode)
    return config


def main():
    """Main entry point for the benchmark system."""
    logger = None
    try:
        # Parse command line arguments
        args = parse_arguments()
        # Create configuration
        config = create_config_from_args(args)
        # Initialize logger
        logger = setup_logger(__name__, level=args.log_level)
        logger.info("Starting Multimodal LLM Benchmark System")
        logger.info(f"Configuration: {config.to_dict()}")
        # Validate configuration
        config.validate()
        # Create and run benchmark pipeline
        pipeline = BenchmarkPipeline(config)
        if args.dry_run:
            logger.info("Dry run mode - validating configuration and setup")
            # pipeline.validate_setup()  # TODO: Implement validate_setup method
            logger.info("Dry run completed successfully")
            return
        # Execute the benchmark
        if config.mode == "generation-only":
            results = pipeline.run_generation_phase()
        elif config.mode == "judging-only":
            results = pipeline.run_evaluation_phase()
        else:  # full-pipeline
            results = pipeline.run_full_pipeline()
        logger.info("Benchmark completed successfully")
        logger.info(f"Results saved to: {config.output_dir}")
        # Print summary
        print("\n" + "=" * 80)
        print("BENCHMARK SUMMARY")
        print("=" * 80)
        print(f"Total models tested: {len(config.models)}")
        print(f"Total tasks completed: {len(config.tasks)}")
        print(f"Execution mode: {config.mode}")
        print(f"Iterations per task: {config.iterations}")
        print(f"Results directory: {config.output_dir}")
        print("=" * 80)
    except KeyboardInterrupt:
        if logger:
            logger.warning("Benchmark interrupted by user")
        else:
            print("Benchmark interrupted by user")
        sys.exit(1)
    except RuntimeError as e:
        if logger:
            logger.error(f"Benchmark error: {e}")
        else:
            print(f"Benchmark error: {e}")
        sys.exit(1)
    except Exception as e:
        if logger:
            logger.error(f"Unexpected error: {e}", exc_info=True)
        else:
            print(f"Unexpected error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()