diff --git a/src/lightning/pytorch/profilers/advanced.py b/src/lightning/pytorch/profilers/advanced.py index c0b4b9953cc33..5e26fc48c2523 100644 --- a/src/lightning/pytorch/profilers/advanced.py +++ b/src/lightning/pytorch/profilers/advanced.py @@ -82,7 +82,8 @@ def start(self, action_name: str) -> None: def stop(self, action_name: str) -> None: pr = self.profiled_actions.get(action_name) if pr is None: - raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") + log.debug(f"Attempting to stop recording an action ({action_name}) which was never started.") + return pr.disable() def _dump_stats(self, action_name: str, profile: cProfile.Profile) -> None: diff --git a/test_existing_functionality.py b/test_existing_functionality.py new file mode 100644 index 0000000000000..89322c4c9b0a3 --- /dev/null +++ b/test_existing_functionality.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +"""Test to verify existing AdvancedProfiler functionality still works after the fix. + +Based on existing tests from the test suite. + +""" + +import os + +# Import our modified AdvancedProfiler +import sys +import tempfile +import time + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) + +from lightning.pytorch.profilers.advanced import AdvancedProfiler + + +def test_advanced_profiler_deepcopy(): + """Test that AdvancedProfiler can be deep copied.""" + from copy import deepcopy + + with tempfile.TemporaryDirectory() as tmp_path: + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + profiler.describe() + try: + deepcopy(profiler) + print("✓ AdvancedProfiler deepcopy works") + return True + except Exception as e: + print(f"✗ AdvancedProfiler deepcopy failed: {e}") + return False + + +def test_advanced_profiler_nested(): + """Test that AdvancedProfiler handles nested profiling actions.""" + with tempfile.TemporaryDirectory() as tmp_path: + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + + try: + with profiler.profile("outer"), profiler.profile("inner"): + pass # Should not raise ValueError + print("✓ AdvancedProfiler nested profiling works") + return True + except Exception as e: + print(f"✗ AdvancedProfiler nested profiling failed: {e}") + return False + + +def test_advanced_profiler_basic_functionality(): + """Test basic profiling functionality.""" + with tempfile.TemporaryDirectory() as tmp_path: + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + + try: + # Test basic profiling + with profiler.profile("test_action"): + time.sleep(0.01) # Small delay to register some activity + + # Test that we can get a summary + summary = profiler.summary() + if "test_action" not in summary: + print("✗ test_action not found in profiler summary") + return False + + print("✓ AdvancedProfiler basic functionality works") + return True + except Exception as e: + print(f"✗ AdvancedProfiler basic functionality failed: {e}") + return False + + +def test_advanced_profiler_stop_started_action(): + """Test that stopping a properly started action still works.""" + with tempfile.TemporaryDirectory() as tmp_path: + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + + try: + # Start an action + profiler.start("test_action") + + # Do some work + time.sleep(0.01) + + # Stop the action - this should work + profiler.stop("test_action") + + # Verify it's in the summary + summary = profiler.summary() + if "test_action" not in summary: + print("✗ Properly started/stopped action not found in summary") + return False + + print("✓ AdvancedProfiler start/stop of real action works") + return True + except Exception as e: + print(f"✗ AdvancedProfiler start/stop of real action failed: {e}") + return False + + +def test_original_bug_scenario(): + """Test the original bug scenario is now fixed.""" + with tempfile.TemporaryDirectory() as tmp_path: + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + + try: + # Simulate the problematic scenario: stop an action that was never started + # This specifically mimics the "run_test_evaluation" error from the issue + profiler.stop("run_test_evaluation") + profiler.stop("run_validation_evaluation") + profiler.stop("some_random_action") + + # Verify profiler is still functional after these calls + with profiler.profile("after_fix_test"): + time.sleep(0.01) + + summary = profiler.summary() + if "after_fix_test" not in summary: + print("✗ Profiler not functional after stopping nonexistent actions") + return False + + print("✓ Original bug scenario is fixed") + return True + except ValueError as e: + print(f"✗ Original bug still exists - ValueError raised: {e}") + return False + except Exception as e: + print(f"✗ Unexpected error in bug fix test: {e}") + return False + + +def main(): + """Run all tests.""" + print("Testing AdvancedProfiler - verifying fix and existing functionality") + print("=" * 70) + + tests = [ + ("Deepcopy functionality", test_advanced_profiler_deepcopy), + ("Nested profiling", test_advanced_profiler_nested), + ("Basic functionality", test_advanced_profiler_basic_functionality), + ("Start/stop real actions", test_advanced_profiler_stop_started_action), + ("Original bug fix", test_original_bug_scenario), + ] + + results = [] + for test_name, test_func in tests: + print(f"\n--- {test_name} ---") + results.append(test_func()) + + print("\n" + "=" * 70) + print("FINAL RESULTS:") + + passed = sum(results) + total = len(results) + + for i, (test_name, _) in enumerate(tests): + status = "✓ PASS" if results[i] else "✗ FAIL" + print(f"{status}: {test_name}") + + print(f"\nSUMMARY: {passed}/{total} tests passed") + + if passed == total: + print("✓ ALL TESTS PASSED - Fix is working correctly!") + return 0 + print("✗ SOME TESTS FAILED") + return 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/test_fix.py b/test_fix.py new file mode 100644 index 0000000000000..2f520a858e7b8 --- /dev/null +++ b/test_fix.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +"""Simple test script to verify the AdvancedProfiler fix works. + +This reproduces the original bug and verifies it's fixed. + +""" + +import os +import sys +import tempfile + +# Add the source directory to Python path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) + +try: + from lightning.pytorch.profilers.advanced import AdvancedProfiler + + print("✓ Successfully imported AdvancedProfiler") +except ImportError as e: + print(f"✗ Failed to import AdvancedProfiler: {e}") + sys.exit(1) + + +def test_stop_nonexistent_action(): + """Test that stopping a non-existent action doesn't raise ValueError.""" + print("\n=== Testing stop of nonexistent action ===") + + with tempfile.TemporaryDirectory() as tmp_dir: + profiler = AdvancedProfiler(dirpath=tmp_dir, filename="test") + + try: + # This should NOT raise ValueError after the fix + profiler.stop("run_test_evaluation") + profiler.stop("some_nonexistent_action") + print("✓ Stopping nonexistent actions completed without error") + return True + except ValueError as e: + print(f"✗ ValueError was still raised: {e}") + return False + except Exception as e: + print(f"✗ Unexpected error: {e}") + return False + + +def test_normal_profiling_still_works(): + """Test that normal profiling functionality still works.""" + print("\n=== Testing normal profiling still works ===") + + with tempfile.TemporaryDirectory() as tmp_dir: + profiler = AdvancedProfiler(dirpath=tmp_dir, filename="test") + + try: + # Normal profiling should still work + with profiler.profile("test_action"): + sum(range(100)) # Some work + + # Should be able to get summary + summary = profiler.summary() + if "test_action" in summary: + print("✓ Normal profiling works correctly") + return True + print("✗ Normal profiling summary doesn't contain expected action") + return False + + except Exception as e: + print(f"✗ Normal profiling failed: {e}") + return False + + +def test_mixed_usage(): + """Test mixed usage of stop on nonexistent and normal profiling.""" + print("\n=== Testing mixed usage ===") + + with tempfile.TemporaryDirectory() as tmp_dir: + profiler = AdvancedProfiler(dirpath=tmp_dir, filename="test") + + try: + # Stop nonexistent action - should not error + profiler.stop("nonexistent1") + + # Normal profiling + with profiler.profile("real_action"): + sum(range(50)) + + # Stop another nonexistent action - should not error + profiler.stop("nonexistent2") + + # Check summary contains the real action + summary = profiler.summary() + if "real_action" in summary: + print("✓ Mixed usage works correctly") + return True + print("✗ Mixed usage failed - real action not in summary") + return False + + except Exception as e: + print(f"✗ Mixed usage failed: {e}") + return False + + +def main(): + """Run all tests.""" + print("Testing AdvancedProfiler fix for issue #9136") + print("=" * 50) + + tests = [ + test_stop_nonexistent_action, + test_normal_profiling_still_works, + test_mixed_usage, + ] + + results = [] + for test in tests: + results.append(test()) + + print("\n" + "=" * 50) + print("SUMMARY:") + + all_passed = all(results) + if all_passed: + print("✓ ALL TESTS PASSED") + print("✓ The fix successfully resolves the issue!") + return 0 + print("✗ SOME TESTS FAILED") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test_profiler_fix_simple.py b/test_profiler_fix_simple.py new file mode 100644 index 0000000000000..ff8dc10645abc --- /dev/null +++ b/test_profiler_fix_simple.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Simple test of just the AdvancedProfiler class to verify the fix. + +This directly tests the modified class without full Lightning dependencies. + +""" + +import cProfile +import io +import logging +import pstats +from collections import defaultdict +from pathlib import Path +from typing import Optional, Union + +log = logging.getLogger(__name__) + + +class AdvancedProfiler: + """Minimal version of AdvancedProfiler to test the fix.""" + + def __init__( + self, + dirpath: Optional[Union[str, Path]] = None, + filename: Optional[str] = None, + line_count_restriction: float = 1.0, + dump_stats: bool = False, + ) -> None: + self.dirpath = dirpath + self.filename = filename + self.profiled_actions: dict[str, cProfile.Profile] = defaultdict(cProfile.Profile) + self.line_count_restriction = line_count_restriction + self.dump_stats = dump_stats + + def start(self, action_name: str) -> None: + # Disable all profilers before starting a new one + for pr in self.profiled_actions.values(): + pr.disable() + self.profiled_actions[action_name].enable() + + def stop(self, action_name: str) -> None: + pr = self.profiled_actions.get(action_name) + if pr is None: + # This is the fix - log debug instead of raising ValueError + log.debug(f"Attempting to stop recording an action ({action_name}) which was never started.") + return + pr.disable() + + def profile(self, action_name: str): + """Context manager for profiling.""" + + class ProfileContext: + def __init__(self, profiler, action_name): + self.profiler = profiler + self.action_name = action_name + + def __enter__(self): + self.profiler.start(self.action_name) + return self.action_name + + def __exit__(self, exc_type, exc_val, exc_tb): + self.profiler.stop(self.action_name) + + return ProfileContext(self, action_name) + + def summary(self) -> str: + recorded_stats = {} + for action_name, pr in self.profiled_actions.items(): + s = io.StringIO() + ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats("cumulative") + ps.print_stats(self.line_count_restriction) + recorded_stats[action_name] = s.getvalue() + return str(recorded_stats) + + def teardown(self, stage: Optional[str] = None) -> None: + self.profiled_actions.clear() + + +def test_stop_nonexistent_action(): + """Test that stopping a non-existent action doesn't raise ValueError.""" + print("=== Testing stop of nonexistent action ===") + + profiler = AdvancedProfiler(dirpath="/tmp", filename="test") + + try: + # This should NOT raise ValueError after the fix + profiler.stop("run_test_evaluation") + profiler.stop("some_nonexistent_action") + print("✓ Stopping nonexistent actions completed without error") + return True + except ValueError as e: + print(f"✗ ValueError was still raised: {e}") + return False + except Exception as e: + print(f"✗ Unexpected error: {e}") + return False + + +def test_normal_profiling_still_works(): + """Test that normal profiling functionality still works.""" + print("\n=== Testing normal profiling still works ===") + + profiler = AdvancedProfiler(dirpath="/tmp", filename="test") + + try: + # Normal profiling should still work + with profiler.profile("test_action"): + sum(range(100)) # Some work + + # Should be able to get summary + summary = profiler.summary() + if "test_action" in summary: + print("✓ Normal profiling works correctly") + return True + print("✗ Normal profiling summary doesn't contain expected action") + return False + + except Exception as e: + print(f"✗ Normal profiling failed: {e}") + return False + + +def test_mixed_usage(): + """Test mixed usage of stop on nonexistent and normal profiling.""" + print("\n=== Testing mixed usage ===") + + profiler = AdvancedProfiler(dirpath="/tmp", filename="test") + + try: + # Stop nonexistent action - should not error + profiler.stop("nonexistent1") + + # Normal profiling + with profiler.profile("real_action"): + sum(range(50)) + + # Stop another nonexistent action - should not error + profiler.stop("nonexistent2") + + # Check summary contains the real action + summary = profiler.summary() + if "real_action" in summary: + print("✓ Mixed usage works correctly") + return True + print("✗ Mixed usage failed - real action not in summary") + return False + + except Exception as e: + print(f"✗ Mixed usage failed: {e}") + return False + + +def main(): + """Run all tests.""" + print("Testing AdvancedProfiler fix for issue #9136") + print("=" * 50) + + tests = [ + test_stop_nonexistent_action, + test_normal_profiling_still_works, + test_mixed_usage, + ] + + results = [] + for test in tests: + results.append(test()) + + print("\n" + "=" * 50) + print("SUMMARY:") + + all_passed = all(results) + if all_passed: + print("✓ ALL TESTS PASSED") + print("✓ The fix successfully resolves the issue!") + return 0 + print("✗ SOME TESTS FAILED") + return 1 + + +if __name__ == "__main__": + exit(main()) diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py index d48250e63b986..ab29040405c6a 100644 --- a/tests/tests_pytorch/profilers/test_profiler.py +++ b/tests/tests_pytorch/profilers/test_profiler.py @@ -708,3 +708,109 @@ def test_profiler_invalid_table_kwargs(tmp_path): with pytest.raises(KeyError) as exc_info: PyTorchProfiler(table_kwargs={key: None}, dirpath=tmp_path, filename="profile") assert exc_info.value.args[0].startswith(f"Found invalid table_kwargs key: {key}.") + + +def test_advanced_profiler_multiple_trainers_test_only_one(tmp_path): + """Test that AdvancedProfiler handles multiple trainers where only one runs test. + + This reproduces the bug reported in issue #9136 where having multiple trainers + with AdvancedProfiler and only running test on one would cause: + ValueError: Attempting to stop recording an action (run_test_evaluation) which was never started. + + """ + # Create a shared profiler instance + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + + # Create multiple trainers that could be used in a grid search scenario + model1 = BoringModel() + model2 = BoringModel() + + trainer1 = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + limit_test_batches=1, + profiler=profiler, + logger=False, + enable_checkpointing=False, + ) + + trainer2 = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + limit_test_batches=1, + profiler=profiler, + logger=False, + enable_checkpointing=False, + ) + + # Simulate grid search where we fit multiple trainers + trainer1.fit(model1) + trainer2.fit(model2) + + # Simulate finding the "best" trainer and running test only on it + # This should not raise ValueError about stopping non-started action + test_results = trainer1.test(model1) + assert test_results is not None + + # Test should complete without errors + assert len(test_results) > 0 + + +def test_advanced_profiler_reused_trainer_test(tmp_path): + """Test that AdvancedProfiler handles reused trainer calling test multiple times. + + This tests another scenario that could trigger the bug: reusing the same trainer + for multiple test calls with profiling. + + """ + profiler = AdvancedProfiler(dirpath=tmp_path, filename="profiler") + model = BoringModel() + + trainer = Trainer( + default_root_dir=tmp_path, + max_epochs=1, + limit_train_batches=1, + limit_val_batches=1, + limit_test_batches=1, + profiler=profiler, + logger=False, + enable_checkpointing=False, + ) + + # Train the model + trainer.fit(model) + + # Run test multiple times - this should not cause profiler state issues + test_results1 = trainer.test(model) + test_results2 = trainer.test(model) + + assert test_results1 is not None + assert test_results2 is not None + assert len(test_results1) > 0 + assert len(test_results2) > 0 + + +@pytest.mark.parametrize("stage", ["validation", "test"]) +def test_advanced_profiler_stop_nonexistent_action_no_error(advanced_profiler, stage): + """Test that stopping a non-existent action doesn't raise ValueError. + + This test verifies that the defensive fix works: attempting to stop + a profiling action that was never started should not crash, but instead + log a debug message and continue gracefully. + + """ + # This should not raise ValueError + advanced_profiler.stop(f"run_{stage}_evaluation") + advanced_profiler.stop("some_nonexistent_action") + + # Verify the profiler is still functional + with advanced_profiler.profile("test_action"): + pass + + # Should be able to get summary without issues + summary = advanced_profiler.summary() + assert isinstance(summary, str) diff --git a/verify_fix_direct.py b/verify_fix_direct.py new file mode 100644 index 0000000000000..d208668e31da2 --- /dev/null +++ b/verify_fix_direct.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +"""Direct verification of the exact fix applied to AdvancedProfiler. + +This tests the specific lines of code that were changed. + +""" + + +def test_original_code(): + """Test what the original code would do (should raise ValueError).""" + import cProfile + import logging + from collections import defaultdict + + logging.getLogger(__name__) + profiled_actions = defaultdict(cProfile.Profile) + + # Original implementation + def stop_original(action_name: str) -> None: + pr = profiled_actions.get(action_name) + if pr is None: + raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.") + pr.disable() + + try: + stop_original("run_test_evaluation") + print("✗ Original code did NOT raise ValueError (unexpected)") + return False + except ValueError as e: + print(f"✓ Original code correctly raised ValueError: {e}") + return True + except Exception as e: + print(f"✗ Original code raised unexpected error: {e}") + return False + + +def test_fixed_code(): + """Test what the fixed code does (should log debug and return).""" + import cProfile + + # Set up logging to capture debug messages + import io + import logging + from collections import defaultdict + + log_stream = io.StringIO() + handler = logging.StreamHandler(log_stream) + handler.setLevel(logging.DEBUG) + + log = logging.getLogger(__name__) + log.setLevel(logging.DEBUG) + log.addHandler(handler) + + profiled_actions = defaultdict(cProfile.Profile) + + # Fixed implementation (what we changed to) + def stop_fixed(action_name: str) -> None: + pr = profiled_actions.get(action_name) + if pr is None: + log.debug(f"Attempting to stop recording an action ({action_name}) which was never started.") + return + pr.disable() + + try: + stop_fixed("run_test_evaluation") + stop_fixed("some_other_action") + + # Check that debug messages were logged + log_output = log_stream.getvalue() + if "run_test_evaluation" in log_output and "some_other_action" in log_output: + print("✓ Fixed code logged debug messages correctly") + else: + print("? Fixed code didn't log debug messages (might be logging level issue)") + + print("✓ Fixed code completed without raising ValueError") + return True + except Exception as e: + print(f"✗ Fixed code raised unexpected error: {e}") + return False + finally: + log.removeHandler(handler) + + +def test_behavior_comparison(): + """Compare the behavior before and after the fix.""" + print("=" * 60) + print("BEHAVIOR COMPARISON") + print("=" * 60) + + print("\n1. Testing original behavior (should raise ValueError):") + original_worked = test_original_code() + + print("\n2. Testing fixed behavior (should log debug and continue):") + fixed_worked = test_fixed_code() + + return original_worked and fixed_worked + + +def verify_actual_file_was_changed(): + """Verify that the actual source file contains our fix.""" + try: + with open("src/lightning/pytorch/profilers/advanced.py") as f: + content = f.read() + + # Check that our fix is present + if 'log.debug(f"Attempting to stop recording an action ({action_name}) which was never started.")' in content: + print("✓ Source file contains the debug logging fix") + return True + print("✗ Source file does not contain the expected fix") + return False + + except FileNotFoundError: + print("✗ Could not find source file") + return False + except Exception as e: + print(f"✗ Error reading source file: {e}") + return False + + +def main(): + print("DIRECT VERIFICATION OF ADVANCEDPROFILER FIX") + print("=" * 60) + print("Issue #9136: ValueError when stopping profiling action that was never started") + print("Fix: Replace ValueError with debug logging") + print() + + # Test the behavioral difference + behavior_ok = test_behavior_comparison() + + print("\n" + "=" * 60) + print("VERIFICATION RESULTS") + print("=" * 60) + + # Verify the actual file was changed + file_ok = verify_actual_file_was_changed() + + if behavior_ok and file_ok: + print("\n✓ SUCCESS: Fix has been properly implemented and tested!") + print("✓ The AdvancedProfiler now handles missing actions gracefully") + print("✓ No more ValueError crashes for users with multiple trainers") + return 0 + print("\n✗ FAILURE: Fix verification failed") + return 1 + + +if __name__ == "__main__": + exit(main())