training.py

import os
import tensorflow as tf
import numpy as np
from pathlib import Path
from typing import Tuple, Dict, Generator
from tensorflow.keras.utils import Progbar
import datetime
from typing import Tuple, List, Dict
import re


class FogDataset:
    """
    Dataset handler for fog removal GAN training
    Expects directory structure:
    dataset/
        train/
            input/
                image1.png
                image2.png
            target/
                image1.png
                image2.png
        val/
            input/
                ...
            target/
                ...
        test/
            input/
                ...
            target/
                ...
    """
    def __init__(self, 
                 root_dir: str,
                 batch_size: int = 1,
                 image_size: Tuple[int, int] = (256, 256),
                 buffer_size: int = 1000):
        self.root_dir = Path(root_dir)
        self.batch_size = batch_size
        self.image_size = image_size
        self.buffer_size = buffer_size

    def _find_matching_pairs(self, input_dir: Path, target_dir: Path) -> List[Tuple[Path, Path]]:
        """Find matching pairs between input and target directories"""
        input_files = sorted(list(input_dir.glob('*.[pj][np][g]')))  # matches .png and .jpg
        target_files = sorted(list(target_dir.glob('*.[pj][np][g]')))
        
        print(f"\nFound {len(input_files)} input files and {len(target_files)} target files")
        
        # Create a mapping of base names to full paths for target files
        target_map = {}
        for target_path in target_files:
            # Handle different naming patterns
            base_name = target_path.stem
            # Remove any '-target' suffix
            base_name = base_name.replace('-target', '')
            base_name = base_name.replace('-targets', '')
            # Store in map
            target_map[base_name] = target_path
        
        # Match input files with their targets
        pairs = []
        unmatched = []
        for input_path in input_files:
            base_name = input_path.stem
            # Remove any '-input' suffix
            base_name = base_name.replace('-input', '')
            base_name = base_name.replace('-inputs', '')
            
            if base_name in target_map:
                pairs.append((input_path, target_map[base_name]))
            else:
                unmatched.append(input_path)
        
        print(f"Successfully paired {len(pairs)} images")
        if unmatched:
            print(f"Warning: {len(unmatched)} input images could not be matched:")
            for path in unmatched[:5]:  # Show first 5 unmatched
                print(f"  {path.name}")
            if len(unmatched) > 5:
                print(f"  ... and {len(unmatched) - 5} more")
        
        return pairs

    def _load_and_preprocess(self, input_path: str, target_path: str) -> Tuple[tf.Tensor, tf.Tensor]:
        """Load and preprocess image pairs"""
        def load_image(path: str) -> tf.Tensor:
            img = tf.io.read_file(path)
            # Handle both PNG and JPEG
            if tf.strings.regex_full_match(path, ".*\.jpg"):
                img = tf.image.decode_jpeg(img, channels=3)
            else:
                img = tf.image.decode_png(img, channels=3)
            img = tf.cast(img, tf.float32)
            img = tf.image.resize(img, self.image_size, method='bicubic')
            return (img / 127.5) - 1  # Normalize to [-1, 1]

        input_img = load_image(input_path)
        target_img = load_image(target_path)
        return input_img, target_img

    def get_dataset(self, mode: str = 'train') -> tf.data.Dataset:
        """Create tensorflow dataset for specified mode"""
        mode_dir = self.root_dir / mode
        input_dir = mode_dir / 'input'
        target_dir = mode_dir / 'target'
        
        print(f"\nCreating dataset for mode: {mode}")
        print(f"Input directory: {input_dir}")
        print(f"Target directory: {target_dir}")
        
        # Verify directories exist
        if not input_dir.exists() or not target_dir.exists():
            raise ValueError(f"Missing directory for {mode} mode")
        
        # Find matching pairs
        pairs = self._find_matching_pairs(input_dir, target_dir)
        
        if not pairs:
            raise ValueError(f"No matching image pairs found for {mode} mode")
        
        # Create dataset from pairs
        input_paths = [str(p[0]) for p in pairs]
        target_paths = [str(p[1]) for p in pairs]
        
        dataset = tf.data.Dataset.from_tensor_slices((input_paths, target_paths))
        
        # Map loading and preprocessing function
        dataset = dataset.map(
            lambda x, y: tf.py_function(
                self._load_and_preprocess,
                [x, y],
                [tf.float32, tf.float32]
            ),
            num_parallel_calls=tf.data.AUTOTUNE
        )
        
        # Set shapes explicitly
        dataset = dataset.map(
            lambda x, y: (
                tf.ensure_shape(x, [*self.image_size, 3]),
                tf.ensure_shape(y, [*self.image_size, 3])
            )
        )
        
        if mode == 'train':
            dataset = dataset.shuffle(min(len(pairs), self.buffer_size))
        
        dataset = dataset.batch(self.batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        # Print dataset info
        print(f"\nDataset created for {mode}:")
        print(f"Number of image pairs: {len(pairs)}")
        print(f"Batch size: {self.batch_size}")
        print(f"Steps per epoch: {len(pairs) // self.batch_size}")
        
        return dataset

    def verify_dataset(self, mode: str = 'train'):
        """Verify dataset by showing first few samples"""
        dataset = self.get_dataset(mode)
        print(f"\nVerifying {mode} dataset:")
        
        for i, (input_batch, target_batch) in enumerate(dataset.take(1)):
            print(f"\nBatch {i + 1}:")
            print(f"Input shape: {input_batch.shape}")
            print(f"Target shape: {target_batch.shape}")
            print(f"Input range: [{tf.reduce_min(input_batch)}, {tf.reduce_max(input_batch)}]")
            print(f"Target range: [{tf.reduce_min(target_batch)}, {tf.reduce_max(target_batch)}]")

class GANTrainer:
    """Handler for GAN training process"""
    def __init__(self,
                 model: tf.keras.Model,
                 dataset: FogDataset,
                 args: Dict):
        self.model = model
        self.dataset = dataset
        self.args = args
        
        # Training parameters
        self.epochs = args.epochs
        self.val_frequency = args.val_frequency
        
        # Setup paths
        self.model_dir = Path(args.model_name)
        self.checkpoint_dir = self.model_dir / 'checkpoints'
        self.sample_dir = self.model_dir / 'samples'
        self.log_dir = self.model_dir / 'logs'
        
        # Create directories
        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
        self.sample_dir.mkdir(parents=True, exist_ok=True)
        self.log_dir.mkdir(parents=True, exist_ok=True)
        
        # Setup checkpointing
        self.checkpoint = tf.train.Checkpoint(
            generator_optimizer=model.generator_optimizer,
            discriminator_optimizer=model.discriminator_optimizer,
            generator=model.generator,
            discriminator=model.discriminator
        )
        self.checkpoint_manager = tf.train.CheckpointManager(
            self.checkpoint, 
            str(self.checkpoint_dir),
            max_to_keep=3
        )
        
        # Setup metrics
        self.train_metrics = {
            'gen_total_loss': tf.keras.metrics.Mean(),
            'gen_gan_loss': tf.keras.metrics.Mean(),
            'gen_l1_loss': tf.keras.metrics.Mean(),
            'gen_vgg_loss': tf.keras.metrics.Mean(),
            'disc_loss': tf.keras.metrics.Mean()
        }
        self.val_metrics = {
            'val_psnr': tf.keras.metrics.Mean(),
            'val_ssim': tf.keras.metrics.Mean()
        }
        
        # Setup summary writers
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.train_summary_writer = tf.summary.create_file_writer(
            str(self.log_dir / f'train_{current_time}')
        )
        self.val_summary_writer = tf.summary.create_file_writer(
            str(self.log_dir / f'val_{current_time}')
        )

    def restore_checkpoint(self):
        """Restore from latest checkpoint if available"""
        if self.checkpoint_manager.latest_checkpoint:
            self.checkpoint.restore(self.checkpoint_manager.latest_checkpoint)
            print(f"Restored from checkpoint: {self.checkpoint_manager.latest_checkpoint}")
            return True
        return False

    def _reset_metrics(self):
        """Reset all metrics at epoch start"""
        for metric in self.train_metrics.values():
            metric.reset_state()
        for metric in self.val_metrics.values():
            metric.reset_state()

    def _update_metrics(self, results: Dict[str, tf.Tensor]):
        """Update metrics with latest batch results"""
        for name, value in results.items():
            self.train_metrics[name].update_state(value)

    def validate(self, val_dataset: tf.data.Dataset) -> Dict[str, float]:
        """Run validation loop"""
        for batch in val_dataset:
            input_image, target_image = batch
            generated_image = self.model.generator(input_image, training=False)
            
            # Calculate PSNR and SSIM
            psnr = tf.image.psnr(
                (target_image + 1) * 127.5,
                (generated_image + 1) * 127.5,
                max_val=255
            )
            ssim = tf.image.ssim(
                (target_image + 1) * 127.5,
                (generated_image + 1) * 127.5,
                max_val=255
            )
            
            self.val_metrics['val_psnr'].update_state(psnr)
            self.val_metrics['val_ssim'].update_state(ssim)
        
        return {name: metric.result() for name, metric in self.val_metrics.items()}

    def train(self):
        """Main training loop"""
        # Setup datasets
        train_dataset = self.dataset.get_dataset('train')
        val_dataset = self.dataset.get_dataset('val')
        
        # Restore checkpoint if available
        if self.args.restore:
            self.restore_checkpoint()
        
        best_psnr = 0
        steps_per_epoch = len(train_dataset)
        
        for epoch in range(self.epochs):
            print(f"\nEpoch {epoch + 1}/{self.epochs}")
            self._reset_metrics()
            
            # Progress bar
            progress_bar = Progbar(steps_per_epoch, stateful_metrics=[
                'gen_total_loss', 'disc_loss', 'gen_gan_loss', 'gen_l1_loss', 'gen_vgg_loss'
            ])
            
            # Training loop
            for step, batch in enumerate(train_dataset):
                results = self.model.train_step(batch)
                self._update_metrics(results)
                
                # Update progress bar
                progress_bar.update(
                    step + 1,
                    [(name, metric.result()) for name, metric in self.train_metrics.items()]
                )
                
                # Log training metrics
                with self.train_summary_writer.as_default():
                    for name, metric in self.train_metrics.items():
                        tf.summary.scalar(name, metric.result(), step=self.model.optimizer.iterations)
            
            # Validation
            if (epoch + 1) % self.val_frequency == 0:
                val_results = self.validate(val_dataset)
                print("\nValidation Results:")
                for name, value in val_results.items():
                    print(f"{name}: {value:.4f}")
                
                # Log validation metrics
                with self.val_summary_writer.as_default():
                    for name, value in val_results.items():
                        tf.summary.scalar(name, value, step=epoch)
                
                # Save best model
                if val_results['val_psnr'] > best_psnr:
                    best_psnr = val_results['val_psnr']
                    self.checkpoint_manager.save()
                    print(f"Saved new best model with PSNR: {best_psnr:.4f}")

class GANInference:
    """Handler for GAN inference"""
    def __init__(self, model: tf.keras.Model, checkpoint_dir: str):
        self.model = model
        self.checkpoint = tf.train.Checkpoint(
            generator=model.generator
        )
        self.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
    
    def process_image(self, image_path: str, output_path: str):
        """Process a single image"""
        # Load and preprocess image
        img = tf.io.read_file(image_path)
        img = tf.image.decode_png(img, channels=3)
        img = tf.cast(img, tf.float32)
        img = tf.image.resize(img, (256, 256))
        img = (img / 127.5) - 1
        img = tf.expand_dims(img, 0)
        
        # Generate output
        output = self.model.generator(img, training=False)
        output = (output[0] + 1) * 127.5
        output = tf.clip_by_value(output, 0, 255)
        output = tf.cast(output, tf.uint8)
        
        # Save output
        tf.io.write_file(output_path, tf.image.encode_png(output))
    
    def process_directory(self, input_dir: str, output_dir: str):
        """Process all images in a directory"""
        input_dir = Path(input_dir)
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        input_images = sorted(input_dir.glob('*-input.png'))
        for img_path in input_images:
            output_path = output_dir / img_path.name.replace('-input.png', '-output.png')
            self.process_image(str(img_path), str(output_path))
            print(f"Processed {img_path.name}")