Upscale-Videos/upscale_manager.py at main · masterface77/Upscale-Videos · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
"""
Video Upscale Manager - CodeFormer based Video Enhancement
Otimizado para talking heads (recepcionista virtual, vídeos com rostos)

Autor: Baseado em CodeFormer (sczhou/CodeFormer)
Uso: python upscale_manager.py --input video.mp4 --output resultado.mp4 --fidelity 0.7
"""

import os
import sys
import cv2
import argparse
import subprocess
import tempfile
import shutil
from pathlib import Path
from tqdm import tqdm
import torch
from torchvision.transforms.functional import normalize

# Importações do CodeFormer
from basicsr.utils import imwrite, img2tensor, tensor2img
from basicsr.utils.download_util import load_file_from_url
from basicsr.utils.misc import gpu_is_available, get_device
from basicsr.utils.registry import ARCH_REGISTRY
from facelib.utils.face_restoration_helper import FaceRestoreHelper


class VideoUpscaleManager:
    """
    Gerenciador de Upscale de Vídeos usando CodeFormer
    """

    def __init__(self, fidelity=0.7, upscale=2, device=None):
        """
        Inicializa o gerenciador

        Args:
            fidelity (float): Peso de fidelidade (0.0-1.0). Menor = maior qualidade, Maior = maior fidelidade
            upscale (int): Fator de upscale (default: 2)
            device: Device PyTorch (auto-detecta se None)
        """
        self.fidelity = fidelity
        self.upscale = upscale
        self.device = device if device else get_device()

        print(f"🚀 Iniciando Video Upscale Manager")
        print(f"📱 Device: {self.device}")
        print(f"🎯 Fidelity: {self.fidelity}")
        print(f"📈 Upscale: {self.upscale}x")

        # URLs dos modelos pré-treinados
        self.pretrain_model_url = {
            'restoration': 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth',
        }

        # Inicializa componentes
        self.net = None
        self.face_helper = None
        self.bg_upsampler = None

    def setup_codeformer(self):
        """Configura o modelo CodeFormer"""
        print("\n🔧 Configurando CodeFormer...")

        # Carrega a arquitetura
        self.net = ARCH_REGISTRY.get('CodeFormer')(
            dim_embd=512,
            codebook_size=1024,
            n_head=8,
            n_layers=9,
            connect_list=['32', '64', '128', '256']
        ).to(self.device)

        # Download e carregamento do checkpoint
        ckpt_path = load_file_from_url(
            url=self.pretrain_model_url['restoration'],
            model_dir='weights/CodeFormer',
            progress=True,
            file_name=None
        )

        checkpoint = torch.load(ckpt_path, map_location=self.device)['params_ema']
        self.net.load_state_dict(checkpoint)
        self.net.eval()

        print("✅ CodeFormer carregado com sucesso!")

    def setup_realesrgan(self):
        """Configura o Real-ESRGAN para background upsampling"""
        print("🔧 Configurando Real-ESRGAN para background...")

        from basicsr.archs.rrdbnet_arch import RRDBNet
        from basicsr.utils.realesrgan_utils import RealESRGANer

        # Detecta suporte a half precision
        use_half = False
        if torch.cuda.is_available():
            no_half_gpu_list = ['1650', '1660']
            if not any(gpu in torch.cuda.get_device_name(0) for gpu in no_half_gpu_list):
                use_half = True

        # Modelo RRDBNet
        model = RRDBNet(
            num_in_ch=3,
            num_out_ch=3,
            num_feat=64,
            num_block=23,
            num_grow_ch=32,
            scale=2,
        )

        # Upsampler
        self.bg_upsampler = RealESRGANer(
            scale=2,
            model_path="https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/RealESRGAN_x2plus.pth",
            model=model,
            tile=400,
            tile_pad=40,
            pre_pad=0,
            half=use_half
        )

        print("✅ Real-ESRGAN configurado!")

    def setup_face_helper(self):
        """Configura o Face Restoration Helper"""
        print("🔧 Configurando Face Helper...")

        self.face_helper = FaceRestoreHelper(
            self.upscale,
            face_size=512,
            crop_ratio=(1, 1),
            det_model='retinaface_resnet50',
            save_ext='png',
            use_parse=True,
            device=self.device
        )

        print("✅ Face Helper configurado!")

    def extract_audio(self, video_path, audio_output):
        """
        Extrai áudio do vídeo usando FFmpeg

        Args:
            video_path (str): Caminho do vídeo de entrada
            audio_output (str): Caminho para salvar o áudio
        """
        print("\n🎵 Extraindo áudio do vídeo original...")

        try:
            cmd = [
                'ffmpeg',
                '-i', video_path,
                '-vn',  # Sem vídeo
                '-acodec', 'copy',  # Copia o codec de áudio sem recodificar
                '-y',  # Sobrescreve se existir
                audio_output
            ]

            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                print("✅ Áudio extraído com sucesso!")
                return True
            else:
                print("⚠️  Vídeo não possui áudio ou erro na extração")
                return False

        except FileNotFoundError:
            print("❌ FFmpeg não encontrado! Instale: conda install -c conda-forge ffmpeg")
            return False
        except Exception as e:
            print(f"❌ Erro ao extrair áudio: {e}")
            return False

    def merge_audio_video(self, video_path, audio_path, output_path):
        """
        Une vídeo processado com áudio original usando FFmpeg

        Args:
            video_path (str): Vídeo processado (sem áudio)
            audio_path (str): Áudio original
            output_path (str): Vídeo final com áudio
        """
        print("\n🎬 Unindo vídeo processado com áudio original...")

        try:
            cmd = [
                'ffmpeg',
                '-i', video_path,
                '-i', audio_path,
                '-c:v', 'copy',  # Copia vídeo sem recodificar
                '-c:a', 'aac',   # Codec de áudio AAC
                '-strict', 'experimental',
                '-y',
                output_path
            ]

            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                print("✅ Áudio e vídeo unidos com sucesso!")
                return True
            else:
                print(f"❌ Erro ao unir: {result.stderr}")
                return False

        except Exception as e:
            print(f"❌ Erro no merge: {e}")
            return False

    def process_frame(self, frame):
        """
        Processa um único frame com CodeFormer

        Args:
            frame (numpy.ndarray): Frame em formato BGR (OpenCV)

        Returns:
            numpy.ndarray: Frame processado
        """
        # Limpa cache de faces anteriores
        self.face_helper.clean_all()

        # Detecta e alinha rostos
        self.face_helper.read_image(frame)
        num_faces = self.face_helper.get_face_landmarks_5(
            only_center_face=False,
            resize=640,
            eye_dist_threshold=5
        )

        # Se não detectou rostos, retorna frame original upscaled
        if num_faces == 0:
            if self.bg_upsampler is not None:
                return self.bg_upsampler.enhance(frame, outscale=self.upscale)[0]
            return frame

        # Alinha faces detectadas
        self.face_helper.align_warp_face()

        # Processa cada rosto detectado
        for cropped_face in self.face_helper.cropped_faces:
            # Prepara tensor
            cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
            normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
            cropped_face_t = cropped_face_t.unsqueeze(0).to(self.device)

            try:
                with torch.no_grad():
                    output = self.net(cropped_face_t, w=self.fidelity, adain=True)[0]
                    restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))
                del output
                torch.cuda.empty_cache()  # Limpa cache CUDA
            except Exception as error:
                print(f"⚠️  Erro na inferência: {error}")
                restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))

            restored_face = restored_face.astype('uint8')
            self.face_helper.add_restored_face(restored_face, cropped_face)

        # Upscale do background
        bg_img = None
        if self.bg_upsampler is not None:
            bg_img = self.bg_upsampler.enhance(frame, outscale=self.upscale)[0]

        # Cola faces restauradas no frame
        self.face_helper.get_inverse_affine(None)
        restored_img = self.face_helper.paste_faces_to_input_image(
            upsample_img=bg_img,
            draw_box=False
        )

        return restored_img

    def process_video(self, input_path, output_path):
        """
        Processa vídeo completo

        Args:
            input_path (str): Caminho do vídeo de entrada
            output_path (str): Caminho do vídeo de saída
        """
        # Valida entrada
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Vídeo não encontrado: {input_path}")

        print(f"\n📹 Processando vídeo: {input_path}")

        # Configura componentes
        self.setup_codeformer()
        self.setup_realesrgan()
        self.setup_face_helper()

        # Cria diretório temporário
        temp_dir = tempfile.mkdtemp(prefix='upscale_')
        temp_frames_dir = os.path.join(temp_dir, 'frames')
        os.makedirs(temp_frames_dir, exist_ok=True)

        temp_audio = os.path.join(temp_dir, 'audio.aac')
        temp_video = os.path.join(temp_dir, 'video_no_audio.mp4')

        try:
            # Extrai áudio
            has_audio = self.extract_audio(input_path, temp_audio)

            # Abre vídeo
            cap = cv2.VideoCapture(input_path)
            fps = cap.get(cv2.CAP_PROP_FPS)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

            print(f"\n📊 Informações do vídeo:")
            print(f"   - FPS: {fps}")
            print(f"   - Total de frames: {total_frames}")

            # Processa frames
            print("\n🎨 Processando frames...")
            processed_frames = []

            with tqdm(total=total_frames, desc="Upscaling", unit="frame") as pbar:
                frame_count = 0
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        break

                    # Processa frame
                    processed_frame = self.process_frame(frame)
                    processed_frames.append(processed_frame)

                    # Limpa memória periodicamente (a cada 30 frames)
                    if frame_count % 30 == 0:
                        torch.cuda.empty_cache()

                    frame_count += 1
                    pbar.update(1)

            cap.release()

            # Salva vídeo processado
            print("\n💾 Salvando vídeo processado...")

            if len(processed_frames) > 0:
                height, width = processed_frames[0].shape[:2]

                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
                out = cv2.VideoWriter(temp_video, fourcc, fps, (width, height))

                for frame in tqdm(processed_frames, desc="Salvando", unit="frame"):
                    out.write(frame)

                out.release()

                # Une áudio (se existir)
                if has_audio and os.path.exists(temp_audio):
                    self.merge_audio_video(temp_video, temp_audio, output_path)
                else:
                    # Copia vídeo sem áudio
                    shutil.copy(temp_video, output_path)
                    print("⚠️  Vídeo salvo sem áudio (original não tinha áudio)")

                print(f"\n✅ CONCLUÍDO! Vídeo salvo em: {output_path}")
            else:
                print("❌ Nenhum frame foi processado!")

        except Exception as e:
            print(f"\n❌ Erro no processamento: {e}")
            raise
        finally:
            # Limpa arquivos temporários
            print("\n🧹 Limpando arquivos temporários...")
            shutil.rmtree(temp_dir, ignore_errors=True)
            torch.cuda.empty_cache()


def main():
    """Função principal com interface CLI"""
    parser = argparse.ArgumentParser(
        description='Video Upscale Manager - CodeFormer based video enhancement',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Exemplos de uso:
  python upscale_manager.py --input video.mp4 --output resultado.mp4
  python upscale_manager.py --input video.mp4 --output resultado.mp4 --fidelity 0.9
  python upscale_manager.py --input video.mp4 --output resultado.mp4 --fidelity 0.5 --upscale 2
        """
    )

    parser.add_argument(
        '--input',
        type=str,
        required=True,
        help='Caminho do vídeo de entrada (mp4, avi, mov)'
    )

    parser.add_argument(
        '--output',
        type=str,
        required=True,
        help='Caminho do vídeo de saída'
    )

    parser.add_argument(
        '--fidelity',
        type=float,
        default=0.7,
        help='Peso de fidelidade (0.0-1.0). Menor = maior qualidade IA, Maior = mais fiel ao original. Default: 0.7'
    )

    parser.add_argument(
        '--upscale',
        type=int,
        default=2,
        choices=[1, 2],
        help='Fator de upscale. Default: 2'
    )

    args = parser.parse_args()

    # Valida fidelity
    if not 0.0 <= args.fidelity <= 1.0:
        parser.error("--fidelity deve estar entre 0.0 e 1.0")

    # Cria gerenciador e processa
    manager = VideoUpscaleManager(
        fidelity=args.fidelity,
        upscale=args.upscale
    )

    manager.process_video(args.input, args.output)


if __name__ == '__main__':
    main()