From e2d582ff316c6073aac6e16b3abb79765660b157 Mon Sep 17 00:00:00 2001
From: FlyingFathead <flyingfathead@protonmail.com>
Date: Wed, 9 Oct 2024 18:03:22 +0300
Subject: [PATCH] `v0.1707` - more robust media site handling

---
 Dockerfile                   |   5 +-
 config/config.ini            |   8 +-
 src/transcription_handler.py | 147 +----------------------------------
 3 files changed, 14 insertions(+), 146 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b114fa3..6dac21b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:slim-bookworm
+FROM python:3.12-slim
 
 # Install dependencies & clean up after to reduce Docker file size
 RUN apt-get update && apt-get install -y \
@@ -13,6 +13,9 @@ WORKDIR /app
 # Copy the requirements file first to leverage Docker cache
 COPY requirements.txt .
 
+# Upgrade pip and setuptools
+RUN pip install --upgrade pip setuptools wheel
+
 # Install Python dependencies
 RUN pip3 install --no-cache-dir -r requirements.txt
 
diff --git a/config/config.ini b/config/config.ini
index d79afbd..afaa4c5 100644
--- a/config/config.ini
+++ b/config/config.ini
@@ -78,4 +78,10 @@ download_original_video_domains = rumble.com
 # this is usually recommended, because we will only need the _audio_ for transcription.
 # adding a high-quality video will cause massive file size increases.
 # however, in some cases you might want to turn this off
-use_worst_video_quality = true
\ No newline at end of file
+use_worst_video_quality = true
+
+[VideoDescriptionSettings]
+# Set to True to use only a snippet of the video description
+use_snippet_for_description = False
+# Maximum number of lines to include in the description snippet
+description_max_lines = 30
diff --git a/src/transcription_handler.py b/src/transcription_handler.py
index ea9e2b9..c82ea6f 100644
--- a/src/transcription_handler.py
+++ b/src/transcription_handler.py
@@ -33,17 +33,10 @@
 from config_loader import ConfigLoader
 config = ConfigLoader.get_config()
 
-# # Load config
-# config = configparser.ConfigParser()
-# config.read('config/config.ini')
-# send_as_files = config.getboolean('TranscriptionSettings', 'sendasfiles', fallback=True)
-# send_as_messages = config.getboolean('TranscriptionSettings', 'sendasmessages', fallback=False)
-
 # Toggle this to use the full description or a snippet.
-USE_SNIPPET_FOR_DESCRIPTION = False
-
+USE_SNIPPET_FOR_DESCRIPTION = config.getboolean('VideoDescriptionSettings', 'use_snippet_for_description', fallback=False)
 # If we're using a snippet of the description, maximum number of lines to include
-DESCRIPTION_MAX_LINES = 30
+DESCRIPTION_MAX_LINES = config.getint('VideoDescriptionSettings', 'description_max_lines', fallback=30)
 
 # Output directory for transcriptions; create if doesn't exist
 output_dir = "transcriptions"
@@ -273,7 +266,7 @@ def get_format_sort_key(fmt):
 
         command = [
             "yt-dlp",
-            "--verbose",
+            # "--verbose", # uncomment to set verbose
             "--format", selected_format_id,
             "--output", video_output_template,
             url
@@ -366,140 +359,6 @@ async def read_stream(stream, lines, log_func):
             raise Exception(f"Failed to download audio: {audio_path}")
         logger.info(f"Audio downloaded successfully: {audio_path}")
 
-# // audio download (OLD method)
-# async def download_audio(url, audio_path):
-#     config = ConfigLoader.get_config()
-#     ytdlp_settings = ConfigLoader.get_ytdlp_domain_settings()
-#     use_cookies = config.getboolean('YTDLPSettings', 'use_cookies', fallback=False)
-#     cookies_file = config.get('YTDLPSettings', 'cookies_file', fallback='config/cookies.txt')
-    
-#     parsed_url = urlparse(url)
-#     domain = parsed_url.netloc.lower()
-#     if domain.startswith('www.'):
-#         domain = domain[4:]  # Remove 'www.'
-
-#     should_download_video = ytdlp_settings['active'] and domain in ytdlp_settings['domains']
-
-#     command = [
-#         "yt-dlp",
-#         "--cache-dir", ".cache",
-#     ]
-
-#     if use_cookies and os.path.exists(cookies_file):
-#         command.extend(["--cookies", cookies_file])
-
-#     if should_download_video:
-#         # Use a separate base path without the .mp3 extension
-#         base_output_path = audio_path.replace('.mp3', '')  # e.g., audio/12345_1618033988
-#         video_output_template = f"{base_output_path}.%(ext)s"  # e.g., audio/12345_1618033988.mp4
-#         command.extend([
-#             "--verbose",  # Add verbose flag            
-#             "--external-downloader", "aria2c",
-#             # "--cache-dir", ".cache",
-#             # things you could try with problematic sites, may not always work:
-#             "--external-downloader-args", "split=1,min-split-size=1M", # 1M splits
-#             # "--format", 'b[height<=480]'  # Prefer the lowest quality format
-#             # "--format", 'worstvideo',  # Select the lowest video quality
-#             # "--concurrent-fragments", "1",  # Reduce concurrent fragment downloads
-#             "--fragment-retries", "10",  # Set retries for failed fragments
-#             "--sleep-interval", "5",  # Wait 5 seconds between retries
-#             # "--merge-output-format", "mp4",  # Ensure merged output format                        
-#             "-o", video_output_template,
-#             url
-#         ])
-#         logger.info("Downloading full video...")
-#     else:
-#         # Download audio-only as mp3
-#         command.extend([
-#             "--extract-audio",
-#             "--audio-format", "mp3",
-#             "-o", audio_path,
-#             url
-#         ])
-#         logger.info("Downloading audio-only...")
-
-#     # Start the subprocess
-#     process = await asyncio.create_subprocess_exec(
-#         *command,
-#         stdout=asyncio.subprocess.PIPE,
-#         stderr=asyncio.subprocess.PIPE
-#     )
-
-#     stdout_lines = []
-#     stderr_lines = []
-
-#     async def read_stream(stream, lines, log_func):
-#         while True:
-#             line = await stream.readline()
-#             if line:
-#                 decoded_line = line.decode().rstrip()
-#                 lines.append(decoded_line)
-#                 log_func(decoded_line)
-#             else:
-#                 break
-
-#     await asyncio.gather(
-#         read_stream(process.stdout, stdout_lines, logger.info),
-#         read_stream(process.stderr, stderr_lines, logger.error)
-#     )
-
-#     await process.wait()
-
-#     if process.returncode != 0:
-#         stderr_output = '\n'.join(stderr_lines)
-#         if any(keyword in stderr_output for keyword in [
-#             "separator is not found",
-#             "chunk exceed the limit",
-#             "Sign in to confirm you're not a bot",
-#             "unable to extract initial player response",
-#             "This video is unavailable",
-#             "ERROR:"
-#         ]):
-#             custom_error_message = (
-#                 "Failed to download audio due to YouTube's anti-bot measures or video restrictions. "
-#                 "Possible reasons include age restrictions, region locks, or the video requiring sign-in. "
-#                 "Please try a different video, or if you're the administrator, consider using cookies with `yt-dlp`."
-#             )
-#             logger.error(f"Error: {custom_error_message}")
-#             raise Exception(custom_error_message)
-#         else:
-#             logger.error(f"yt-dlp failed with error:\n{stderr_output}")
-#             raise Exception(f"Failed to download audio: {stderr_output}")
-
-#     if should_download_video:
-#         video_extensions = ['mp4', 'webm', 'mkv', 'avi', 'mov', 'flv', 'wmv', 'mpg', 'mpeg']
-#         video_file = None
-#         for ext in video_extensions:
-#             potential_video = f"{base_output_path}.{ext}"
-#             if os.path.exists(potential_video):
-#                 video_file = potential_video
-#                 break
-
-#         if not video_file:
-#             logger.error("Failed to locate the downloaded video file.")
-#             raise Exception("Failed to locate the downloaded video file.")
-
-#         logger.info(f"Extracted video file: {video_file}")
-
-#         try:
-#             audio = AudioSegment.from_file(video_file)
-#             audio.export(audio_path, format="mp3")
-#             logger.info(f"Extracted audio to: {audio_path}")
-#         except Exception as e:
-#             logger.error(f"Failed to extract audio from video: {e}")
-#             raise Exception(f"Failed to extract audio from video: {e}")
-
-#         try:
-#             os.remove(video_file)
-#             logger.info(f"Removed temporary video file: {video_file}")
-#         except Exception as e:
-#             logger.warning(f"Failed to remove temporary video file {video_file}: {e}")
-
-#     else:
-#         if not os.path.exists(audio_path):
-#             raise Exception(f"Failed to download audio: {audio_path}")
-#         logger.info(f"Audio downloaded successfully: {audio_path}")
-
 # Read from stream line by line until EOF, call callback on each line.
 async def read_stream(stream, callback):
     while True: