From e2d582ff316c6073aac6e16b3abb79765660b157 Mon Sep 17 00:00:00 2001 From: FlyingFathead Date: Wed, 9 Oct 2024 18:03:22 +0300 Subject: [PATCH] `v0.1707` - more robust media site handling --- Dockerfile | 5 +- config/config.ini | 8 +- src/transcription_handler.py | 147 +---------------------------------- 3 files changed, 14 insertions(+), 146 deletions(-) diff --git a/Dockerfile b/Dockerfile index b114fa3..6dac21b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:slim-bookworm +FROM python:3.12-slim # Install dependencies & clean up after to reduce Docker file size RUN apt-get update && apt-get install -y \ @@ -13,6 +13,9 @@ WORKDIR /app # Copy the requirements file first to leverage Docker cache COPY requirements.txt . +# Upgrade pip and setuptools +RUN pip install --upgrade pip setuptools wheel + # Install Python dependencies RUN pip3 install --no-cache-dir -r requirements.txt diff --git a/config/config.ini b/config/config.ini index d79afbd..afaa4c5 100644 --- a/config/config.ini +++ b/config/config.ini @@ -78,4 +78,10 @@ download_original_video_domains = rumble.com # this is usually recommended, because we will only need the _audio_ for transcription. # adding a high-quality video will cause massive file size increases. # however, in some cases you might want to turn this off -use_worst_video_quality = true \ No newline at end of file +use_worst_video_quality = true + +[VideoDescriptionSettings] +# Set to True to use only a snippet of the video description +use_snippet_for_description = False +# Maximum number of lines to include in the description snippet +description_max_lines = 30 diff --git a/src/transcription_handler.py b/src/transcription_handler.py index ea9e2b9..c82ea6f 100644 --- a/src/transcription_handler.py +++ b/src/transcription_handler.py @@ -33,17 +33,10 @@ from config_loader import ConfigLoader config = ConfigLoader.get_config() -# # Load config -# config = configparser.ConfigParser() -# config.read('config/config.ini') -# send_as_files = config.getboolean('TranscriptionSettings', 'sendasfiles', fallback=True) -# send_as_messages = config.getboolean('TranscriptionSettings', 'sendasmessages', fallback=False) - # Toggle this to use the full description or a snippet. -USE_SNIPPET_FOR_DESCRIPTION = False - +USE_SNIPPET_FOR_DESCRIPTION = config.getboolean('VideoDescriptionSettings', 'use_snippet_for_description', fallback=False) # If we're using a snippet of the description, maximum number of lines to include -DESCRIPTION_MAX_LINES = 30 +DESCRIPTION_MAX_LINES = config.getint('VideoDescriptionSettings', 'description_max_lines', fallback=30) # Output directory for transcriptions; create if doesn't exist output_dir = "transcriptions" @@ -273,7 +266,7 @@ def get_format_sort_key(fmt): command = [ "yt-dlp", - "--verbose", + # "--verbose", # uncomment to set verbose "--format", selected_format_id, "--output", video_output_template, url @@ -366,140 +359,6 @@ async def read_stream(stream, lines, log_func): raise Exception(f"Failed to download audio: {audio_path}") logger.info(f"Audio downloaded successfully: {audio_path}") -# // audio download (OLD method) -# async def download_audio(url, audio_path): -# config = ConfigLoader.get_config() -# ytdlp_settings = ConfigLoader.get_ytdlp_domain_settings() -# use_cookies = config.getboolean('YTDLPSettings', 'use_cookies', fallback=False) -# cookies_file = config.get('YTDLPSettings', 'cookies_file', fallback='config/cookies.txt') - -# parsed_url = urlparse(url) -# domain = parsed_url.netloc.lower() -# if domain.startswith('www.'): -# domain = domain[4:] # Remove 'www.' - -# should_download_video = ytdlp_settings['active'] and domain in ytdlp_settings['domains'] - -# command = [ -# "yt-dlp", -# "--cache-dir", ".cache", -# ] - -# if use_cookies and os.path.exists(cookies_file): -# command.extend(["--cookies", cookies_file]) - -# if should_download_video: -# # Use a separate base path without the .mp3 extension -# base_output_path = audio_path.replace('.mp3', '') # e.g., audio/12345_1618033988 -# video_output_template = f"{base_output_path}.%(ext)s" # e.g., audio/12345_1618033988.mp4 -# command.extend([ -# "--verbose", # Add verbose flag -# "--external-downloader", "aria2c", -# # "--cache-dir", ".cache", -# # things you could try with problematic sites, may not always work: -# "--external-downloader-args", "split=1,min-split-size=1M", # 1M splits -# # "--format", 'b[height<=480]' # Prefer the lowest quality format -# # "--format", 'worstvideo', # Select the lowest video quality -# # "--concurrent-fragments", "1", # Reduce concurrent fragment downloads -# "--fragment-retries", "10", # Set retries for failed fragments -# "--sleep-interval", "5", # Wait 5 seconds between retries -# # "--merge-output-format", "mp4", # Ensure merged output format -# "-o", video_output_template, -# url -# ]) -# logger.info("Downloading full video...") -# else: -# # Download audio-only as mp3 -# command.extend([ -# "--extract-audio", -# "--audio-format", "mp3", -# "-o", audio_path, -# url -# ]) -# logger.info("Downloading audio-only...") - -# # Start the subprocess -# process = await asyncio.create_subprocess_exec( -# *command, -# stdout=asyncio.subprocess.PIPE, -# stderr=asyncio.subprocess.PIPE -# ) - -# stdout_lines = [] -# stderr_lines = [] - -# async def read_stream(stream, lines, log_func): -# while True: -# line = await stream.readline() -# if line: -# decoded_line = line.decode().rstrip() -# lines.append(decoded_line) -# log_func(decoded_line) -# else: -# break - -# await asyncio.gather( -# read_stream(process.stdout, stdout_lines, logger.info), -# read_stream(process.stderr, stderr_lines, logger.error) -# ) - -# await process.wait() - -# if process.returncode != 0: -# stderr_output = '\n'.join(stderr_lines) -# if any(keyword in stderr_output for keyword in [ -# "separator is not found", -# "chunk exceed the limit", -# "Sign in to confirm you're not a bot", -# "unable to extract initial player response", -# "This video is unavailable", -# "ERROR:" -# ]): -# custom_error_message = ( -# "Failed to download audio due to YouTube's anti-bot measures or video restrictions. " -# "Possible reasons include age restrictions, region locks, or the video requiring sign-in. " -# "Please try a different video, or if you're the administrator, consider using cookies with `yt-dlp`." -# ) -# logger.error(f"Error: {custom_error_message}") -# raise Exception(custom_error_message) -# else: -# logger.error(f"yt-dlp failed with error:\n{stderr_output}") -# raise Exception(f"Failed to download audio: {stderr_output}") - -# if should_download_video: -# video_extensions = ['mp4', 'webm', 'mkv', 'avi', 'mov', 'flv', 'wmv', 'mpg', 'mpeg'] -# video_file = None -# for ext in video_extensions: -# potential_video = f"{base_output_path}.{ext}" -# if os.path.exists(potential_video): -# video_file = potential_video -# break - -# if not video_file: -# logger.error("Failed to locate the downloaded video file.") -# raise Exception("Failed to locate the downloaded video file.") - -# logger.info(f"Extracted video file: {video_file}") - -# try: -# audio = AudioSegment.from_file(video_file) -# audio.export(audio_path, format="mp3") -# logger.info(f"Extracted audio to: {audio_path}") -# except Exception as e: -# logger.error(f"Failed to extract audio from video: {e}") -# raise Exception(f"Failed to extract audio from video: {e}") - -# try: -# os.remove(video_file) -# logger.info(f"Removed temporary video file: {video_file}") -# except Exception as e: -# logger.warning(f"Failed to remove temporary video file {video_file}: {e}") - -# else: -# if not os.path.exists(audio_path): -# raise Exception(f"Failed to download audio: {audio_path}") -# logger.info(f"Audio downloaded successfully: {audio_path}") - # Read from stream line by line until EOF, call callback on each line. async def read_stream(stream, callback): while True: