Skip to content

Commit

Permalink
support prompting during subtitle generation by transcription
Browse files Browse the repository at this point in the history
  • Loading branch information
baxtree committed Dec 30, 2024
1 parent c084ff6 commit 59bdd89
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 18 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub
# Generate subtitles by transcribing audiovisual files
$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
$ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
$ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
```
```
# Alignment on segmented plain texts (double newlines as the delimiter)
Expand Down
2 changes: 2 additions & 0 deletions site/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Make sure you have got the virtual environment activated upfront.

(.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
(.venv) $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
(.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
(.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt

**Alignment on segmented plain texts (double newlines as the delimiter)**::

Expand Down
31 changes: 22 additions & 9 deletions subaligner/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
[-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
[-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS]
[-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
[-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-tr {helsinki-nlp,whisper,facebook-mbart}] [-tf TRANSLATION_FLAVOUR]
[-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-lgs] [-d] [-q] [-ver]
[-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-ip INITIAL_PROMPT] [-tr {helsinki-nlp,whisper,facebook-mbart}]
[-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-lgs] [-d] [-q] [-ver]
Subaligner command line interface (v0.3.7)
Subaligner command line interface
options:
optional arguments:
-h, --help show this help message and exit
-s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...]
File path or URL to the subtitle file (Extensions of supported subtitles: .scc, .tmp, .sami, .stl, .ttml, .dfxp, .srt, .ssa, .ass, .sub, .sbv, .xml, .ytt, .smi, .txt, .vtt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
File path or URL to the subtitle file (Extensions of supported subtitles: .ass, .smi, .scc, .vtt, .stl, .txt, .sbv, .ssa, .sub, .ttml, .xml, .srt, .ytt, .dfxp, .sami, .tmp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
-l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
Max global log loss for alignment
-so, --stretch_on Switch on stretch on subtitles)
Expand All @@ -32,8 +32,10 @@
Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]
-mr {whisper}, --transcription_recipe {whisper}
LLM recipe used for transcribing video files
-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large}
-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}
Flavour variation for a specific LLM recipe supporting transcription
-ip INITIAL_PROMPT, --initial_prompt INITIAL_PROMPT
Optional text to provide the transcribing context or specific phrases
-tr {helsinki-nlp,whisper,facebook-mbart}, --translation_recipe {helsinki-nlp,whisper,facebook-mbart}
LLM recipe used for translating subtitles
-tf TRANSLATION_FLAVOUR, --translation_flavour TRANSLATION_FLAVOUR
Expand Down Expand Up @@ -178,6 +180,13 @@ def main():
choices=[wf.value for wf in WhisperFlavour],
help="Flavour variation for a specific LLM recipe supporting transcription"
)
parser.add_argument(
"-ip",
"--initial_prompt",
type=str,
default=None,
help="Optional text to provide the transcribing context or specific phrases"
)
from subaligner.llm import TranslationRecipe
from subaligner.llm import HelsinkiNLPFlavour
parser.add_argument(
Expand Down Expand Up @@ -233,7 +242,8 @@ def main():
parser.print_usage()
sys.exit(21)
elif FLAGS.mode == "transcribe":
FLAGS.subtitle_path = ["{}.srt".format(tempfile.mkstemp()[1])]
if not FLAGS.subtitle_path:
FLAGS.subtitle_path = [tempfile.mkstemp(suffix="_transcribe_temp.srt")[1]]
if FLAGS.mode in ["single", "dual", "script", "transcribe"]:
for subtitle_path in FLAGS.subtitle_path:
if FLAGS.video_path == "":
Expand Down Expand Up @@ -345,7 +355,10 @@ def main():
elif FLAGS.mode == "transcribe":
from subaligner.transcriber import Transcriber
transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour)
subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang)
if "_transcribe_temp" in local_subtitle_path:
subtitle, frame_rate = transcriber.transcribe(video_file_path=local_video_path, language_code=stretch_in_lang, initial_prompt=FLAGS.initial_prompt)
else:
subtitle, frame_rate = transcriber.transcribe_with_subtitle_as_prompts(video_file_path=local_video_path, subtitle_file_path=local_subtitle_path, language_code=stretch_in_lang)
aligned_subs = subtitle.subs
else:
print("ERROR: Unknown mode {}".format(FLAGS.mode))
Expand Down Expand Up @@ -422,7 +435,7 @@ def _remove_tmp_files(video_path, subtitle_path, local_video_path, local_subtitl
os.remove(local_video_path)
if subtitle_path.lower().startswith("http") and os.path.exists(local_subtitle_path):
os.remove(local_subtitle_path)
if mode == "transcribe" and os.path.exists(local_subtitle_path):
if mode == "transcribe" and os.path.exists(local_subtitle_path) and "_transcribe_temp" in local_subtitle_path:
os.remove(local_subtitle_path)


Expand Down
72 changes: 67 additions & 5 deletions subaligner/transcriber.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import os
import whisper
import torch
from typing import Tuple, Optional
from pysrt import SubRipTime
from whisper.tokenizer import LANGUAGES
from .translator import Translator
from .subtitle import Subtitle
from .media_helper import MediaHelper
from .llm import TranscriptionRecipe, WhisperFlavour
from .singleton import Singleton
from .logger import Logger
from .utils import Utils
from .exception import NoFrameRateException, TranscriptionException
Expand All @@ -30,18 +29,23 @@ def __init__(self, recipe: str = TranscriptionRecipe.WHISPER.value, flavour: str
if recipe == TranscriptionRecipe.WHISPER.value:
if flavour not in [f.value for f in WhisperFlavour]:
raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}")
self.__model = whisper.load_model(flavour)
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
self.__model = whisper.load_model(flavour, device=device)
self.__recipe = recipe
self.__flavour = flavour
self.__media_helper = MediaHelper()
self.__LOGGER = Logger().get_logger(__name__)

def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle, Optional[float]]:
def transcribe(self, video_file_path: str, language_code: str, initial_prompt: Optional[str] = None) -> Tuple[Subtitle, Optional[float]]:
"""Transcribe an audiovisual file and generate subtitles.
Arguments:
video_file_path {string} -- The input video file path.
language_code {string} -- An alpha 3 language code derived from ISO 639-3.
initial_prompt {string} -- Optional text to provide the transcribing context or specific phrases.
Returns:
tuple: Generated subtitle after transcription and the detected frame rate
Expand All @@ -59,7 +63,8 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle
audio = whisper.load_audio(audio_file_path)
self.__LOGGER.info("Start transcribing the audio...")
verbose = False if Logger.VERBOSE and not Logger.QUIET else None
result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose)
self.__LOGGER.debug("Prompting with: '%s'" % initial_prompt)
result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose, initial_prompt=initial_prompt)
self.__LOGGER.info("Finished transcribing the audio")
srt_str = ""
for i, segment in enumerate(result["segments"], start=1):
Expand All @@ -77,6 +82,63 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle
else:
raise NotImplementedError(f"{self.__recipe} ({self.__flavour}) is not supported")

def transcribe_with_subtitle_as_prompts(self, video_file_path: str, subtitle_file_path: str, language_code: str) -> Tuple[Subtitle, Optional[float]]:
"""Transcribe an audiovisual file and generate subtitles using the original subtitle as prompts.
Arguments:
video_file_path {string} -- The input video file path.
subtitle_file_path {string} -- The input subtitle file path to provide prompts.
language_code {string} -- An alpha 3 language code derived from ISO 639-3.
Returns:
tuple: Generated subtitle after transcription and the detected frame rate
Raises:
TranscriptionException: Thrown when transcription is failed.
NotImplementedError: Thrown when the LLM recipe is not supported.
"""
if self.__recipe == "whisper":
lang = Utils.get_iso_639_alpha_2(language_code)
if lang not in LANGUAGES:
raise TranscriptionException(
f'"{language_code}" is not supported by {self.__recipe} ({self.__flavour})')
audio_file_path = self.__media_helper.extract_audio(video_file_path, True, 16000)
subtitle = Subtitle.load(subtitle_file_path)
segment_paths = []
try:
srt_str = ""
srt_idx = 1
self.__LOGGER.info("Start transcribing the audio...")
verbose = False if Logger.VERBOSE and not Logger.QUIET else None
for sub in subtitle.subs:
segment_path, _ = self.__media_helper.extract_audio_from_start_to_end(audio_file_path, str(sub.start), str(sub.end))
segment_paths.append(segment_path)
audio = whisper.load_audio(segment_path)
result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose, initial_prompt=sub.text)
original_start_in_secs = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0
original_end_in_secs = sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000.0
for segment in result["segments"]:
if segment["end"] <= segment["start"]:
continue
srt_str += f"{srt_idx}\n" \
f"{Utils.format_timestamp(original_start_in_secs + segment['start'])} --> {Utils.format_timestamp(min(original_start_in_secs + segment['end'], original_end_in_secs))}\n" \
f"{segment['text'].strip().replace('-->', '->')}\n" \
"\n"
srt_idx += 1
self.__LOGGER.info("Finished transcribing the audio")
subtitle = Subtitle.load_subrip_str(srt_str)
subtitle, frame_rate = self.__on_frame_timecodes(subtitle, video_file_path)
self.__LOGGER.debug("Generated the raw subtitle")
return subtitle, frame_rate
finally:
if os.path.exists(audio_file_path):
os.remove(audio_file_path)
for segment_path in segment_paths:
if os.path.exists(segment_path):
os.remove(segment_path)
else:
raise NotImplementedError(f"{self.__recipe} ({self.__flavour}) is not supported")

def __on_frame_timecodes(self, subtitle: Subtitle, video_file_path: str) -> Tuple[Subtitle, Optional[float]]:
frame_rate = None
try:
Expand Down
17 changes: 13 additions & 4 deletions tests/integration/feature/subaligner.feature
Original file line number Diff line number Diff line change
Expand Up @@ -236,16 +236,25 @@ Feature: Subaligner CLI
| subaligner_1pass | <NULL> | "test.srt" | eng,fra | "test_aligned.srt" |
| subaligner_2pass | <NULL> | "test.srt" | eng,deu | "test_aligned.srt" |

@transcription
@transcription @custom-prompt
Scenario Outline: Test transcription on audiovisual input and subtitle generation
Given I have a video file <video-in>
When I run the alignment with <aligner> on them with <mode> stage with <language> language, <recipe> recipe and <flavour> flavour and prompt <prompt>
Then a new subtitle file <subtitle-out> is generated
Examples:
| video-in | aligner | mode | language | recipe | flavour | prompt | subtitle-out |
| "test.mp4" | subaligner | transcribe | eng | whisper | tiny | <NULL> | "test_aligned.srt" |
| "test.wav" | subaligner | transcribe | eng | whisper | tiny | test_prompt | "test_aligned.srt" |

@transcription @subtitle-prompt
Scenario Outline: Test transcription on audiovisual input with original subtitle as prompts and subtitle generation
Given I have a video file <video-in>
And I have a subtitle file <subtitle-in>
When I run the alignment with <aligner> on them with <mode> stage with <language> language, <recipe> recipe and <flavour> flavour
Then a new subtitle file <subtitle-out> is generated
Examples:
| video-in | aligner | mode | subtitle-in | language | recipe | flavour | subtitle-out |
| "test.mp4" | subaligner | transcribe | "test.srt" | eng | whisper | tiny | "test_aligned.srt" |
| "test.wav" | subaligner | transcribe | "test.srt" | eng | whisper | tiny | "test_aligned.srt" |
| video-in | aligner | mode | subtitle-in | language | recipe | flavour | subtitle-out |
| "test.mp4" | subaligner | transcribe | "test.srt" | eng | whisper | tiny | "test_aligned.srt" |

@batch
Scenario Outline: Test batch alignment
Expand Down
27 changes: 27 additions & 0 deletions tests/integration/radish/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,39 @@ def run_subaligner_with_translation(step, aligner, mode, language_pair):
step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS)


@when('I run the alignment with {aligner:S} on them with {mode:S} stage with {language:S} language, {recipe:S} recipe and {flavour:S} flavour and prompt {prompt:S}')
def run_subaligner_with_transcription(step, aligner, mode, language, recipe, flavour, prompt):
if prompt == "<NULL>":
process = subprocess.Popen([
os.path.join(PWD, "..", "..", "..", "bin", aligner),
"-m", mode,
"-v", step.context.video_file_path,
"-ml", language,
"-mr", recipe,
"-mf", flavour,
"-o", os.path.join(PWD, "..", "..", "subaligner", "resource", "test_aligned.srt"),
"-q"], shell=False)
else:
process = subprocess.Popen([
os.path.join(PWD, "..", "..", "..", "bin", aligner),
"-m", mode,
"-v", step.context.video_file_path,
"-ml", language,
"-mr", recipe,
"-mf", flavour,
"-ip", prompt,
"-o", os.path.join(PWD, "..", "..", "subaligner", "resource", "test_aligned.srt"),
"-q"], shell=False)
step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS)


@when('I run the alignment with {aligner:S} on them with {mode:S} stage with {language:S} language, {recipe:S} recipe and {flavour:S} flavour')
def run_subaligner_with_transcription(step, aligner, mode, language, recipe, flavour):
process = subprocess.Popen([
os.path.join(PWD, "..", "..", "..", "bin", aligner),
"-m", mode,
"-v", step.context.video_file_path,
"-s", step.context.subtitle_path_or_selector,
"-ml", language,
"-mr", recipe,
"-mf", flavour,
Expand Down
11 changes: 11 additions & 0 deletions tests/subaligner/test_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,24 @@ class TranscriberTest(unittest.TestCase):

def setUp(self) -> None:
self.video_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource", "test.mp4")
self.srt_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource/test.srt")
self.undertest = Undertest(recipe=TranscriptionRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value)

def test_transcribe(self):
subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng")
assert len(subtitle.subs) > 0
assert frame_rate == 24

def test_transcribe_with_initial_prompt(self):
subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng", initial_prompt="This is a testing prompt")
assert len(subtitle.subs) > 0
assert frame_rate == 24

def test_transcribe_with_subtitle_as_prompts(self):
subtitle, frame_rate = self.undertest.transcribe_with_subtitle_as_prompts(self.video_file_path, self.srt_file_path, "eng")
assert len(subtitle.subs) > 0
assert frame_rate == 24

def test_throw_exception_on_unknown_recipe(self):
try:
Undertest(recipe="unknown")
Expand Down

0 comments on commit 59bdd89

Please sign in to comment.