support prompting during subtitle generation by transcription

baxtree · Dec 30, 2024 · 59bdd89 · 59bdd89
1 parent c084ff6
commit 59bdd89
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -103,6 +103,8 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub
 # Generate subtitles by transcribing audiovisual files
 $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
 $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
+$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
+$ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
 ```
 ```
 # Alignment on segmented plain texts (double newlines as the delimiter)

diff --git a/site/source/usage.rst b/site/source/usage.rst
@@ -26,6 +26,8 @@ Make sure you have got the virtual environment activated upfront.
 
     (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
     (.venv) $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
+    (.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
+    (.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
 
 **Alignment on segmented plain texts (double newlines as the delimiter)**::
 

diff --git a/subaligner/__main__.py b/subaligner/__main__.py
@@ -4,15 +4,15 @@
                   [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
                   [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS]
                   [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
-                  [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-tr {helsinki-nlp,whisper,facebook-mbart}] [-tf TRANSLATION_FLAVOUR]
-                  [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-lgs] [-d] [-q] [-ver]
+                  [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-ip INITIAL_PROMPT] [-tr {helsinki-nlp,whisper,facebook-mbart}]
+                  [-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-lgs] [-d] [-q] [-ver]
 
-Subaligner command line interface (v0.3.7)
+Subaligner command line interface
 
-options:
+optional arguments:
   -h, --help            show this help message and exit
   -s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...]
-                        File path or URL to the subtitle file (Extensions of supported subtitles: .scc, .tmp, .sami, .stl, .ttml, .dfxp, .srt, .ssa, .ass, .sub, .sbv, .xml, .ytt, .smi, .txt, .vtt) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
+                        File path or URL to the subtitle file (Extensions of supported subtitles: .ass, .smi, .scc, .vtt, .stl, .txt, .sbv, .ssa, .sub, .ttml, .xml, .srt, .ytt, .dfxp, .sami, .tmp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
   -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
                         Max global log loss for alignment
   -so, --stretch_on     Switch on stretch on subtitles)
@@ -32,8 +32,10 @@
                         Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]
   -mr {whisper}, --transcription_recipe {whisper}
                         LLM recipe used for transcribing video files
-  -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large}
+  -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}
                         Flavour variation for a specific LLM recipe supporting transcription
+  -ip INITIAL_PROMPT, --initial_prompt INITIAL_PROMPT
+                        Optional text to provide the transcribing context or specific phrases
   -tr {helsinki-nlp,whisper,facebook-mbart}, --translation_recipe {helsinki-nlp,whisper,facebook-mbart}
                         LLM recipe used for translating subtitles
   -tf TRANSLATION_FLAVOUR, --translation_flavour TRANSLATION_FLAVOUR
@@ -178,6 +180,13 @@ def main():
         choices=[wf.value for wf in WhisperFlavour],
         help="Flavour variation for a specific LLM recipe supporting transcription"
     )
+    parser.add_argument(
+        "-ip",
+        "--initial_prompt",
+        type=str,
+        default=None,
+        help="Optional text to provide the transcribing context or specific phrases"
+    )
     from subaligner.llm import TranslationRecipe
     from subaligner.llm import HelsinkiNLPFlavour
     parser.add_argument(
@@ -233,7 +242,8 @@ def main():
         parser.print_usage()
         sys.exit(21)
     elif FLAGS.mode == "transcribe":
-        FLAGS.subtitle_path = ["{}.srt".format(tempfile.mkstemp()[1])]
+        if not FLAGS.subtitle_path:
+            FLAGS.subtitle_path = [tempfile.mkstemp(suffix="_transcribe_temp.srt")[1]]
     if FLAGS.mode in ["single", "dual", "script", "transcribe"]:
         for subtitle_path in FLAGS.subtitle_path:
             if FLAGS.video_path == "":
@@ -345,7 +355,10 @@ def main():
                 elif FLAGS.mode == "transcribe":
                     from subaligner.transcriber import Transcriber
                     transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour)
-                    subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang)
+                    if "_transcribe_temp" in local_subtitle_path:
+                        subtitle, frame_rate = transcriber.transcribe(video_file_path=local_video_path, language_code=stretch_in_lang, initial_prompt=FLAGS.initial_prompt)
+                    else:
+                        subtitle, frame_rate = transcriber.transcribe_with_subtitle_as_prompts(video_file_path=local_video_path, subtitle_file_path=local_subtitle_path, language_code=stretch_in_lang)
                     aligned_subs = subtitle.subs
                 else:
                     print("ERROR: Unknown mode {}".format(FLAGS.mode))
@@ -422,7 +435,7 @@ def _remove_tmp_files(video_path, subtitle_path, local_video_path, local_subtitl
         os.remove(local_video_path)
     if subtitle_path.lower().startswith("http") and os.path.exists(local_subtitle_path):
         os.remove(local_subtitle_path)
-    if mode == "transcribe" and os.path.exists(local_subtitle_path):
+    if mode == "transcribe" and os.path.exists(local_subtitle_path) and "_transcribe_temp" in local_subtitle_path:
         os.remove(local_subtitle_path)
 
 

diff --git a/subaligner/transcriber.py b/subaligner/transcriber.py
@@ -1,13 +1,12 @@
 import os
 import whisper
+import torch
 from typing import Tuple, Optional
 from pysrt import SubRipTime
 from whisper.tokenizer import LANGUAGES
-from .translator import Translator
 from .subtitle import Subtitle
 from .media_helper import MediaHelper
 from .llm import TranscriptionRecipe, WhisperFlavour
-from .singleton import Singleton
 from .logger import Logger
 from .utils import Utils
 from .exception import NoFrameRateException, TranscriptionException
@@ -30,18 +29,23 @@ def __init__(self, recipe: str = TranscriptionRecipe.WHISPER.value, flavour: str
         if recipe == TranscriptionRecipe.WHISPER.value:
             if flavour not in [f.value for f in WhisperFlavour]:
                 raise NotImplementedError(f"Unknown {recipe} flavour: {flavour}")
-            self.__model = whisper.load_model(flavour)
+            if torch.cuda.is_available():
+                device = "cuda"
+            else:
+                device = "cpu"
+            self.__model = whisper.load_model(flavour, device=device)
         self.__recipe = recipe
         self.__flavour = flavour
         self.__media_helper = MediaHelper()
         self.__LOGGER = Logger().get_logger(__name__)
 
-    def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle, Optional[float]]:
+    def transcribe(self, video_file_path: str, language_code: str, initial_prompt: Optional[str] = None) -> Tuple[Subtitle, Optional[float]]:
         """Transcribe an audiovisual file and generate subtitles.
 
         Arguments:
             video_file_path {string} -- The input video file path.
             language_code {string} -- An alpha 3 language code derived from ISO 639-3.
+            initial_prompt {string} -- Optional text to provide the transcribing context or specific phrases.
 
         Returns:
             tuple: Generated subtitle after transcription and the detected frame rate
@@ -59,7 +63,8 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle
                 audio = whisper.load_audio(audio_file_path)
                 self.__LOGGER.info("Start transcribing the audio...")
                 verbose = False if Logger.VERBOSE and not Logger.QUIET else None
-                result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose)
+                self.__LOGGER.debug("Prompting with: '%s'" % initial_prompt)
+                result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose, initial_prompt=initial_prompt)
                 self.__LOGGER.info("Finished transcribing the audio")
                 srt_str = ""
                 for i, segment in enumerate(result["segments"], start=1):
@@ -77,6 +82,63 @@ def transcribe(self, video_file_path: str, language_code: str) -> Tuple[Subtitle
         else:
             raise NotImplementedError(f"{self.__recipe} ({self.__flavour}) is not supported")
 
+    def transcribe_with_subtitle_as_prompts(self, video_file_path: str, subtitle_file_path: str, language_code: str) -> Tuple[Subtitle, Optional[float]]:
+        """Transcribe an audiovisual file and generate subtitles using the original subtitle as prompts.
+
+        Arguments:
+            video_file_path {string} -- The input video file path.
+            subtitle_file_path {string} -- The input subtitle file path to provide prompts.
+            language_code {string} -- An alpha 3 language code derived from ISO 639-3.
+
+        Returns:
+            tuple: Generated subtitle after transcription and the detected frame rate
+
+        Raises:
+            TranscriptionException: Thrown when transcription is failed.
+            NotImplementedError: Thrown when the LLM recipe is not supported.
+        """
+        if self.__recipe == "whisper":
+            lang = Utils.get_iso_639_alpha_2(language_code)
+            if lang not in LANGUAGES:
+                raise TranscriptionException(
+                    f'"{language_code}" is not supported by {self.__recipe} ({self.__flavour})')
+            audio_file_path = self.__media_helper.extract_audio(video_file_path, True, 16000)
+            subtitle = Subtitle.load(subtitle_file_path)
+            segment_paths = []
+            try:
+                srt_str = ""
+                srt_idx = 1
+                self.__LOGGER.info("Start transcribing the audio...")
+                verbose = False if Logger.VERBOSE and not Logger.QUIET else None
+                for sub in subtitle.subs:
+                    segment_path, _ = self.__media_helper.extract_audio_from_start_to_end(audio_file_path, str(sub.start), str(sub.end))
+                    segment_paths.append(segment_path)
+                    audio = whisper.load_audio(segment_path)
+                    result = self.__model.transcribe(audio, task="transcribe", language=LANGUAGES[lang], verbose=verbose, initial_prompt=sub.text)
+                    original_start_in_secs = sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds + sub.start.milliseconds / 1000.0
+                    original_end_in_secs = sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds + sub.end.milliseconds / 1000.0
+                    for segment in result["segments"]:
+                        if segment["end"] <= segment["start"]:
+                            continue
+                        srt_str += f"{srt_idx}\n" \
+                                   f"{Utils.format_timestamp(original_start_in_secs + segment['start'])} --> {Utils.format_timestamp(min(original_start_in_secs + segment['end'], original_end_in_secs))}\n" \
+                                   f"{segment['text'].strip().replace('-->', '->')}\n" \
+                                   "\n"
+                        srt_idx += 1
+                self.__LOGGER.info("Finished transcribing the audio")
+                subtitle = Subtitle.load_subrip_str(srt_str)
+                subtitle, frame_rate = self.__on_frame_timecodes(subtitle, video_file_path)
+                self.__LOGGER.debug("Generated the raw subtitle")
+                return subtitle, frame_rate
+            finally:
+                if os.path.exists(audio_file_path):
+                    os.remove(audio_file_path)
+                for segment_path in segment_paths:
+                    if os.path.exists(segment_path):
+                        os.remove(segment_path)
+        else:
+            raise NotImplementedError(f"{self.__recipe} ({self.__flavour}) is not supported")
+
     def __on_frame_timecodes(self, subtitle: Subtitle, video_file_path: str) -> Tuple[Subtitle, Optional[float]]:
         frame_rate = None
         try:

diff --git a/tests/integration/feature/subaligner.feature b/tests/integration/feature/subaligner.feature
@@ -236,16 +236,25 @@ Feature: Subaligner CLI
         |  subaligner_1pass |  <NULL>   |  "test.srt"       |   eng,fra         |   "test_aligned.srt"      |
         |  subaligner_2pass |  <NULL>   |  "test.srt"       |   eng,deu         |   "test_aligned.srt"      |
 
-    @transcription
+    @transcription @custom-prompt
     Scenario Outline: Test transcription on audiovisual input and subtitle generation
+        Given I have a video file <video-in>
+        When I run the alignment with <aligner> on them with <mode> stage with <language> language, <recipe> recipe and <flavour> flavour and prompt <prompt>
+        Then a new subtitle file <subtitle-out> is generated
+    Examples:
+        |   video-in    |   aligner     |  mode         |   language    |   recipe      |   flavour     |   prompt      |   subtitle-out        |
+        |   "test.mp4"  |   subaligner  |  transcribe   |   eng         |   whisper     |   tiny        |   <NULL>      |   "test_aligned.srt"  |
+        |   "test.wav"  |   subaligner  |  transcribe   |   eng         |   whisper     |   tiny        |   test_prompt |   "test_aligned.srt"  |
+
+    @transcription @subtitle-prompt
+    Scenario Outline: Test transcription on audiovisual input with original subtitle as prompts and subtitle generation
         Given I have a video file <video-in>
         And I have a subtitle file <subtitle-in>
         When I run the alignment with <aligner> on them with <mode> stage with <language> language, <recipe> recipe and <flavour> flavour
         Then a new subtitle file <subtitle-out> is generated
     Examples:
-        |   video-in    |   aligner     |  mode         |  subtitle-in      |   language    |   recipe      |   flavour     |   subtitle-out        |
-        |   "test.mp4"  |   subaligner  |  transcribe   |  "test.srt"       |   eng         |   whisper     |   tiny        |   "test_aligned.srt"  |
-        |   "test.wav"  |   subaligner  |  transcribe   |  "test.srt"       |   eng         |   whisper     |   tiny        |   "test_aligned.srt"  |
+        |   video-in    |   aligner     |  mode         |  subtitle-in      |   language    |   recipe      |   flavour |   subtitle-out        |
+        |   "test.mp4"  |   subaligner  |  transcribe   |  "test.srt"       |   eng         |   whisper     |   tiny    |   "test_aligned.srt"  |
 
     @batch
     Scenario Outline: Test batch alignment

diff --git a/tests/integration/radish/step.py b/tests/integration/radish/step.py
@@ -125,12 +125,39 @@ def run_subaligner_with_translation(step, aligner, mode, language_pair):
     step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS)
 
 
+@when('I run the alignment with {aligner:S} on them with {mode:S} stage with {language:S} language, {recipe:S} recipe and {flavour:S} flavour and prompt {prompt:S}')
+def run_subaligner_with_transcription(step, aligner, mode, language, recipe, flavour, prompt):
+    if prompt == "<NULL>":
+        process = subprocess.Popen([
+            os.path.join(PWD, "..", "..", "..", "bin", aligner),
+            "-m", mode,
+            "-v", step.context.video_file_path,
+            "-ml", language,
+            "-mr", recipe,
+            "-mf", flavour,
+            "-o", os.path.join(PWD, "..", "..", "subaligner", "resource", "test_aligned.srt"),
+            "-q"], shell=False)
+    else:
+        process = subprocess.Popen([
+            os.path.join(PWD, "..", "..", "..", "bin", aligner),
+            "-m", mode,
+            "-v", step.context.video_file_path,
+            "-ml", language,
+            "-mr", recipe,
+            "-mf", flavour,
+            "-ip", prompt,
+            "-o", os.path.join(PWD, "..", "..", "subaligner", "resource", "test_aligned.srt"),
+            "-q"], shell=False)
+    step.context.exit_code = process.wait(timeout=WAIT_TIMEOUT_IN_SECONDS)
+
+
 @when('I run the alignment with {aligner:S} on them with {mode:S} stage with {language:S} language, {recipe:S} recipe and {flavour:S} flavour')
 def run_subaligner_with_transcription(step, aligner, mode, language, recipe, flavour):
     process = subprocess.Popen([
         os.path.join(PWD, "..", "..", "..", "bin", aligner),
         "-m", mode,
         "-v", step.context.video_file_path,
+        "-s", step.context.subtitle_path_or_selector,
         "-ml", language,
         "-mr", recipe,
         "-mf", flavour,

diff --git a/tests/subaligner/test_transcriber.py b/tests/subaligner/test_transcriber.py
@@ -9,13 +9,24 @@ class TranscriberTest(unittest.TestCase):
 
     def setUp(self) -> None:
         self.video_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource", "test.mp4")
+        self.srt_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource/test.srt")
         self.undertest = Undertest(recipe=TranscriptionRecipe.WHISPER.value, flavour=WhisperFlavour.TINY.value)
 
     def test_transcribe(self):
         subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng")
         assert len(subtitle.subs) > 0
         assert frame_rate == 24
 
+    def test_transcribe_with_initial_prompt(self):
+        subtitle, frame_rate = self.undertest.transcribe(self.video_file_path, "eng", initial_prompt="This is a testing prompt")
+        assert len(subtitle.subs) > 0
+        assert frame_rate == 24
+
+    def test_transcribe_with_subtitle_as_prompts(self):
+        subtitle, frame_rate = self.undertest.transcribe_with_subtitle_as_prompts(self.video_file_path, self.srt_file_path, "eng")
+        assert len(subtitle.subs) > 0
+        assert frame_rate == 24
+
     def test_throw_exception_on_unknown_recipe(self):
         try:
             Undertest(recipe="unknown")