mat3ra · pranabdas · May 16, 2026 · May 14, 2026 · May 15, 2026 · May 16, 2026
diff --git a/INTERNAL.md b/INTERNAL.md
@@ -20,6 +20,14 @@ Follow the below instructions to upload/update a tutorial video:
    ```
    whereby `PATH_TO_SAVE_AUDIO` and `PATH_TO_SAVE_NEW_VIDEO` should have the file extension `.mp3` and `.mp4`, respectively.
 
+   Voiceover audio alone (without an accompanying video file) can be generated
+   by omitting the `--file` and `--output` options.
+
+   The Google Cloud Text-to-Speech API has a limit of about 4 minutes of audio
+   per request. Audio longer than this must be split into multiple requests
+   using the `--skip` and `--until` options. The exact end time of a segment
+   should be used as the `skip` and/or `until` value.
+
 5. Retry step 4 with adjusted `youTubeCaptions` data until the optimal outcome is achieved.
 
 6. Before uploading, make sure that the timings of the `youTubeCaptions` sentences in the metadata file match exactly the duration of their pronunciations in the voiceover. This ensures that the subtitles will be synced correctly to the voice in the final online video version.

diff --git a/scripts/video-manager.py b/scripts/video-manager.py
@@ -180,21 +180,40 @@ def insert_caption(youtube_, youtube_id_, name, content):
     return request.execute()
 
 
-def create_SSML_text(metadata_):
+def create_SSML_text(metadata_, skip=None, until=None):
     """
     Creates SSML text from metadata.
 
     See https://cloud.google.com/text-to-speech/docs/ssml for more information.
+
+    The optional `skip` and `until` arguments allow generating the voiceover
+    in parts to stay within the Google TextToSpeech API's per-request limits.
+    Captions whose `startTime` is before `skip` are dropped, and iteration
+    stops once a caption's `startTime` reaches `until`. Leading silence on
+    the resulting audio is shortened to the gap between `skip` and the first
+    kept caption, so each part can be aligned back to the original timeline
+    with `ffmpeg -itsoffset <skip>`. Please use exact end times of a segment as
+    `skip` and `until` values.
+
     Args:
         metadata_ (dict): video metadata.
+        skip (str|None): caption timestamp (`HH:MM:SS.MS`) to start from.
+        until (str|None): caption timestamp (`HH:MM:SS.MS`) to stop at.
 
     Returns:
         str
     """
+    skip_ms = caption_time_to_milliseconds(skip) if skip else 0
+    until_ms = caption_time_to_milliseconds(until) if until else None
     text = ""
-    previous_end = 0
+    previous_end = skip_ms
     for caption in metadata_["youTubeCaptions"]:
-        silence = caption_time_to_milliseconds(caption["startTime"]) - previous_end
+        start_ms = caption_time_to_milliseconds(caption["startTime"])
+        if start_ms < skip_ms:
+            continue
+        if until_ms is not None and start_ms >= until_ms:
+            break
+        silence = start_ms - previous_end
         text = "".join((text, f"<break time='{silence}ms'/>", caption["text"]))
         previous_end = caption_time_to_milliseconds(caption["endTime"])
     return "".join(("<speak>", text, "</speak>"))
@@ -230,15 +249,24 @@ def convert_text_to_speech(ssml_text, speech_path):
     update.add_argument('--privacyStatus', default="unlisted", help='video privacy status')
 
     voiceover = subparsers.add_parser('voiceover')
-    voiceover.add_argument('--file', required=True, help='video file path')
+    voiceover.add_argument('--file', help='video file path (required only when --output is set)')
     voiceover.add_argument('--metadata', required=True, help='video metadata file path')
-    voiceover.add_argument('--audio', help='path to store audio file')
+    voiceover.add_argument('--audio', required=True, help='path to store audio file')
     voiceover.add_argument('--output', help='path to store voiceover video file')
     voiceover.add_argument('--privacyStatus', default="unlisted", help='video privacy status')
+    voiceover.add_argument(
+        '--skip', default=None,
+        help='skip captions before this timestamp, e.g. 00:05:30.500 (HH:MM:SS.MS)',
+    )
+    voiceover.add_argument(
+        '--until', default=None,
+        help='stop at captions at/after this timestamp, e.g. 00:10:00.000 (HH:MM:SS.MS)',
+    )
 
     args = argparser.parse_args()
 
-    if not os.path.exists(args.file):
+    video_file = getattr(args, "file", None)
+    if video_file and not os.path.exists(video_file):
         exit("video file does not exist!")
     if not os.path.exists(args.metadata):
         exit("metadata file does not exist!")
@@ -268,6 +296,9 @@ def convert_text_to_speech(ssml_text, speech_path):
         update_metadata(args.metadata, {"youTubeId": youtube_id})
 
     if args.command == "voiceover":
-        ssml_text = create_SSML_text(metadata)
+        ssml_text = create_SSML_text(metadata, skip=args.skip, until=args.until)
         convert_text_to_speech(ssml_text, args.audio)
-        os.system(FFMPEG_COMMAND_TMPL.format(args.file, args.audio, args.output))
+        if args.output:
+            if not args.file:
+                exit("--file is required when --output is set")
+            os.system(FFMPEG_COMMAND_TMPL.format(args.file, args.audio, args.output))