Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions INTERNAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ Follow the below instructions to upload/update a tutorial video:
```
whereby `PATH_TO_SAVE_AUDIO` and `PATH_TO_SAVE_NEW_VIDEO` should have the file extension `.mp3` and `.mp4`, respectively.

Voiceover audio alone (without an accompanying video file) can be generated
by omitting the `--file` and `--output` options.

The Google Cloud Text-to-Speech API has a limit of about 4 minutes of audio
per request. Audio longer than this must be split into multiple requests
using the `--skip` and `--until` options. The exact end time of a segment
should be used as the `skip` and/or `until` value.

5. Retry step 4 with adjusted `youTubeCaptions` data until the optimal outcome is achieved.

6. Before uploading, make sure that the timings of the `youTubeCaptions` sentences in the metadata file match exactly the duration of their pronunciations in the voiceover. This ensures that the subtitles will be synced correctly to the voice in the final online video version.
Expand Down
47 changes: 39 additions & 8 deletions scripts/video-manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,21 +180,40 @@ def insert_caption(youtube_, youtube_id_, name, content):
return request.execute()


def create_SSML_text(metadata_):
def create_SSML_text(metadata_, skip=None, until=None):
"""
Creates SSML text from metadata.

See https://cloud.google.com/text-to-speech/docs/ssml for more information.

The optional `skip` and `until` arguments allow generating the voiceover
in parts to stay within the Google TextToSpeech API's per-request limits.
Captions whose `startTime` is before `skip` are dropped, and iteration
stops once a caption's `startTime` reaches `until`. Leading silence on
the resulting audio is shortened to the gap between `skip` and the first
kept caption, so each part can be aligned back to the original timeline
with `ffmpeg -itsoffset <skip>`. Please use exact end times of a segment as
`skip` and `until` values.

Args:
metadata_ (dict): video metadata.
skip (str|None): caption timestamp (`HH:MM:SS.MS`) to start from.
until (str|None): caption timestamp (`HH:MM:SS.MS`) to stop at.

Returns:
str
"""
skip_ms = caption_time_to_milliseconds(skip) if skip else 0
until_ms = caption_time_to_milliseconds(until) if until else None
text = ""
previous_end = 0
previous_end = skip_ms
for caption in metadata_["youTubeCaptions"]:
silence = caption_time_to_milliseconds(caption["startTime"]) - previous_end
start_ms = caption_time_to_milliseconds(caption["startTime"])
if start_ms < skip_ms:
continue
if until_ms is not None and start_ms >= until_ms:
break
silence = start_ms - previous_end
text = "".join((text, f"<break time='{silence}ms'/>", caption["text"]))
previous_end = caption_time_to_milliseconds(caption["endTime"])
return "".join(("<speak>", text, "</speak>"))
Expand Down Expand Up @@ -230,15 +249,24 @@ def convert_text_to_speech(ssml_text, speech_path):
update.add_argument('--privacyStatus', default="unlisted", help='video privacy status')

voiceover = subparsers.add_parser('voiceover')
voiceover.add_argument('--file', required=True, help='video file path')
voiceover.add_argument('--file', help='video file path (required only when --output is set)')
voiceover.add_argument('--metadata', required=True, help='video metadata file path')
voiceover.add_argument('--audio', help='path to store audio file')
voiceover.add_argument('--audio', required=True, help='path to store audio file')
voiceover.add_argument('--output', help='path to store voiceover video file')
voiceover.add_argument('--privacyStatus', default="unlisted", help='video privacy status')
voiceover.add_argument(
'--skip', default=None,
help='skip captions before this timestamp, e.g. 00:05:30.500 (HH:MM:SS.MS)',
)
voiceover.add_argument(
'--until', default=None,
help='stop at captions at/after this timestamp, e.g. 00:10:00.000 (HH:MM:SS.MS)',
)

args = argparser.parse_args()

if not os.path.exists(args.file):
video_file = getattr(args, "file", None)
if video_file and not os.path.exists(video_file):
exit("video file does not exist!")
if not os.path.exists(args.metadata):
exit("metadata file does not exist!")
Expand Down Expand Up @@ -268,6 +296,9 @@ def convert_text_to_speech(ssml_text, speech_path):
update_metadata(args.metadata, {"youTubeId": youtube_id})

if args.command == "voiceover":
ssml_text = create_SSML_text(metadata)
ssml_text = create_SSML_text(metadata, skip=args.skip, until=args.until)
convert_text_to_speech(ssml_text, args.audio)
os.system(FFMPEG_COMMAND_TMPL.format(args.file, args.audio, args.output))
if args.output:
if not args.file:
exit("--file is required when --output is set")
os.system(FFMPEG_COMMAND_TMPL.format(args.file, args.audio, args.output))