video frame rendering using ffmpeg (#997)

* feat: 🎥 add video tasks and frame extraction support * feat: ✨ add support for custom options in frame extraction * feat: ✨ add options for frame extraction in CLI * feat: 🎥 add video frame extraction and tracing support * feat: ✨ add video transcript generation script * refactor: ♻️ remove unused imports in ffmpeg module * refactor: ♻️ rename and update video processing functions * feat: ✨ add video segmentation and caching utils * adding caching * feat: ✨ add SRT/VTT rendering to transcription results * feat: 📝 update transcript processing for srt format * feat: ✨ add videoProbe function for video metadata extraction * feat: ✨ add video support with ffmpeg utilities * feat: 🎥 add transcript integration for video frame extraction * feat: ✨ add transcript-based frame extraction example * feat: ✨ add video alt text generation guide and script * feat: ✨ add new cards for Speech To Text, Images, Videos * docs: 📝 fix broken links in transcription and videos docs
microsoft · Jan 10, 2025 · 7a74f59 · 7a74f59
1 parent 2a0ad04
commit 7a74f59
Show file tree

Hide file tree

Showing 27 changed files with 853 additions and 140 deletions.
diff --git a/docs/src/content/docs/guides/video-alt-text.genai.mdx b/docs/src/content/docs/guides/video-alt-text.genai.mdx
@@ -0,0 +1,70 @@
+---
+title: Video Alt Text
+sidebar:
+    order: 50
+description: Learn how to generate alt text for videos
+keywords: Video
+---
+
+import { Code } from "@astrojs/starlight/components"
+import src from "../../../../../packages/sample/genaisrc/video-alt-text.genai.mjs?raw"
+
+GenAIScript supports [speech transcription](/genaiscript/reference/scripts/transcription)
+and [video frame extraction](/genaiscript/reference/scripts/videos) which can be combined to analyze videos.
+
+## Video Alt Text
+
+The HTML video attribute does not have an `alt` attribute.. but you can still attach a accessible description using the `aria-label` attribute.
+We will build a script that generates the description using the transcript and video frames.
+
+## Transcript
+
+We use the `transcribe` function to generate the transcript. It will use the `transcription` model alias to compute a transcription.
+Transcriptions are useful to reduce hallucations of LLMs when analyzing images and also provides
+good timestemp candidates to screenshot the video stream.
+
+```js
+const file = env.files[0]
+const transcript = await transcribe(file) // OpenAI whisper
+```
+
+## Video Frames
+
+The next step is to use the transcript to screenshot the video stream. GenAIScript uses [ffmpeg](https://ffmpeg.org/) to render the frames
+so make sure you have it installed and configured.
+
+```js
+const frames = await parsers.videoFrames(file, {
+    transcript,
+})
+```
+
+## Context
+
+Both the transcript and the frames are added to the prompt context. Since some videos may be silent, we ignore empty transcripts.
+We also use low detail for the frames to improve performance.
+
+```js
+def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }) // ignore silent videos
+defImages(frames, { detail: "low" }) // low detail for better performance
+```
+
+## Prompting it together
+
+Finally, we give the task to the LLM to generate the alt text.
+
+```js
+$`You are an expert in assistive technology.
+You will analyze the video and generate a description alt text for the video.
+`
+```
+
+Using this script, you can automatically generate high quality alt text for videos.
+
+```sh
+npx --yes genaiscript run video-alt-text path_to_video.mp4
+```
+
+## Full source
+
+<Code code={src} wrap={true} lang="js" title="video-alt-text.genai.mjs" />
diff --git a/docs/src/content/docs/guides/zod-schema.mdx b/docs/src/content/docs/guides/zod-schema.mdx
@@ -1,16 +1,15 @@
 ---
 title: Zod Schema
 sidebar:
-  order: 80
+    order: 80
 description: Learn how to define and convert TypeScript-first Zod schemas to JSON schema
 keywords: TypeScript, Zod schema, JSON schema, schema validation, type inference
-
 ---
 
-[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference. 
+[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference.
 
 ```ts
-import { z } from "zod"
+import { z } from "genaiscript/runtime"
 // city array schema
 const CitySchema = z.array(
     z.object({
@@ -21,20 +20,10 @@ const CitySchema = z.array(
 )
 ```
 
-[zod-to-json-schema](https://www.npmjs.com/package/zod-to-json-schema) is a library that converts zod schemas to JSON schema.
-
-```ts
-import { zodToJsonSchema } from "zod-to-json-schema"
-// convert to JSON schema
-const CitySchemaJSON = zodToJsonSchema(CitySchema, "citySchema").definitions[
-    "citySchema"
-] as JSONSchemaArray
-```
-
-The JSON schemas can be used in `defSchema` to constrain the output of the tool.
+The zod schemas can be used in `defSchema` to constrain the output of the tool.
 
 ```ts
 // JSON schema to constrain the output of the tool.
-const schema = defSchema("CITY_SCHEMA", CitySchemaJSON)
+const schema = defSchema("CITY_SCHEMA", CitySchema)
 ...
 ```
diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx
@@ -247,6 +247,38 @@ defData("ROWS", rows, { sliceHead: 100 })
 
 </Card>
 
+<Card title="Speech To Text" icon="seti:microphone">
+
+Automatically transcribe audio or videos using [OpenAI](https://openai.com/) or others.
+
+```js
+const transcript = await transcript("path/to/audio.mp3")
+const { srt, vtt, segments } = transcript
+```
+
+</Card>
+
+<Card title="Images" icon="seti:image">
+
+Include images in prompts, we'll crop/resize/resize then for you.
+
+```js
+defImages(images, { autoCrop: true, details: "low" })
+```
+
+</Card>
+
+<Card title="Videos" icon="seti:video">
+
+Extract frames from videos using timestamps or even transcripts.
+
+```js
+const frames = await parsers.videoFrames("...", { count: 10 })
+defImages(frames, { details: "low" })
+```
+
+</Card>
+
 <Card title="Generate Files" icon="document">
 
 Extract files and diff from the LLM output.

diff --git a/docs/src/content/docs/reference/cli/commands.md b/docs/src/content/docs/reference/cli/commands.md
@@ -291,25 +291,26 @@ Options:
   -h, --help  display help for command
 ```
 
-## `audio`
+## `video`
 
 ```
-Usage: genaiscript audio [options] [command]
+Usage: genaiscript video [options] [command]
 
-Audio tasks
+Video tasks
 
 Options:
-  -h, --help        display help for command
+  -h, --help                       display help for command
 
 Commands:
-  transcode <file>  Transcode video/audio file
-  help [command]    display help for command
+  extract-audio <file>             Transcode video/audio file
+  extract-frames [options] <file>  Extract video frames
+  help [command]                   display help for command
 ```
 
-### `audio transcode`
+### `video extract-audio`
 
 ```
-Usage: genaiscript audio transcode [options] <file>
+Usage: genaiscript video extract-audio [options] <file>
 
 Transcode video/audio file
 
@@ -320,6 +321,23 @@ Options:
   -h, --help  display help for command
 ```
 
+### `video extract-frames`
+
+```
+Usage: genaiscript video extract-frames [options] <file>
+
+Extract video frames
+
+Arguments:
+  file                  Audio or video file to transcode
+
+Options:
+  -c, --count <number>  maximum number of frames to extract
+  -o, --out <string>    output folder
+  -s, --size <string>   size of the output frames wxh
+  -h, --help            display help for command
+```
+
 ## `retrieval`
 
 ```

diff --git a/docs/src/content/docs/reference/scripts/transcription.mdx b/docs/src/content/docs/reference/scripts/transcription.mdx
@@ -25,7 +25,7 @@ Otherwise, it will attempt to call ffmpeg directly
 
 ## model
 
-By default, the API uses the `transcription` [model alias](/genaiscript/reference/model-aliases) to transcribe the audio.
+By default, the API uses the `transcription` [model alias](/genaiscript/reference/scripts/model-aliases) to transcribe the audio.
 You can also specify a different model alias using the `model` option.
 
 ```js "openai:whisper-1"
@@ -44,6 +44,15 @@ for (const segment of segments) {
 }
 ```
 
+## SRT and VTT
+
+GenAIScript renders the segments to [SRT](https://en.wikipedia.org/wiki/SubRip) 
+and [WebVTT](https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API) formats as well.
+
+```js
+const { srt, vtt } = await transcribe("...")
+```
+
 ## Translation
 
 Some models also support transcribing and translating to English in one pass. For this case,

diff --git a/docs/src/content/docs/reference/scripts/videos.mdx b/docs/src/content/docs/reference/scripts/videos.mdx
@@ -0,0 +1,81 @@
+---
+title: Videos
+description: How to use the Video in scripts
+sidebar:
+    order: 10.01
+---
+
+While most LLMs do not support videos natively, they can be integrated in scripts by rendering frames
+and adding them as images to the prompt. This can be tedious and GenAIScript provides efficient helpers
+to streamline this process.
+
+## ffmpeg and ffprobe
+
+The functionalities to render and analyze videos rely on [ffmpeg](https://ffmpeg.org/)
+and [ffprobe](https://ffmpeg.org/ffprobe.html).
+
+Make sure these tools are installed locally and available in your PATH,
+or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point to the `ffmpeg`/`ffprobe` executable.
+
+### ffmpeg output caching
+
+Since video processing can be slow, GenAIScript caches the results in subfolders under `.genaiscript/videos/...`
+where the subfolder name is a hash from the video file content and the options used to render the video.
+This way, you can re-run the script without having to re-render the video.
+
+You can review the `ffmpeg` console log in the `log.txt` file in the cache folder.
+
+## Extracting frames
+
+As mentionned above, multi-modal LLMs typically support images as a sequence
+of frames (or screenshots).
+
+The `parsers.videoFrames` will render frames from a video file or url
+and return them as an array of file paths. You can use the result with `defImages` directly.
+
+```js
+const frames = await parsers.videoFrames("path_url_to_video")
+def("FRAMES", frames)
+```
+
+- specify a number of frames using `count`
+
+```js "count: 10"
+const frames = await parsers.videoFrames("...", { count: 10 })
+```
+
+- specify timestamps in seconds or percentages of the video duration using `timestamps` (or `times`)
+
+```js "timestamps"
+const frames = await parsers.videoFrames("...", { timestamps: ["0%", "50%"] })
+```
+
+- specify the transcript computed by the [transcribe](/genaiscript/reference/scripts/transcription) function. GenAIScript
+  will extract a frame at the start of each segment.
+
+```js "timestamps"
+const transcript = await transcribe("...")
+const frames = await parsers.videoFrames("...", { transcript })
+```
+
+## Extracting audio
+
+The `parsers.videoAudio` will extract the audio from a video file or url
+as a `.wav` file.
+
+```js
+const audio = await parsers.videoAudio("path_url_to_video")
+```
+
+The conversion to audio happens automatically
+for videos when using [transcribe](/genaiscript/reference/scripts/transcription).
+
+## Probing videos
+
+You can extract metadata from a video file or url using `parsers.videoProbe`.
+
+```js
+const info = await parsers.videoProbe("path_url_to_video")
+const { duration } = info.streams[0]
+console.log(`video duration: ${duration} seconds`)
+```
diff --git a/packages/cli/src/audio.ts b/packages/cli/src/audio.ts
diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
@@ -49,7 +49,7 @@ import { CORE_VERSION, GITHUB_REPO } from "../../core/src/version" // Core versi
 import { logVerbose } from "../../core/src/util" // Utility logging
 import { semverSatisfies } from "../../core/src/semver" // Semantic version checking
 import { convertFiles } from "./convert"
-import { transcodeFile } from "./audio"
+import { extractAudio, extractVideoFrames } from "./video"
 
 /**
  * Main function to initialize and run the CLI.
@@ -330,12 +330,20 @@ export async function cli() {
         .argument("[name]", "Name of the cache, tests")
         .action(cacheClear) // Action to clear cache
 
-    const audio = program.command("audio").description("Audio tasks")
+    const audio = program.command("video").description("Video tasks")
     audio
-        .command("transcode")
+        .command("extract-audio")
         .description("Transcode video/audio file")
         .argument("<file>", "Audio or video file to transcode")
-        .action(transcodeFile)
+        .action(extractAudio)
+    audio
+        .command("extract-frames")
+        .description("Extract video frames")
+        .argument("<file>", "Audio or video file to transcode")
+        .option("-c, --count <number>", "maximum number of frames to extract")
+        .option("-o, --out <string>", "output folder")
+        .option("-s, --size <string>", "size of the output frames wxh")
+        .action(extractVideoFrames)
 
     // Define 'retrieval' command group for RAG support
     const retrieval = program

diff --git a/packages/cli/src/nodehost.ts b/packages/cli/src/nodehost.ts
@@ -323,14 +323,14 @@ export class NodeHost implements RuntimeHost {
             return undefined
         }
     }
-    async readFile(name: string): Promise<Uint8Array> {
+    async readFile(filepath: string): Promise<Uint8Array> {
         const wksrx = /^workspace:\/\//i
-        if (wksrx.test(name))
-            name = join(this.projectFolder(), name.replace(wksrx, ""))
+        if (wksrx.test(filepath))
+            filepath = join(this.projectFolder(), filepath.replace(wksrx, ""))
         // check if file exists
-        if (!(await exists(name))) return undefined
+        if (!(await exists(filepath))) return undefined
         // read file
-        const res = await readFile(name)
+        const res = await readFile(filepath)
         return res ? new Uint8Array(res) : new Uint8Array()
     }
     async findFiles(