diff --git a/docs/src/content/docs/guides/video-alt-text.genai.mdx b/docs/src/content/docs/guides/video-alt-text.genai.mdx
new file mode 100644
index 0000000000..f1ffc3dd5f
--- /dev/null
+++ b/docs/src/content/docs/guides/video-alt-text.genai.mdx
@@ -0,0 +1,70 @@
+---
+title: Video Alt Text
+sidebar:
+    order: 50
+description: Learn how to generate alt text for videos
+keywords: Video
+---
+
+import { Code } from "@astrojs/starlight/components"
+import src from "../../../../../packages/sample/genaisrc/video-alt-text.genai.mjs?raw"
+
+GenAIScript supports [speech transcription](/genaiscript/reference/scripts/transcription)
+and [video frame extraction](/genaiscript/reference/scripts/videos) which can be combined to analyze videos.
+
+## Video Alt Text
+
+The HTML video attribute does not have an `alt` attribute.. but you can still attach a accessible description using the `aria-label` attribute.
+We will build a script that generates the description using the transcript and video frames.
+
+## Transcript
+
+We use the `transcribe` function to generate the transcript. It will use the `transcription` model alias to compute a transcription.
+Transcriptions are useful to reduce hallucations of LLMs when analyzing images and also provides
+good timestemp candidates to screenshot the video stream.
+
+```js
+const file = env.files[0]
+const transcript = await transcribe(file) // OpenAI whisper
+```
+
+## Video Frames
+
+The next step is to use the transcript to screenshot the video stream. GenAIScript uses [ffmpeg](https://ffmpeg.org/) to render the frames
+so make sure you have it installed and configured.
+
+```js
+const frames = await parsers.videoFrames(file, {
+    transcript,
+})
+```
+
+## Context
+
+Both the transcript and the frames are added to the prompt context. Since some videos may be silent, we ignore empty transcripts.
+We also use low detail for the frames to improve performance.
+
+```js
+def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }) // ignore silent videos
+defImages(frames, { detail: "low" }) // low detail for better performance
+```
+
+## Prompting it together
+
+Finally, we give the task to the LLM to generate the alt text.
+
+```js
+$`You are an expert in assistive technology.
+You will analyze the video and generate a description alt text for the video.
+`
+```
+
+Using this script, you can automatically generate high quality alt text for videos.
+
+```sh
+npx --yes genaiscript run video-alt-text path_to_video.mp4
+```
+
+## Full source
+
+<Code code={src} wrap={true} lang="js" title="video-alt-text.genai.mjs" />
diff --git a/docs/src/content/docs/guides/zod-schema.mdx b/docs/src/content/docs/guides/zod-schema.mdx
index 218a85ffa5..4c3ca895ce 100644
--- a/docs/src/content/docs/guides/zod-schema.mdx
+++ b/docs/src/content/docs/guides/zod-schema.mdx
@@ -1,16 +1,15 @@
 ---
 title: Zod Schema
 sidebar:
-  order: 80
+    order: 80
 description: Learn how to define and convert TypeScript-first Zod schemas to JSON schema
 keywords: TypeScript, Zod schema, JSON schema, schema validation, type inference
-
 ---
 
-[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference. 
+[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference.
 
 ```ts
-import { z } from "zod"
+import { z } from "genaiscript/runtime"
 // city array schema
 const CitySchema = z.array(
     z.object({
@@ -21,20 +20,10 @@ const CitySchema = z.array(
 )
 ```
 
-[zod-to-json-schema](https://www.npmjs.com/package/zod-to-json-schema) is a library that converts zod schemas to JSON schema.
-
-```ts
-import { zodToJsonSchema } from "zod-to-json-schema"
-// convert to JSON schema
-const CitySchemaJSON = zodToJsonSchema(CitySchema, "citySchema").definitions[
-    "citySchema"
-] as JSONSchemaArray
-```
-
-The JSON schemas can be used in `defSchema` to constrain the output of the tool.
+The zod schemas can be used in `defSchema` to constrain the output of the tool.
 
 ```ts
 // JSON schema to constrain the output of the tool.
-const schema = defSchema("CITY_SCHEMA", CitySchemaJSON)
+const schema = defSchema("CITY_SCHEMA", CitySchema)
 ...
 ```
diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx
index 21b98c5fd9..69e8f0911a 100644
--- a/docs/src/content/docs/index.mdx
+++ b/docs/src/content/docs/index.mdx
@@ -247,6 +247,38 @@ defData("ROWS", rows, { sliceHead: 100 })
 
 </Card>
 
+<Card title="Speech To Text" icon="seti:microphone">
+
+Automatically transcribe audio or videos using [OpenAI](https://openai.com/) or others.
+
+```js
+const transcript = await transcript("path/to/audio.mp3")
+const { srt, vtt, segments } = transcript
+```
+
+</Card>
+
+<Card title="Images" icon="seti:image">
+
+Include images in prompts, we'll crop/resize/resize then for you.
+
+```js
+defImages(images, { autoCrop: true, details: "low" })
+```
+
+</Card>
+
+<Card title="Videos" icon="seti:video">
+
+Extract frames from videos using timestamps or even transcripts.
+
+```js
+const frames = await parsers.videoFrames("...", { count: 10 })
+defImages(frames, { details: "low" })
+```
+
+</Card>
+
 <Card title="Generate Files" icon="document">
 
 Extract files and diff from the LLM output.
diff --git a/docs/src/content/docs/reference/cli/commands.md b/docs/src/content/docs/reference/cli/commands.md
index b59d36de12..e1c2d3f204 100644
--- a/docs/src/content/docs/reference/cli/commands.md
+++ b/docs/src/content/docs/reference/cli/commands.md
@@ -291,25 +291,26 @@ Options:
   -h, --help  display help for command
 ```
 
-## `audio`
+## `video`
 
 ```
-Usage: genaiscript audio [options] [command]
+Usage: genaiscript video [options] [command]
 
-Audio tasks
+Video tasks
 
 Options:
-  -h, --help        display help for command
+  -h, --help                       display help for command
 
 Commands:
-  transcode <file>  Transcode video/audio file
-  help [command]    display help for command
+  extract-audio <file>             Transcode video/audio file
+  extract-frames [options] <file>  Extract video frames
+  help [command]                   display help for command
 ```
 
-### `audio transcode`
+### `video extract-audio`
 
 ```
-Usage: genaiscript audio transcode [options] <file>
+Usage: genaiscript video extract-audio [options] <file>
 
 Transcode video/audio file
 
@@ -320,6 +321,23 @@ Options:
   -h, --help  display help for command
 ```
 
+### `video extract-frames`
+
+```
+Usage: genaiscript video extract-frames [options] <file>
+
+Extract video frames
+
+Arguments:
+  file                  Audio or video file to transcode
+
+Options:
+  -c, --count <number>  maximum number of frames to extract
+  -o, --out <string>    output folder
+  -s, --size <string>   size of the output frames wxh
+  -h, --help            display help for command
+```
+
 ## `retrieval`
 
 ```
diff --git a/docs/src/content/docs/reference/scripts/transcription.mdx b/docs/src/content/docs/reference/scripts/transcription.mdx
index 4791b7985f..53cd269c52 100644
--- a/docs/src/content/docs/reference/scripts/transcription.mdx
+++ b/docs/src/content/docs/reference/scripts/transcription.mdx
@@ -25,7 +25,7 @@ Otherwise, it will attempt to call ffmpeg directly
 
 ## model
 
-By default, the API uses the `transcription` [model alias](/genaiscript/reference/model-aliases) to transcribe the audio.
+By default, the API uses the `transcription` [model alias](/genaiscript/reference/scripts/model-aliases) to transcribe the audio.
 You can also specify a different model alias using the `model` option.
 
 ```js "openai:whisper-1"
@@ -44,6 +44,15 @@ for (const segment of segments) {
 }
 ```
 
+## SRT and VTT
+
+GenAIScript renders the segments to [SRT](https://en.wikipedia.org/wiki/SubRip) 
+and [WebVTT](https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API) formats as well.
+
+```js
+const { srt, vtt } = await transcribe("...")
+```
+
 ## Translation
 
 Some models also support transcribing and translating to English in one pass. For this case,
diff --git a/docs/src/content/docs/reference/scripts/videos.mdx b/docs/src/content/docs/reference/scripts/videos.mdx
new file mode 100644
index 0000000000..398c1f4a0f
--- /dev/null
+++ b/docs/src/content/docs/reference/scripts/videos.mdx
@@ -0,0 +1,81 @@
+---
+title: Videos
+description: How to use the Video in scripts
+sidebar:
+    order: 10.01
+---
+
+While most LLMs do not support videos natively, they can be integrated in scripts by rendering frames
+and adding them as images to the prompt. This can be tedious and GenAIScript provides efficient helpers
+to streamline this process.
+
+## ffmpeg and ffprobe
+
+The functionalities to render and analyze videos rely on [ffmpeg](https://ffmpeg.org/)
+and [ffprobe](https://ffmpeg.org/ffprobe.html).
+
+Make sure these tools are installed locally and available in your PATH,
+or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point to the `ffmpeg`/`ffprobe` executable.
+
+### ffmpeg output caching
+
+Since video processing can be slow, GenAIScript caches the results in subfolders under `.genaiscript/videos/...`
+where the subfolder name is a hash from the video file content and the options used to render the video.
+This way, you can re-run the script without having to re-render the video.
+
+You can review the `ffmpeg` console log in the `log.txt` file in the cache folder.
+
+## Extracting frames
+
+As mentionned above, multi-modal LLMs typically support images as a sequence
+of frames (or screenshots).
+
+The `parsers.videoFrames` will render frames from a video file or url
+and return them as an array of file paths. You can use the result with `defImages` directly.
+
+```js
+const frames = await parsers.videoFrames("path_url_to_video")
+def("FRAMES", frames)
+```
+
+- specify a number of frames using `count`
+
+```js "count: 10"
+const frames = await parsers.videoFrames("...", { count: 10 })
+```
+
+- specify timestamps in seconds or percentages of the video duration using `timestamps` (or `times`)
+
+```js "timestamps"
+const frames = await parsers.videoFrames("...", { timestamps: ["0%", "50%"] })
+```
+
+- specify the transcript computed by the [transcribe](/genaiscript/reference/scripts/transcription) function. GenAIScript
+  will extract a frame at the start of each segment.
+
+```js "timestamps"
+const transcript = await transcribe("...")
+const frames = await parsers.videoFrames("...", { transcript })
+```
+
+## Extracting audio
+
+The `parsers.videoAudio` will extract the audio from a video file or url
+as a `.wav` file.
+
+```js
+const audio = await parsers.videoAudio("path_url_to_video")
+```
+
+The conversion to audio happens automatically
+for videos when using [transcribe](/genaiscript/reference/scripts/transcription).
+
+## Probing videos
+
+You can extract metadata from a video file or url using `parsers.videoProbe`.
+
+```js
+const info = await parsers.videoProbe("path_url_to_video")
+const { duration } = info.streams[0]
+console.log(`video duration: ${duration} seconds`)
+```
diff --git a/packages/cli/src/audio.ts b/packages/cli/src/audio.ts
deleted file mode 100644
index fd70a3baaa..0000000000
--- a/packages/cli/src/audio.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import { writeFile } from "node:fs/promises"
-import { convertToAudioBlob } from "../../core/src/ffmpeg"
-
-export async function transcodeFile(file: string, options: { force: boolean }) {
-    const { force } = options || {}
-    const res = await convertToAudioBlob(file, { forceConversion: force })
-
-    const fn = file + ".wav"
-    console.log(`transcoded file to ${fn}`)
-    await writeFile(fn, Buffer.from(await res.arrayBuffer()))
-}
diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
index 919f252204..4abdb52c6d 100644
--- a/packages/cli/src/cli.ts
+++ b/packages/cli/src/cli.ts
@@ -49,7 +49,7 @@ import { CORE_VERSION, GITHUB_REPO } from "../../core/src/version" // Core versi
 import { logVerbose } from "../../core/src/util" // Utility logging
 import { semverSatisfies } from "../../core/src/semver" // Semantic version checking
 import { convertFiles } from "./convert"
-import { transcodeFile } from "./audio"
+import { extractAudio, extractVideoFrames } from "./video"
 
 /**
  * Main function to initialize and run the CLI.
@@ -330,12 +330,20 @@ export async function cli() {
         .argument("[name]", "Name of the cache, tests")
         .action(cacheClear) // Action to clear cache
 
-    const audio = program.command("audio").description("Audio tasks")
+    const audio = program.command("video").description("Video tasks")
     audio
-        .command("transcode")
+        .command("extract-audio")
         .description("Transcode video/audio file")
         .argument("<file>", "Audio or video file to transcode")
-        .action(transcodeFile)
+        .action(extractAudio)
+    audio
+        .command("extract-frames")
+        .description("Extract video frames")
+        .argument("<file>", "Audio or video file to transcode")
+        .option("-c, --count <number>", "maximum number of frames to extract")
+        .option("-o, --out <string>", "output folder")
+        .option("-s, --size <string>", "size of the output frames wxh")
+        .action(extractVideoFrames)
 
     // Define 'retrieval' command group for RAG support
     const retrieval = program
diff --git a/packages/cli/src/nodehost.ts b/packages/cli/src/nodehost.ts
index c5596a589c..050bfd8709 100644
--- a/packages/cli/src/nodehost.ts
+++ b/packages/cli/src/nodehost.ts
@@ -323,14 +323,14 @@ export class NodeHost implements RuntimeHost {
             return undefined
         }
     }
-    async readFile(name: string): Promise<Uint8Array> {
+    async readFile(filepath: string): Promise<Uint8Array> {
         const wksrx = /^workspace:\/\//i
-        if (wksrx.test(name))
-            name = join(this.projectFolder(), name.replace(wksrx, ""))
+        if (wksrx.test(filepath))
+            filepath = join(this.projectFolder(), filepath.replace(wksrx, ""))
         // check if file exists
-        if (!(await exists(name))) return undefined
+        if (!(await exists(filepath))) return undefined
         // read file
-        const res = await readFile(name)
+        const res = await readFile(filepath)
         return res ? new Uint8Array(res) : new Uint8Array()
     }
     async findFiles(
diff --git a/packages/cli/src/video.ts b/packages/cli/src/video.ts
new file mode 100644
index 0000000000..0047cc47c6
--- /dev/null
+++ b/packages/cli/src/video.ts
@@ -0,0 +1,28 @@
+import { videoExtractAudio, videoExtractFrames } from "../../core/src/ffmpeg"
+
+export async function extractAudio(file: string, options: { force: boolean }) {
+    const { force } = options || {}
+    const fn = await videoExtractAudio(file, { forceConversion: force })
+    console.log(`transcoded file to ${fn}`)
+}
+
+export async function extractVideoFrames(
+    file: string,
+    options: {
+        timestamps?: number[]
+        count?: number
+        out?: string
+        size?: string
+    }
+) {
+    const { out, ...rest } = options || {}
+    if (!rest.count && !rest.timestamps?.length) rest.count = 3
+    const frames = await videoExtractFrames(file, {
+        folder: out,
+        ...rest,
+    })
+    for (let i = 0; i < frames.length; i++) {
+        const fn = frames[i]
+        console.log(`  ${fn}`)
+    }
+}
diff --git a/packages/core/src/bufferlike.ts b/packages/core/src/bufferlike.ts
index f29521c96c..1a9d7a327f 100644
--- a/packages/core/src/bufferlike.ts
+++ b/packages/core/src/bufferlike.ts
@@ -1,5 +1,6 @@
 import { resolveFileBytes } from "./file"
 import { TraceOptions } from "./trace"
+import { fileTypeFromBuffer } from "file-type"
 
 export async function resolveBufferLike(
     bufferLike: BufferLike,
@@ -25,3 +26,10 @@ export async function resolveBufferLike(
         )
     throw new Error("Unsupported buffer-like object")
 }
+
+export async function BufferToBlob(buffer: Buffer | Uint8Array) {
+    const mime = await fileTypeFromBuffer(buffer)
+    return new Blob([buffer], {
+        type: mime?.mime || "application/octet-stream",
+    })
+}
diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
index 0deb9f5348..a82513ec07 100644
--- a/packages/core/src/chat.ts
+++ b/packages/core/src/chat.ts
@@ -156,7 +156,7 @@ export interface LanguageModel {
     completer: ChatCompletionHandler
     listModels?: ListModelsFunction
     pullModel?: PullModelFunction
-    transcribe?: TranscribeFunction
+    transcriber?: TranscribeFunction
 }
 
 async function runToolCalls(
diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts
index 8bdd407443..1f9e3c0852 100644
--- a/packages/core/src/constants.ts
+++ b/packages/core/src/constants.ts
@@ -307,3 +307,8 @@ export const IMAGE_DETAIL_LOW_HEIGHT = 512
 export const MIN_LINE_NUMBER_LENGTH = 10
 
 export const VSCODE_SERVER_MAX_RETRIES = 5
+
+export const VIDEO_HASH_LENGTH = 12
+export const VIDEO_FRAMES_DIR_NAME = "frames"
+export const VIDEO_AUDIO_DIR_NAME = "audio"
+export const VIDEO_PROBE_DIR_NAME = "probe"
diff --git a/packages/core/src/crypto.test.ts b/packages/core/src/crypto.test.ts
index 7fe5df16e4..628c8ac961 100644
--- a/packages/core/src/crypto.test.ts
+++ b/packages/core/src/crypto.test.ts
@@ -47,7 +47,7 @@ describe("hash function", () => {
 
     test("should generate a hash with a specified algorithm", async () => {
         const value = "test"
-        const hashedValue = await hash(value, { algorithm: "sha-1" })
+        const hashedValue = await hash(value, { algorithm: "sha-256" })
     })
 
     test("should generate a hash with a specified length", async () => {
diff --git a/packages/core/src/crypto.ts b/packages/core/src/crypto.ts
index e943a40bb2..6120213e3d 100644
--- a/packages/core/src/crypto.ts
+++ b/packages/core/src/crypto.ts
@@ -3,6 +3,8 @@ import { getRandomValues as cryptoGetRandomValues } from "crypto"
 
 // Importing the toHex function from the util module to convert byte arrays to hexadecimal strings
 import { concatBuffers, toHex, utf8Encode } from "./util"
+import { createReadStream } from "fs"
+import { createHash } from "crypto"
 import { CORE_VERSION } from "./version"
 
 function getRandomValues(bytes: Uint8Array) {
@@ -41,7 +43,14 @@ export function randomHex(size: number) {
 }
 
 export async function hash(value: any, options?: HashOptions) {
-    const { algorithm = "sha-256", version, length, ...rest } = options || {}
+    const {
+        algorithm = "sha-256",
+        version,
+        length,
+        salt,
+        readWorkspaceFiles,
+        ...rest
+    } = options || {}
 
     const sep = utf8Encode("|")
     const un = utf8Encode("undefined")
@@ -67,16 +76,34 @@ export async function hash(value: any, options?: HashOptions) {
         else if (v instanceof ArrayBuffer) h.push(new Uint8Array(v))
         else if (v instanceof Blob)
             h.push(new Uint8Array(await v.arrayBuffer()))
-        else if (typeof v === "object")
+        else if (typeof v === "object") {
             for (const c of Object.keys(v).sort()) {
                 h.push(sep)
                 h.push(utf8Encode(c))
                 h.push(sep)
                 await append(v[c])
             }
-        else if (typeof v === "function") h.push(utf8Encode(v.toString()))
+            if (
+                readWorkspaceFiles &&
+                typeof v.filename === "string" &&
+                v.content === undefined &&
+                !/^https?:\/\//i.test(v.filename)
+            ) {
+                try {
+                    const h = await hashFile(v.filename)
+                    await append(sep)
+                    await append(h)
+                } catch {}
+            }
+        } else if (typeof v === "function") h.push(utf8Encode(v.toString()))
         else h.push(utf8Encode(JSON.stringify(v)))
     }
+
+    if (salt) {
+        await append(salt)
+        await append(sep)
+    }
+
     if (version) {
         await append(CORE_VERSION)
         await append(sep)
@@ -90,3 +117,28 @@ export async function hash(value: any, options?: HashOptions) {
     if (length) res = res.slice(0, length)
     return res
 }
+
+/**
+ * Streaming file hashing
+ */
+export async function hashFile(
+    filePath: string,
+    algorithm: string = "sha-256"
+): Promise<string> {
+    return new Promise((resolve, reject) => {
+        const hash = createHash(algorithm)
+        const stream = createReadStream(filePath)
+
+        stream.on("data", (chunk) => {
+            hash.update(chunk)
+        })
+
+        stream.on("end", () => {
+            resolve(hash.digest("hex"))
+        })
+
+        stream.on("error", (err) => {
+            reject(err)
+        })
+    })
+}
diff --git a/packages/core/src/ffmpeg.ts b/packages/core/src/ffmpeg.ts
index 27e9ef0b0f..23c75df64d 100644
--- a/packages/core/src/ffmpeg.ts
+++ b/packages/core/src/ffmpeg.ts
@@ -1,79 +1,203 @@
-import { fileTypeFromBuffer } from "file-type"
-import { PassThrough } from "stream"
-import { logError, logVerbose } from "./util"
+import { dotGenaiscriptPath, logError, logVerbose } from "./util"
 import { TraceOptions } from "./trace"
 import { lookupMime } from "./mime"
-import { host } from "./host"
+import pLimit from "p-limit"
+import { join, basename } from "node:path"
+import { ensureDir } from "fs-extra"
+import type { FfmpegCommand } from "fluent-ffmpeg"
+import { hash } from "./crypto"
+import {
+    VIDEO_AUDIO_DIR_NAME,
+    VIDEO_FRAMES_DIR_NAME,
+    VIDEO_HASH_LENGTH,
+    VIDEO_PROBE_DIR_NAME,
+} from "./constants"
+import { writeFile, readFile } from "fs/promises"
+import { errorMessage, serializeError } from "./error"
 
-async function ffmpeg() {
+const ffmpegLimit = pLimit(1)
+
+async function ffmpeg(options?: TraceOptions) {
     const m = await import("fluent-ffmpeg")
     const cmd = m.default
-    return cmd()
-        .on("start", (commandLine) => logVerbose(commandLine))
-        .on("progress", () => process.stderr.write("."))
-        .on("stderr", (s) => logVerbose(s))
+    return cmd({ logger: console, timeout: 1000000 })
 }
 
-export async function convertToAudioBlob(
-    file: string,
-    options: { forceConversion?: boolean } & TraceOptions
-): Promise<Blob> {
-    const { forceConversion } = options
-    if (!forceConversion) {
-        const mime = lookupMime(file)
-        if (/^audio/.test(mime)) {
-            const buffer = await host.readFile(file)
-            return new Blob([buffer], { type: mime })
-        }
-    }
+async function computeHashFolder(
+    filename: string,
+    folderid: string,
+    options: { folder?: string } & TraceOptions
+) {
+    const { trace, ...rest } = options
+    const h = await hash([{ filename }, rest], {
+        readWorkspaceFiles: true,
+        version: true,
+        length: VIDEO_HASH_LENGTH,
+    })
+    options.folder = dotGenaiscriptPath("video", folderid, h)
+}
 
-    logVerbose(`ffmpeg: extracting audio from video...`)
-    // ffmpeg -i helloworld.mp4 -q:a 0 -map a output.mp3
-    return new Promise<Blob>(async (resolve, reject) => {
-        const outputStream = new PassThrough()
-        const chunks: Buffer[] = []
-        outputStream.on("data", (chunk) => chunks.push(chunk))
-        outputStream.on("end", async () => {
-            const buffer = Buffer.concat(chunks)
-            if (!buffer.length) reject(new Error("conversion failed"))
-            const mime = await fileTypeFromBuffer(buffer)
-            resolve(new Blob([buffer], { type: mime.mime }))
-        })
-        outputStream.on("error", (e) => {
-            logError(e)
-            reject(e)
+export async function runFfmpeg<T>(
+    renderer: (cmd: FfmpegCommand) => Awaitable<T>,
+    options: TraceOptions & { folder?: string }
+): Promise<T> {
+    const { trace, folder } = options
+
+    return ffmpegLimit(async () => {
+        const cmd = await ffmpeg({ trace })
+        cmd.on("start", (commandLine) => {
+            logVerbose(commandLine)
         })
-        const cmd = await ffmpeg()
-        cmd.input(file)
-            .noVideo()
-            .input(file)
-            .toFormat("wav")
-            .on("error", reject)
-            .pipe(outputStream, { end: true })
+        if (process.env.FFMPEG_DEBUG) cmd.on("stderr", (s) => logVerbose(s))
+
+        const resFilename = options.folder
+            ? join(options.folder, "res.json")
+            : undefined
+        // try cache hit
+        if (resFilename) {
+            try {
+                const res = JSON.parse(
+                    await readFile(resFilename, {
+                        encoding: "utf-8",
+                    })
+                )
+                logVerbose(`video: cache hit at ${options.folder}`)
+                return res as T
+            } catch {}
+        }
+
+        if (folder) {
+            await ensureDir(folder)
+            let log: string[] = []
+            const writeLog = async () => {
+                const logFilename = join(folder, "log.txt")
+                logVerbose(`ffmpeg log: ${logFilename}`)
+                await writeFile(logFilename, log.join("\n"), {
+                    encoding: "utf-8",
+                })
+            }
+            cmd.on("stderr", (s) => log.push(s))
+            cmd.on("end", writeLog)
+            cmd.on("error", async (err) => {
+                log.push(`error: ${errorMessage(err)}\n${serializeError(err)}`)
+                await writeLog()
+            })
+        }
+
+        const res = await renderer(cmd)
+        if (resFilename) {
+            logVerbose(`ffmpeg: cache result at ${resFilename}`)
+            await writeFile(resFilename, JSON.stringify(res, null, 2))
+        }
+        return res
     })
 }
 
-export async function renderVideoFrames(file: string, timestamps: number[]) {
-    const frames: Buffer[] = []
-    return new Promise<Buffer[]>(async (resolve, reject) => {
-        const cmd = await ffmpeg()
-        cmd.input(file)
-            .outputOptions([
-                "-f image2pipe",
-                "-pix_fmt rgb24",
-                "-vcodec rawvideo",
-                "-vf select='eq(pict_type,I)'",
-                `-ss ${timestamps.join(",")}`,
-            ])
-            .on("error", reject)
-            .on("end", () => resolve(frames))
-            .pipe(
-                new PassThrough({
-                    transform: (chunk, _, callback) => {
-                        frames.push(chunk)
-                        callback()
-                    },
+export async function videoExtractAudio(
+    filename: string,
+    options: { forceConversion?: boolean; folder?: string } & TraceOptions
+): Promise<string> {
+    const { trace, forceConversion } = options
+    if (!forceConversion) {
+        const mime = lookupMime(filename)
+        if (/^audio/.test(mime)) return filename
+    }
+    if (!options.folder)
+        await computeHashFolder(filename, VIDEO_AUDIO_DIR_NAME, options)
+    const output = join(options.folder, basename(filename) + ".wav")
+    return await runFfmpeg(
+        async (cmd) =>
+            new Promise<string>(async (resolve, reject) => {
+                /*
+                const outputStream = new PassThrough()
+                const chunks: Buffer[] = []
+                outputStream.on("data", (chunk) => chunks.push(chunk))
+                outputStream.on("end", async () => {
+                    await ffmpeg(options) // keep this; it "unplugs" the output stream so that the error is not raised.
+                    const buffer = Buffer.concat(chunks)
+                    if (!buffer.length) reject(new Error("conversion failed"))
+                    resolve(buffer)
                 })
-            )
-    })
+                outputStream.on("error", (e) => {
+                    logError(e)
+                    reject(e)
+                })
+                */
+
+                cmd.input(filename)
+                    .noVideo()
+                    .toFormat("wav")
+                    .save(output)
+                    .on("end", () => resolve(output))
+                    .on("error", (err) => reject(err))
+            }),
+        options
+    )
+}
+
+export async function videoExtractFrames(
+    filename: string,
+    options: {
+        timestamps?: number[] | string[]
+        filename?: string
+        count?: number
+        size?: string
+        transcript?: TranscriptionResult
+        folder?: string
+    } & TraceOptions
+): Promise<string[]> {
+    const { trace, transcript, ...screenshotsOptions } = options
+    if (!screenshotsOptions.filename) screenshotsOptions.filename = "%b_%i.png"
+    if (transcript?.segments?.length) {
+        screenshotsOptions.timestamps = transcript.segments.map((s) => s.start)
+    }
+    if (!screenshotsOptions.count && !screenshotsOptions.timestamps)
+        screenshotsOptions.count = 5
+    if (!screenshotsOptions.folder)
+        await computeHashFolder(
+            filename,
+            VIDEO_FRAMES_DIR_NAME,
+            screenshotsOptions
+        )
+
+    return await runFfmpeg(
+        async (cmd) =>
+            new Promise(async (resolve, reject) => {
+                let filenames: string[]
+                cmd.input(filename)
+                    .screenshots(screenshotsOptions)
+                    .on("error", (err: Error) => {
+                        logError(err)
+                        reject(err)
+                    })
+                    .on(
+                        "filenames",
+                        (fns: string[]) =>
+                            (filenames = fns.map((fn) =>
+                                join(screenshotsOptions.folder, fn)
+                            ))
+                    )
+                    .on("end", async () => resolve(filenames))
+            }),
+        options
+    )
+}
+
+export async function videoProbe(
+    filename: string,
+    options?: { folder?: string } & TraceOptions
+): Promise<VideoProbeResult> {
+    const { trace } = options
+    if (!options.folder)
+        await computeHashFolder(filename, VIDEO_PROBE_DIR_NAME, options)
+    return await runFfmpeg(
+        async (cmd) =>
+            new Promise<VideoProbeResult>((resolve, reject) => {
+                cmd.input(filename).ffprobe((err, data) => {
+                    if (err) reject(err)
+                    else resolve(data as any)
+                })
+            }),
+        options
+    )
 }
diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts
index 8027845ee8..88d40f5a61 100644
--- a/packages/core/src/openai.ts
+++ b/packages/core/src/openai.ts
@@ -491,7 +491,7 @@ const pullModel: PullModelFunction = async (modelId, options) => {
     }
 }
 
-const transcribe = async (
+const transcriber = async (
     req: CreateTranscriptionRequest,
     cfg: LanguageModelConfiguration,
     options: TraceOptions & CancellationOptions
@@ -539,7 +539,7 @@ export function LocalOpenAICompatibleModel(
             id: providerId,
             listModels: options?.listModels ? listModels : undefined,
             pullModel: options?.pullModel ? pullModel : undefined,
-            transcribe: options?.transcribe ? transcribe : undefined,
+            transcriber: options?.transcribe ? transcriber : undefined,
         })
     )
 }
diff --git a/packages/core/src/parsers.ts b/packages/core/src/parsers.ts
index 624ba73de2..b83540640d 100644
--- a/packages/core/src/parsers.ts
+++ b/packages/core/src/parsers.ts
@@ -1,5 +1,9 @@
 import { CSVTryParse } from "./csv"
-import { filenameOrFileToContent, unfence } from "./unwrappers"
+import {
+    filenameOrFileToContent,
+    filenameOrFileToFilename,
+    unfence,
+} from "./unwrappers"
 import { JSON5TryParse } from "./json5"
 import { estimateTokens } from "./tokens"
 import { TOMLTryParse } from "./toml"
@@ -29,6 +33,7 @@ import { createDiff, llmifyDiff } from "./diff"
 import { tidyData } from "./tidy"
 import { hash } from "./crypto"
 import { GROQEvaluate } from "./groq"
+import { videoExtractAudio, videoExtractFrames, videoProbe } from "./ffmpeg"
 
 export async function createParsers(options: {
     trace: MarkdownTrace
@@ -124,5 +129,14 @@ export async function createParsers(options: {
         hash: async (text, options) => await hash(text, options),
         unfence: unfence,
         GROQ: GROQEvaluate,
+        videoFrames: async (file, options) =>
+            await videoExtractFrames(filenameOrFileToFilename(file), {
+                ...(options || {}),
+                trace,
+            }),
+        videoAudio: async (file) =>
+            await videoExtractAudio(filenameOrFileToFilename(file), { trace }),
+        videoProbe: async (file) =>
+            await videoProbe(filenameOrFileToFilename(file), { trace }),
     })
 }
diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts
index 9bc6323321..89d359a976 100644
--- a/packages/core/src/runpromptcontext.ts
+++ b/packages/core/src/runpromptcontext.ts
@@ -81,11 +81,13 @@ import { agentAddMemory, agentQueryMemory } from "./agent"
 import { YAMLStringify } from "./yaml"
 import { Project } from "./server/messages"
 import { parametersToVars } from "./vars"
-import { resolveBufferLike } from "./bufferlike"
-import { fileTypeFromBuffer } from "file-type"
 import prettyBytes from "pretty-bytes"
 import { JSONLineCache } from "./cache"
-import { convertToAudioBlob } from "./ffmpeg"
+import { videoExtractAudio } from "./ffmpeg"
+import { BufferToBlob } from "./bufferlike"
+import { host } from "./host"
+import { srtVttRender } from "./transcription"
+import { filenameOrFileToFilename } from "./unwrappers"
 
 export function createChatTurnGenerationContext(
     options: GenerationOptions,
@@ -633,7 +635,7 @@ export function createChatGenerationContext(
     }
 
     const transcribe = async (
-        audio: string,
+        audio: string | WorkspaceFile,
         options?: TranscriptionOptions
     ): Promise<TranscriptionResult> => {
         const { cache, ...rest } = options || {}
@@ -659,19 +661,23 @@ export function createChatGenerationContext(
             })
             if (!ok) throw new Error(`failed to pull model ${conn}`)
             checkCancelled(cancellationToken)
-            const { transcribe } = await resolveLanguageModel(
+            const { transcriber } = await resolveLanguageModel(
                 configuration.provider
             )
-            if (!transcribe)
+            if (!transcriber)
                 throw new Error("model driver not found for " + info.model)
-            const file = await convertToAudioBlob(audio, {
-                trace: transcriptionTrace,
-            })
+            const audioFile = await videoExtractAudio(
+                filenameOrFileToFilename(audio),
+                {
+                    trace: transcriptionTrace,
+                }
+            )
+            const file = await BufferToBlob(await host.readFile(audioFile))
             const update: () => Promise<TranscriptionResult> = async () => {
                 trace.itemValue(`model`, configuration.model)
                 trace.itemValue(`file size`, prettyBytes(file.size))
                 trace.itemValue(`file type`, file.type)
-                const res = await transcribe(
+                const res = await transcriber(
                     {
                         file,
                         model: configuration.model,
@@ -684,6 +690,7 @@ export function createChatGenerationContext(
                         cancellationToken,
                     }
                 )
+                srtVttRender(res)
                 return res
             }
 
diff --git a/packages/core/src/transcription.ts b/packages/core/src/transcription.ts
new file mode 100644
index 0000000000..4ae2582ded
--- /dev/null
+++ b/packages/core/src/transcription.ts
@@ -0,0 +1,49 @@
+export function srtVttRender(transcription: TranscriptionResult) {
+    const segments = transcription.segments
+    if (!segments) return transcription
+
+    const srt = segments
+        .map((segment, index) => {
+            const start = formatSRTTime(segment.start)
+            const end = formatSRTTime(segment.end)
+            return `${index + 1}\n${start} --> ${end}\n${segment.text.trim()}\n`
+        })
+        .join("\n")
+    transcription.srt = srt
+
+    const vtt =
+        "WEBVTT\n\n" +
+        segments
+            .map((segment, index) => {
+                const start = formatVRTTime(segment.start)
+                const end = formatVRTTime(segment.end)
+                return `${start} --> ${end}\n${segment.text.trim()}\n`
+            })
+            .join("\n")
+    transcription.vtt = vtt
+
+    return transcription
+
+    function formatSRTTime(seconds: number): string {
+        const date = new Date(0)
+        date.setSeconds(seconds)
+        const hours = String(date.getUTCHours()).padStart(2, "0")
+        const minutes = String(date.getUTCMinutes()).padStart(2, "0")
+        const secondsPart = String(date.getUTCSeconds()).padStart(2, "0")
+        const milliseconds = String(date.getUTCMilliseconds()).padStart(3, "0")
+        const time = `${hours}:${minutes}:${secondsPart},${milliseconds}`
+        return time.replace(".", ",")
+    }
+
+    function formatVRTTime(seconds: number): string {
+        const date = new Date(0)
+        date.setSeconds(seconds)
+        const hours = String(date.getUTCHours()).padStart(2, "0")
+        const minutes = String(date.getUTCMinutes()).padStart(2, "0")
+        const secondsPart = String(date.getUTCSeconds()).padStart(2, "0")
+        const milliseconds = String(date.getUTCMilliseconds()).padStart(3, "0")
+        let time = `${minutes}:${secondsPart}.${milliseconds}`
+        if (hours !== "00") time = hours + ":" + time
+        return time
+    }
+}
diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts
index 64e819443e..3fbee686ae 100644
--- a/packages/core/src/types/prompt_template.d.ts
+++ b/packages/core/src/types/prompt_template.d.ts
@@ -557,7 +557,7 @@ interface WorkspaceFile {
     /**
      * Encoding of the content
      */
-    encoding?: 'base64'
+    encoding?: "base64"
 
     /**
      * Content of the file.
@@ -1396,9 +1396,129 @@ interface Tokenizers {
 }
 
 interface HashOptions {
-    algorithm?: "sha-1" | "sha-256"
+    /**
+     * Algorithm used for hashing
+     */
+    algorithm?: "sha-256"
+    /**
+     * Trim hash to this number of character
+     */
     length?: number
+    /**
+     * Include genaiscript version in the hash
+     */
     version?: boolean
+    /**
+     * Optional salting of the hash
+     */
+    salt?: string
+    /**
+     * Read the content of workspace files object into the hash
+     */
+    readWorkspaceFiles?: boolean
+}
+
+interface VideoExtractFramesOptions {
+    timestamps?: number[] | string[]
+    count?: number
+    size?: string
+    transcript?: TranscriptionResult
+}
+
+interface VideoProbeResult {
+    streams: {
+        index: number
+        codec_name: string
+        codec_long_name: string
+        profile: string
+        codec_type: string
+        codec_tag_string: string
+        codec_tag: string
+        width?: number
+        height?: number
+        coded_width?: number
+        coded_height?: number
+        closed_captions?: number
+        film_grain?: number
+        has_b_frames?: number
+        sample_aspect_ratio?: string
+        display_aspect_ratio?: string
+        pix_fmt?: string
+        level?: number
+        color_range?: string
+        color_space?: string
+        color_transfer?: string
+        color_primaries?: string
+        chroma_location?: string
+        field_order?: string
+        refs?: number
+        is_avc?: string
+        nal_length_size?: number
+        id: string
+        r_frame_rate: string
+        avg_frame_rate: string
+        time_base: string
+        start_pts: number
+        start_time: number
+        duration_ts: number
+        duration: number
+        bit_rate: number
+        max_bit_rate: string
+        bits_per_raw_sample: number | string
+        nb_frames: number | string
+        nb_read_frames?: string
+        nb_read_packets?: string
+        extradata_size?: number
+        tags?: {
+            creation_time: string
+            language?: string
+            handler_name: string
+            vendor_id?: string
+            encoder?: string
+        }
+        disposition?: {
+            default: number
+            dub: number
+            original: number
+            comment: number
+            lyrics: number
+            karaoke: number
+            forced: number
+            hearing_impaired: number
+            visual_impaired: number
+            clean_effects: number
+            attached_pic: number
+            timed_thumbnails: number
+            captions: number
+            descriptions: number
+            metadata: number
+            dependent: number
+            still_image: number
+        }
+        sample_fmt?: string
+        sample_rate?: number
+        channels?: number
+        channel_layout?: string
+        bits_per_sample?: number | string
+    }[]
+    format: {
+        filename: string
+        nb_streams: number
+        nb_programs: number
+        format_name: string
+        format_long_name: string
+        start_time: number
+        duration: number
+        size: number
+        bit_rate: number
+        probe_score: number
+        tags: {
+            major_brand: string
+            minor_version: string
+            compatible_brands: string
+            creation_time: string
+        }
+    }
 }
 
 interface Parsers {
@@ -1626,6 +1746,28 @@ interface Parsers {
      * @param language
      */
     unfence(text: string, language: string): string
+
+    /**
+     * Extracts metadata information from a video file using ffprobe
+     * @param filename
+     */
+    videoProbe(filename: string | WorkspaceFile): Promise<VideoProbeResult>
+
+    /**
+     * Extracts frames from a video file
+     * @param videoPath
+     * @param options
+     */
+    videoFrames(
+        videoPath: string | WorkspaceFile,
+        options?: VideoExtractFramesOptions
+    ): Promise<string[]>
+
+    /**
+     * Extract the audio track from a video
+     * @param videoPath
+     */
+    videoAudio(videoPath: string | WorkspaceFile): Promise<string>
 }
 
 interface AICIGenOptions {
@@ -1860,7 +2002,7 @@ interface Git {
      * @param options various clone options
      * @returns the path to the cloned repository
      */
-    async shallowClone(
+    shallowClone(
         repository: string,
         options?: {
             /**
@@ -2681,7 +2823,13 @@ type McpServersConfig = Record<string, Omit<McpServerConfig, "id" | "options">>
 
 type ZodTypeLike = { _def: any; safeParse: any; refine: any }
 
-type BufferLike = string | WorkspaceFile | Buffer | Blob | ArrayBuffer | ReadableStream
+type BufferLike =
+    | string
+    | WorkspaceFile
+    | Buffer
+    | Blob
+    | ArrayBuffer
+    | ReadableStream
 
 interface TranscriptionOptions {
     /**
@@ -2701,7 +2849,7 @@ interface TranscriptionOptions {
     language?: string
 
     /**
-     * The sampling temperature, between 0 and 1. 
+     * The sampling temperature, between 0 and 1.
      * Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
      */
     temperature?: number
@@ -2721,12 +2869,23 @@ interface TranscriptionResult {
      * Error if any
      */
     error?: SerializedError
+
+    /**
+     * SubRip subtitle string from segments
+     */
+    srt?: string
+
+    /**
+     * WebVTT subtitle string from segments
+     */
+    vtt?: string
+
     /**
      * Individual segments
      */
     segments?: {
         /**
-         * The start time of the segment 
+         * The start time of the segment
          */
         start: number
         /**
@@ -2798,7 +2957,10 @@ interface ChatGenerationContext extends ChatTurnGenerationContext {
     ): RunPromptResultPromiseWithOptions
     defFileMerge(fn: FileMergeHandler): void
     defOutputProcessor(fn: PromptOutputProcessorHandler): void
-    transcribe(audio: string, options?: TranscriptionOptions): Promise<TranscriptionResult>
+    transcribe(
+        audio: string | WorkspaceFile,
+        options?: TranscriptionOptions
+    ): Promise<TranscriptionResult>
 }
 
 interface GenerationOutput {
@@ -3033,10 +3195,12 @@ interface BrowseSessionOptions extends BrowserOptions, TimeoutOptions {
     /**
      * Enable recording video for all pages. Implies incognito mode.
      */
-    recordVideo?: boolean | {
-        width: number
-        height: number
-    }
+    recordVideo?:
+        | boolean
+        | {
+              width: number
+              height: number
+          }
 }
 
 interface TimeoutOptions {
diff --git a/packages/core/src/types/prompt_type.d.ts b/packages/core/src/types/prompt_type.d.ts
index ed36f48cda..bf8ce93fc3 100644
--- a/packages/core/src/types/prompt_type.d.ts
+++ b/packages/core/src/types/prompt_type.d.ts
@@ -333,6 +333,6 @@ declare function defChatParticipant(
  * @param options
  */
 declare function transcribe(
-    audio: string,
+    audio: string | WorkspaceFile,
     options?: TranscriptionOptions
 ): Promise<TranscriptionResult>
diff --git a/packages/core/src/unwrappers.ts b/packages/core/src/unwrappers.ts
index 195ab0f54d..3a3f1b44fd 100644
--- a/packages/core/src/unwrappers.ts
+++ b/packages/core/src/unwrappers.ts
@@ -37,6 +37,14 @@ export function filenameOrFileToContent(
         : fileOrContent?.content
 }
 
+export function filenameOrFileToFilename(
+    fileOrContent: string | WorkspaceFile
+): string {
+    return typeof fileOrContent === "string"
+        ? fileOrContent
+        : fileOrContent?.filename
+}
+
 export function trimNewlines(s: string) {
     return s?.replace(/^\n*/, "").replace(/\n*$/, "")
 }
diff --git a/packages/sample/genaisrc/transcribe.genai.mjs b/packages/sample/genaisrc/transcribe.genai.mjs
index 8336a258fd..9fd878d174 100644
--- a/packages/sample/genaisrc/transcribe.genai.mjs
+++ b/packages/sample/genaisrc/transcribe.genai.mjs
@@ -14,3 +14,6 @@ console.log(res2)
 
 const resv = await transcribe("src/audio/helloworld.mp4")
 console.log(resv)
+
+const resv2 = await transcribe("src/audio/helloworld.mp4")
+console.log(resv2)
diff --git a/packages/sample/genaisrc/video-alt-text.genai.mjs b/packages/sample/genaisrc/video-alt-text.genai.mjs
new file mode 100644
index 0000000000..c3c13f70c5
--- /dev/null
+++ b/packages/sample/genaisrc/video-alt-text.genai.mjs
@@ -0,0 +1,27 @@
+script({
+    description: "Generate a description alt text for a video",
+    accept: ".mp4,.webm",
+    system: [
+        "system.output_plaintext",
+        "system.safety_jailbreak",
+        "system.safety_harmful_content",
+        "system.safety_validate_harmful_content",
+    ],
+})
+
+const file = env.files[0]
+const transcript = await transcribe(file) // OpenAI whisper
+const frames = await parsers.videoFrames(file, {
+    transcript,
+}) // ffmpeg to extract frames
+
+def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }) // ignore silent videos
+defImages(frames, { detail: "low" }) // low detail for better performance
+
+$`You are an expert in assistive technology.
+You will analyze the video and generate a description alt text for the video.
+
+- The video is included as a set of <FRAMES> images and the <TRANSCRIPT>.
+- Do not include alt text in the description.
+- Keep it short but descriptive.
+- Do not generate the [ character.`
diff --git a/packages/sample/genaisrc/video-transcript.genai.mjs b/packages/sample/genaisrc/video-transcript.genai.mjs
new file mode 100644
index 0000000000..901e35574e
--- /dev/null
+++ b/packages/sample/genaisrc/video-transcript.genai.mjs
@@ -0,0 +1,13 @@
+const transcript = await transcribe("src/audio/helloworld.mp4")
+console.log(transcript.srt)
+console.log(transcript.vtt)
+console.log(JSON.stringify(transcript, null, 2))
+def("TRANSCRIPT", transcript.srt, {
+    language: "srt",
+    ignoreEmpty: true,
+})
+const frames = await parsers.videoFrames("src/audio/helloworld.mp4", {
+    transcript,
+})
+defImages(frames)
+$`Describe the video using the screenshots and the transcript.`
diff --git a/packages/sample/genaisrc/video.genai.mjs b/packages/sample/genaisrc/video.genai.mjs
new file mode 100644
index 0000000000..1645326d47
--- /dev/null
+++ b/packages/sample/genaisrc/video.genai.mjs
@@ -0,0 +1,15 @@
+const info = await parsers.videoProbe("src/audio/helloworld.mp4")
+console.log(JSON.stringify(info, null, 2))
+const { duration, width, height } = info.streams[0]
+console.log({ duration, width, height })
+const frames = await parsers.videoFrames("src/audio/helloworld.mp4")
+console.log(frames)
+defImages(frames)
+
+const more = await parsers.videoFrames(
+    "https://github.com/microsoft/jacdac-docs/raw/refs/heads/main/static/videos/addbutton.webm"
+)
+console.log(more)
+defImages(more)
+
+$`Describe the images.`