diff --git a/docs/src/content/docs/guides/video-alt-text.genai.mdx b/docs/src/content/docs/guides/video-alt-text.genai.mdx
new file mode 100644
index 0000000000..f1ffc3dd5f
--- /dev/null
+++ b/docs/src/content/docs/guides/video-alt-text.genai.mdx
@@ -0,0 +1,70 @@
+---
+title: Video Alt Text
+sidebar:
+ order: 50
+description: Learn how to generate alt text for videos
+keywords: Video
+---
+
+import { Code } from "@astrojs/starlight/components"
+import src from "../../../../../packages/sample/genaisrc/video-alt-text.genai.mjs?raw"
+
+GenAIScript supports [speech transcription](/genaiscript/reference/scripts/transcription)
+and [video frame extraction](/genaiscript/reference/scripts/videos) which can be combined to analyze videos.
+
+## Video Alt Text
+
+The HTML video attribute does not have an `alt` attribute.. but you can still attach a accessible description using the `aria-label` attribute.
+We will build a script that generates the description using the transcript and video frames.
+
+## Transcript
+
+We use the `transcribe` function to generate the transcript. It will use the `transcription` model alias to compute a transcription.
+Transcriptions are useful to reduce hallucations of LLMs when analyzing images and also provides
+good timestemp candidates to screenshot the video stream.
+
+```js
+const file = env.files[0]
+const transcript = await transcribe(file) // OpenAI whisper
+```
+
+## Video Frames
+
+The next step is to use the transcript to screenshot the video stream. GenAIScript uses [ffmpeg](https://ffmpeg.org/) to render the frames
+so make sure you have it installed and configured.
+
+```js
+const frames = await parsers.videoFrames(file, {
+ transcript,
+})
+```
+
+## Context
+
+Both the transcript and the frames are added to the prompt context. Since some videos may be silent, we ignore empty transcripts.
+We also use low detail for the frames to improve performance.
+
+```js
+def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }) // ignore silent videos
+defImages(frames, { detail: "low" }) // low detail for better performance
+```
+
+## Prompting it together
+
+Finally, we give the task to the LLM to generate the alt text.
+
+```js
+$`You are an expert in assistive technology.
+You will analyze the video and generate a description alt text for the video.
+`
+```
+
+Using this script, you can automatically generate high quality alt text for videos.
+
+```sh
+npx --yes genaiscript run video-alt-text path_to_video.mp4
+```
+
+## Full source
+
+
diff --git a/docs/src/content/docs/guides/zod-schema.mdx b/docs/src/content/docs/guides/zod-schema.mdx
index 218a85ffa5..4c3ca895ce 100644
--- a/docs/src/content/docs/guides/zod-schema.mdx
+++ b/docs/src/content/docs/guides/zod-schema.mdx
@@ -1,16 +1,15 @@
---
title: Zod Schema
sidebar:
- order: 80
+ order: 80
description: Learn how to define and convert TypeScript-first Zod schemas to JSON schema
keywords: TypeScript, Zod schema, JSON schema, schema validation, type inference
-
---
-[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference.
+[zod](https://zod.dev/) is a TypeScript-first schema validation with static type inference.
```ts
-import { z } from "zod"
+import { z } from "genaiscript/runtime"
// city array schema
const CitySchema = z.array(
z.object({
@@ -21,20 +20,10 @@ const CitySchema = z.array(
)
```
-[zod-to-json-schema](https://www.npmjs.com/package/zod-to-json-schema) is a library that converts zod schemas to JSON schema.
-
-```ts
-import { zodToJsonSchema } from "zod-to-json-schema"
-// convert to JSON schema
-const CitySchemaJSON = zodToJsonSchema(CitySchema, "citySchema").definitions[
- "citySchema"
-] as JSONSchemaArray
-```
-
-The JSON schemas can be used in `defSchema` to constrain the output of the tool.
+The zod schemas can be used in `defSchema` to constrain the output of the tool.
```ts
// JSON schema to constrain the output of the tool.
-const schema = defSchema("CITY_SCHEMA", CitySchemaJSON)
+const schema = defSchema("CITY_SCHEMA", CitySchema)
...
```
diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx
index 21b98c5fd9..69e8f0911a 100644
--- a/docs/src/content/docs/index.mdx
+++ b/docs/src/content/docs/index.mdx
@@ -247,6 +247,38 @@ defData("ROWS", rows, { sliceHead: 100 })
+
+
+Automatically transcribe audio or videos using [OpenAI](https://openai.com/) or others.
+
+```js
+const transcript = await transcript("path/to/audio.mp3")
+const { srt, vtt, segments } = transcript
+```
+
+
+
+
+
+Include images in prompts, we'll crop/resize/resize then for you.
+
+```js
+defImages(images, { autoCrop: true, details: "low" })
+```
+
+
+
+
+
+Extract frames from videos using timestamps or even transcripts.
+
+```js
+const frames = await parsers.videoFrames("...", { count: 10 })
+defImages(frames, { details: "low" })
+```
+
+
+
Extract files and diff from the LLM output.
diff --git a/docs/src/content/docs/reference/cli/commands.md b/docs/src/content/docs/reference/cli/commands.md
index b59d36de12..e1c2d3f204 100644
--- a/docs/src/content/docs/reference/cli/commands.md
+++ b/docs/src/content/docs/reference/cli/commands.md
@@ -291,25 +291,26 @@ Options:
-h, --help display help for command
```
-## `audio`
+## `video`
```
-Usage: genaiscript audio [options] [command]
+Usage: genaiscript video [options] [command]
-Audio tasks
+Video tasks
Options:
- -h, --help display help for command
+ -h, --help display help for command
Commands:
- transcode Transcode video/audio file
- help [command] display help for command
+ extract-audio Transcode video/audio file
+ extract-frames [options] Extract video frames
+ help [command] display help for command
```
-### `audio transcode`
+### `video extract-audio`
```
-Usage: genaiscript audio transcode [options]
+Usage: genaiscript video extract-audio [options]
Transcode video/audio file
@@ -320,6 +321,23 @@ Options:
-h, --help display help for command
```
+### `video extract-frames`
+
+```
+Usage: genaiscript video extract-frames [options]
+
+Extract video frames
+
+Arguments:
+ file Audio or video file to transcode
+
+Options:
+ -c, --count maximum number of frames to extract
+ -o, --out output folder
+ -s, --size size of the output frames wxh
+ -h, --help display help for command
+```
+
## `retrieval`
```
diff --git a/docs/src/content/docs/reference/scripts/transcription.mdx b/docs/src/content/docs/reference/scripts/transcription.mdx
index 4791b7985f..53cd269c52 100644
--- a/docs/src/content/docs/reference/scripts/transcription.mdx
+++ b/docs/src/content/docs/reference/scripts/transcription.mdx
@@ -25,7 +25,7 @@ Otherwise, it will attempt to call ffmpeg directly
## model
-By default, the API uses the `transcription` [model alias](/genaiscript/reference/model-aliases) to transcribe the audio.
+By default, the API uses the `transcription` [model alias](/genaiscript/reference/scripts/model-aliases) to transcribe the audio.
You can also specify a different model alias using the `model` option.
```js "openai:whisper-1"
@@ -44,6 +44,15 @@ for (const segment of segments) {
}
```
+## SRT and VTT
+
+GenAIScript renders the segments to [SRT](https://en.wikipedia.org/wiki/SubRip)
+and [WebVTT](https://developer.mozilla.org/en-US/docs/Web/API/WebVTT_API) formats as well.
+
+```js
+const { srt, vtt } = await transcribe("...")
+```
+
## Translation
Some models also support transcribing and translating to English in one pass. For this case,
diff --git a/docs/src/content/docs/reference/scripts/videos.mdx b/docs/src/content/docs/reference/scripts/videos.mdx
new file mode 100644
index 0000000000..398c1f4a0f
--- /dev/null
+++ b/docs/src/content/docs/reference/scripts/videos.mdx
@@ -0,0 +1,81 @@
+---
+title: Videos
+description: How to use the Video in scripts
+sidebar:
+ order: 10.01
+---
+
+While most LLMs do not support videos natively, they can be integrated in scripts by rendering frames
+and adding them as images to the prompt. This can be tedious and GenAIScript provides efficient helpers
+to streamline this process.
+
+## ffmpeg and ffprobe
+
+The functionalities to render and analyze videos rely on [ffmpeg](https://ffmpeg.org/)
+and [ffprobe](https://ffmpeg.org/ffprobe.html).
+
+Make sure these tools are installed locally and available in your PATH,
+or configure the `FFMPEG_PATH` / `FFPROBE_PATH` environment variables to point to the `ffmpeg`/`ffprobe` executable.
+
+### ffmpeg output caching
+
+Since video processing can be slow, GenAIScript caches the results in subfolders under `.genaiscript/videos/...`
+where the subfolder name is a hash from the video file content and the options used to render the video.
+This way, you can re-run the script without having to re-render the video.
+
+You can review the `ffmpeg` console log in the `log.txt` file in the cache folder.
+
+## Extracting frames
+
+As mentionned above, multi-modal LLMs typically support images as a sequence
+of frames (or screenshots).
+
+The `parsers.videoFrames` will render frames from a video file or url
+and return them as an array of file paths. You can use the result with `defImages` directly.
+
+```js
+const frames = await parsers.videoFrames("path_url_to_video")
+def("FRAMES", frames)
+```
+
+- specify a number of frames using `count`
+
+```js "count: 10"
+const frames = await parsers.videoFrames("...", { count: 10 })
+```
+
+- specify timestamps in seconds or percentages of the video duration using `timestamps` (or `times`)
+
+```js "timestamps"
+const frames = await parsers.videoFrames("...", { timestamps: ["0%", "50%"] })
+```
+
+- specify the transcript computed by the [transcribe](/genaiscript/reference/scripts/transcription) function. GenAIScript
+ will extract a frame at the start of each segment.
+
+```js "timestamps"
+const transcript = await transcribe("...")
+const frames = await parsers.videoFrames("...", { transcript })
+```
+
+## Extracting audio
+
+The `parsers.videoAudio` will extract the audio from a video file or url
+as a `.wav` file.
+
+```js
+const audio = await parsers.videoAudio("path_url_to_video")
+```
+
+The conversion to audio happens automatically
+for videos when using [transcribe](/genaiscript/reference/scripts/transcription).
+
+## Probing videos
+
+You can extract metadata from a video file or url using `parsers.videoProbe`.
+
+```js
+const info = await parsers.videoProbe("path_url_to_video")
+const { duration } = info.streams[0]
+console.log(`video duration: ${duration} seconds`)
+```
diff --git a/packages/cli/src/audio.ts b/packages/cli/src/audio.ts
deleted file mode 100644
index fd70a3baaa..0000000000
--- a/packages/cli/src/audio.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import { writeFile } from "node:fs/promises"
-import { convertToAudioBlob } from "../../core/src/ffmpeg"
-
-export async function transcodeFile(file: string, options: { force: boolean }) {
- const { force } = options || {}
- const res = await convertToAudioBlob(file, { forceConversion: force })
-
- const fn = file + ".wav"
- console.log(`transcoded file to ${fn}`)
- await writeFile(fn, Buffer.from(await res.arrayBuffer()))
-}
diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
index 919f252204..4abdb52c6d 100644
--- a/packages/cli/src/cli.ts
+++ b/packages/cli/src/cli.ts
@@ -49,7 +49,7 @@ import { CORE_VERSION, GITHUB_REPO } from "../../core/src/version" // Core versi
import { logVerbose } from "../../core/src/util" // Utility logging
import { semverSatisfies } from "../../core/src/semver" // Semantic version checking
import { convertFiles } from "./convert"
-import { transcodeFile } from "./audio"
+import { extractAudio, extractVideoFrames } from "./video"
/**
* Main function to initialize and run the CLI.
@@ -330,12 +330,20 @@ export async function cli() {
.argument("[name]", "Name of the cache, tests")
.action(cacheClear) // Action to clear cache
- const audio = program.command("audio").description("Audio tasks")
+ const audio = program.command("video").description("Video tasks")
audio
- .command("transcode")
+ .command("extract-audio")
.description("Transcode video/audio file")
.argument("", "Audio or video file to transcode")
- .action(transcodeFile)
+ .action(extractAudio)
+ audio
+ .command("extract-frames")
+ .description("Extract video frames")
+ .argument("", "Audio or video file to transcode")
+ .option("-c, --count ", "maximum number of frames to extract")
+ .option("-o, --out ", "output folder")
+ .option("-s, --size ", "size of the output frames wxh")
+ .action(extractVideoFrames)
// Define 'retrieval' command group for RAG support
const retrieval = program
diff --git a/packages/cli/src/nodehost.ts b/packages/cli/src/nodehost.ts
index c5596a589c..050bfd8709 100644
--- a/packages/cli/src/nodehost.ts
+++ b/packages/cli/src/nodehost.ts
@@ -323,14 +323,14 @@ export class NodeHost implements RuntimeHost {
return undefined
}
}
- async readFile(name: string): Promise {
+ async readFile(filepath: string): Promise {
const wksrx = /^workspace:\/\//i
- if (wksrx.test(name))
- name = join(this.projectFolder(), name.replace(wksrx, ""))
+ if (wksrx.test(filepath))
+ filepath = join(this.projectFolder(), filepath.replace(wksrx, ""))
// check if file exists
- if (!(await exists(name))) return undefined
+ if (!(await exists(filepath))) return undefined
// read file
- const res = await readFile(name)
+ const res = await readFile(filepath)
return res ? new Uint8Array(res) : new Uint8Array()
}
async findFiles(
diff --git a/packages/cli/src/video.ts b/packages/cli/src/video.ts
new file mode 100644
index 0000000000..0047cc47c6
--- /dev/null
+++ b/packages/cli/src/video.ts
@@ -0,0 +1,28 @@
+import { videoExtractAudio, videoExtractFrames } from "../../core/src/ffmpeg"
+
+export async function extractAudio(file: string, options: { force: boolean }) {
+ const { force } = options || {}
+ const fn = await videoExtractAudio(file, { forceConversion: force })
+ console.log(`transcoded file to ${fn}`)
+}
+
+export async function extractVideoFrames(
+ file: string,
+ options: {
+ timestamps?: number[]
+ count?: number
+ out?: string
+ size?: string
+ }
+) {
+ const { out, ...rest } = options || {}
+ if (!rest.count && !rest.timestamps?.length) rest.count = 3
+ const frames = await videoExtractFrames(file, {
+ folder: out,
+ ...rest,
+ })
+ for (let i = 0; i < frames.length; i++) {
+ const fn = frames[i]
+ console.log(` ${fn}`)
+ }
+}
diff --git a/packages/core/src/bufferlike.ts b/packages/core/src/bufferlike.ts
index f29521c96c..1a9d7a327f 100644
--- a/packages/core/src/bufferlike.ts
+++ b/packages/core/src/bufferlike.ts
@@ -1,5 +1,6 @@
import { resolveFileBytes } from "./file"
import { TraceOptions } from "./trace"
+import { fileTypeFromBuffer } from "file-type"
export async function resolveBufferLike(
bufferLike: BufferLike,
@@ -25,3 +26,10 @@ export async function resolveBufferLike(
)
throw new Error("Unsupported buffer-like object")
}
+
+export async function BufferToBlob(buffer: Buffer | Uint8Array) {
+ const mime = await fileTypeFromBuffer(buffer)
+ return new Blob([buffer], {
+ type: mime?.mime || "application/octet-stream",
+ })
+}
diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
index 0deb9f5348..a82513ec07 100644
--- a/packages/core/src/chat.ts
+++ b/packages/core/src/chat.ts
@@ -156,7 +156,7 @@ export interface LanguageModel {
completer: ChatCompletionHandler
listModels?: ListModelsFunction
pullModel?: PullModelFunction
- transcribe?: TranscribeFunction
+ transcriber?: TranscribeFunction
}
async function runToolCalls(
diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts
index 8bdd407443..1f9e3c0852 100644
--- a/packages/core/src/constants.ts
+++ b/packages/core/src/constants.ts
@@ -307,3 +307,8 @@ export const IMAGE_DETAIL_LOW_HEIGHT = 512
export const MIN_LINE_NUMBER_LENGTH = 10
export const VSCODE_SERVER_MAX_RETRIES = 5
+
+export const VIDEO_HASH_LENGTH = 12
+export const VIDEO_FRAMES_DIR_NAME = "frames"
+export const VIDEO_AUDIO_DIR_NAME = "audio"
+export const VIDEO_PROBE_DIR_NAME = "probe"
diff --git a/packages/core/src/crypto.test.ts b/packages/core/src/crypto.test.ts
index 7fe5df16e4..628c8ac961 100644
--- a/packages/core/src/crypto.test.ts
+++ b/packages/core/src/crypto.test.ts
@@ -47,7 +47,7 @@ describe("hash function", () => {
test("should generate a hash with a specified algorithm", async () => {
const value = "test"
- const hashedValue = await hash(value, { algorithm: "sha-1" })
+ const hashedValue = await hash(value, { algorithm: "sha-256" })
})
test("should generate a hash with a specified length", async () => {
diff --git a/packages/core/src/crypto.ts b/packages/core/src/crypto.ts
index e943a40bb2..6120213e3d 100644
--- a/packages/core/src/crypto.ts
+++ b/packages/core/src/crypto.ts
@@ -3,6 +3,8 @@ import { getRandomValues as cryptoGetRandomValues } from "crypto"
// Importing the toHex function from the util module to convert byte arrays to hexadecimal strings
import { concatBuffers, toHex, utf8Encode } from "./util"
+import { createReadStream } from "fs"
+import { createHash } from "crypto"
import { CORE_VERSION } from "./version"
function getRandomValues(bytes: Uint8Array) {
@@ -41,7 +43,14 @@ export function randomHex(size: number) {
}
export async function hash(value: any, options?: HashOptions) {
- const { algorithm = "sha-256", version, length, ...rest } = options || {}
+ const {
+ algorithm = "sha-256",
+ version,
+ length,
+ salt,
+ readWorkspaceFiles,
+ ...rest
+ } = options || {}
const sep = utf8Encode("|")
const un = utf8Encode("undefined")
@@ -67,16 +76,34 @@ export async function hash(value: any, options?: HashOptions) {
else if (v instanceof ArrayBuffer) h.push(new Uint8Array(v))
else if (v instanceof Blob)
h.push(new Uint8Array(await v.arrayBuffer()))
- else if (typeof v === "object")
+ else if (typeof v === "object") {
for (const c of Object.keys(v).sort()) {
h.push(sep)
h.push(utf8Encode(c))
h.push(sep)
await append(v[c])
}
- else if (typeof v === "function") h.push(utf8Encode(v.toString()))
+ if (
+ readWorkspaceFiles &&
+ typeof v.filename === "string" &&
+ v.content === undefined &&
+ !/^https?:\/\//i.test(v.filename)
+ ) {
+ try {
+ const h = await hashFile(v.filename)
+ await append(sep)
+ await append(h)
+ } catch {}
+ }
+ } else if (typeof v === "function") h.push(utf8Encode(v.toString()))
else h.push(utf8Encode(JSON.stringify(v)))
}
+
+ if (salt) {
+ await append(salt)
+ await append(sep)
+ }
+
if (version) {
await append(CORE_VERSION)
await append(sep)
@@ -90,3 +117,28 @@ export async function hash(value: any, options?: HashOptions) {
if (length) res = res.slice(0, length)
return res
}
+
+/**
+ * Streaming file hashing
+ */
+export async function hashFile(
+ filePath: string,
+ algorithm: string = "sha-256"
+): Promise {
+ return new Promise((resolve, reject) => {
+ const hash = createHash(algorithm)
+ const stream = createReadStream(filePath)
+
+ stream.on("data", (chunk) => {
+ hash.update(chunk)
+ })
+
+ stream.on("end", () => {
+ resolve(hash.digest("hex"))
+ })
+
+ stream.on("error", (err) => {
+ reject(err)
+ })
+ })
+}
diff --git a/packages/core/src/ffmpeg.ts b/packages/core/src/ffmpeg.ts
index 27e9ef0b0f..23c75df64d 100644
--- a/packages/core/src/ffmpeg.ts
+++ b/packages/core/src/ffmpeg.ts
@@ -1,79 +1,203 @@
-import { fileTypeFromBuffer } from "file-type"
-import { PassThrough } from "stream"
-import { logError, logVerbose } from "./util"
+import { dotGenaiscriptPath, logError, logVerbose } from "./util"
import { TraceOptions } from "./trace"
import { lookupMime } from "./mime"
-import { host } from "./host"
+import pLimit from "p-limit"
+import { join, basename } from "node:path"
+import { ensureDir } from "fs-extra"
+import type { FfmpegCommand } from "fluent-ffmpeg"
+import { hash } from "./crypto"
+import {
+ VIDEO_AUDIO_DIR_NAME,
+ VIDEO_FRAMES_DIR_NAME,
+ VIDEO_HASH_LENGTH,
+ VIDEO_PROBE_DIR_NAME,
+} from "./constants"
+import { writeFile, readFile } from "fs/promises"
+import { errorMessage, serializeError } from "./error"
-async function ffmpeg() {
+const ffmpegLimit = pLimit(1)
+
+async function ffmpeg(options?: TraceOptions) {
const m = await import("fluent-ffmpeg")
const cmd = m.default
- return cmd()
- .on("start", (commandLine) => logVerbose(commandLine))
- .on("progress", () => process.stderr.write("."))
- .on("stderr", (s) => logVerbose(s))
+ return cmd({ logger: console, timeout: 1000000 })
}
-export async function convertToAudioBlob(
- file: string,
- options: { forceConversion?: boolean } & TraceOptions
-): Promise {
- const { forceConversion } = options
- if (!forceConversion) {
- const mime = lookupMime(file)
- if (/^audio/.test(mime)) {
- const buffer = await host.readFile(file)
- return new Blob([buffer], { type: mime })
- }
- }
+async function computeHashFolder(
+ filename: string,
+ folderid: string,
+ options: { folder?: string } & TraceOptions
+) {
+ const { trace, ...rest } = options
+ const h = await hash([{ filename }, rest], {
+ readWorkspaceFiles: true,
+ version: true,
+ length: VIDEO_HASH_LENGTH,
+ })
+ options.folder = dotGenaiscriptPath("video", folderid, h)
+}
- logVerbose(`ffmpeg: extracting audio from video...`)
- // ffmpeg -i helloworld.mp4 -q:a 0 -map a output.mp3
- return new Promise(async (resolve, reject) => {
- const outputStream = new PassThrough()
- const chunks: Buffer[] = []
- outputStream.on("data", (chunk) => chunks.push(chunk))
- outputStream.on("end", async () => {
- const buffer = Buffer.concat(chunks)
- if (!buffer.length) reject(new Error("conversion failed"))
- const mime = await fileTypeFromBuffer(buffer)
- resolve(new Blob([buffer], { type: mime.mime }))
- })
- outputStream.on("error", (e) => {
- logError(e)
- reject(e)
+export async function runFfmpeg(
+ renderer: (cmd: FfmpegCommand) => Awaitable,
+ options: TraceOptions & { folder?: string }
+): Promise {
+ const { trace, folder } = options
+
+ return ffmpegLimit(async () => {
+ const cmd = await ffmpeg({ trace })
+ cmd.on("start", (commandLine) => {
+ logVerbose(commandLine)
})
- const cmd = await ffmpeg()
- cmd.input(file)
- .noVideo()
- .input(file)
- .toFormat("wav")
- .on("error", reject)
- .pipe(outputStream, { end: true })
+ if (process.env.FFMPEG_DEBUG) cmd.on("stderr", (s) => logVerbose(s))
+
+ const resFilename = options.folder
+ ? join(options.folder, "res.json")
+ : undefined
+ // try cache hit
+ if (resFilename) {
+ try {
+ const res = JSON.parse(
+ await readFile(resFilename, {
+ encoding: "utf-8",
+ })
+ )
+ logVerbose(`video: cache hit at ${options.folder}`)
+ return res as T
+ } catch {}
+ }
+
+ if (folder) {
+ await ensureDir(folder)
+ let log: string[] = []
+ const writeLog = async () => {
+ const logFilename = join(folder, "log.txt")
+ logVerbose(`ffmpeg log: ${logFilename}`)
+ await writeFile(logFilename, log.join("\n"), {
+ encoding: "utf-8",
+ })
+ }
+ cmd.on("stderr", (s) => log.push(s))
+ cmd.on("end", writeLog)
+ cmd.on("error", async (err) => {
+ log.push(`error: ${errorMessage(err)}\n${serializeError(err)}`)
+ await writeLog()
+ })
+ }
+
+ const res = await renderer(cmd)
+ if (resFilename) {
+ logVerbose(`ffmpeg: cache result at ${resFilename}`)
+ await writeFile(resFilename, JSON.stringify(res, null, 2))
+ }
+ return res
})
}
-export async function renderVideoFrames(file: string, timestamps: number[]) {
- const frames: Buffer[] = []
- return new Promise(async (resolve, reject) => {
- const cmd = await ffmpeg()
- cmd.input(file)
- .outputOptions([
- "-f image2pipe",
- "-pix_fmt rgb24",
- "-vcodec rawvideo",
- "-vf select='eq(pict_type,I)'",
- `-ss ${timestamps.join(",")}`,
- ])
- .on("error", reject)
- .on("end", () => resolve(frames))
- .pipe(
- new PassThrough({
- transform: (chunk, _, callback) => {
- frames.push(chunk)
- callback()
- },
+export async function videoExtractAudio(
+ filename: string,
+ options: { forceConversion?: boolean; folder?: string } & TraceOptions
+): Promise {
+ const { trace, forceConversion } = options
+ if (!forceConversion) {
+ const mime = lookupMime(filename)
+ if (/^audio/.test(mime)) return filename
+ }
+ if (!options.folder)
+ await computeHashFolder(filename, VIDEO_AUDIO_DIR_NAME, options)
+ const output = join(options.folder, basename(filename) + ".wav")
+ return await runFfmpeg(
+ async (cmd) =>
+ new Promise(async (resolve, reject) => {
+ /*
+ const outputStream = new PassThrough()
+ const chunks: Buffer[] = []
+ outputStream.on("data", (chunk) => chunks.push(chunk))
+ outputStream.on("end", async () => {
+ await ffmpeg(options) // keep this; it "unplugs" the output stream so that the error is not raised.
+ const buffer = Buffer.concat(chunks)
+ if (!buffer.length) reject(new Error("conversion failed"))
+ resolve(buffer)
})
- )
- })
+ outputStream.on("error", (e) => {
+ logError(e)
+ reject(e)
+ })
+ */
+
+ cmd.input(filename)
+ .noVideo()
+ .toFormat("wav")
+ .save(output)
+ .on("end", () => resolve(output))
+ .on("error", (err) => reject(err))
+ }),
+ options
+ )
+}
+
+export async function videoExtractFrames(
+ filename: string,
+ options: {
+ timestamps?: number[] | string[]
+ filename?: string
+ count?: number
+ size?: string
+ transcript?: TranscriptionResult
+ folder?: string
+ } & TraceOptions
+): Promise {
+ const { trace, transcript, ...screenshotsOptions } = options
+ if (!screenshotsOptions.filename) screenshotsOptions.filename = "%b_%i.png"
+ if (transcript?.segments?.length) {
+ screenshotsOptions.timestamps = transcript.segments.map((s) => s.start)
+ }
+ if (!screenshotsOptions.count && !screenshotsOptions.timestamps)
+ screenshotsOptions.count = 5
+ if (!screenshotsOptions.folder)
+ await computeHashFolder(
+ filename,
+ VIDEO_FRAMES_DIR_NAME,
+ screenshotsOptions
+ )
+
+ return await runFfmpeg(
+ async (cmd) =>
+ new Promise(async (resolve, reject) => {
+ let filenames: string[]
+ cmd.input(filename)
+ .screenshots(screenshotsOptions)
+ .on("error", (err: Error) => {
+ logError(err)
+ reject(err)
+ })
+ .on(
+ "filenames",
+ (fns: string[]) =>
+ (filenames = fns.map((fn) =>
+ join(screenshotsOptions.folder, fn)
+ ))
+ )
+ .on("end", async () => resolve(filenames))
+ }),
+ options
+ )
+}
+
+export async function videoProbe(
+ filename: string,
+ options?: { folder?: string } & TraceOptions
+): Promise {
+ const { trace } = options
+ if (!options.folder)
+ await computeHashFolder(filename, VIDEO_PROBE_DIR_NAME, options)
+ return await runFfmpeg(
+ async (cmd) =>
+ new Promise((resolve, reject) => {
+ cmd.input(filename).ffprobe((err, data) => {
+ if (err) reject(err)
+ else resolve(data as any)
+ })
+ }),
+ options
+ )
}
diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts
index 8027845ee8..88d40f5a61 100644
--- a/packages/core/src/openai.ts
+++ b/packages/core/src/openai.ts
@@ -491,7 +491,7 @@ const pullModel: PullModelFunction = async (modelId, options) => {
}
}
-const transcribe = async (
+const transcriber = async (
req: CreateTranscriptionRequest,
cfg: LanguageModelConfiguration,
options: TraceOptions & CancellationOptions
@@ -539,7 +539,7 @@ export function LocalOpenAICompatibleModel(
id: providerId,
listModels: options?.listModels ? listModels : undefined,
pullModel: options?.pullModel ? pullModel : undefined,
- transcribe: options?.transcribe ? transcribe : undefined,
+ transcriber: options?.transcribe ? transcriber : undefined,
})
)
}
diff --git a/packages/core/src/parsers.ts b/packages/core/src/parsers.ts
index 624ba73de2..b83540640d 100644
--- a/packages/core/src/parsers.ts
+++ b/packages/core/src/parsers.ts
@@ -1,5 +1,9 @@
import { CSVTryParse } from "./csv"
-import { filenameOrFileToContent, unfence } from "./unwrappers"
+import {
+ filenameOrFileToContent,
+ filenameOrFileToFilename,
+ unfence,
+} from "./unwrappers"
import { JSON5TryParse } from "./json5"
import { estimateTokens } from "./tokens"
import { TOMLTryParse } from "./toml"
@@ -29,6 +33,7 @@ import { createDiff, llmifyDiff } from "./diff"
import { tidyData } from "./tidy"
import { hash } from "./crypto"
import { GROQEvaluate } from "./groq"
+import { videoExtractAudio, videoExtractFrames, videoProbe } from "./ffmpeg"
export async function createParsers(options: {
trace: MarkdownTrace
@@ -124,5 +129,14 @@ export async function createParsers(options: {
hash: async (text, options) => await hash(text, options),
unfence: unfence,
GROQ: GROQEvaluate,
+ videoFrames: async (file, options) =>
+ await videoExtractFrames(filenameOrFileToFilename(file), {
+ ...(options || {}),
+ trace,
+ }),
+ videoAudio: async (file) =>
+ await videoExtractAudio(filenameOrFileToFilename(file), { trace }),
+ videoProbe: async (file) =>
+ await videoProbe(filenameOrFileToFilename(file), { trace }),
})
}
diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts
index 9bc6323321..89d359a976 100644
--- a/packages/core/src/runpromptcontext.ts
+++ b/packages/core/src/runpromptcontext.ts
@@ -81,11 +81,13 @@ import { agentAddMemory, agentQueryMemory } from "./agent"
import { YAMLStringify } from "./yaml"
import { Project } from "./server/messages"
import { parametersToVars } from "./vars"
-import { resolveBufferLike } from "./bufferlike"
-import { fileTypeFromBuffer } from "file-type"
import prettyBytes from "pretty-bytes"
import { JSONLineCache } from "./cache"
-import { convertToAudioBlob } from "./ffmpeg"
+import { videoExtractAudio } from "./ffmpeg"
+import { BufferToBlob } from "./bufferlike"
+import { host } from "./host"
+import { srtVttRender } from "./transcription"
+import { filenameOrFileToFilename } from "./unwrappers"
export function createChatTurnGenerationContext(
options: GenerationOptions,
@@ -633,7 +635,7 @@ export function createChatGenerationContext(
}
const transcribe = async (
- audio: string,
+ audio: string | WorkspaceFile,
options?: TranscriptionOptions
): Promise => {
const { cache, ...rest } = options || {}
@@ -659,19 +661,23 @@ export function createChatGenerationContext(
})
if (!ok) throw new Error(`failed to pull model ${conn}`)
checkCancelled(cancellationToken)
- const { transcribe } = await resolveLanguageModel(
+ const { transcriber } = await resolveLanguageModel(
configuration.provider
)
- if (!transcribe)
+ if (!transcriber)
throw new Error("model driver not found for " + info.model)
- const file = await convertToAudioBlob(audio, {
- trace: transcriptionTrace,
- })
+ const audioFile = await videoExtractAudio(
+ filenameOrFileToFilename(audio),
+ {
+ trace: transcriptionTrace,
+ }
+ )
+ const file = await BufferToBlob(await host.readFile(audioFile))
const update: () => Promise = async () => {
trace.itemValue(`model`, configuration.model)
trace.itemValue(`file size`, prettyBytes(file.size))
trace.itemValue(`file type`, file.type)
- const res = await transcribe(
+ const res = await transcriber(
{
file,
model: configuration.model,
@@ -684,6 +690,7 @@ export function createChatGenerationContext(
cancellationToken,
}
)
+ srtVttRender(res)
return res
}
diff --git a/packages/core/src/transcription.ts b/packages/core/src/transcription.ts
new file mode 100644
index 0000000000..4ae2582ded
--- /dev/null
+++ b/packages/core/src/transcription.ts
@@ -0,0 +1,49 @@
+export function srtVttRender(transcription: TranscriptionResult) {
+ const segments = transcription.segments
+ if (!segments) return transcription
+
+ const srt = segments
+ .map((segment, index) => {
+ const start = formatSRTTime(segment.start)
+ const end = formatSRTTime(segment.end)
+ return `${index + 1}\n${start} --> ${end}\n${segment.text.trim()}\n`
+ })
+ .join("\n")
+ transcription.srt = srt
+
+ const vtt =
+ "WEBVTT\n\n" +
+ segments
+ .map((segment, index) => {
+ const start = formatVRTTime(segment.start)
+ const end = formatVRTTime(segment.end)
+ return `${start} --> ${end}\n${segment.text.trim()}\n`
+ })
+ .join("\n")
+ transcription.vtt = vtt
+
+ return transcription
+
+ function formatSRTTime(seconds: number): string {
+ const date = new Date(0)
+ date.setSeconds(seconds)
+ const hours = String(date.getUTCHours()).padStart(2, "0")
+ const minutes = String(date.getUTCMinutes()).padStart(2, "0")
+ const secondsPart = String(date.getUTCSeconds()).padStart(2, "0")
+ const milliseconds = String(date.getUTCMilliseconds()).padStart(3, "0")
+ const time = `${hours}:${minutes}:${secondsPart},${milliseconds}`
+ return time.replace(".", ",")
+ }
+
+ function formatVRTTime(seconds: number): string {
+ const date = new Date(0)
+ date.setSeconds(seconds)
+ const hours = String(date.getUTCHours()).padStart(2, "0")
+ const minutes = String(date.getUTCMinutes()).padStart(2, "0")
+ const secondsPart = String(date.getUTCSeconds()).padStart(2, "0")
+ const milliseconds = String(date.getUTCMilliseconds()).padStart(3, "0")
+ let time = `${minutes}:${secondsPart}.${milliseconds}`
+ if (hours !== "00") time = hours + ":" + time
+ return time
+ }
+}
diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts
index 64e819443e..3fbee686ae 100644
--- a/packages/core/src/types/prompt_template.d.ts
+++ b/packages/core/src/types/prompt_template.d.ts
@@ -557,7 +557,7 @@ interface WorkspaceFile {
/**
* Encoding of the content
*/
- encoding?: 'base64'
+ encoding?: "base64"
/**
* Content of the file.
@@ -1396,9 +1396,129 @@ interface Tokenizers {
}
interface HashOptions {
- algorithm?: "sha-1" | "sha-256"
+ /**
+ * Algorithm used for hashing
+ */
+ algorithm?: "sha-256"
+ /**
+ * Trim hash to this number of character
+ */
length?: number
+ /**
+ * Include genaiscript version in the hash
+ */
version?: boolean
+ /**
+ * Optional salting of the hash
+ */
+ salt?: string
+ /**
+ * Read the content of workspace files object into the hash
+ */
+ readWorkspaceFiles?: boolean
+}
+
+interface VideoExtractFramesOptions {
+ timestamps?: number[] | string[]
+ count?: number
+ size?: string
+ transcript?: TranscriptionResult
+}
+
+interface VideoProbeResult {
+ streams: {
+ index: number
+ codec_name: string
+ codec_long_name: string
+ profile: string
+ codec_type: string
+ codec_tag_string: string
+ codec_tag: string
+ width?: number
+ height?: number
+ coded_width?: number
+ coded_height?: number
+ closed_captions?: number
+ film_grain?: number
+ has_b_frames?: number
+ sample_aspect_ratio?: string
+ display_aspect_ratio?: string
+ pix_fmt?: string
+ level?: number
+ color_range?: string
+ color_space?: string
+ color_transfer?: string
+ color_primaries?: string
+ chroma_location?: string
+ field_order?: string
+ refs?: number
+ is_avc?: string
+ nal_length_size?: number
+ id: string
+ r_frame_rate: string
+ avg_frame_rate: string
+ time_base: string
+ start_pts: number
+ start_time: number
+ duration_ts: number
+ duration: number
+ bit_rate: number
+ max_bit_rate: string
+ bits_per_raw_sample: number | string
+ nb_frames: number | string
+ nb_read_frames?: string
+ nb_read_packets?: string
+ extradata_size?: number
+ tags?: {
+ creation_time: string
+ language?: string
+ handler_name: string
+ vendor_id?: string
+ encoder?: string
+ }
+ disposition?: {
+ default: number
+ dub: number
+ original: number
+ comment: number
+ lyrics: number
+ karaoke: number
+ forced: number
+ hearing_impaired: number
+ visual_impaired: number
+ clean_effects: number
+ attached_pic: number
+ timed_thumbnails: number
+ captions: number
+ descriptions: number
+ metadata: number
+ dependent: number
+ still_image: number
+ }
+ sample_fmt?: string
+ sample_rate?: number
+ channels?: number
+ channel_layout?: string
+ bits_per_sample?: number | string
+ }[]
+ format: {
+ filename: string
+ nb_streams: number
+ nb_programs: number
+ format_name: string
+ format_long_name: string
+ start_time: number
+ duration: number
+ size: number
+ bit_rate: number
+ probe_score: number
+ tags: {
+ major_brand: string
+ minor_version: string
+ compatible_brands: string
+ creation_time: string
+ }
+ }
}
interface Parsers {
@@ -1626,6 +1746,28 @@ interface Parsers {
* @param language
*/
unfence(text: string, language: string): string
+
+ /**
+ * Extracts metadata information from a video file using ffprobe
+ * @param filename
+ */
+ videoProbe(filename: string | WorkspaceFile): Promise
+
+ /**
+ * Extracts frames from a video file
+ * @param videoPath
+ * @param options
+ */
+ videoFrames(
+ videoPath: string | WorkspaceFile,
+ options?: VideoExtractFramesOptions
+ ): Promise
+
+ /**
+ * Extract the audio track from a video
+ * @param videoPath
+ */
+ videoAudio(videoPath: string | WorkspaceFile): Promise
}
interface AICIGenOptions {
@@ -1860,7 +2002,7 @@ interface Git {
* @param options various clone options
* @returns the path to the cloned repository
*/
- async shallowClone(
+ shallowClone(
repository: string,
options?: {
/**
@@ -2681,7 +2823,13 @@ type McpServersConfig = Record>
type ZodTypeLike = { _def: any; safeParse: any; refine: any }
-type BufferLike = string | WorkspaceFile | Buffer | Blob | ArrayBuffer | ReadableStream
+type BufferLike =
+ | string
+ | WorkspaceFile
+ | Buffer
+ | Blob
+ | ArrayBuffer
+ | ReadableStream
interface TranscriptionOptions {
/**
@@ -2701,7 +2849,7 @@ interface TranscriptionOptions {
language?: string
/**
- * The sampling temperature, between 0 and 1.
+ * The sampling temperature, between 0 and 1.
* Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
*/
temperature?: number
@@ -2721,12 +2869,23 @@ interface TranscriptionResult {
* Error if any
*/
error?: SerializedError
+
+ /**
+ * SubRip subtitle string from segments
+ */
+ srt?: string
+
+ /**
+ * WebVTT subtitle string from segments
+ */
+ vtt?: string
+
/**
* Individual segments
*/
segments?: {
/**
- * The start time of the segment
+ * The start time of the segment
*/
start: number
/**
@@ -2798,7 +2957,10 @@ interface ChatGenerationContext extends ChatTurnGenerationContext {
): RunPromptResultPromiseWithOptions
defFileMerge(fn: FileMergeHandler): void
defOutputProcessor(fn: PromptOutputProcessorHandler): void
- transcribe(audio: string, options?: TranscriptionOptions): Promise
+ transcribe(
+ audio: string | WorkspaceFile,
+ options?: TranscriptionOptions
+ ): Promise
}
interface GenerationOutput {
@@ -3033,10 +3195,12 @@ interface BrowseSessionOptions extends BrowserOptions, TimeoutOptions {
/**
* Enable recording video for all pages. Implies incognito mode.
*/
- recordVideo?: boolean | {
- width: number
- height: number
- }
+ recordVideo?:
+ | boolean
+ | {
+ width: number
+ height: number
+ }
}
interface TimeoutOptions {
diff --git a/packages/core/src/types/prompt_type.d.ts b/packages/core/src/types/prompt_type.d.ts
index ed36f48cda..bf8ce93fc3 100644
--- a/packages/core/src/types/prompt_type.d.ts
+++ b/packages/core/src/types/prompt_type.d.ts
@@ -333,6 +333,6 @@ declare function defChatParticipant(
* @param options
*/
declare function transcribe(
- audio: string,
+ audio: string | WorkspaceFile,
options?: TranscriptionOptions
): Promise
diff --git a/packages/core/src/unwrappers.ts b/packages/core/src/unwrappers.ts
index 195ab0f54d..3a3f1b44fd 100644
--- a/packages/core/src/unwrappers.ts
+++ b/packages/core/src/unwrappers.ts
@@ -37,6 +37,14 @@ export function filenameOrFileToContent(
: fileOrContent?.content
}
+export function filenameOrFileToFilename(
+ fileOrContent: string | WorkspaceFile
+): string {
+ return typeof fileOrContent === "string"
+ ? fileOrContent
+ : fileOrContent?.filename
+}
+
export function trimNewlines(s: string) {
return s?.replace(/^\n*/, "").replace(/\n*$/, "")
}
diff --git a/packages/sample/genaisrc/transcribe.genai.mjs b/packages/sample/genaisrc/transcribe.genai.mjs
index 8336a258fd..9fd878d174 100644
--- a/packages/sample/genaisrc/transcribe.genai.mjs
+++ b/packages/sample/genaisrc/transcribe.genai.mjs
@@ -14,3 +14,6 @@ console.log(res2)
const resv = await transcribe("src/audio/helloworld.mp4")
console.log(resv)
+
+const resv2 = await transcribe("src/audio/helloworld.mp4")
+console.log(resv2)
diff --git a/packages/sample/genaisrc/video-alt-text.genai.mjs b/packages/sample/genaisrc/video-alt-text.genai.mjs
new file mode 100644
index 0000000000..c3c13f70c5
--- /dev/null
+++ b/packages/sample/genaisrc/video-alt-text.genai.mjs
@@ -0,0 +1,27 @@
+script({
+ description: "Generate a description alt text for a video",
+ accept: ".mp4,.webm",
+ system: [
+ "system.output_plaintext",
+ "system.safety_jailbreak",
+ "system.safety_harmful_content",
+ "system.safety_validate_harmful_content",
+ ],
+})
+
+const file = env.files[0]
+const transcript = await transcribe(file) // OpenAI whisper
+const frames = await parsers.videoFrames(file, {
+ transcript,
+}) // ffmpeg to extract frames
+
+def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }) // ignore silent videos
+defImages(frames, { detail: "low" }) // low detail for better performance
+
+$`You are an expert in assistive technology.
+You will analyze the video and generate a description alt text for the video.
+
+- The video is included as a set of images and the .
+- Do not include alt text in the description.
+- Keep it short but descriptive.
+- Do not generate the [ character.`
diff --git a/packages/sample/genaisrc/video-transcript.genai.mjs b/packages/sample/genaisrc/video-transcript.genai.mjs
new file mode 100644
index 0000000000..901e35574e
--- /dev/null
+++ b/packages/sample/genaisrc/video-transcript.genai.mjs
@@ -0,0 +1,13 @@
+const transcript = await transcribe("src/audio/helloworld.mp4")
+console.log(transcript.srt)
+console.log(transcript.vtt)
+console.log(JSON.stringify(transcript, null, 2))
+def("TRANSCRIPT", transcript.srt, {
+ language: "srt",
+ ignoreEmpty: true,
+})
+const frames = await parsers.videoFrames("src/audio/helloworld.mp4", {
+ transcript,
+})
+defImages(frames)
+$`Describe the video using the screenshots and the transcript.`
diff --git a/packages/sample/genaisrc/video.genai.mjs b/packages/sample/genaisrc/video.genai.mjs
new file mode 100644
index 0000000000..1645326d47
--- /dev/null
+++ b/packages/sample/genaisrc/video.genai.mjs
@@ -0,0 +1,15 @@
+const info = await parsers.videoProbe("src/audio/helloworld.mp4")
+console.log(JSON.stringify(info, null, 2))
+const { duration, width, height } = info.streams[0]
+console.log({ duration, width, height })
+const frames = await parsers.videoFrames("src/audio/helloworld.mp4")
+console.log(frames)
+defImages(frames)
+
+const more = await parsers.videoFrames(
+ "https://github.com/microsoft/jacdac-docs/raw/refs/heads/main/static/videos/addbutton.webm"
+)
+console.log(more)
+defImages(more)
+
+$`Describe the images.`