diff --git a/.env.example b/.env.example index 58cd00de..59eebe9f 100644 --- a/.env.example +++ b/.env.example @@ -4,3 +4,4 @@ WHISPER_VERBOSE=true PORT=3123 DEV=true # local development mode DATA_DIR_PATH= # only for docker, otherwise leave empty +MINIMAX_API_KEY= # optional, enables MiniMax cloud TTS provider. Get your key at https://platform.minimax.io/ diff --git a/src/config.ts b/src/config.ts index 73966d98..755b3263 100644 --- a/src/config.ts +++ b/src/config.ts @@ -46,6 +46,7 @@ export class Config { public whisperVersion: string = whisperVersion; public whisperModel: whisperModels = defaultWhisperModel; public kokoroModelPrecision: kokoroModelPrecision = "fp32"; + public minimaxApiKey: string | null = null; // docker-specific, performance-related settings to prevent memory issues public concurrency?: number; @@ -89,6 +90,8 @@ export class Config { .KOKORO_MODEL_PRECISION as kokoroModelPrecision; } + this.minimaxApiKey = process.env.MINIMAX_API_KEY || null; + this.concurrency = process.env.CONCURRENCY ? parseInt(process.env.CONCURRENCY) : undefined; diff --git a/src/index.ts b/src/index.ts index 1e13b73d..c5b3b620 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,6 +3,7 @@ import path from "path"; import fs from "fs-extra"; import { Kokoro } from "./short-creator/libraries/Kokoro"; +import { MiniMax } from "./short-creator/libraries/MiniMax"; import { Remotion } from "./short-creator/libraries/Remotion"; import { Whisper } from "./short-creator/libraries/Whisper"; import { FFMpeg } from "./short-creator/libraries/FFmpeg"; @@ -41,6 +42,13 @@ async function main() { const ffmpeg = await FFMpeg.init(); const pexelsApi = new PexelsAPI(config.pexelsApiKey); + const minimax = config.minimaxApiKey + ? MiniMax.fromApiKey(config.minimaxApiKey) + : undefined; + if (minimax) { + logger.info("MiniMax TTS provider enabled"); + } + logger.debug("initializing the short creator"); const shortCreator = new ShortCreator( config, @@ -50,6 +58,7 @@ async function main() { ffmpeg, pexelsApi, musicManager, + minimax, ); if (!config.runningInDocker) { diff --git a/src/short-creator/ShortCreator.ts b/src/short-creator/ShortCreator.ts index 28497b99..ea5dd4b4 100644 --- a/src/short-creator/ShortCreator.ts +++ b/src/short-creator/ShortCreator.ts @@ -1,4 +1,4 @@ -import { OrientationEnum } from "./../types/shorts"; +import { OrientationEnum, MiniMaxVoiceEnum } from "./../types/shorts"; /* eslint-disable @remotion/deterministic-randomness */ import fs from "fs-extra"; import cuid from "cuid"; @@ -7,6 +7,7 @@ import https from "https"; import http from "http"; import { Kokoro } from "./libraries/Kokoro"; +import { MiniMax } from "./libraries/MiniMax"; import { Remotion } from "./libraries/Remotion"; import { Whisper } from "./libraries/Whisper"; import { FFMpeg } from "./libraries/FFmpeg"; @@ -38,6 +39,7 @@ export class ShortCreator { private ffmpeg: FFMpeg, private pexelsApi: PexelsAPI, private musicManager: MusicManager, + private minimax?: MiniMax, ) {} public status(id: string): VideoStatus { @@ -106,12 +108,29 @@ export class ShortCreator { const orientation: OrientationEnum = config.orientation || OrientationEnum.portrait; + const MINIMAX_VOICES = new Set(Object.values(MiniMaxVoiceEnum) as string[]); + let index = 0; for (const scene of inputScenes) { - const audio = await this.kokoro.generate( - scene.text, - config.voice ?? "af_heart", - ); + const isMinimaxVoice = + config.voice && MINIMAX_VOICES.has(config.voice); + let audio: { audio: ArrayBuffer; audioLength: number }; + if (isMinimaxVoice) { + if (!this.minimax) { + throw new Error( + "MINIMAX_API_KEY environment variable is required to use MiniMax TTS voices", + ); + } + audio = await this.minimax.generate( + scene.text, + config.voice as `${MiniMaxVoiceEnum}`, + ); + } else { + audio = await this.kokoro.generate( + scene.text, + config.voice ?? "af_heart", + ); + } let { audioLength } = audio; const { audio: audioStream } = audio; @@ -292,6 +311,8 @@ export class ShortCreator { } public ListAvailableVoices(): string[] { - return this.kokoro.listAvailableVoices(); + const kokoroVoices = this.kokoro.listAvailableVoices(); + const minimaxVoices = this.minimax ? this.minimax.listAvailableVoices() : []; + return [...kokoroVoices, ...minimaxVoices]; } } diff --git a/src/short-creator/libraries/MiniMax.test.ts b/src/short-creator/libraries/MiniMax.test.ts new file mode 100644 index 00000000..d44f2241 --- /dev/null +++ b/src/short-creator/libraries/MiniMax.test.ts @@ -0,0 +1,176 @@ +process.env.LOG_LEVEL = "debug"; + +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { MiniMax } from "./MiniMax"; +import { MiniMaxVoiceEnum } from "../../types/shorts"; + +const DEFAULT_VOICE = MiniMaxVoiceEnum.English_Graceful_Lady; + +describe("MiniMax TTS", () => { + let mockFetch: ReturnType; + + beforeEach(() => { + mockFetch = vi.fn(); + vi.stubGlobal("fetch", mockFetch); + }); + + afterEach(() => { + vi.unstubAllGlobals(); + }); + + describe("listAvailableVoices", () => { + it("returns all MiniMax voice IDs", () => { + const minimax = new MiniMax("test-key"); + const voices = minimax.listAvailableVoices(); + expect(voices).toEqual(Object.values(MiniMaxVoiceEnum)); + expect(voices.length).toBeGreaterThan(0); + }); + }); + + describe("generate", () => { + it("calls the MiniMax TTS API with correct parameters", async () => { + const hexAudio = Buffer.from("mock audio data").toString("hex"); + mockFetch.mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + data: { audio: hexAudio, status: 2 }, + extra_info: { audio_length: 1500 }, + base_resp: { status_code: 0, status_msg: "success" }, + }), + }); + + const minimax = new MiniMax("test-key"); + const result = await minimax.generate("Hello world", DEFAULT_VOICE); + + expect(mockFetch).toHaveBeenCalledWith( + "https://api.minimax.io/v1/t2a_v2", + expect.objectContaining({ + method: "POST", + headers: expect.objectContaining({ + Authorization: "Bearer test-key", + "Content-Type": "application/json", + }), + }), + ); + + const callBody = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(callBody.voice_setting.voice_id).toBe(DEFAULT_VOICE); + expect(callBody.stream).toBe(false); + expect(callBody.model).toBe("speech-2.8-hd"); + + expect(result.audioLength).toBe(1.5); + expect(result.audio).toBeInstanceOf(ArrayBuffer); + }); + + it("uses custom base URL when provided", async () => { + const hexAudio = Buffer.from("mock audio").toString("hex"); + mockFetch.mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + data: { audio: hexAudio, status: 2 }, + extra_info: { audio_length: 1000 }, + base_resp: { status_code: 0, status_msg: "success" }, + }), + }); + + const minimax = new MiniMax("test-key", "https://custom.example.com"); + await minimax.generate("Hello", DEFAULT_VOICE); + + expect(mockFetch).toHaveBeenCalledWith( + "https://custom.example.com/v1/t2a_v2", + expect.anything(), + ); + }); + + it("throws on HTTP error response", async () => { + mockFetch.mockResolvedValue({ + ok: false, + status: 401, + statusText: "Unauthorized", + }); + + const minimax = new MiniMax("bad-key"); + await expect( + minimax.generate("Hello", DEFAULT_VOICE), + ).rejects.toThrow("MiniMax TTS API error: 401 Unauthorized"); + }); + + it("throws on non-zero status_code in response body", async () => { + mockFetch.mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + base_resp: { status_code: 1001, status_msg: "Invalid API key" }, + }), + }); + + const minimax = new MiniMax("bad-key"); + await expect( + minimax.generate("Hello", DEFAULT_VOICE), + ).rejects.toThrow("MiniMax TTS error: Invalid API key"); + }); + + it("decodes hex-encoded audio correctly", async () => { + const expectedBytes = [0x49, 0x44, 0x33]; // "ID3" MP3 header + const hexAudio = Buffer.from(expectedBytes).toString("hex"); + mockFetch.mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + data: { audio: hexAudio, status: 2 }, + extra_info: { audio_length: 500 }, + base_resp: { status_code: 0, status_msg: "success" }, + }), + }); + + const minimax = new MiniMax("test-key"); + const result = await minimax.generate("Hi", DEFAULT_VOICE); + + const resultBytes = Array.from(new Uint8Array(result.audio)); + expect(resultBytes).toEqual(expectedBytes); + }); + + it("converts audio_length from milliseconds to seconds", async () => { + const hexAudio = Buffer.from("audio").toString("hex"); + mockFetch.mockResolvedValue({ + ok: true, + json: () => + Promise.resolve({ + data: { audio: hexAudio, status: 2 }, + extra_info: { audio_length: 3200 }, + base_resp: { status_code: 0, status_msg: "success" }, + }), + }); + + const minimax = new MiniMax("test-key"); + const result = await minimax.generate("Hello", DEFAULT_VOICE); + expect(result.audioLength).toBe(3.2); + }); + }); + + describe("fromApiKey", () => { + it("creates instance with the provided API key", () => { + const minimax = MiniMax.fromApiKey("my-api-key"); + expect(minimax).toBeInstanceOf(MiniMax); + }); + }); +}); + +describe("MiniMax TTS E2E", () => { + it.skipIf(!process.env.MINIMAX_API_KEY)( + "synthesizes speech with real API", + async () => { + const minimax = MiniMax.fromApiKey(process.env.MINIMAX_API_KEY!); + const result = await minimax.generate( + "Hello, this is a test.", + MiniMaxVoiceEnum.English_Graceful_Lady, + ); + expect(result.audio).toBeInstanceOf(ArrayBuffer); + expect(result.audio.byteLength).toBeGreaterThan(100); + expect(result.audioLength).toBeGreaterThan(0); + }, + 30000, + ); +}); diff --git a/src/short-creator/libraries/MiniMax.ts b/src/short-creator/libraries/MiniMax.ts new file mode 100644 index 00000000..83a93ed1 --- /dev/null +++ b/src/short-creator/libraries/MiniMax.ts @@ -0,0 +1,78 @@ +import { logger } from "../../config"; +import { MiniMaxVoiceEnum } from "../../types/shorts"; + +export type MiniMaxVoice = `${MiniMaxVoiceEnum}`; + +export class MiniMax { + constructor( + private apiKey: string, + private baseUrl: string = "https://api.minimax.io", + ) {} + + async generate( + text: string, + voice: MiniMaxVoice, + model: string = "speech-2.8-hd", + ): Promise<{ audio: ArrayBuffer; audioLength: number }> { + const response = await fetch(`${this.baseUrl}/v1/t2a_v2`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify({ + model, + text, + stream: false, + voice_setting: { + voice_id: voice, + speed: 1, + vol: 1, + pitch: 0, + }, + audio_setting: { + sample_rate: 32000, + bitrate: 128000, + format: "mp3", + channel: 1, + }, + }), + }); + + if (!response.ok) { + throw new Error( + `MiniMax TTS API error: ${response.status} ${response.statusText}`, + ); + } + + const result = await response.json(); + + if (result.base_resp?.status_code !== 0) { + throw new Error( + `MiniMax TTS error: ${result.base_resp?.status_msg ?? "Unknown error"}`, + ); + } + + const buf = Buffer.from(result.data.audio, "hex"); + const audio = buf.buffer.slice( + buf.byteOffset, + buf.byteOffset + buf.byteLength, + ) as ArrayBuffer; + const audioLength = (result.extra_info?.audio_length ?? 0) / 1000; + + logger.debug( + { text, voice, audioLength }, + "Audio generated with MiniMax TTS", + ); + + return { audio, audioLength }; + } + + listAvailableVoices(): MiniMaxVoice[] { + return Object.values(MiniMaxVoiceEnum) as MiniMaxVoice[]; + } + + static fromApiKey(apiKey: string): MiniMax { + return new MiniMax(apiKey); + } +} diff --git a/src/types/shorts.ts b/src/types/shorts.ts index 5065e0eb..d76e6b6c 100644 --- a/src/types/shorts.ts +++ b/src/types/shorts.ts @@ -71,6 +71,15 @@ export enum VoiceEnum { bm_fable = "bm_fable", } +export enum MiniMaxVoiceEnum { + English_Graceful_Lady = "English_Graceful_Lady", + English_Insightful_Speaker = "English_Insightful_Speaker", + English_radiant_girl = "English_radiant_girl", + English_Persuasive_Man = "English_Persuasive_Man", + English_Lucky_Robot = "English_Lucky_Robot", + English_expressive_narrator = "English_expressive_narrator", +} + export enum OrientationEnum { landscape = "landscape", portrait = "portrait", @@ -105,9 +114,11 @@ export const renderConfig = z.object({ "Background color of the caption, a valid css color, default is blue", ), voice: z - .nativeEnum(VoiceEnum) + .union([z.nativeEnum(VoiceEnum), z.nativeEnum(MiniMaxVoiceEnum)]) .optional() - .describe("Voice to be used for the speech, default is af_heart"), + .describe( + "Voice to be used for the speech. Use VoiceEnum values for local Kokoro TTS (default: af_heart) or MiniMaxVoiceEnum values for MiniMax cloud TTS (requires MINIMAX_API_KEY).", + ), orientation: z .nativeEnum(OrientationEnum) .optional() @@ -119,7 +130,7 @@ export const renderConfig = z.object({ }); export type RenderConfig = z.infer; -export type Voices = `${VoiceEnum}`; +export type Voices = `${VoiceEnum}` | `${MiniMaxVoiceEnum}`; export type Video = { id: string;