Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ WHISPER_VERBOSE=true
PORT=3123
DEV=true # local development mode
DATA_DIR_PATH= # only for docker, otherwise leave empty
MINIMAX_API_KEY= # optional, enables MiniMax cloud TTS provider. Get your key at https://platform.minimax.io/
3 changes: 3 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export class Config {
public whisperVersion: string = whisperVersion;
public whisperModel: whisperModels = defaultWhisperModel;
public kokoroModelPrecision: kokoroModelPrecision = "fp32";
public minimaxApiKey: string | null = null;

// docker-specific, performance-related settings to prevent memory issues
public concurrency?: number;
Expand Down Expand Up @@ -89,6 +90,8 @@ export class Config {
.KOKORO_MODEL_PRECISION as kokoroModelPrecision;
}

this.minimaxApiKey = process.env.MINIMAX_API_KEY || null;

this.concurrency = process.env.CONCURRENCY
? parseInt(process.env.CONCURRENCY)
: undefined;
Expand Down
9 changes: 9 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import path from "path";
import fs from "fs-extra";

import { Kokoro } from "./short-creator/libraries/Kokoro";
import { MiniMax } from "./short-creator/libraries/MiniMax";
import { Remotion } from "./short-creator/libraries/Remotion";
import { Whisper } from "./short-creator/libraries/Whisper";
import { FFMpeg } from "./short-creator/libraries/FFmpeg";
Expand Down Expand Up @@ -41,6 +42,13 @@ async function main() {
const ffmpeg = await FFMpeg.init();
const pexelsApi = new PexelsAPI(config.pexelsApiKey);

const minimax = config.minimaxApiKey
? MiniMax.fromApiKey(config.minimaxApiKey)
: undefined;
if (minimax) {
logger.info("MiniMax TTS provider enabled");
}

logger.debug("initializing the short creator");
const shortCreator = new ShortCreator(
config,
Expand All @@ -50,6 +58,7 @@ async function main() {
ffmpeg,
pexelsApi,
musicManager,
minimax,
);

if (!config.runningInDocker) {
Expand Down
33 changes: 27 additions & 6 deletions src/short-creator/ShortCreator.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { OrientationEnum } from "./../types/shorts";
import { OrientationEnum, MiniMaxVoiceEnum } from "./../types/shorts";
/* eslint-disable @remotion/deterministic-randomness */
import fs from "fs-extra";
import cuid from "cuid";
Expand All @@ -7,6 +7,7 @@ import https from "https";
import http from "http";

import { Kokoro } from "./libraries/Kokoro";
import { MiniMax } from "./libraries/MiniMax";
import { Remotion } from "./libraries/Remotion";
import { Whisper } from "./libraries/Whisper";
import { FFMpeg } from "./libraries/FFmpeg";
Expand Down Expand Up @@ -38,6 +39,7 @@ export class ShortCreator {
private ffmpeg: FFMpeg,
private pexelsApi: PexelsAPI,
private musicManager: MusicManager,
private minimax?: MiniMax,
) {}

public status(id: string): VideoStatus {
Expand Down Expand Up @@ -106,12 +108,29 @@ export class ShortCreator {
const orientation: OrientationEnum =
config.orientation || OrientationEnum.portrait;

const MINIMAX_VOICES = new Set(Object.values(MiniMaxVoiceEnum) as string[]);

let index = 0;
for (const scene of inputScenes) {
const audio = await this.kokoro.generate(
scene.text,
config.voice ?? "af_heart",
);
const isMinimaxVoice =
config.voice && MINIMAX_VOICES.has(config.voice);
let audio: { audio: ArrayBuffer; audioLength: number };
if (isMinimaxVoice) {
if (!this.minimax) {
throw new Error(
"MINIMAX_API_KEY environment variable is required to use MiniMax TTS voices",
);
}
audio = await this.minimax.generate(
scene.text,
config.voice as `${MiniMaxVoiceEnum}`,
);
} else {
audio = await this.kokoro.generate(
scene.text,
config.voice ?? "af_heart",
);
}
let { audioLength } = audio;
const { audio: audioStream } = audio;

Expand Down Expand Up @@ -292,6 +311,8 @@ export class ShortCreator {
}

public ListAvailableVoices(): string[] {
return this.kokoro.listAvailableVoices();
const kokoroVoices = this.kokoro.listAvailableVoices();
const minimaxVoices = this.minimax ? this.minimax.listAvailableVoices() : [];
return [...kokoroVoices, ...minimaxVoices];
}
}
176 changes: 176 additions & 0 deletions src/short-creator/libraries/MiniMax.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
process.env.LOG_LEVEL = "debug";

import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { MiniMax } from "./MiniMax";
import { MiniMaxVoiceEnum } from "../../types/shorts";

const DEFAULT_VOICE = MiniMaxVoiceEnum.English_Graceful_Lady;

describe("MiniMax TTS", () => {
let mockFetch: ReturnType<typeof vi.fn>;

beforeEach(() => {
mockFetch = vi.fn();
vi.stubGlobal("fetch", mockFetch);
});

afterEach(() => {
vi.unstubAllGlobals();
});

describe("listAvailableVoices", () => {
it("returns all MiniMax voice IDs", () => {
const minimax = new MiniMax("test-key");
const voices = minimax.listAvailableVoices();
expect(voices).toEqual(Object.values(MiniMaxVoiceEnum));
expect(voices.length).toBeGreaterThan(0);
});
});

describe("generate", () => {
it("calls the MiniMax TTS API with correct parameters", async () => {
const hexAudio = Buffer.from("mock audio data").toString("hex");
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
data: { audio: hexAudio, status: 2 },
extra_info: { audio_length: 1500 },
base_resp: { status_code: 0, status_msg: "success" },
}),
});

const minimax = new MiniMax("test-key");
const result = await minimax.generate("Hello world", DEFAULT_VOICE);

expect(mockFetch).toHaveBeenCalledWith(
"https://api.minimax.io/v1/t2a_v2",
expect.objectContaining({
method: "POST",
headers: expect.objectContaining({
Authorization: "Bearer test-key",
"Content-Type": "application/json",
}),
}),
);

const callBody = JSON.parse(mockFetch.mock.calls[0][1].body);
expect(callBody.voice_setting.voice_id).toBe(DEFAULT_VOICE);
expect(callBody.stream).toBe(false);
expect(callBody.model).toBe("speech-2.8-hd");

expect(result.audioLength).toBe(1.5);
expect(result.audio).toBeInstanceOf(ArrayBuffer);
});

it("uses custom base URL when provided", async () => {
const hexAudio = Buffer.from("mock audio").toString("hex");
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
data: { audio: hexAudio, status: 2 },
extra_info: { audio_length: 1000 },
base_resp: { status_code: 0, status_msg: "success" },
}),
});

const minimax = new MiniMax("test-key", "https://custom.example.com");
await minimax.generate("Hello", DEFAULT_VOICE);

expect(mockFetch).toHaveBeenCalledWith(
"https://custom.example.com/v1/t2a_v2",
expect.anything(),
);
});

it("throws on HTTP error response", async () => {
mockFetch.mockResolvedValue({
ok: false,
status: 401,
statusText: "Unauthorized",
});

const minimax = new MiniMax("bad-key");
await expect(
minimax.generate("Hello", DEFAULT_VOICE),
).rejects.toThrow("MiniMax TTS API error: 401 Unauthorized");
});

it("throws on non-zero status_code in response body", async () => {
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
base_resp: { status_code: 1001, status_msg: "Invalid API key" },
}),
});

const minimax = new MiniMax("bad-key");
await expect(
minimax.generate("Hello", DEFAULT_VOICE),
).rejects.toThrow("MiniMax TTS error: Invalid API key");
});

it("decodes hex-encoded audio correctly", async () => {
const expectedBytes = [0x49, 0x44, 0x33]; // "ID3" MP3 header
const hexAudio = Buffer.from(expectedBytes).toString("hex");
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
data: { audio: hexAudio, status: 2 },
extra_info: { audio_length: 500 },
base_resp: { status_code: 0, status_msg: "success" },
}),
});

const minimax = new MiniMax("test-key");
const result = await minimax.generate("Hi", DEFAULT_VOICE);

const resultBytes = Array.from(new Uint8Array(result.audio));
expect(resultBytes).toEqual(expectedBytes);
});

it("converts audio_length from milliseconds to seconds", async () => {
const hexAudio = Buffer.from("audio").toString("hex");
mockFetch.mockResolvedValue({
ok: true,
json: () =>
Promise.resolve({
data: { audio: hexAudio, status: 2 },
extra_info: { audio_length: 3200 },
base_resp: { status_code: 0, status_msg: "success" },
}),
});

const minimax = new MiniMax("test-key");
const result = await minimax.generate("Hello", DEFAULT_VOICE);
expect(result.audioLength).toBe(3.2);
});
});

describe("fromApiKey", () => {
it("creates instance with the provided API key", () => {
const minimax = MiniMax.fromApiKey("my-api-key");
expect(minimax).toBeInstanceOf(MiniMax);
});
});
});

describe("MiniMax TTS E2E", () => {
it.skipIf(!process.env.MINIMAX_API_KEY)(
"synthesizes speech with real API",
async () => {
const minimax = MiniMax.fromApiKey(process.env.MINIMAX_API_KEY!);
const result = await minimax.generate(
"Hello, this is a test.",
MiniMaxVoiceEnum.English_Graceful_Lady,
);
expect(result.audio).toBeInstanceOf(ArrayBuffer);
expect(result.audio.byteLength).toBeGreaterThan(100);
expect(result.audioLength).toBeGreaterThan(0);
},
30000,
);
});
78 changes: 78 additions & 0 deletions src/short-creator/libraries/MiniMax.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { logger } from "../../config";
import { MiniMaxVoiceEnum } from "../../types/shorts";

export type MiniMaxVoice = `${MiniMaxVoiceEnum}`;

export class MiniMax {
constructor(
private apiKey: string,
private baseUrl: string = "https://api.minimax.io",
) {}

async generate(
text: string,
voice: MiniMaxVoice,
model: string = "speech-2.8-hd",
): Promise<{ audio: ArrayBuffer; audioLength: number }> {
const response = await fetch(`${this.baseUrl}/v1/t2a_v2`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
},
body: JSON.stringify({
model,
text,
stream: false,
voice_setting: {
voice_id: voice,
speed: 1,
vol: 1,
pitch: 0,
},
audio_setting: {
sample_rate: 32000,
bitrate: 128000,
format: "mp3",
channel: 1,
},
}),
});

if (!response.ok) {
throw new Error(
`MiniMax TTS API error: ${response.status} ${response.statusText}`,
);
}

const result = await response.json();

if (result.base_resp?.status_code !== 0) {
throw new Error(
`MiniMax TTS error: ${result.base_resp?.status_msg ?? "Unknown error"}`,
);
}

const buf = Buffer.from(result.data.audio, "hex");
const audio = buf.buffer.slice(
buf.byteOffset,
buf.byteOffset + buf.byteLength,
) as ArrayBuffer;
const audioLength = (result.extra_info?.audio_length ?? 0) / 1000;

logger.debug(
{ text, voice, audioLength },
"Audio generated with MiniMax TTS",
);

return { audio, audioLength };
}

listAvailableVoices(): MiniMaxVoice[] {
return Object.values(MiniMaxVoiceEnum) as MiniMaxVoice[];
}

static fromApiKey(apiKey: string): MiniMax {
return new MiniMax(apiKey);
}
}
Loading