Skip to content

examples : update vad support in stream example #3160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 41 additions & 51 deletions examples/stream/README.md
Original file line number Diff line number Diff line change
@@ -1,51 +1,41 @@
# whisper.cpp/examples/stream

This is a naive example of performing real-time inference on audio from your microphone.
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

```bash
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```

https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4

## Sliding window mode with VAD

Setting the `--step` argument to `0` enables the sliding window mode:

```bash
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
```

In this mode, the tool will transcribe only after some speech activity is detected. A very
basic VAD detector is used, but in theory a more sophisticated approach can be added. The
`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
a transcription block that is suitable for parsing.

## Building

The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

```bash
# Install SDL2
# On Debian based linux distributions:
sudo apt-get install libsdl2-dev

# On Fedora Linux:
sudo dnf install SDL2 SDL2-devel

# Install SDL2 on Mac OS
brew install sdl2

cmake -B build -DWHISPER_SDL2=ON
cmake --build build --config Release

./build/bin/whisper-stream
```

## Web version

This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
# whisper.cpp/examples/stream

This is a naive example of performing real-time inference on audio from your microphone.
The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

```bash
./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```

https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4

## VAD support

VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default
`models/for-tests-silero-v5.1.2-ggml.bin` will be used).

## Building

The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

```bash
# Install SDL2
# On Debian based linux distributions:
sudo apt-get install libsdl2-dev

# On Fedora Linux:
sudo dnf install SDL2 SDL2-devel

# Install SDL2 on Mac OS
brew install sdl2

cmake -B build -DWHISPER_SDL2=ON
cmake --build build --config Release

./build/bin/whisper-stream
```

## Web version

This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
185 changes: 95 additions & 90 deletions examples/stream/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ struct whisper_params {
int32_t audio_ctx = 0;
int32_t beam_size = -1;

float vad_thold = 0.6f;
float freq_thold = 100.0f;

bool translate = false;
bool no_fallback = false;
bool print_special = false;
Expand All @@ -37,10 +34,21 @@ struct whisper_params {
bool save_audio = false; // save audio to wav file
bool use_gpu = true;
bool flash_attn = false;
bool no_prints = false;

std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;

// Voice Activity Detection (VAD) parameters
bool vad = false;
std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin";
float vad_threshold = 0.5f;
int vad_min_speech_duration_ms = 250;
int vad_min_silence_duration_ms = 100;
float vad_max_speech_duration_s = FLT_MAX;
int vad_speech_pad_ms = 30;
float vad_samples_overlap = 0.1f;
};

void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
Expand All @@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
Expand All @@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }

else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
// Voice Activity Detection (VAD)
else if ( arg == "--vad") { params.vad = true; }
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
Expand All @@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
Expand All @@ -112,6 +125,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false");
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
// Voice Activity Detection (VAD) parameters
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
std::string("FLT_MAX").c_str() :
std::to_string(params.vad_max_speech_duration_s).c_str());
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
fprintf(stderr, "\n");
}

Expand All @@ -122,20 +148,28 @@ int main(int argc, char ** argv) {
return 1;
}

params.keep_ms = std::min(params.keep_ms, params.step_ms);
if (params.no_prints) {
whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL);
}

if (params.vad) {
// For VAD, ensure at least vad_keep_ms of context
params.keep_ms = std::max(params.keep_ms, 500);
} else {
params.keep_ms = std::min(params.keep_ms, params.step_ms);
}

params.length_ms = std::max(params.length_ms, params.step_ms);


const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;

const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD

const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line

params.no_timestamps = !use_vad;
params.no_context |= use_vad;
const int n_new_line = std::max(1, params.length_ms / params.step_ms - 1); // number of steps to print new line
//params.no_timestamps = !params.vad;
//params.no_context |= params.vad;
params.max_tokens = 0;

// init audio
Expand Down Expand Up @@ -189,12 +223,7 @@ int main(int argc, char ** argv) {
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);

if (!use_vad) {
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
} else {
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
}

fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -242,67 +271,44 @@ int main(int argc, char ** argv) {

// process new audio

if (!use_vad) {
while (true) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
audio.get(params.step_ms, pcmf32_new);

if ((int) pcmf32_new.size() > 2*n_samples_step) {
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
audio.clear();
continue;
}

if ((int) pcmf32_new.size() >= n_samples_step) {
audio.clear();
break;
}

std::this_thread::sleep_for(std::chrono::milliseconds(1));
while (true) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
audio.get(params.step_ms, pcmf32_new);

const int n_samples_new = pcmf32_new.size();

// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));

//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());

pcmf32.resize(n_samples_new + n_samples_take);
if ((int) pcmf32_new.size() > 2*n_samples_step) {
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
audio.clear();
continue;
}

for (int i = 0; i < n_samples_take; i++) {
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
if ((int) pcmf32_new.size() >= n_samples_step) {
audio.clear();
break;
}

memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}

pcmf32_old = pcmf32;
} else {
const auto t_now = std::chrono::high_resolution_clock::now();
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
const int n_samples_new = pcmf32_new.size();

if (t_diff < 2000) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));

continue;
}
//fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());

audio.get(2000, pcmf32_new);
pcmf32.resize(n_samples_new + n_samples_take);

if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
audio.get(params.length_ms, pcmf32);
} else {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
for (int i = 0; i < n_samples_take; i++) {
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
}

continue;
}
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));

t_last = t_now;
}
pcmf32_old = pcmf32;

// run the inference
{
Expand All @@ -313,7 +319,6 @@ int main(int argc, char ** argv) {
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.single_segment = !use_vad;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
Expand All @@ -330,30 +335,34 @@ int main(int argc, char ** argv) {
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();

wparams.vad = params.vad;
wparams.vad_model_path = params.vad_model.c_str();

wparams.vad_params.threshold = params.vad_threshold;
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
wparams.vad_params.samples_overlap = params.vad_samples_overlap;

if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 6;
}

// print result;
{
if (!use_vad) {
printf("\33[2K\r");

// print long empty line to clear the previous line
printf("%s", std::string(100, ' ').c_str());
const int n_segments = whisper_full_n_segments(ctx);
if (n_segments == 0) {
continue;
}
printf("\33[2K\r");

printf("\33[2K\r");
} else {
const int64_t t1 = (t_last - t_start).count()/1000000;
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
// print long empty line to clear the previous line
printf("%s", std::string(100, ' ').c_str());

printf("\n");
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
printf("\n");
}
printf("\33[2K\r");

const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);

Expand Down Expand Up @@ -389,15 +398,11 @@ int main(int argc, char ** argv) {
fout << std::endl;
}

if (use_vad) {
printf("\n");
printf("### Transcription %d END\n", n_iter);
}
}

++n_iter;

if (!use_vad && (n_iter % n_new_line) == 0) {
if ((n_iter % n_new_line) == 0) {
printf("\n");

// keep part of the audio for next iteration to try to mitigate word boundary issues
Expand Down
Loading