Skip to content

Commit

Permalink
Whisper integration for speaker generation
Browse files Browse the repository at this point in the history
Added Whisper-based transcription for speaker creation when `transcript` is None (#28).
  • Loading branch information
edwko committed Nov 30, 2024
1 parent ffd8179 commit c5a75f4
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 2 deletions.
21 changes: 20 additions & 1 deletion outetts/version/v1/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ...wav_tokenizer.audio_codec import AudioCodec
from .prompt_processor import PromptProcessor
from .model import HFModel, GGUFModel, EXL2Model, GenerationConfig
from ...whisper import transcribe
import torch
from .alignment import CTCForcedAlignment
import torchaudio
Expand Down Expand Up @@ -138,7 +139,25 @@ def get_audio(self, tokens):
torch.tensor([[output]], dtype=torch.int64).to(self.audio_codec.device)
)

def create_speaker(self, audio_path: str, transcript: str):
def create_speaker(
self,
audio_path: str,
transcript: str = None,
whisper_model: str = "turbo",
whisper_device = None
):

if transcript is None:
logger.info("Transcription not provided, transcribing audio with whisper.")
transcript = transcribe.transcribe_once(
audio_path=audio_path,
model=whisper_model,
device=whisper_device
)

if not transcript:
raise ValueError("Transcript text is empty")

ctc = CTCForcedAlignment(self.languages, self._device)
words = ctc.align(audio_path, transcript, self.language)
ctc.free()
Expand Down
Empty file added outetts/whisper/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions outetts/whisper/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import whisper
from loguru import logger

def transcribe_once(audio_path: str, model: str = "turbo", device = None):
logger.info(f"Loading model {model}")
model = whisper.load_model("turbo", device=device)
logger.info(f"Transcribing {audio_path}")
text = model.transcribe(audio_path)["text"]
logger.success(f"Transcription: {text}")
return text
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ requests
sounddevice
mecab-python3
unidic-lite
uroman
uroman
openai-whisper>=20240930

0 comments on commit c5a75f4

Please sign in to comment.