diff --git a/README.md b/README.md index e6291b7..2db447e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A voice add-on for the [WebThings Gateway](https://github.com/WebThingsIO/gateway) which uses -[DeepSpeech](https://github.com/mozilla/DeepSpeech) as the speech-to-text (STT) +[:frog:STT](https://github.com/coqui-ai/stt) as the speech-to-text (STT) engine. The add-on uses a microphone attached to the gateway host. @@ -69,4 +69,4 @@ The add-on uses a microphone attached to the gateway host. ## Credits This add-on was originally created by André Natal (@andrenatal). DeepSpeech -support was initially added by Alexandre Lissy (@lissyx). +support was initially added by Alexandre Lissy (@lissyx), and upgraded to Coqui (:frog:STT) by Josh Meyer (@JRMeyer) diff --git a/lib/adapter.js b/lib/adapter.js index d62acf4..a0c7216 100644 --- a/lib/adapter.js +++ b/lib/adapter.js @@ -9,7 +9,7 @@ 'use strict'; const {Adapter, Event} = require('gateway-addon'); -const DeepSpeechInterface = require('./deep-speech-interface'); +const CoquiSttInterface = require('./coqui-stt-interface'); const levenshtein = require('js-levenshtein'); const manifest = require('../manifest.json'); const {normalizeDeviceName} = require('./command-utils'); @@ -29,14 +29,14 @@ class VoiceAdapter extends Adapter { addonManager.addAdapter(this); - this._dsInterface = new DeepSpeechInterface(this); + this._sttInterface = new CoquiSttInterface(this); this.startPairing(); }); } startPairing() { if (!this.devices['voice-controller']) { - this.handleDeviceAdded(new VoiceDevice(this, this._dsInterface)); + this.handleDeviceAdded(new VoiceDevice(this, this._sttInterface)); } } @@ -110,12 +110,12 @@ class VoiceAdapter extends Adapter { this._nameMap[deviceId] = normalizeDeviceName(device.title); this._deviceSavedTimeout = setTimeout(() => { - this._dsInterface.generateLocalLM(Object.values(this._nameMap)); + this._sttInterface.generateLocalLM(Object.values(this._nameMap)); }, 1000); } unload() { - this._dsInterface.stopMicrophone(); + this._sttInterface.stopMicrophone(); return Promise.resolve(); } } diff --git a/lib/deep-speech-interface.js b/lib/coqui-stt-interface.js similarity index 95% rename from lib/deep-speech-interface.js rename to lib/coqui-stt-interface.js index c7ded60..d50d8be 100644 --- a/lib/deep-speech-interface.js +++ b/lib/coqui-stt-interface.js @@ -1,12 +1,6 @@ 'use strict'; -let Ds; -try { - Ds = require('deepspeech'); -} catch (_) { - Ds = require('deepspeech-tflite'); -} - +const Stt = require('stt'); const {spawnSync} = require('child_process'); const fs = require('fs'); const { @@ -162,8 +156,8 @@ class DeepSpeechInterface { console.debug(`Loading model from ${this._modelsDir}`); } - this._model = new Ds.Model( - path.join(this._assetsDir, `deepspeech-model.tflite`) + this._model = new Stt.Model( + path.join(this._assetsDir, `model.tflite`) ); if (this._debug) { @@ -229,7 +223,7 @@ class DeepSpeechInterface { } resumeMicrophone() { - const dsStream = this._model.createStream(); + const sttStream = this._model.createStream(); const micStream = this._mic.getStream(); let silenceCount = 0; @@ -239,7 +233,7 @@ class DeepSpeechInterface { let runningTranscript = ''; const interimTimer = setInterval(() => { - const transcript = dsStream.intermediateDecode(); + const transcript = sttStream.intermediateDecode(); if (this._debug) { console.debug('interim:', transcript); @@ -254,7 +248,7 @@ class DeepSpeechInterface { }, 1000); const dataHandler = (data) => { - dsStream.feedAudioContent(data); + sttStream.feedAudioContent(data); }; micStream.once('pauseComplete', () => { @@ -270,7 +264,7 @@ class DeepSpeechInterface { micStream.removeListener('data', dataHandler); - const transcript = dsStream.finishStream().trim(); + const transcript = sttStream.finishStream().trim(); if (!transcript) { if (this._debug) { console.debug('Transcript was empty'); diff --git a/lib/device.js b/lib/device.js index 8407516..e6222d4 100644 --- a/lib/device.js +++ b/lib/device.js @@ -4,10 +4,10 @@ const {Device} = require('gateway-addon'); const VoiceProperty = require('./property'); class VoiceDevice extends Device { - constructor(adapter, dsInterface) { + constructor(adapter, sttInterface) { super(adapter, 'voice-controller'); - this._dsInterface = dsInterface; + this._sttInterface = sttInterface; this.name = 'Voice Controller'; this.description = 'Voice Controller'; this['@type'] = ['OnOffSwitch']; @@ -53,9 +53,9 @@ class VoiceDevice extends Device { toggle(value) { if (value) { - this._dsInterface.enable(); + this._sttInterface.enable(); } else { - this._dsInterface.disable(); + this._sttInterface.disable(); } } } diff --git a/package.json b/package.json index 0c1457c..905f6ec 100644 --- a/package.json +++ b/package.json @@ -17,8 +17,7 @@ "url": "https://github.com/WebThingsIO/voice-addon/issues" }, "dependencies": { - "deepspeech": "^0.9.0", - "deepspeech-tflite": "^0.9.0", + "stt": "^1.2.0", "js-levenshtein": "^1.1.6", "mic": "^2.1.2", "sound-player": "^1.0.13", @@ -37,7 +36,7 @@ "SHA256SUMS", "assets/LICENSE", "assets/alphabet.txt", - "assets/deepspeech-model.tflite", + "assets/model.tflite", "assets/error.wav", "assets/no-input.wav", "assets/success.wav", @@ -48,7 +47,7 @@ "index.js", "lib/adapter.js", "lib/command-utils.js", - "lib/deep-speech-interface.js", + "lib/coqui-stt-interface.js", "lib/device.js", "lib/matrix-microphone.js", "lib/property.js", diff --git a/package.sh b/package.sh index 7f6963a..2807a24 100755 --- a/package.sh +++ b/package.sh @@ -1,6 +1,7 @@ #!/bin/bash -e -_DS_VERSION="0.9.0" +_STT_VERSION="1.2.0" +_STT_MODEL_VERSION="1.0.0" # Setup environment for building inside Dockerized toolchain export NVM_DIR="${HOME}/.nvm" @@ -41,58 +42,41 @@ rm -rf "${here}/kenlm" pushd "${here}/bin" case "$ADDON_ARCH" in linux-x64) - _SCORER_TARBALL="native_client.amd64.cpu.linux.tar.xz" + _SCORER_TARBALL="native_client.tflite.Linux.tar.xz" ;; linux-arm) - _SCORER_TARBALL="native_client.rpi3.cpu.linux.tar.xz" + _SCORER_TARBALL="native_client.tflite.linux.armv7.tar.xz" ;; linux-arm64) - _SCORER_TARBALL="native_client.arm64.cpu.linux.tar.xz" + _SCORER_TARBALL="native_client.tflite.linux.aarch64.tar.xz" ;; darwin-x64) - _SCORER_TARBALL="native_client.amd64.cpu.osx.tar.xz" + _SCORER_TARBALL="native_client.tflite.macOS.tar.xz" ;; esac curl \ - -L "https://github.com/mozilla/DeepSpeech/releases/download/v${_DS_VERSION}/${_SCORER_TARBALL}" | \ + -L "https://github.com/coqui-ai/STT/releases/download/v${_STT_VERSION}/${_SCORER_TARBALL}" | \ tar xJ generate_scorer_package popd # download the DeepSpeech model pushd "${here}/assets" curl \ - -o "deepspeech-model.tflite" \ - -L "https://github.com/mozilla/DeepSpeech/releases/download/v${_DS_VERSION}/deepspeech-${_DS_VERSION}-models.tflite" + -o "model.tflite" \ + -L "https://github.com/coqui-ai/STT-models/releases/download/english%2Fcoqui%2Fv${_STT_MODEL_VERSION}-huge-vocab/model.tflite" popd -# remove one of the DS dependencies, based on architecture -KEEP_DEP="deepspeech" -REMOVE_DEP="deepspeech-tflite" -if [[ -n "$ADDON_ARCH" && $ADDON_ARCH =~ x64 ]]; then - KEEP_DEP="deepspeech-tflite" - REMOVE_DEP="deepspeech" -fi -python -c "import json, os; \ - from collections import OrderedDict; \ - fname = os.path.join(os.getcwd(), 'package.json'); \ - d = json.loads(open(fname).read(), object_pairs_hook=OrderedDict); \ - del d['dependencies']['${REMOVE_DEP}']; \ - f = open(fname, 'wt'); \ - json.dump(d, f, indent=2); \ - f.close() -" - npm install --production -# keep only the compiled DS binary that we need +# keep only the compiled STT binary that we need module_version=$(node -e 'console.log(`node-v${process.config.variables.node_module_version}`)') -find "node_modules/${KEEP_DEP}/lib/binding/v${_DS_VERSION}" \ +find "node_modules/stt/lib/binding/v${_STT_VERSION}" \ -mindepth 1 \ -maxdepth 1 \ \! -name "${ADDON_ARCH}" \ -exec rm -rf {} \; -find "node_modules/${KEEP_DEP}/lib/binding/v${_DS_VERSION}/${ADDON_ARCH}" \ +find "node_modules/stt/lib/binding/v${_STT_VERSION}/${ADDON_ARCH}" \ -mindepth 1 \ -maxdepth 1 \ -type d \