Merge pull request #6 from chengsokdara/feat/real-time-transcription

chengsokdara · web-flow · commit 64be111768d5 · 2023-03-11T11:52:52.000+07:00
Add Real-Time transcription support
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [0.0.13] - 2023-04-01
+## [0.2.0] - 2023-04-01
 
 ### Added
 
@@ -18,15 +18,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - able to run custom ffmpeg command
 - expose onTranscribing event
 
+## [0.1.0] - 2023-03-11
+
+### Added
+
+- streaming option for real-time trascription
+- timeSlice option to control onDataAvailable event
+- onDataAvaiable option for getting recorded blob in interval based on timeSlice
+
+### Changed
+
+- recording in higher audio quality to help Whisper in transcription
+
+### Removed
+
+- customServer option, deprecated since 0.0.11
+
 ## [0.0.12] - 2023-03-09
 
 ### Changed
 
 - autoTranscribe default to true
 - update examples in README.md
 
-### Added
-
 ## [0.0.11] - 2023-03-08
 
 ### Added
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# useWhisper()
+# useWhisper
 
-React Hook for OpenAI Whisper API with speech recorder and silence removal built-in
+React Hook for OpenAI Whisper API with speech recorder, real-time transcription and silence removal built-in
 
 ---
 
@@ -22,8 +22,6 @@ _Try OpenAI API price calculator, token counter, and dataset manager (preview)_
 
 - ### Usage
 
-- ###### Provide your own OpenAI API key
-
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
 
@@ -54,9 +52,7 @@ const App = () => {
 }
 ```
 
-_**NOTE:** by providing apiKey, it could be exposed in the browser devtool network tab_
-
-- ###### Custom REST API (if you want to keep your OpenAI API key secure)
+- ###### Custom Server (keep OpenAI API token secure)
 
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
@@ -105,16 +101,19 @@ const App = () => {
 
 - ### Examples
 
-- ###### Remove silence before sending to Whisper to save cost
+- ###### Real-time streaming trascription
 
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
 
 const App = () => {
   const { transcript } = useWhisper({
     apiKey: env.process.OPENAI_API_TOKEN, // YOUR_OPEN_AI_TOKEN
-    // use ffmpeg-wasp to remove silence from recorded speech
-    removeSilence: true,
+    streaming: true,
+    timeSlice: 1_000, // 1 second
+    whisperConfig: {
+      language: 'en',
+    },
   })
 
   return (
@@ -125,16 +124,16 @@ const App = () => {
 }
 ```
 
-- ###### Auto start recording on component mounted
+- ###### Remove silence before sending to Whisper to save cost
 
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
 
 const App = () => {
   const { transcript } = useWhisper({
     apiKey: env.process.OPENAI_API_TOKEN, // YOUR_OPEN_AI_TOKEN
-    // will auto start recording speech upon component mounted
-    autoStart: true,
+    // use ffmpeg-wasp to remove silence from recorded speech
+    removeSilence: true,
   })
 
   return (
@@ -145,16 +144,16 @@ const App = () => {
 }
 ```
 
-- ###### Keep recording as long as the user is speaking
+- ###### Auto start recording on component mounted
 
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
 
 const App = () => {
   const { transcript } = useWhisper({
     apiKey: env.process.OPENAI_API_TOKEN, // YOUR_OPEN_AI_TOKEN
-    nonStop: true, // keep recording as long as the user is speaking
-    stopTimeout: 5000, // auto stop after 5 seconds
+    // will auto start recording speech upon component mounted
+    autoStart: true,
   })
 
   return (
@@ -165,15 +164,16 @@ const App = () => {
 }
 ```
 
-- ###### Auto transcribe speech when recorder stopped
+- ###### Keep recording as long as the user is speaking
 
 ```jsx
 import { useWhisper } from '@chengsokdara/use-whisper'
 
 const App = () => {
   const { transcript } = useWhisper({
     apiKey: env.process.OPENAI_API_TOKEN, // YOUR_OPEN_AI_TOKEN
-    autoTranscribe: true, // will try to automatically transcribe speech
+    nonStop: true, // keep recording as long as the user is speaking
+    stopTimeout: 5000, // auto stop after 5 seconds
   })
 
   return (
@@ -192,6 +192,7 @@ import { useWhisper } from '@chengsokdara/use-whisper'
 const App = () => {
   const { transcript } = useWhisper({
     apiKey: env.process.OPENAI_API_TOKEN, // YOUR_OPEN_AI_TOKEN
+    autoTranscribe: true,
     whisperConfig: {
       prompt: 'previous conversation', // you can pass previous conversation for context
       response_format: 'text', // output text instead of json
@@ -222,16 +223,19 @@ _most of these dependecies are lazy loaded, so it is only imported when it is ne
 
 - ###### Config Object
 
-| Name           | Type                                               | Default Value | Description                                                                                                          |
-| -------------- | -------------------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------- |
-| apiKey         | string                                             | ''            | your OpenAI API token                                                                                                |
-| autoStart      | boolean                                            | false         | auto start speech recording on component mount                                                                       |
-| autoTranscribe | boolean                                            | true          | should auto transcribe after stop recording                                                                          |
-| nonStop        | boolean                                            | false         | if true, record will auto stop after stopTimeout. However if user keep on speaking, the recorder will keep recording |
-| removeSilence  | boolean                                            | false         | remove silence before sending file to OpenAI API                                                                     |
-| stopTimeout    | number                                             | 5,000 ms      | if nonStop is true, this become required. This control when the recorder auto stop                                   |
-| whisperConfig  | [WhisperApiConfig](#whisperapiconfig)              | undefined     | Whisper API transcription config                                                                                     |
-| onTranscribe   | (blob: Blob) => Promise<[Transcript](#transcript)> | undefined     | callback function to handle transcription on your own custom server                                                  |
+| Name            | Type                                               | Default Value | Description                                                                                                          |
+| --------------- | -------------------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------- |
+| apiKey          | string                                             | ''            | your OpenAI API token                                                                                                |
+| autoStart       | boolean                                            | false         | auto start speech recording on component mount                                                                       |
+| autoTranscribe  | boolean                                            | true          | should auto transcribe after stop recording                                                                          |
+| nonStop         | boolean                                            | false         | if true, record will auto stop after stopTimeout. However if user keep on speaking, the recorder will keep recording |
+| removeSilence   | boolean                                            | false         | remove silence before sending file to OpenAI API                                                                     |
+| stopTimeout     | number                                             | 5,000 ms      | if nonStop is true, this become required. This control when the recorder auto stop                                   |
+| streaming       | boolean                                            | false         | transcribe speech in real-time based on timeSlice                                                                    |
+| timeSlice       | number                                             | 2000 ms       | interval between each onDataAvailable event                                                                          |
+| whisperConfig   | [WhisperApiConfig](#whisperapiconfig)              | undefined     | Whisper API transcription config                                                                                     |
+| onDataAvailable | (blob: Blob) => void                               | undefined     | callback function for getting recorded blob in interval between timeSlice                                            |
+| onTranscribe    | (blob: Blob) => Promise<[Transcript](#transcript)> | undefined     | callback function to handle transcription on your own custom server                                                  |
 
 - ###### WhisperApiConfig
 
diff --git a/package.json b/package.json
@@ -32,7 +32,7 @@
     "prepublishOnly": "yarn run build-prod"
   },
   "dependencies": {
-    "@chengsokdara/react-hooks-async": "^0.0.1",
+    "@chengsokdara/react-hooks-async": "^0.0.2",
     "@ffmpeg/ffmpeg": "^0.11.6",
     "axios": "^1.3.4",
     "hark": "^1.2.3",
diff --git a/src/types.ts b/src/types.ts
@@ -2,12 +2,13 @@ export type UseWhisperConfig = {
   apiKey?: string
   autoStart?: boolean
   autoTranscribe?: boolean
-  /** @deprecated: use {@link UseWhisperConfig.onTranscribe} instead  */
-  customServer?: string
   nonStop?: boolean
   removeSilence?: boolean
   stopTimeout?: number
+  streaming?: boolean
+  timeSlice?: number
   whisperConfig?: WhisperApiConfig
+  onDataAvailable?: (blob: Blob) => void
   onTranscribe?: (blob: Blob) => Promise<UseWhisperTranscript>
 }
 
@@ -32,12 +33,6 @@ export type UseWhisperReturn = {
 
 export type UseWhisperHook = (config?: UseWhisperConfig) => UseWhisperReturn
 
-/** @deprecated along with {@link UseWhisperConfig.customServer} */
-export type CustomServerRequestBody = {
-  file: string | ArrayBuffer | null
-  model: 'whisper-1' | string
-}
-
 export type WhisperApiConfig = {
   model?: 'whisper-1' | string
   prompt?: string
diff --git a/src/useWhisper.ts b/src/useWhisper.ts
diff --git a/yarn.lock b/yarn.lock