add mode option between transcriptions or translations

chengsokdara · chengsokdara · commit 07616a5965d2 · 2023-03-14T12:48:42.000+07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - able to run custom ffmpeg command
 - expose onTranscribing event
 
+## [0.1.1] - 2023-03-14
+
+### Added
+
+- add mode option for Whisper API, choose either transcriptions or translations
+  (currently only support translation to English)
+
+### Changed
+
+- default timeSlice from 2000ms to 1000ms
+
 ## [0.1.0] - 2023-03-11
 
 ### Added
diff --git a/README.md b/README.md
@@ -232,19 +232,20 @@ _most of these dependecies are lazy loaded, so it is only imported when it is ne
 
 - ###### Config Object
 
-| Name            | Type                                               | Default Value | Description                                                                                                          |
-| --------------- | -------------------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------- |
-| apiKey          | string                                             | ''            | your OpenAI API token                                                                                                |
-| autoStart       | boolean                                            | false         | auto start speech recording on component mount                                                                       |
-| autoTranscribe  | boolean                                            | true          | should auto transcribe after stop recording                                                                          |
-| nonStop         | boolean                                            | false         | if true, record will auto stop after stopTimeout. However if user keep on speaking, the recorder will keep recording |
-| removeSilence   | boolean                                            | false         | remove silence before sending file to OpenAI API                                                                     |
-| stopTimeout     | number                                             | 5,000 ms      | if nonStop is true, this become required. This control when the recorder auto stop                                   |
-| streaming       | boolean                                            | false         | transcribe speech in real-time based on timeSlice                                                                    |
-| timeSlice       | number                                             | 2000 ms       | interval between each onDataAvailable event                                                                          |
-| whisperConfig   | [WhisperApiConfig](#whisperapiconfig)              | undefined     | Whisper API transcription config                                                                                     |
-| onDataAvailable | (blob: Blob) => void                               | undefined     | callback function for getting recorded blob in interval between timeSlice                                            |
-| onTranscribe    | (blob: Blob) => Promise<[Transcript](#transcript)> | undefined     | callback function to handle transcription on your own custom server                                                  |
+| Name            | Type                                               | Default Value  | Description                                                                                                          |
+| --------------- | -------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------- |
+| apiKey          | string                                             | ''             | your OpenAI API token                                                                                                |
+| autoStart       | boolean                                            | false          | auto start speech recording on component mount                                                                       |
+| autoTranscribe  | boolean                                            | true           | should auto transcribe after stop recording                                                                          |
+| mode            | string                                             | transcriptions | control Whisper mode either transcriptions or translations, currently only support translation to English            |
+| nonStop         | boolean                                            | false          | if true, record will auto stop after stopTimeout. However if user keep on speaking, the recorder will keep recording |
+| removeSilence   | boolean                                            | false          | remove silence before sending file to OpenAI API                                                                     |
+| stopTimeout     | number                                             | 5,000 ms       | if nonStop is true, this become required. This control when the recorder auto stop                                   |
+| streaming       | boolean                                            | false          | transcribe speech in real-time based on timeSlice                                                                    |
+| timeSlice       | number                                             | 1000 ms        | interval between each onDataAvailable event                                                                          |
+| whisperConfig   | [WhisperApiConfig](#whisperapiconfig)              | undefined      | Whisper API transcription config                                                                                     |
+| onDataAvailable | (blob: Blob) => void                               | undefined      | callback function for getting recorded blob in interval between timeSlice                                            |
+| onTranscribe    | (blob: Blob) => Promise<[Transcript](#transcript)> | undefined      | callback function to handle transcription on your own custom server                                                  |
 
 - ###### WhisperApiConfig
 
diff --git a/src/configs.ts b/src/configs.ts
@@ -6,5 +6,4 @@ export const ffmpegCoreUrl =
 export const silenceRemoveCommand =
   'silenceremove=start_periods=1:stop_periods=-1:start_threshold=-30dB:stop_threshold=-30dB:start_silence=2:stop_silence=2'
 
-export const whisperApiEndpoint =
-  'https://api.openai.com/v1/audio/transcriptions'
+export const whisperApiEndpoint = 'https://api.openai.com/v1/audio/'
diff --git a/src/types.ts b/src/types.ts
@@ -2,6 +2,7 @@ export type UseWhisperConfig = {
   apiKey?: string
   autoStart?: boolean
   autoTranscribe?: boolean
+  mode?: 'transcriptions' | 'translations'
   nonStop?: boolean
   removeSilence?: boolean
   stopTimeout?: number
diff --git a/src/useWhisper.ts b/src/useWhisper.ts
@@ -13,7 +13,7 @@ import {
   silenceRemoveCommand,
   whisperApiEndpoint,
 } from './configs'
-import type {
+import {
   UseWhisperConfig,
   UseWhisperHook,
   UseWhisperTimeout,
@@ -27,11 +27,12 @@ const defaultConfig: UseWhisperConfig = {
   apiKey: '',
   autoStart: false,
   autoTranscribe: true,
+  mode: 'transcriptions',
   nonStop: false,
   removeSilence: false,
   stopTimeout: defaultStopTimeout,
   streaming: false,
-  timeSlice: 2_000,
+  timeSlice: 1_000,
   onDataAvailable: undefined,
   onTranscribe: undefined,
 }
@@ -59,6 +60,7 @@ export const useWhisper: UseWhisperHook = (config) => {
     apiKey,
     autoStart,
     autoTranscribe,
+    mode,
     nonStop,
     removeSilence,
     stopTimeout,
@@ -462,7 +464,9 @@ export const useWhisper: UseWhisperHook = (config) => {
       const body = new FormData()
       body.append('file', file)
       body.append('model', 'whisper-1')
-      body.append('language', whisperConfig?.language ?? 'en')
+      if (mode === 'transcriptions') {
+        body.append('language', whisperConfig?.language ?? 'en')
+      }
       if (whisperConfig?.prompt) {
         body.append('prompt', whisperConfig.prompt)
       }
@@ -478,7 +482,7 @@ export const useWhisper: UseWhisperHook = (config) => {
         headers['Authorization'] = `Bearer ${apiKey}`
       }
       const { default: axios } = await import('axios')
-      const response = await axios.post(whisperApiEndpoint, body, {
+      const response = await axios.post(whisperApiEndpoint + mode, body, {
         headers,
       })
       return response.data.text