updated web agent

dusty-nv · dusty-nv · commit 8fe2c66af0e0 · 2024-03-13T14:11:08.000-04:00
diff --git a/packages/llm/local_llm/agents/voice_chat.py b/packages/llm/local_llm/agents/voice_chat.py
@@ -4,7 +4,7 @@
 
 from local_llm.plugins import (
     UserPrompt, ChatQuery, PrintStream, 
-    RivaASR, RivaTTS, RateLimit,
+    AutoASR, AutoTTS, RateLimit, ProcessProxy, 
     AudioOutputDevice, AudioOutputFile
 )
 
@@ -16,66 +16,68 @@ class VoiceChat(Agent):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-        # ASR
-        self.asr = RivaASR(**kwargs)
-    
-        self.asr.add(PrintStream(partial=False, prefix='## ', color='blue'), RivaASR.OutputFinal)
-        self.asr.add(PrintStream(partial=False, prefix='>> ', color='magenta'), RivaASR.OutputPartial)
-        
-        self.asr.add(self.asr_partial, RivaASR.OutputPartial, threaded=False) # pause output when user is speaking
-        self.asr.add(self.asr_final, RivaASR.OutputFinal, threaded=False)     # clear queues on final ASR transcript
-        
-        self.asr_history = None  # store the partial ASR transcript
-        
         # LLM
-        self.llm = ChatQuery(**kwargs)
+        self.llm = ChatQuery(**kwargs) #ProcessProxy('ChatQuery', **kwargs) # 
+        self.llm.add(PrintStream(color='green'))
+        
+        # ASR
+        self.asr = AutoASR.from_pretrained(**kwargs)
     
-        self.llm.add(PrintStream(color='green', relay=True).add(self.on_eos))
-        self.asr.add(self.llm, RivaASR.OutputFinal)  # runs after asr_final() and any interruptions occur
+        if self.asr:
+            self.asr.add(PrintStream(partial=False, prefix='## ', color='blue'), AutoASR.OutputFinal)
+            self.asr.add(PrintStream(partial=False, prefix='>> ', color='magenta'), AutoASR.OutputPartial)
+            
+            self.asr.add(self.asr_partial, AutoASR.OutputPartial) # pause output when user is speaking
+            self.asr.add(self.asr_final, AutoASR.OutputFinal)     # clear queues on final ASR transcript
+            self.asr.add(self.llm, AutoASR.OutputFinal)  # runs after asr_final() and any interruptions occur
+            
+            self.asr_history = None  # store the partial ASR transcript
 
         # TTS
-        self.tts = RivaTTS(**kwargs)
-        self.tts_output = RateLimit(kwargs['sample_rate_hz'], chunk=9600) # slow down TTS to realtime and be able to pause it
-        
-        self.tts.add(self.tts_output)
-        self.llm.add(self.tts, ChatQuery.OutputWords)
+        self.tts = AutoTTS.from_pretrained(**kwargs)
         
-        # Audio Output
-        self.audio_output_device = kwargs.get('audio_output_device')
-        self.audio_output_file = kwargs.get('audio_output_file')
-        
-        if self.audio_output_device is not None:
-            self.audio_output_device = AudioOutputDevice(**kwargs)
-            self.tts_output.add(self.audio_output_device)
-        
-        if self.audio_output_file is not None:
-            self.audio_output_file = AudioOutputFile(**kwargs)
-            self.tts_output.add(self.audio_output_file)
+        if self.tts:
+            self.tts_output = RateLimit(kwargs['sample_rate_hz'], chunk=9600) # slow down TTS to realtime and be able to pause it
+            self.tts.add(self.tts_output)
+            self.llm.add(self.tts, ChatQuery.OutputWords)
+
+            self.audio_output_device = kwargs.get('audio_output_device')
+            self.audio_output_file = kwargs.get('audio_output_file')
+            
+            if self.audio_output_device is not None:
+                self.audio_output_device = AudioOutputDevice(**kwargs)
+                self.tts_output.add(self.audio_output_device)
+            
+            if self.audio_output_file is not None:
+                self.audio_output_file = AudioOutputFile(**kwargs)
+                self.tts_output.add(self.audio_output_file)
         
         # text prompts from web UI or CLI
         self.prompt = UserPrompt(interactive=True, **kwargs)
         self.prompt.add(self.llm)
         
-        self.pipeline = [self.prompt, self.asr]
+        # setup pipeline with two entry nodes
+        self.pipeline = [self.prompt]
+
+        if self.asr:
+            self.pipeline.append(self.asr)
             
     def asr_partial(self, text):
         self.asr_history = text
         if len(text.split(' ')) < 2:
             return
-        self.tts_output.pause(1.0)
+        if self.tts:
+            self.tts_output.pause(1.0)
 
     def asr_final(self, text):
         self.asr_history = None
+        self.on_interrupt()
         
-        self.llm.interrupt()
-        self.tts.interrupt()
-        
-        self.tts_output.interrupt(block=False) # might be paused/asleep
-
-    def on_eos(self, text):
-        if text.endswith('</s>'):
-            print_table(self.llm.model.stats)
-
+    def on_interrupt(self):
+        self.llm.interrupt(recursive=False)
+        if self.tts:
+            self.tts.interrupt(recursive=False)
+            self.tts_output.interrupt(block=False, recursive=False) # might be paused/asleep
  
 if __name__ == "__main__":
     parser = ArgParser(extras=ArgParser.Defaults+['asr', 'tts', 'audio_output'])
diff --git a/packages/llm/local_llm/agents/web_chat.py b/packages/llm/local_llm/agents/web_chat.py
@@ -25,11 +25,14 @@ def __init__(self, **kwargs):
         """
         super().__init__(**kwargs)
 
-        self.asr.add(self.on_asr_partial, RivaASR.OutputPartial)
-        #self.asr.add(self.on_asr_final, RivaASR.OutputFinal)
+        if self.asr:
+            self.asr.add(self.on_asr_partial, RivaASR.OutputPartial)
+            #self.asr.add(self.on_asr_final, RivaASR.OutputFinal)
         
         self.llm.add(self.on_llm_reply)
-        self.tts_output.add(self.on_tts_samples)
+        
+        if self.tts:
+            self.tts_output.add(self.on_tts_samples)
         
         self.server = WebServer(msg_callback=self.on_message, **kwargs)
         
@@ -40,13 +43,19 @@ def on_message(self, msg, msg_type=0, metadata='', **kwargs):
                 self.send_chat_history()
             if 'client_state' in msg:
                 if msg['client_state'] == 'connected':
+                    if self.tts:
+                        self.server.send_message({'tts_voices': self.tts.voices, 'tts_voice': self.tts.voice, 'tts_rate': self.tts.rate})
                     threading.Timer(1.0, lambda: self.send_chat_history()).start()
-            if 'tts_voice' in msg:
+            if 'tts_voice' in msg and self.tts:
                 self.tts.voice = msg['tts_voice']
+            if 'tts_rate' in msg and self.tts:
+                self.tts.rate = float(msg['tts_rate'])
         elif msg_type == WebServer.MESSAGE_TEXT:  # chat input
+            self.on_interrupt()
             self.prompt(msg.strip('"'))
         elif msg_type == WebServer.MESSAGE_AUDIO:  # web audio (mic)
-            self.asr(msg)
+            if self.asr:
+                self.asr(msg)
         elif msg_type == WebServer.MESSAGE_IMAGE:
             logging.info(f"recieved {metadata} image message {msg.size} -> {msg.filename}")
             self.llm.chat_history.reset()
@@ -79,7 +88,7 @@ def send_chat_history(self, history=None):
             
         history = history.to_list()
         
-        if self.asr_history:
+        if self.asr and self.asr_history:
             history.append({'role': 'user', 'text': self.asr_history})
             
         def web_text(text):
diff --git a/packages/llm/local_llm/local_llm.py b/packages/llm/local_llm/local_llm.py
@@ -51,7 +51,7 @@ def from_pretrained(model, api=None, **kwargs):
             model_name = os.path.basename(model)
             
         if not api:
-            api = default_model_api(model_path, quant)
+            api = default_model_api(model_path, kwargs.get('quant'))
         
         kwargs['name'] = model_name
         kwargs['api'] = api
diff --git a/packages/llm/local_llm/plugins/audio/auto_asr.py b/packages/llm/local_llm/plugins/audio/auto_asr.py
@@ -39,4 +39,14 @@ def from_pretrained(asr=None, **kwargs):
             return RivaASR(**kwargs)
         else:
             raise ValueError(f"ASR model type should be 'riva'")
-    
+    
+    def add_punctuation(self, text):
+        """
+        Make sure that the transcript ends in some kind of punctuation
+        """
+        x = text.strip()
+        
+        if not any([x[-1] == y for y in ('.', ',', '?', '!', ':')]):
+            return text + '.'
+            
+        return text
diff --git a/packages/llm/local_llm/plugins/audio/auto_tts.py b/packages/llm/local_llm/plugins/audio/auto_tts.py
@@ -40,9 +40,9 @@ def from_pretrained(tts=None, **kwargs):
             return None
             
         if FastPitchTTS.is_model(tts):
-            return FastPitchTTS(model=tts, **kwargs)
+            return FastPitchTTS(**{**kwargs, 'model': tts})
         elif XTTS.is_model(tts):
-            return XTTS(model=tts, **kwargs)
+            return XTTS(**{**kwargs, 'model': tts})
         elif tts.lower() == 'riva':
             return RivaTTS(**kwargs)
         else:
@@ -92,7 +92,6 @@ def buffer_text(self, text):
         # see if input is needed to prevent a gap-out
         if 'time' in self.buffering:    
             timeout = self.needs_text_by - time.perf_counter() - 0.05  # TODO make this RTFX factor adjustable
-            
             if timeout > 0:
                 return None   # we can keep accumulating text
                 
@@ -111,7 +110,7 @@ def buffer_text(self, text):
                 return None
                
             # for commas, make sure there are at least a handful of proceeding words
-            if self.text_buffer[punc_pos] == ',' and len(self.text_buffer[:punc_pos].split(' ')) < 4:
+            if len(self.text_buffer[:punc_pos].split(' ')) < 4: #and self.text_buffer[punc_pos] == ',':
                 return None
                 
             # make sure that the character following the punctuation isn't alphanumeric
@@ -162,10 +161,14 @@ def filter_text(self, text, numbers_to_words=False):
             return None
             
         # text = text.strip()
-        text = text.replace('</s>', '')
+        for stop_token in StopTokens:
+            text = text.replace(stop_token, '')
+            
+        #text = text.replace('</s>', '')
         text = text.replace('\n', ' ')
-        #text = text.replace('  ', ' ')
-        
+        text = text.replace('...', ' ')        
+        text = self.filter_chars(text)
+
         if numbers_to_words:
             text = self.numbers_to_words(text)
             
@@ -174,6 +177,26 @@ def filter_text(self, text, numbers_to_words=False):
             
         return text
     
+    def filter_chars(self, text):
+        """
+        Filter out non-alphanumeric and non-punctuation characters
+        """
+        def filter_char(input):
+            for idx, char in enumerate(input):
+                if char.isalnum() or any([char == x for x in ('.', ',', '?', '!', ':', ';', '-', "'", '"', ' ', '/')]):
+                    continue
+                else:
+                    return input.replace(char, ' ')
+            return input
+        
+        while True:
+            filtered = filter_char(text)
+            if filtered == text:
+                return text
+            else:
+                text = filtered
+                continue
+                
     def numbers_to_words(self, text):
         """
         Convert instances of numbers to words in the text.
diff --git a/packages/llm/local_llm/plugins/audio/riva_asr.py b/packages/llm/local_llm/plugins/audio/riva_asr.py
@@ -53,7 +53,7 @@ def __init__(self, riva_server='localhost:50051',
         self.sample_rate = sample_rate_hz
         self.confidence_threshold = asr_confidence
         self.silence_threshold = asr_silence
-        self.keep_alive_timeout = 99  # requests timeout after 1000 seconds
+        self.keep_alive_timeout = 5  # requests timeout after 1000 seconds
         
         self.asr_service = riva.client.ASRService(self.auth)
         
@@ -104,7 +104,7 @@ def generate(self, audio_generator):
                         score = result.alternatives[0].confidence
                         if score >= self.confidence_threshold:
                             logging.debug(f"submitting ASR transcript (confidence={score:.3f}) -> '{transcript}'")
-                            self.output(transcript, AutoASR.OutputFinal)
+                            self.output(self.add_punctuation(transcript), AutoASR.OutputFinal)
                         else:
                             logging.warning(f"dropping ASR transcript (confidence={score:.3f} < {self.confidence_threshold:.3f}) -> '{transcript}'")
                     else:
diff --git a/packages/llm/local_llm/plugins/audio/riva_tts.py b/packages/llm/local_llm/plugins/audio/riva_tts.py
@@ -43,7 +43,7 @@ def __init__(self, riva_server='localhost:50051',
         self.pitch = voice_pitch
         self.volume = voice_volume
 
-        self.language = language
+        self.language = language_code
         self.sample_rate = sample_rate_hz
         
         # find out how to query these for non-English models
diff --git a/packages/llm/local_llm/plugins/process_proxy.py b/packages/llm/local_llm/plugins/process_proxy.py
@@ -44,8 +44,9 @@ def input(self, input):
         self.data_parent.send_bytes(input)
         
     def start(self):
-        self.control_parent.send('start')
-        self.assert_message(self.control_parent.recv(), 'started')
+        if not self.is_alive():
+            self.control_parent.send('start')
+            self.assert_message(self.control_parent.recv(), 'started')
         return super().start()
 
     def run(self):
diff --git a/packages/llm/local_llm/utils/args.py b/packages/llm/local_llm/utils/args.py
@@ -98,7 +98,7 @@ def __init__(self, extras=Defaults, **kwargs):
             self.add_argument("--language-code", default="en-US", help="Language code of the ASR/TTS to be used.")
 
         if 'tts' in extras:
-            self.add_argument("--tts", type=str, default="riva", help="name of path of the TTS model to use (e.g. 'riva', 'xtts', 'none', 'disabled')")
+            self.add_argument("--tts", type=str, default=None, help="name of path of the TTS model to use (e.g. 'riva', 'xtts', 'none', 'disabled')")
             self.add_argument("--tts-buffering", type=str, default="punctuation", help="buffering method for TTS ('none', 'punctuation', 'time', 'punctuation,time')")
             self.add_argument("--voice", type=str, default="English-US.Female-1", help="Voice model name to use for TTS")
             self.add_argument("--voice-rate", type=float, default=1.0, help="TTS SSML voice speaker rate (between 25-250%%)")
@@ -107,7 +107,7 @@ def __init__(self, extras=Defaults, **kwargs):
             #self.add_argument("--voice-min-words", type=int, default=4, help="the minimum number of words the TTS should wait to speak")
             
         if 'asr' in extras:
-            self.add_argument("--asr", type=str, default="riva", help="name or path of the ASR model to use (e.g. 'riva', 'none', 'disabled')")
+            self.add_argument("--asr", type=str, default=None, help="name or path of the ASR model to use (e.g. 'riva', 'none', 'disabled')")
             self.add_argument("--asr-confidence", type=float, default=-2.5, help="minimum ASR confidence (only applies to 'final' transcripts)")
             self.add_argument("--asr-silence", type=float, default=-1.0, help="audio with RMS equal to or below this amount will be considered silent (negative will disable silence detection)")
             self.add_argument("--asr-chunk", type=int, default=1600, help="the number of audio samples to buffer as input to ASR")
diff --git a/packages/llm/local_llm/web/server.py b/packages/llm/local_llm/web/server.py
@@ -111,8 +111,9 @@ def __init__(self, web_host='0.0.0.0', web_port=8050, ws_port=49000,
             self.ssl_context.load_cert_chain(certfile=self.ssl_cert, keyfile=self.ssl_key)
             
         # websocket
-        self.ws_port = ws_port
         self.websocket = None
+        self.ws_port = ws_port
+        self.kwargs['ws_port'] = ws_port
 
         self.ws_server = websocket_serve(self.on_websocket, host=self.host, port=self.ws_port, ssl_context=self.ssl_context, max_size=None)
         self.ws_thread = threading.Thread(target=lambda: self.ws_server.serve_forever(), daemon=True)
diff --git a/packages/llm/local_llm/web/templates/index.html b/packages/llm/local_llm/web/templates/index.html