CSDK-1843 improve virtual audio devices poor performance

aconchillo · aconchillo · commit 0054dfaf755e · 2023-10-23T14:18:13.000-07:00
We now create one frame per each device instead of one thread every time we call
read_frames()/write_frames().

We also take into account time spent between calls since usually those will be
done consecutively.
diff --git a/demos/audio/wav_audio_send.py b/demos/audio/wav_audio_send.py
@@ -96,10 +96,9 @@ def send_wav_file(self, file_name):
         while not self.__app_quit and sent_frames < total_frames:
             # Read 100ms worth of audio frames.
             frames = wav.readframes(1600)
-            frames_read = len(frames) / 2 # 16-bit linear PCM
-            if frames_read > 0:
+            if len(frames) > 0:
                 self.__mic_device.write_frames(frames)
-            sent_frames += frames_read
+                sent_frames += 1600
 
 def main():
     parser = argparse.ArgumentParser()
diff --git a/src/media/virtual_microphone_device.rs b/src/media/virtual_microphone_device.rs
@@ -10,7 +10,7 @@ use pyo3::types::PyBytes;
 /// devices are used to send audio to the meeting.
 ///
 /// The audio format used by virtual microphone devices is 16-bit linear PCM.
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 #[pyclass(name = "VirtualMicrophoneDevice", module = "daily")]
 pub struct PyVirtualMicrophoneDevice {
     device_name: String,
@@ -73,6 +73,12 @@ impl PyVirtualMicrophoneDevice {
     /// number of audio frames is not a multiple of 10ms worth of audio frames,
     /// silence will be added as padding.
     ///
+    /// To get low latency real time performance it is important that
+    /// consecutive calls to this function don't take more time than the
+    /// provided audio frames time. For example, if we provide audio frames
+    /// every 10ms then we shouldn't take longer than 10ms to provide the next
+    /// ones.
+    ///
     /// :param bytestring frames: A bytestring with the audio frames to write
     ///
     /// :return: The number of audio frames written
diff --git a/src/media/virtual_speaker_device.rs b/src/media/virtual_speaker_device.rs
@@ -10,7 +10,7 @@ use pyo3::types::PyBytes;
 /// used to receive audio from the meeting.
 ///
 /// The audio format used by virtual speaker devices is 16-bit linear PCM.
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 #[pyclass(name = "VirtualSpeakerDevice", module = "daily")]
 pub struct PyVirtualSpeakerDevice {
     device_name: String,
@@ -72,6 +72,11 @@ impl PyVirtualSpeakerDevice {
     /// should be able to read 160 audio frames (10ms), 320 (20ms), 480 (30ms),
     /// etc.
     ///
+    /// To get low latency real time performance it is important that
+    /// consecutive calls to this function don't take more time than the
+    /// requested audio frames time. For example, if we request audio frames
+    /// every 10ms then we shouldn't take longer than 10ms to process them.
+    ///
     /// :param int num_frames: The number of audio frames to read
     ///
     /// :return: The read audio frames as a bytestring. If no audio frames could be read, it returns an empty bytestring