From 0d56ce228c669735d29e64f23203e2d2868e49e7 Mon Sep 17 00:00:00 2001
From: Deep-Unlearning <steven@mac-11.taildb5d.ts.net>
Date: Thu, 20 Feb 2025 14:32:09 +0100
Subject: [PATCH 1/3] Initial commit: Add task audio-text-to-text

---
 .../src/tasks/audio-text-to-text/about.md     | 30 ++++++++++
 .../src/tasks/audio-text-to-text/data.ts      | 60 +++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 packages/tasks/src/tasks/audio-text-to-text/about.md
 create mode 100644 packages/tasks/src/tasks/audio-text-to-text/data.ts

diff --git a/packages/tasks/src/tasks/audio-text-to-text/about.md b/packages/tasks/src/tasks/audio-text-to-text/about.md
new file mode 100644
index 0000000000..066799bfb6
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-text-to-text/about.md
@@ -0,0 +1,30 @@
+## Different Types of Audio-Text-to-Text Models
+
+Audio-text-to-text models can be categorized into three main types:
+
+- **Base:**  
+  Pre-trained models that extract rich audio features using techniques such as Wav2Vec, HuBERT, or Whisper. These models serve as the backbone for various downstream tasks. An example is the [Qwen2-Audio-7b](https://huggingface.co/Qwen/Qwen2-Audio-7B), which can be further fine-tuned.
+
+- **Instruction:**  
+  Base models fine-tuned on specialized audio instruction datasets to better handle task-specific querie and conversation. For instance, [Ichigo-llama3.1-s-instruct-v0.4](https://huggingface.co/homebrewltd/Ichigo-llama3.1-s-instruct-v0.4) has been optimized to follow detailed audio-related commands.
+
+### Use Cases
+
+- **Multimodal Audio Dialogue:**  
+  These models can engage in real-time, multi-turn conversations by processing audio inputs and generating text responses. They are the backbone of advanced voice assistants and interactive dialogue systems.
+
+- **Speech Transcription and Analysis:**  
+  Beyond converting spoken words to text, these models capture prosody, emotion, and speaker characteristics. This enriched transcription can be used for applications such as sentiment analysis and speaker profiling.
+
+- **Audio Question Answering:**  
+  By directly processing audio inputs, the models can answer questions about the content of an audio clip—whether it’s a podcast excerpt or a recorded conversation.
+
+- **Audio Command Recognition and Automation:**  
+  Voice-controlled applications, from smart home devices to computer interfaces, benefit from models that can understand and execute complex spoken commands.
+
+- **Voice-Based Computer Use:**  
+  Models can control computing workflows by parsing spoken instructions, making interactions more natural and accessible.
+
+
+### Useful Resources
+
diff --git a/packages/tasks/src/tasks/audio-text-to-text/data.ts b/packages/tasks/src/tasks/audio-text-to-text/data.ts
new file mode 100644
index 0000000000..1651f196b7
--- /dev/null
+++ b/packages/tasks/src/tasks/audio-text-to-text/data.ts
@@ -0,0 +1,60 @@
+import type { TaskDataCustom } from "../index.js";
+
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "Instructions composed of audio and text.",
+			id: "homebrewltd/instruction-speech-encodec-v1.5",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "sample-audio.wav",
+				type: "audio",
+			},
+			{
+				label: "Text Prompt",
+				content: "Transcribe and describe what is being said in the audio.",
+				type: "text",
+			},
+		],
+		outputs: [
+			{
+				label: "Answer",
+				content: "The audio contains a person explaining a recipe for chocolate chip cookies. They describe mixing butter and sugar first, then adding eggs and vanilla extract, followed by the dry ingredients.",
+				type: "text",
+			},
+		],
+	},
+	metrics: [],
+	models: [
+		{
+			description: "Small yet powerful audio language model.",
+			id: "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+		},
+		{
+			description: "Audio Language Model based on Llama 3.1. 8b",
+			id: "homebrewltd/Ichigo-llama3.1-s-instruct-v0.4",
+		},
+		{
+			description: "Strong Audio Language Model.",
+			id: "Qwen/Qwen2-Audio-7B",
+		},
+	],
+	spaces: [
+		{
+			description: "Powerful audio-language model assistant.",
+			id: "Qwen/Qwen2-Audio-Instruct-Demo",
+		},
+		{
+			description: "Real-time audio-text-to-text model.",
+			id: "Steveeeeeeen/talk-to-ultravox-0.5",
+		},
+	],
+	summary:
+		"Audio-text-to-text models extend multimodal AI into the speech domain. Much like their visual counterparts, these models are designed to understand and generate text based on audio inputs. Recent research in spoken dialogue systems and Speech Large Language Models (LLMs) highlights how such models are evolving, leveraging both semantic and acoustic representations extracted from speech signals.",
+	widgetModels: [],
+};
+
+export default taskData;

From 717c0d0d32e0d4eb6b88193c3ec927d3860ddc19 Mon Sep 17 00:00:00 2001
From: Steven Zheng <58599908+Deep-unlearning@users.noreply.github.com>
Date: Tue, 25 Feb 2025 11:31:07 +0100
Subject: [PATCH 2/3] Update
 packages/tasks/src/tasks/audio-text-to-text/about.md

Co-authored-by: vb <vaibhavs10@gmail.com>
---
 packages/tasks/src/tasks/audio-text-to-text/about.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tasks/src/tasks/audio-text-to-text/about.md b/packages/tasks/src/tasks/audio-text-to-text/about.md
index 066799bfb6..104c8578bd 100644
--- a/packages/tasks/src/tasks/audio-text-to-text/about.md
+++ b/packages/tasks/src/tasks/audio-text-to-text/about.md
@@ -1,6 +1,6 @@
 ## Different Types of Audio-Text-to-Text Models
 
-Audio-text-to-text models can be categorized into three main types:
+Audio-text-to-text models can be categorized into two main types:
 
 - **Base:**  
   Pre-trained models that extract rich audio features using techniques such as Wav2Vec, HuBERT, or Whisper. These models serve as the backbone for various downstream tasks. An example is the [Qwen2-Audio-7b](https://huggingface.co/Qwen/Qwen2-Audio-7B), which can be further fine-tuned.

From 8d6bd6f7ef82e1f9b4e06eb5d6e5034c999785ff Mon Sep 17 00:00:00 2001
From: Deep-Unlearning <steven@macbook-pro-de-steven.taildb5d.ts.net>
Date: Fri, 28 Feb 2025 14:44:49 +0100
Subject: [PATCH 3/3] added useful resources

---
 packages/tasks/src/tasks/audio-text-to-text/about.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/packages/tasks/src/tasks/audio-text-to-text/about.md b/packages/tasks/src/tasks/audio-text-to-text/about.md
index 104c8578bd..705de3720a 100644
--- a/packages/tasks/src/tasks/audio-text-to-text/about.md
+++ b/packages/tasks/src/tasks/audio-text-to-text/about.md
@@ -28,3 +28,14 @@ Audio-text-to-text models can be categorized into two main types:
 
 ### Useful Resources
 
+Here are some useful resources:
+
+- [Ultravox, a fast multimodal large language model designed for real-time voice interactions-.](https://github.com/fixie-ai/ultravox)
+
+- [An open-source large-scale audio-language model by Alibaba Cloud, Qwen2-Audio, supporting voice chat and audio analysis in multiple languages.](https://github.com/QwenLM/Qwen2-Audio)
+
+- [A compact, open-source speech tokenizer, WhisperSpeech, enhancing multilingual performance with minimal impact on English capabilities.](https://github.com/janhq/WhisperSpeech)
+
+- [A guide to Microsoft's open-source Phi models, PhiCookBook, offering capable and cost-effective small language models.](https://github.com/microsoft/PhiCookBook) 
+
+- [Fast-RTC, turn any python function into a real-time audio and video stream over WebRTC or WebSockets.](https://huggingface.co/fastrtc)
\ No newline at end of file