Skip to content

Commit 0df9604

Browse files
tasks: add any-to-any (#1346)
Since these models are becoming more popular I added the task page --------- Co-authored-by: vb <[email protected]>
1 parent fe0e5e6 commit 0df9604

File tree

3 files changed

+126
-1
lines changed

3 files changed

+126
-1
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
## Use Cases
2+
3+
### Embodied Agents
4+
5+
Any-to-any models can help embodied agents operate in multi-sensory environments, such as video games or physical robots. The model can take in an image or video of a scene, text prompts, and audio, and respond by generating text, actions, predict next frames, or generate speech commands.
6+
7+
### Real-time Accessibility Systems
8+
9+
Vision-language based any-to-any models can be used aid visually impaired people. A real-time on-device any-to-any model can take a real-world video stream from wearable glasses, and describe the scene in audio (e.g., "A person in a red coat is walking toward you") or provide real-time closed captions and environmental sound cues.
10+
11+
### Multimodal Content Creation
12+
13+
One can use any-to-any models to generate multimodal content. For example, given a video and an outline, the model can generate speech, better videos, or a descriptive blog post. Moreover, these models can sync narration timing with visual transitions.
14+
15+
## Inference
16+
17+
You can infer with any-to-any models using transformers. Below is an example to infer Qwen2.5-Omni-7B model, make sure to check the model you're inferring with.
18+
19+
```python
20+
import soundfile as sf
21+
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
22+
from qwen_omni_utils import process_mm_info
23+
24+
model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
25+
26+
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
27+
28+
conversation = [
29+
{
30+
"role": "system",
31+
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
32+
},
33+
{
34+
"role": "user",
35+
"content": [
36+
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
37+
],
38+
},
39+
]
40+
41+
USE_AUDIO_IN_VIDEO = True
42+
43+
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
44+
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
45+
inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
46+
inputs = inputs.to(model.device).to(model.dtype)
47+
48+
# Inference: Generation of the output text and audio
49+
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
50+
51+
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
52+
print(text)
53+
sf.write(
54+
"output.wav",
55+
audio.reshape(-1).detach().cpu().numpy(),
56+
samplerate=24000,
57+
)
58+
```
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import type { TaskDataCustom } from "../index.js";
2+
3+
const taskData: TaskDataCustom = {
4+
datasets: [
5+
{
6+
description: "A dataset with multiple modality input and output pairs.",
7+
id: "PKU-Alignment/align-anything",
8+
},
9+
],
10+
demo: {
11+
inputs: [
12+
{
13+
filename: "any-to-any-input.jpg",
14+
type: "img",
15+
},
16+
{
17+
label: "Text Prompt",
18+
content: "Describe the position of the bee in detail.",
19+
type: "text",
20+
},
21+
],
22+
outputs: [
23+
{
24+
label: "Generated Text",
25+
content:
26+
"The place in the picture is Osaka Castle, located in Osaka, Japan. Osaka Castle is a historic castle that was originally built in the 16th century by Toyotomi Hideyoshi, a powerful warlord of the time. It is one of the most famous landmarks in Osaka and is known for its distinctive white walls and black roof tiles. The castle has been rebuilt several times over the centuries and is now a popular tourist attraction, offering visitors a glimpse into Japan's rich history and culture.",
27+
type: "text",
28+
},
29+
{
30+
filename: "any-to-any-output.wav",
31+
type: "audio",
32+
},
33+
],
34+
},
35+
metrics: [],
36+
models: [
37+
{
38+
description: "Strong model that can take in video, audio, image, text and output text and natural speech.",
39+
id: "Qwen/Qwen2.5-Omni-7B",
40+
},
41+
{
42+
description: "Robust model that can take in image and text and generate image and text.",
43+
id: "deepseek-ai/Janus-Pro-7B",
44+
},
45+
{
46+
description: "Any-to-any model with speech, video, audio, image and text understanding capabilities.",
47+
id: "openbmb/MiniCPM-o-2_6",
48+
},
49+
{
50+
description: "A model that can understand image and text and generate image and text.",
51+
id: "EPFL-VILAB/4M-21_XL",
52+
},
53+
],
54+
spaces: [
55+
{
56+
description: "An application to chat with an any-to-any (image & text) model.",
57+
id: "deepseek-ai/Janus-Pro-7B",
58+
},
59+
],
60+
summary:
61+
"Any-to-any models can understand two or more modalities and output two or more modalities.",
62+
widgetModels: [],
63+
youtubeId: "",
64+
};
65+
66+
export default taskData;

packages/tasks/src/tasks/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type { PipelineType } from "../pipelines.js";
22
import { PIPELINE_DATA } from "../pipelines.js";
33

4+
import anyToAny from "./any-to-any/data.js";
45
import audioClassification from "./audio-classification/data.js";
56
import audioToAudio from "./audio-to-audio/data.js";
67
import automaticSpeechRecognition from "./automatic-speech-recognition/data.js";
@@ -198,7 +199,7 @@ function getData(type: PipelineType, partialTaskData: TaskDataCustom = placehold
198199
// Tasks that call getData() without the second argument will
199200
// have a "placeholder" page.
200201
export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
201-
"any-to-any": getData("any-to-any", placeholder),
202+
"any-to-any": getData("any-to-any", anyToAny),
202203
"audio-classification": getData("audio-classification", audioClassification),
203204
"audio-to-audio": getData("audio-to-audio", audioToAudio),
204205
"audio-text-to-text": getData("audio-text-to-text", placeholder),

0 commit comments

Comments
 (0)