Skip to content

Commit 4da154b

Browse files
feat: audio speech to text partition (#4264)
## Summary Enables partitioning of WAV audio files into document elements by transcribing with an optional speech-to-text (STT) agent, defaulting to Whisper. Closes #4029 ## Changes: - New partition_audio() and routing for FileType.WAV so partition() supports audio. - Pluggable STT layer: SpeechToTextAgent interface and SpeechToTextAgentWhisper implementation. - Optional extra audio in pyproject.toml (openai-whisper); all-docs includes audio. - Config: STT_AGENT (and STT_AGENT_MODULES_WHITELIST) for choosing the STT implementation. ## Usage pip install "unstructured[audio]" then partition("file.wav") or partition_audio("file.wav", language="en").
1 parent 6aeb74f commit 4da154b

File tree

21 files changed

+1015
-41
lines changed

21 files changed

+1015
-41
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.21.11
2+
3+
### Enhancements
4+
- **Add speech-to-text to multimodal pipeline**: Audio files (WAV, MP3, FLAC, M4A, OGG, OPUS, WEBM, and any format supported by ffmpeg) can now be partitioned into document elements via speech-to-text. Install the optional `audio` extra (`pip install "unstructured[audio]"`) to use the Whisper-based partitioner. Call `partition()` or `partition_audio()` with an audio file to get a transcript as `NarrativeText` elements, each carrying `segment_start_seconds` / `segment_end_seconds` metadata. **Known limitation**: segment timestamps are dropped when elements are merged by a chunking strategy; consume un-chunked elements directly if audio timeline alignment is required.
5+
16
## 0.21.10
27
- **Add Form Class**: Adds a new form class in elements.py to deal with forms
38

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,12 @@ xlsx = [
111111
"pandas>=2.0.0, <4.0.0",
112112
"xlrd>=2.0.1, <3.0.0",
113113
]
114+
# Speech-to-text for partition_audio (multimodal: audio -> elements)
115+
audio = [
116+
"openai-whisper>=20231117, <20270000",
117+
]
114118
all-docs = [
115-
"unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
119+
"unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
116120
]
117121
# Feature extras
118122
chunking-tokens = [

test_unstructured/file_utils/test_filetype.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,14 @@ def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_ty
169169
("expected_value", "file_name", "mime_type"),
170170
[
171171
(FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
172+
(FileType.BMP, "img/bmp_24.bmp", "image/x-bmp"),
173+
(FileType.BMP, "img/bmp_24.bmp", "image/x-ms-bmp"),
172174
(FileType.CSV, "stanley-cups.csv", "text/csv"),
173175
(FileType.CSV, "stanley-cups.csv", "application/csv"),
174176
(FileType.CSV, "stanley-cups.csv", "application/x-csv"),
175177
(FileType.EML, "eml/fake-email.eml", "message/rfc822"),
176178
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
179+
(FileType.HEIC, "img/DA-1p.heic", "image/x-heic"),
177180
(FileType.HTML, "example-10k-1p.html", "text/html"),
178181
(FileType.JPG, "img/example.jpg", "image/jpeg"),
179182
(FileType.JSON, "spring-weather.html.json", "application/json"),

test_unstructured/file_utils/test_model.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def it_can_recognize_a_file_type_from_a_mime_type(
5656
):
5757
assert FileType.from_mime_type(mime_type) is file_type
5858

59-
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
59+
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "foo/bar", None])
6060
def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
6161
self, mime_type: str | None
6262
):
@@ -76,7 +76,7 @@ def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
7676
(FileType.PDF, "pdf"),
7777
(FileType.XLS, "xlsx"),
7878
(FileType.UNK, None),
79-
(FileType.WAV, None),
79+
(FileType.WAV, "audio"),
8080
(FileType.ZIP, None),
8181
],
8282
)
@@ -98,7 +98,7 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies
9898
(FileType.ODT, ("docx", "pypandoc")),
9999
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
100100
(FileType.UNK, ()),
101-
(FileType.WAV, ()),
101+
(FileType.WAV, ()), # STT agent deps validated at runtime
102102
(FileType.ZIP, ()),
103103
],
104104
)
@@ -119,7 +119,7 @@ def it_knows_which_importable_packages_its_partitioner_depends_on(
119119
(FileType.JPG, True),
120120
(FileType.PDF, True),
121121
(FileType.PPTX, True),
122-
(FileType.WAV, False),
122+
(FileType.WAV, True),
123123
(FileType.ZIP, False),
124124
(FileType.EMPTY, False),
125125
(FileType.UNK, False),
@@ -163,14 +163,13 @@ def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
163163
(FileType.JPG, "partition_image"),
164164
(FileType.PNG, "partition_image"),
165165
(FileType.TIFF, "partition_image"),
166+
(FileType.WAV, "partition_audio"),
166167
],
167168
)
168169
def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
169170
assert file_type.partitioner_function_name == expected_value
170171

171-
@pytest.mark.parametrize(
172-
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
173-
)
172+
@pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
174173
def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
175174
self, file_type: FileType
176175
):
@@ -189,16 +188,15 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_
189188
(FileType.JPG, "unstructured.partition.image"),
190189
(FileType.PNG, "unstructured.partition.image"),
191190
(FileType.TIFF, "unstructured.partition.image"),
191+
(FileType.WAV, "unstructured.partition.audio"),
192192
],
193193
)
194194
def it_knows_the_fully_qualified_name_of_its_partitioner_module(
195195
self, file_type: FileType, expected_value: str
196196
):
197197
assert file_type.partitioner_module_qname == expected_value
198198

199-
@pytest.mark.parametrize(
200-
"file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
201-
)
199+
@pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
202200
def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
203201
self, file_type: FileType
204202
):
@@ -217,6 +215,7 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p
217215
(FileType.JPG, "image"),
218216
(FileType.PNG, "image"),
219217
(FileType.TIFF, "image"),
218+
(FileType.WAV, "audio"),
220219
(FileType.XLS, "xlsx"),
221220
(FileType.XLSX, "xlsx"),
222221
],

0 commit comments

Comments
 (0)