Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test_unstructured/file_utils/test_filetype.py‎
Lines changed: 3 additions & 0 deletions b/‎test_unstructured/file_utils/test_filetype.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎test_unstructured/file_utils/test_model.py‎
Lines changed: 9 additions & 10 deletions b/‎test_unstructured/file_utils/test_model.py‎
Lines changed: 9 additions & 10 deletions
@@ -1,3 +1,8 @@
+## 0.21.11
+
+### Enhancements
+- **Add speech-to-text to multimodal pipeline**: Audio files (WAV, MP3, FLAC, M4A, OGG, OPUS, WEBM, and any format supported by ffmpeg) can now be partitioned into document elements via speech-to-text. Install the optional `audio` extra (`pip install "unstructured[audio]"`) to use the Whisper-based partitioner. Call `partition()` or `partition_audio()` with an audio file to get a transcript as `NarrativeText` elements, each carrying `segment_start_seconds` / `segment_end_seconds` metadata. **Known limitation**: segment timestamps are dropped when elements are merged by a chunking strategy; consume un-chunked elements directly if audio timeline alignment is required.
+
 ## 0.21.10
 - **Add Form Class**: Adds a new form class in elements.py to deal with forms
 
 
@@ -111,8 +111,12 @@ xlsx = [
     "pandas>=2.0.0, <4.0.0",
     "xlrd>=2.0.1, <3.0.0",
 ]
+# Speech-to-text for partition_audio (multimodal: audio -> elements)
+audio = [
+    "openai-whisper>=20231117, <20270000",
+]
 all-docs = [
-    "unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
+    "unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",
 ]
 # Feature extras
 chunking-tokens = [
 
@@ -169,11 +169,14 @@ def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_ty
     ("expected_value", "file_name", "mime_type"),
     [
         (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
+        (FileType.BMP, "img/bmp_24.bmp", "image/x-bmp"),
+        (FileType.BMP, "img/bmp_24.bmp", "image/x-ms-bmp"),
         (FileType.CSV, "stanley-cups.csv", "text/csv"),
         (FileType.CSV, "stanley-cups.csv", "application/csv"),
         (FileType.CSV, "stanley-cups.csv", "application/x-csv"),
         (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
+        (FileType.HEIC, "img/DA-1p.heic", "image/x-heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
         (FileType.JSON, "spring-weather.html.json", "application/json"),
 
@@ -56,7 +56,7 @@ def it_can_recognize_a_file_type_from_a_mime_type(
     ):
         assert FileType.from_mime_type(mime_type) is file_type
 
-    @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
+    @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "foo/bar", None])
     def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
         self, mime_type: str | None
     ):
@@ -76,7 +76,7 @@ def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
             (FileType.PDF, "pdf"),
             (FileType.XLS, "xlsx"),
             (FileType.UNK, None),
-            (FileType.WAV, None),
+            (FileType.WAV, "audio"),
             (FileType.ZIP, None),
         ],
     )
@@ -98,7 +98,7 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies
             (FileType.ODT, ("docx", "pypandoc")),
             (FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
             (FileType.UNK, ()),
-            (FileType.WAV, ()),
+            (FileType.WAV, ()),  # STT agent deps validated at runtime
             (FileType.ZIP, ()),
         ],
     )
@@ -119,7 +119,7 @@ def it_knows_which_importable_packages_its_partitioner_depends_on(
             (FileType.JPG, True),
             (FileType.PDF, True),
             (FileType.PPTX, True),
-            (FileType.WAV, False),
+            (FileType.WAV, True),
             (FileType.ZIP, False),
             (FileType.EMPTY, False),
             (FileType.UNK, False),
@@ -163,14 +163,13 @@ def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
             (FileType.JPG, "partition_image"),
             (FileType.PNG, "partition_image"),
             (FileType.TIFF, "partition_image"),
+            (FileType.WAV, "partition_audio"),
         ],
     )
     def it_knows_its_partitioner_function_name(self, file_type: FileType, expected_value: str):
         assert file_type.partitioner_function_name == expected_value
 
-    @pytest.mark.parametrize(
-        "file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
-    )
+    @pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
     def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_partitionable(
         self, file_type: FileType
     ):
@@ -189,16 +188,15 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_
             (FileType.JPG, "unstructured.partition.image"),
             (FileType.PNG, "unstructured.partition.image"),
             (FileType.TIFF, "unstructured.partition.image"),
+            (FileType.WAV, "unstructured.partition.audio"),
         ],
     )
     def it_knows_the_fully_qualified_name_of_its_partitioner_module(
         self, file_type: FileType, expected_value: str
     ):
         assert file_type.partitioner_module_qname == expected_value
 
-    @pytest.mark.parametrize(
-        "file_type", [FileType.WAV, FileType.ZIP, FileType.EMPTY, FileType.UNK]
-    )
+    @pytest.mark.parametrize("file_type", [FileType.ZIP, FileType.EMPTY, FileType.UNK])
     def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_partitionable(
         self, file_type: FileType
     ):
@@ -217,6 +215,7 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p
             (FileType.JPG, "image"),
             (FileType.PNG, "image"),
             (FileType.TIFF, "image"),
+            (FileType.WAV, "audio"),
             (FileType.XLS, "xlsx"),
             (FileType.XLSX, "xlsx"),
         ],
Original file line number	Diff line number	Diff line change
`@@ -111,8 +111,12 @@ xlsx = [`
`111`	`111`	`"pandas>=2.0.0, <4.0.0",`
`112`	`112`	`"xlrd>=2.0.1, <3.0.0",`
`113`	`113`	`]`
	`114`	`+# Speech-to-text for partition_audio (multimodal: audio -> elements)`
	`115`	`+audio = [`
	`116`	`+ "openai-whisper>=20231117, <20270000",`
	`117`	`+]`
`114`	`118`	`all-docs = [`
`115`		`- "unstructured[csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",`
	`119`	`+ "unstructured[audio,csv,doc,docx,epub,image,md,odt,org,pdf,ppt,pptx,rtf,rst,tsv,xlsx]",`
`116`	`120`	`]`
`117`	`121`	`# Feature extras`
`118`	`122`	`chunking-tokens = [`