diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 789c1e5..d5df14f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -496,7 +496,9 @@ def convert( "youtube_transcript_languages", ("en",) ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript( + video_id, languages=youtube_transcript_languages + ) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1076,6 +1078,146 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class VideoConverter(WavConverter): + """ + Converts videos to markdown via: + * extraction of metadata (if `exiftool` is installed) + * speech transcription (if `speech_recognition` AND `pydub` are installed). + * summary via a multimodal LLM if a transcription is available and a llm_client is configured + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + """ + Convert a video to markdown + + Args: + local_path (str): The path to the video file + metadata_exclude: A list of metadata fields to exclude from the extracted exif metadata + metadata_title: The title of the metadata section + transcribe: Whether to transcribe the video + transcript_title: The title of the transcript section + llm_summary: Whether to generate a summary via the provided multimodal LLM client + llm_summary_title: The title of the summary section + """ + + mime_type = mimetypes.guess_type(local_path)[0] + if mime_type is None or not mime_type.startswith("video/"): + return None + + md_content = "" + + # Add metadata, let the user exclude metadata they don't want + metadata = self._get_metadata(local_path) + # Exclude these metadat by default (but allow the user to override) + # Maybe this should be moved to somewhere else + DEFAULTS_METADATA_EXCLUDE = [ + "SourceFile", + "ExifToolVersion", + "Directory", + "FileModifyDate", + "FileAccessDate", + "FileInodeChangeDate", + "FilePermissions", + ] + metadata_exclude = kwargs.get("metadata_exclude", DEFAULTS_METADATA_EXCLUDE) + metadata_title = kwargs.get("metadata_title", "### Metadata:\n") + if metadata_title is not None: + md_content += metadata_title + for f in metadata: + if not f in metadata_exclude: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + transcribe = kwargs.get("transcribe", True) + transcript = "" + if transcribe and IS_AUDIO_TRANSCRIPTION_CAPABLE: + handle, temp_path = tempfile.mkstemp(suffix=".wav") + os.close(handle) + try: + sound = pydub.AudioSegment.from_file(local_path) + with open(temp_path, "wb") as f: + sound.export(f, format="wav") + _args = dict() + _args.update(kwargs) + _args["file_extension"] = ".wav" + + transcript_title = kwargs.get( + "transcript_title", "\n\n### Transcript:\n" + ) + try: + transcript = super()._transcribe_audio(temp_path).strip() + md_content += transcript_title + ( + "[No speech detected]" if transcript == "" else transcript + ) + except Exception: + transcript_error = kwargs.get( + "transcript_error", "Error. Could not transcribe." + ) + md_content += f"{transcript_title}{transcript_error}" + + finally: + os.unlink(temp_path) + + # LLM analysis (Optional) / not all LLMs are fully capable of analyzing video files yet, + # But for now we can use the transcript to get a summary of its content + llm_summary = kwargs.get("llm_summary", True) + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_summary and llm_client is not None and llm_model is not None: + if not transcribe: + print("Error: LLM summary requires transcription to be enabled.") + elif transcript == "": + print("Warning: No transcript found. Skipping LLM summary.") + else: + llm_summary_title = kwargs.get( + "llm_summary_title", "\n\n### Video Summary:\n" + ) + md_content += ( + llm_summary_title + + self._get_llm_video_summary_from_transcript( + transcript, + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), + ) + ) + + # Return the result + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _get_llm_video_summary_from_transcript( + self, transcript, client, model, prompt=None + ) -> str: + """ + helper function to get a summary of the video content from the transcript + + Args: + transcript: the transcript of the video + client: the llm client + model: the llm model + prompt: the prompt to use + Returns: the summary + """ + if prompt is None or prompt.strip() == "": + prompt = "The following is video transcript, based on it, write a summary of the video content:\n" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "text", "text": transcript}, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1282,6 +1424,7 @@ def __init__( self.register_page_converter(WavConverter()) self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) + self.register_page_converter(VideoConverter()) self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) diff --git a/tests/test_files/test.mp4 b/tests/test_files/test.mp4 new file mode 100644 index 0000000..678151c Binary files /dev/null and b/tests/test_files/test.mp4 differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4a981bd..98bbefe 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -130,6 +130,12 @@ "5bda1dd6", ] +VIDEO_TEST_EXIFTOOL = { + "Title": "Sample video test for MarkItDown", + "Comment": "This is a sample video created using FFmpeg, with the voice-over generated by the Parler-TTS model.", + "ImageSize": "1280x720", +} + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): @@ -246,6 +252,14 @@ def test_markitdown_exiftool() -> None: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content + # Test Video metadata + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "test.mp4"), transcribe=False, llm_summary=False + ) + for key in VIDEO_TEST_EXIFTOOL: + target = f"{key}: {VIDEO_TEST_EXIFTOOL[key]}" + assert target in result.text_content + def test_markitdown_deprecation() -> None: try: