Skip to content

Commit

Permalink
add support for taking screenshots from pdf and txt documents using p…
Browse files Browse the repository at this point in the history
…ymupdf
  • Loading branch information
tykling committed Dec 17, 2024
1 parent 0b7c7dd commit 9835734
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"opencv-python==4.10.0.84",
"pillow==11.0.0",
"python-magic==0.4.27",
"PyMuPDF==1.25.1",
]
name = "bma-client-lib"
description = "BornHack Media Archive Client Library"
Expand Down Expand Up @@ -79,3 +80,12 @@ convention = "google"
[tool.mypy]
mypy_path = "src"
strict = true

[tool.pytest.ini_options]
filterwarnings = [
"error",
# https://github.com/swig/swig/issues/2881
'ignore:builtin type SwigPyPacked has no __module__ attribute:DeprecationWarning',
'ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning',
'ignore:builtin type swigvarlink has no __module__ attribute:DeprecationWarning',
]
29 changes: 24 additions & 5 deletions src/bma_client_lib/bma_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import exifread
import httpx
import magic
import pymupdf
from PIL import Image, ImageOps

from .datastructures import (
Expand Down Expand Up @@ -159,13 +160,11 @@ def _handle_image_conversion_job(
logger.debug(f"Converting image size and AR took {time.time() - start} seconds")
logger.debug("Done.")

def _handle_thumbnail_source_job(
self, job: ThumbnailSourceJob, fileinfo: dict[str, str], screenshot_time_seconds: int = 60
) -> None:
def _handle_thumbnail_source_job(self, job: ThumbnailSourceJob, fileinfo: dict[str, str]) -> None:
"""Create a thumbnail source for this file."""
if fileinfo["filetype"] == "video":
# use opencv to get video screenshot
cv2_ss = self._get_video_screenshot(job=job, seconds=screenshot_time_seconds)
cv2_ss = self._get_video_screenshot(job=job)
cc = cv2.cvtColor(cv2_ss, cv2.COLOR_BGR2RGB)
job.images = [Image.fromarray(cc)]
# create an exif object with basic info
Expand All @@ -176,10 +175,22 @@ def _handle_thumbnail_source_job(
exif[0x131] = self.clientinfo["client_version"]
job.exif = exif
return
if fileinfo["filetype"] == "document":
# use pymypdf to take a screenshot of page 1 of pdf/txt
ss = self._get_document_screenshot(job=job)
job.images = [ss]
exif = Image.Exif()
exif[0x100] = job.images[0].width
exif[0x101] = job.images[0].height
exif[0x10E] = f"ThumbnailSource for BMA document file {job.basefile_uuid}"
exif[0x131] = self.clientinfo["client_version"]
job.exif = exif
return

# unsupported filetype
raise JobNotSupportedError(job=job)

def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int) -> Image.Image:
def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int = 60) -> Image.Image:
"""Get a screenshot a certain number of seconds into the video."""
path = self.path / job.source_url[1:]
cam = cv2.VideoCapture(path)
Expand All @@ -198,6 +209,14 @@ def _get_video_screenshot(self, job: ThumbnailSourceJob, seconds: int) -> Image.
cv2.destroyAllWindows()
return frame # type: ignore[no-any-return]

def _get_document_screenshot(self, job: ThumbnailSourceJob, page: int = 0) -> Image.Image:
"""Get a screenshot a certain number of pages into the pdf/txt file."""
path = self.path / job.source_url[1:]
doc = pymupdf.open(path)
pdfpage = doc[page]
pix = pdfpage.get_pixmap()
return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

###############################################################################

def _write_and_upload_result(self, job: Job, filename: str) -> None:
Expand Down

0 comments on commit 9835734

Please sign in to comment.