Add pages method, closes #12. Improve x coordinate positioning, closes …

…#14
neuml · Dec 13, 2024 · d8152a4 · d8152a4
1 parent 66327c3
commit d8152a4
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 56 deletions.
diff --git a/src/python/txtmarker/base.py b/src/python/txtmarker/base.py
@@ -3,15 +3,18 @@
 """
 
 # Highlight colors
-COLORS = [(0.914, 0.118, 0.388), # Red
-          (0.129, 0.588, 0.953), # Blue
-          (1.000, 0.757, 0.027), # Yellow
-          (0.298, 0.686, 0.314), # Green
-          (0.404, 0.227, 0.718), # Purple
-          (1.000, 0.596, 0.000), # Orange
-          (0.475, 0.333, 0.282)] # Bronze
-
-class Highlighter(object):
+COLORS = [
+    (0.914, 0.118, 0.388),  # Red
+    (0.129, 0.588, 0.953),  # Blue
+    (1.000, 0.757, 0.027),  # Yellow
+    (0.298, 0.686, 0.314),  # Green
+    (0.404, 0.227, 0.718),  # Purple
+    (1.000, 0.596, 0.000),  # Orange
+    (0.475, 0.333, 0.282),  # Bronze
+]
+
+
+class Highlighter:
     """
     Base class that finds text and adds annotations to files.
     """
@@ -40,3 +43,14 @@ def highlight(self, infile, outfile, highlights):
         Returns:
             annotation metadata - list of (title, rgb, page, x1, y1, x2, y2)
         """
+
+    def pages(self, infile):
+        """
+        Opens input file and returns an iterator of pages.
+
+        Args:
+            infile: path to read input file
+
+        Returns:
+            iterator of pages
+        """
diff --git a/src/python/txtmarker/factory.py b/src/python/txtmarker/factory.py
@@ -4,7 +4,8 @@
 
 from . import pdf
 
-class Factory(object):
+
+class Factory:
     """
     Creates document highlighters.
     """

diff --git a/src/python/txtmarker/pdf.py b/src/python/txtmarker/pdf.py
@@ -12,6 +12,7 @@
 
 from . import base
 
+
 class Highlighter(base.Highlighter):
     """
     Finds text and adds annotations to PDF files.
@@ -20,15 +21,8 @@ class Highlighter(base.Highlighter):
     def highlight(self, infile, outfile, highlights):
         annotations = []
 
-        for page, layout in enumerate(extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0))):
-            elements = []
-
-            # Extract elements
-            self.extract(elements, layout)
-
-            # Get formatted page text
-            text = self.text(elements)
-
+        # pylint: disable=R1702
+        for page, elements, text in self.pages(infile):
             for name, query in highlights:
                 result = self.search(query, text)
                 if result:
@@ -50,16 +44,37 @@ def highlight(self, infile, outfile, highlights):
                                 break
 
                         # Create annotation for each column
-                        annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:eindex]))
-                        annotations.append((name, base.COLORS[index], page) + self.layout(elements[eindex:end+1]))
+                        annotations.append(
+                            (name, base.COLORS[index], page)
+                            + self.layout(elements[start:eindex])
+                        )
+                        annotations.append(
+                            (name, base.COLORS[index], page)
+                            + self.layout(elements[eindex : end + 1])
+                        )
                     else:
                         # Single column annotation
-                        annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:end+1]))
+                        annotations.append(
+                            (name, base.COLORS[index], page)
+                            + self.layout(elements[start : end + 1])
+                        )
 
         self.annotate(annotations, infile, outfile)
 
         return annotations
 
+    def pages(self, infile):
+        for page, layout in enumerate(
+            extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0))
+        ):
+            elements = []
+
+            # Extract elements
+            self.extract(elements, layout)
+
+            # Get formatted page text
+            yield page, elements, self.text(elements)
+
     def extract(self, elements, layout):
         """
         Extracts text lines and associated coordinates.
@@ -76,7 +91,14 @@ def extract(self, elements, layout):
                 text = obj.get_text()
 
                 # Clean common ligatures and unicode chars
-                pairs = [("ﬀ", "ff"), ("ﬃ", "ffi"), ("ﬁ", "fi"), ("ﬂ", "fl"), ("\u2010", "-"), ("\u2013", "-")]
+                pairs = [
+                    ("ﬀ", "ff"),
+                    ("ﬃ", "ffi"),
+                    ("ﬁ", "fi"),
+                    ("ﬂ", "fl"),
+                    ("\u2010", "-"),
+                    ("\u2013", "-"),
+                ]
 
                 for find, replace in pairs:
                     text = text.replace(find, replace)
@@ -141,7 +163,7 @@ def search(self, query, text):
         if self.chunks > 0:
             # Chunk into subqueries, require at least 50 chars per chunk
             n = max(int(len(query) / self.chunks), 50)
-            subqueries = [query[x:x+n] for x in range(0, len(query), n)]
+            subqueries = [query[x : x + n] for x in range(0, len(query), n)]
 
             # Ensure last chunk is n chars or bigger
             if len(subqueries) > 1 and len(subqueries[-1]) < n:
@@ -185,11 +207,11 @@ def layout(self, elements):
             (left, bottom, right, top) coordinates
         """
 
-        left = min([element[0][0] for element in elements])
-        bottom = min([element[0][1] for element in elements])
+        left = min(element[0][0] for element in elements)
+        bottom = min(element[0][1] for element in elements)
 
-        right = max([element[0][2] for element in elements])
-        top = max([element[0][3] for element in elements])
+        right = max(element[0][2] for element in elements)
+        top = max(element[0][3] for element in elements)
 
         return (left, bottom, right, top)
 
@@ -210,35 +232,75 @@ def annotate(self, annotations, infile, outfile):
 
         for title, rgb, page, x1, y1, x2, y2 in annotations:
             # Highlight text
-            annotator.add_annotation("square", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
-                                     Appearance(fill=rgb + (0.3,), stroke_color=rgb + (0.3, ), stroke_width=0))
+            annotator.add_annotation(
+                "square",
+                Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
+                Appearance(
+                    fill=rgb + (0.3,), stroke_color=rgb + (0.3,), stroke_width=0
+                ),
+            )
 
             if title:
-                # Determine if title text should be in left or right margin
-                if x1 < 250:
-                    x1, x2 = max(5, x1 - 35), x1
-                else:
-                    x1, x2 = x2, x2 + 35
+                # Calculate x position coordinates
+                bounds, fontsize = annotator.get_page_bounding_box(page), 8
+                x1, x2, xmid = self.xposition(bounds, title, fontsize, x1, x2)
 
                 # Calculate center of highlight annotation and offset
                 center = y1 + ((y2 - y1) / 2)
                 offset = min(max(5, len(title)), 20)
 
-                # Set position of text annotation. Handle column layout conflicts.
-                y1, y2 = self.position(ranges, page, x1 >= 250, center, offset)
+                # Calculate y position coordinates. Handle column layout conflicts.
+                y1, y2 = self.yposition(ranges, page, x1 >= xmid, center, offset)
 
                 # Add title annotation next to highlight
-                annotator.add_annotation("text", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
-                                         Appearance(fill=rgb + (1,), font_size=7, stroke_width=1, content=title))
+                annotator.add_annotation(
+                    "text",
+                    Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
+                    Appearance(
+                        fill=rgb + (1,),
+                        font_size=fontsize,
+                        stroke_width=1,
+                        content=title,
+                    ),
+                )
 
                 # Register range
-                ranges.append((page, 0 if x1 < 250 else 1, y1, y2))
+                ranges.append((page, 0 if x1 < xmid else 1, y1, y2))
 
         annotator.write(outfile)
 
-    def position(self, ranges, page, column, center, offset):
+    def xposition(self, bounds, title, fontsize, x1, x2):
+        """
+        Calculates the x coordinates for a text element.
+
+        Args:
+            bounds: page bounds (x1, y1, x2, y2) box
+            title: title annotation text
+            fontsize: font text size
+            x1: box annotation x start
+            x2: box annotation x end
+
+        Returns:
+            text x1, text x2, text x midpoint
+        """
+
+        # Text annotation position parmaeters
+        _, _, xmax, _ = bounds
+        xmid = xmax / 2
+        xspacer, xmargin, xoffset = 5.0, 2.5, min(len(title) * fontsize, 75)
+
+        # Determine if title text should be in left or right margin
+        if x1 < xmid:
+            x1, x2 = max(xmargin, x1 - xoffset), x1 - xspacer
+        else:
+            x1, x2 = x2 + xspacer, min(xmax - xspacer, x2 + xoffset)
+
+        return x1, x2, xmid
+
+    def yposition(self, ranges, page, column, center, offset):
         """
-        Searches for the closest open range to use for an annotation element.
+        Calculates the y coordinates for a text element. Searches for the closest
+        open range to use for an annotation element.
 
         Args:
             ranges: list of existing annotation ranges
@@ -264,7 +326,9 @@ def position(self, ranges, page, column, center, offset):
                 y1, y2 = y1 - offset, y2 - offset
             else:
                 # Try with positive offset
-                conflicts = self.conflicts(ranges, page, column, y1 + offset, y2 + offset)
+                conflicts = self.conflicts(
+                    ranges, page, column, y1 + offset, y2 + offset
+                )
                 if not conflicts:
                     y1, y2 = y1 + offset, y2 + offset
                 else:
@@ -308,4 +372,6 @@ def overlaps(self, start1, end1, start2, end2):
             number of overlapping coordinates
         """
 
-        return len(set(range(int(start1), int(end1))) & set(range(int(start2), int(end2))))
+        return len(
+            set(range(int(start1), int(end1))) & set(range(int(start2), int(end2)))
+        )
diff --git a/test/python/testfactory.py b/test/python/testfactory.py
@@ -7,6 +7,7 @@
 # pylint: disable=E0401
 from txtmarker.factory import Factory
 
+
 class TestFactory(unittest.TestCase):
     """
     Factory tests

diff --git a/test/python/testpdf.py b/test/python/testpdf.py
@@ -9,6 +9,7 @@
 # pylint: disable=E0401
 from txtmarker.factory import Factory
 
+
 class TestPDF(unittest.TestCase):
     """
     PDF tests
@@ -36,13 +37,19 @@ def testHighlights(self):
 
         highlights = [
             ("Basic", "Hashing is a key part"),
-            ("Multi-line", "Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly " +
-             "different with small changes to data or very similar."),
+            (
+                "Multi-line",
+                "Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly "
+                + "different with small changes to data or very similar.",
+            ),
             ("Regex", "This article.*Python"),
             ("Regex Multi-line", "The above(.|\n)+is deterministic"),
-            (None, "Python provides the built-in .hash()")]
+            (None, "Python provides the built-in .hash()"),
+        ]
 
-        annotations = highlighter.highlight(self.path("hash.pdf"), self.path("out.pdf"), highlights)
+        annotations = highlighter.highlight(
+            self.path("hash.pdf"), self.path("out.pdf"), highlights
+        )
 
         # Check annotations created
         self.assertEqual(len(annotations), 5)
@@ -60,7 +67,9 @@ def testOverlaps(self):
         # Create duplicate highlights to test overlapping range
         highlights = [("Overlaps", "This article will explore various methods")] * 4
 
-        annotations = highlighter.highlight(self.path("embeddings.pdf"), self.path("out.pdf"), highlights)
+        annotations = highlighter.highlight(
+            self.path("embeddings.pdf"), self.path("out.pdf"), highlights
+        )
 
         # Check annotations created
         self.assertEqual(len(annotations), 4)
@@ -72,11 +81,18 @@ def testFormatter(self):
 
         highlighter = Factory.create("pdf", lambda x: re.sub(r"[^A-Za-z0-9]", "", x), 4)
 
-        highlights = [("End newline", "txtai builds an AI-powered index over sections of text\n"),
-                      ("Long line", "NeuML has years of relevant experience in building data strategies for both small and large organizations. " +
-                       "With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ")]
-
-        annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out.pdf"), highlights)
+        highlights = [
+            ("End newline", "txtai builds an AI-powered index over sections of text\n"),
+            (
+                "Long line",
+                "NeuML has years of relevant experience in building data strategies for both small and large organizations. "
+                + "With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ",
+            ),
+        ]
+
+        annotations = highlighter.highlight(
+            self.path("neuml.pdf"), self.path("out.pdf"), highlights
+        )
 
         # Check annotations created
         self.assertEqual(len(annotations), 2)
@@ -88,9 +104,13 @@ def testColumns(self):
 
         highlighter = Factory.create("pdf")
 
-        highlights = [("Multi-column", "enable machine-learning(.|\n)+specific domains")]
+        highlights = [
+            ("Multi-column", "enable machine-learning(.|\n)+specific domains")
+        ]
 
-        annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out2.pdf"), highlights)
+        annotations = highlighter.highlight(
+            self.path("neuml.pdf"), self.path("out2.pdf"), highlights
+        )
 
         # Check annotations created
         self.assertEqual(len(annotations), 2)