From d8152a4c3c862dca505de074af1ffa24ae43e12a Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Fri, 13 Dec 2024 05:47:40 -0500 Subject: [PATCH] Add pages method, closes #12. Improve x coordinate positioning, closes #14 --- src/python/txtmarker/base.py | 32 +++++--- src/python/txtmarker/factory.py | 3 +- src/python/txtmarker/pdf.py | 134 ++++++++++++++++++++++++-------- test/python/testfactory.py | 1 + test/python/testpdf.py | 44 ++++++++--- 5 files changed, 158 insertions(+), 56 deletions(-) diff --git a/src/python/txtmarker/base.py b/src/python/txtmarker/base.py index d425763..a1aa8f1 100644 --- a/src/python/txtmarker/base.py +++ b/src/python/txtmarker/base.py @@ -3,15 +3,18 @@ """ # Highlight colors -COLORS = [(0.914, 0.118, 0.388), # Red - (0.129, 0.588, 0.953), # Blue - (1.000, 0.757, 0.027), # Yellow - (0.298, 0.686, 0.314), # Green - (0.404, 0.227, 0.718), # Purple - (1.000, 0.596, 0.000), # Orange - (0.475, 0.333, 0.282)] # Bronze - -class Highlighter(object): +COLORS = [ + (0.914, 0.118, 0.388), # Red + (0.129, 0.588, 0.953), # Blue + (1.000, 0.757, 0.027), # Yellow + (0.298, 0.686, 0.314), # Green + (0.404, 0.227, 0.718), # Purple + (1.000, 0.596, 0.000), # Orange + (0.475, 0.333, 0.282), # Bronze +] + + +class Highlighter: """ Base class that finds text and adds annotations to files. """ @@ -40,3 +43,14 @@ def highlight(self, infile, outfile, highlights): Returns: annotation metadata - list of (title, rgb, page, x1, y1, x2, y2) """ + + def pages(self, infile): + """ + Opens input file and returns an iterator of pages. + + Args: + infile: path to read input file + + Returns: + iterator of pages + """ diff --git a/src/python/txtmarker/factory.py b/src/python/txtmarker/factory.py index d89c23e..b04be65 100644 --- a/src/python/txtmarker/factory.py +++ b/src/python/txtmarker/factory.py @@ -4,7 +4,8 @@ from . import pdf -class Factory(object): + +class Factory: """ Creates document highlighters. """ diff --git a/src/python/txtmarker/pdf.py b/src/python/txtmarker/pdf.py index ea11757..fa49975 100644 --- a/src/python/txtmarker/pdf.py +++ b/src/python/txtmarker/pdf.py @@ -12,6 +12,7 @@ from . import base + class Highlighter(base.Highlighter): """ Finds text and adds annotations to PDF files. @@ -20,15 +21,8 @@ class Highlighter(base.Highlighter): def highlight(self, infile, outfile, highlights): annotations = [] - for page, layout in enumerate(extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0))): - elements = [] - - # Extract elements - self.extract(elements, layout) - - # Get formatted page text - text = self.text(elements) - + # pylint: disable=R1702 + for page, elements, text in self.pages(infile): for name, query in highlights: result = self.search(query, text) if result: @@ -50,16 +44,37 @@ def highlight(self, infile, outfile, highlights): break # Create annotation for each column - annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:eindex])) - annotations.append((name, base.COLORS[index], page) + self.layout(elements[eindex:end+1])) + annotations.append( + (name, base.COLORS[index], page) + + self.layout(elements[start:eindex]) + ) + annotations.append( + (name, base.COLORS[index], page) + + self.layout(elements[eindex : end + 1]) + ) else: # Single column annotation - annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:end+1])) + annotations.append( + (name, base.COLORS[index], page) + + self.layout(elements[start : end + 1]) + ) self.annotate(annotations, infile, outfile) return annotations + def pages(self, infile): + for page, layout in enumerate( + extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0)) + ): + elements = [] + + # Extract elements + self.extract(elements, layout) + + # Get formatted page text + yield page, elements, self.text(elements) + def extract(self, elements, layout): """ Extracts text lines and associated coordinates. @@ -76,7 +91,14 @@ def extract(self, elements, layout): text = obj.get_text() # Clean common ligatures and unicode chars - pairs = [("ff", "ff"), ("ffi", "ffi"), ("fi", "fi"), ("fl", "fl"), ("\u2010", "-"), ("\u2013", "-")] + pairs = [ + ("ff", "ff"), + ("ffi", "ffi"), + ("fi", "fi"), + ("fl", "fl"), + ("\u2010", "-"), + ("\u2013", "-"), + ] for find, replace in pairs: text = text.replace(find, replace) @@ -141,7 +163,7 @@ def search(self, query, text): if self.chunks > 0: # Chunk into subqueries, require at least 50 chars per chunk n = max(int(len(query) / self.chunks), 50) - subqueries = [query[x:x+n] for x in range(0, len(query), n)] + subqueries = [query[x : x + n] for x in range(0, len(query), n)] # Ensure last chunk is n chars or bigger if len(subqueries) > 1 and len(subqueries[-1]) < n: @@ -185,11 +207,11 @@ def layout(self, elements): (left, bottom, right, top) coordinates """ - left = min([element[0][0] for element in elements]) - bottom = min([element[0][1] for element in elements]) + left = min(element[0][0] for element in elements) + bottom = min(element[0][1] for element in elements) - right = max([element[0][2] for element in elements]) - top = max([element[0][3] for element in elements]) + right = max(element[0][2] for element in elements) + top = max(element[0][3] for element in elements) return (left, bottom, right, top) @@ -210,35 +232,75 @@ def annotate(self, annotations, infile, outfile): for title, rgb, page, x1, y1, x2, y2 in annotations: # Highlight text - annotator.add_annotation("square", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page), - Appearance(fill=rgb + (0.3,), stroke_color=rgb + (0.3, ), stroke_width=0)) + annotator.add_annotation( + "square", + Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page), + Appearance( + fill=rgb + (0.3,), stroke_color=rgb + (0.3,), stroke_width=0 + ), + ) if title: - # Determine if title text should be in left or right margin - if x1 < 250: - x1, x2 = max(5, x1 - 35), x1 - else: - x1, x2 = x2, x2 + 35 + # Calculate x position coordinates + bounds, fontsize = annotator.get_page_bounding_box(page), 8 + x1, x2, xmid = self.xposition(bounds, title, fontsize, x1, x2) # Calculate center of highlight annotation and offset center = y1 + ((y2 - y1) / 2) offset = min(max(5, len(title)), 20) - # Set position of text annotation. Handle column layout conflicts. - y1, y2 = self.position(ranges, page, x1 >= 250, center, offset) + # Calculate y position coordinates. Handle column layout conflicts. + y1, y2 = self.yposition(ranges, page, x1 >= xmid, center, offset) # Add title annotation next to highlight - annotator.add_annotation("text", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page), - Appearance(fill=rgb + (1,), font_size=7, stroke_width=1, content=title)) + annotator.add_annotation( + "text", + Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page), + Appearance( + fill=rgb + (1,), + font_size=fontsize, + stroke_width=1, + content=title, + ), + ) # Register range - ranges.append((page, 0 if x1 < 250 else 1, y1, y2)) + ranges.append((page, 0 if x1 < xmid else 1, y1, y2)) annotator.write(outfile) - def position(self, ranges, page, column, center, offset): + def xposition(self, bounds, title, fontsize, x1, x2): + """ + Calculates the x coordinates for a text element. + + Args: + bounds: page bounds (x1, y1, x2, y2) box + title: title annotation text + fontsize: font text size + x1: box annotation x start + x2: box annotation x end + + Returns: + text x1, text x2, text x midpoint + """ + + # Text annotation position parmaeters + _, _, xmax, _ = bounds + xmid = xmax / 2 + xspacer, xmargin, xoffset = 5.0, 2.5, min(len(title) * fontsize, 75) + + # Determine if title text should be in left or right margin + if x1 < xmid: + x1, x2 = max(xmargin, x1 - xoffset), x1 - xspacer + else: + x1, x2 = x2 + xspacer, min(xmax - xspacer, x2 + xoffset) + + return x1, x2, xmid + + def yposition(self, ranges, page, column, center, offset): """ - Searches for the closest open range to use for an annotation element. + Calculates the y coordinates for a text element. Searches for the closest + open range to use for an annotation element. Args: ranges: list of existing annotation ranges @@ -264,7 +326,9 @@ def position(self, ranges, page, column, center, offset): y1, y2 = y1 - offset, y2 - offset else: # Try with positive offset - conflicts = self.conflicts(ranges, page, column, y1 + offset, y2 + offset) + conflicts = self.conflicts( + ranges, page, column, y1 + offset, y2 + offset + ) if not conflicts: y1, y2 = y1 + offset, y2 + offset else: @@ -308,4 +372,6 @@ def overlaps(self, start1, end1, start2, end2): number of overlapping coordinates """ - return len(set(range(int(start1), int(end1))) & set(range(int(start2), int(end2)))) + return len( + set(range(int(start1), int(end1))) & set(range(int(start2), int(end2))) + ) diff --git a/test/python/testfactory.py b/test/python/testfactory.py index 254e31c..01d3906 100644 --- a/test/python/testfactory.py +++ b/test/python/testfactory.py @@ -7,6 +7,7 @@ # pylint: disable=E0401 from txtmarker.factory import Factory + class TestFactory(unittest.TestCase): """ Factory tests diff --git a/test/python/testpdf.py b/test/python/testpdf.py index 4532357..9d8e02f 100644 --- a/test/python/testpdf.py +++ b/test/python/testpdf.py @@ -9,6 +9,7 @@ # pylint: disable=E0401 from txtmarker.factory import Factory + class TestPDF(unittest.TestCase): """ PDF tests @@ -36,13 +37,19 @@ def testHighlights(self): highlights = [ ("Basic", "Hashing is a key part"), - ("Multi-line", "Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly " + - "different with small changes to data or very similar."), + ( + "Multi-line", + "Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly " + + "different with small changes to data or very similar.", + ), ("Regex", "This article.*Python"), ("Regex Multi-line", "The above(.|\n)+is deterministic"), - (None, "Python provides the built-in .hash()")] + (None, "Python provides the built-in .hash()"), + ] - annotations = highlighter.highlight(self.path("hash.pdf"), self.path("out.pdf"), highlights) + annotations = highlighter.highlight( + self.path("hash.pdf"), self.path("out.pdf"), highlights + ) # Check annotations created self.assertEqual(len(annotations), 5) @@ -60,7 +67,9 @@ def testOverlaps(self): # Create duplicate highlights to test overlapping range highlights = [("Overlaps", "This article will explore various methods")] * 4 - annotations = highlighter.highlight(self.path("embeddings.pdf"), self.path("out.pdf"), highlights) + annotations = highlighter.highlight( + self.path("embeddings.pdf"), self.path("out.pdf"), highlights + ) # Check annotations created self.assertEqual(len(annotations), 4) @@ -72,11 +81,18 @@ def testFormatter(self): highlighter = Factory.create("pdf", lambda x: re.sub(r"[^A-Za-z0-9]", "", x), 4) - highlights = [("End newline", "txtai builds an AI-powered index over sections of text\n"), - ("Long line", "NeuML has years of relevant experience in building data strategies for both small and large organizations. " + - "With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ")] - - annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out.pdf"), highlights) + highlights = [ + ("End newline", "txtai builds an AI-powered index over sections of text\n"), + ( + "Long line", + "NeuML has years of relevant experience in building data strategies for both small and large organizations. " + + "With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ", + ), + ] + + annotations = highlighter.highlight( + self.path("neuml.pdf"), self.path("out.pdf"), highlights + ) # Check annotations created self.assertEqual(len(annotations), 2) @@ -88,9 +104,13 @@ def testColumns(self): highlighter = Factory.create("pdf") - highlights = [("Multi-column", "enable machine-learning(.|\n)+specific domains")] + highlights = [ + ("Multi-column", "enable machine-learning(.|\n)+specific domains") + ] - annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out2.pdf"), highlights) + annotations = highlighter.highlight( + self.path("neuml.pdf"), self.path("out2.pdf"), highlights + ) # Check annotations created self.assertEqual(len(annotations), 2)