Skip to content

Commit

Permalink
Add pages method, closes #12. Improve x coordinate positioning, closes
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Dec 13, 2024
1 parent 66327c3 commit d8152a4
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 56 deletions.
32 changes: 23 additions & 9 deletions src/python/txtmarker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@
"""

# Highlight colors
COLORS = [(0.914, 0.118, 0.388), # Red
(0.129, 0.588, 0.953), # Blue
(1.000, 0.757, 0.027), # Yellow
(0.298, 0.686, 0.314), # Green
(0.404, 0.227, 0.718), # Purple
(1.000, 0.596, 0.000), # Orange
(0.475, 0.333, 0.282)] # Bronze

class Highlighter(object):
COLORS = [
(0.914, 0.118, 0.388), # Red
(0.129, 0.588, 0.953), # Blue
(1.000, 0.757, 0.027), # Yellow
(0.298, 0.686, 0.314), # Green
(0.404, 0.227, 0.718), # Purple
(1.000, 0.596, 0.000), # Orange
(0.475, 0.333, 0.282), # Bronze
]


class Highlighter:
"""
Base class that finds text and adds annotations to files.
"""
Expand Down Expand Up @@ -40,3 +43,14 @@ def highlight(self, infile, outfile, highlights):
Returns:
annotation metadata - list of (title, rgb, page, x1, y1, x2, y2)
"""

def pages(self, infile):
"""
Opens input file and returns an iterator of pages.
Args:
infile: path to read input file
Returns:
iterator of pages
"""
3 changes: 2 additions & 1 deletion src/python/txtmarker/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from . import pdf

class Factory(object):

class Factory:
"""
Creates document highlighters.
"""
Expand Down
134 changes: 100 additions & 34 deletions src/python/txtmarker/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from . import base


class Highlighter(base.Highlighter):
"""
Finds text and adds annotations to PDF files.
Expand All @@ -20,15 +21,8 @@ class Highlighter(base.Highlighter):
def highlight(self, infile, outfile, highlights):
annotations = []

for page, layout in enumerate(extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0))):
elements = []

# Extract elements
self.extract(elements, layout)

# Get formatted page text
text = self.text(elements)

# pylint: disable=R1702
for page, elements, text in self.pages(infile):
for name, query in highlights:
result = self.search(query, text)
if result:
Expand All @@ -50,16 +44,37 @@ def highlight(self, infile, outfile, highlights):
break

# Create annotation for each column
annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:eindex]))
annotations.append((name, base.COLORS[index], page) + self.layout(elements[eindex:end+1]))
annotations.append(
(name, base.COLORS[index], page)
+ self.layout(elements[start:eindex])
)
annotations.append(
(name, base.COLORS[index], page)
+ self.layout(elements[eindex : end + 1])
)
else:
# Single column annotation
annotations.append((name, base.COLORS[index], page) + self.layout(elements[start:end+1]))
annotations.append(
(name, base.COLORS[index], page)
+ self.layout(elements[start : end + 1])
)

self.annotate(annotations, infile, outfile)

return annotations

def pages(self, infile):
for page, layout in enumerate(
extract_pages(infile, laparams=LAParams(line_margin=1.0, char_margin=4.0))
):
elements = []

# Extract elements
self.extract(elements, layout)

# Get formatted page text
yield page, elements, self.text(elements)

def extract(self, elements, layout):
"""
Extracts text lines and associated coordinates.
Expand All @@ -76,7 +91,14 @@ def extract(self, elements, layout):
text = obj.get_text()

# Clean common ligatures and unicode chars
pairs = [("ff", "ff"), ("ffi", "ffi"), ("fi", "fi"), ("fl", "fl"), ("\u2010", "-"), ("\u2013", "-")]
pairs = [
("ff", "ff"),
("ffi", "ffi"),
("fi", "fi"),
("fl", "fl"),
("\u2010", "-"),
("\u2013", "-"),
]

for find, replace in pairs:
text = text.replace(find, replace)
Expand Down Expand Up @@ -141,7 +163,7 @@ def search(self, query, text):
if self.chunks > 0:
# Chunk into subqueries, require at least 50 chars per chunk
n = max(int(len(query) / self.chunks), 50)
subqueries = [query[x:x+n] for x in range(0, len(query), n)]
subqueries = [query[x : x + n] for x in range(0, len(query), n)]

# Ensure last chunk is n chars or bigger
if len(subqueries) > 1 and len(subqueries[-1]) < n:
Expand Down Expand Up @@ -185,11 +207,11 @@ def layout(self, elements):
(left, bottom, right, top) coordinates
"""

left = min([element[0][0] for element in elements])
bottom = min([element[0][1] for element in elements])
left = min(element[0][0] for element in elements)
bottom = min(element[0][1] for element in elements)

right = max([element[0][2] for element in elements])
top = max([element[0][3] for element in elements])
right = max(element[0][2] for element in elements)
top = max(element[0][3] for element in elements)

return (left, bottom, right, top)

Expand All @@ -210,35 +232,75 @@ def annotate(self, annotations, infile, outfile):

for title, rgb, page, x1, y1, x2, y2 in annotations:
# Highlight text
annotator.add_annotation("square", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
Appearance(fill=rgb + (0.3,), stroke_color=rgb + (0.3, ), stroke_width=0))
annotator.add_annotation(
"square",
Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
Appearance(
fill=rgb + (0.3,), stroke_color=rgb + (0.3,), stroke_width=0
),
)

if title:
# Determine if title text should be in left or right margin
if x1 < 250:
x1, x2 = max(5, x1 - 35), x1
else:
x1, x2 = x2, x2 + 35
# Calculate x position coordinates
bounds, fontsize = annotator.get_page_bounding_box(page), 8
x1, x2, xmid = self.xposition(bounds, title, fontsize, x1, x2)

# Calculate center of highlight annotation and offset
center = y1 + ((y2 - y1) / 2)
offset = min(max(5, len(title)), 20)

# Set position of text annotation. Handle column layout conflicts.
y1, y2 = self.position(ranges, page, x1 >= 250, center, offset)
# Calculate y position coordinates. Handle column layout conflicts.
y1, y2 = self.yposition(ranges, page, x1 >= xmid, center, offset)

# Add title annotation next to highlight
annotator.add_annotation("text", Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
Appearance(fill=rgb + (1,), font_size=7, stroke_width=1, content=title))
annotator.add_annotation(
"text",
Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page),
Appearance(
fill=rgb + (1,),
font_size=fontsize,
stroke_width=1,
content=title,
),
)

# Register range
ranges.append((page, 0 if x1 < 250 else 1, y1, y2))
ranges.append((page, 0 if x1 < xmid else 1, y1, y2))

annotator.write(outfile)

def position(self, ranges, page, column, center, offset):
def xposition(self, bounds, title, fontsize, x1, x2):
"""
Calculates the x coordinates for a text element.
Args:
bounds: page bounds (x1, y1, x2, y2) box
title: title annotation text
fontsize: font text size
x1: box annotation x start
x2: box annotation x end
Returns:
text x1, text x2, text x midpoint
"""

# Text annotation position parmaeters
_, _, xmax, _ = bounds
xmid = xmax / 2
xspacer, xmargin, xoffset = 5.0, 2.5, min(len(title) * fontsize, 75)

# Determine if title text should be in left or right margin
if x1 < xmid:
x1, x2 = max(xmargin, x1 - xoffset), x1 - xspacer
else:
x1, x2 = x2 + xspacer, min(xmax - xspacer, x2 + xoffset)

return x1, x2, xmid

def yposition(self, ranges, page, column, center, offset):
"""
Searches for the closest open range to use for an annotation element.
Calculates the y coordinates for a text element. Searches for the closest
open range to use for an annotation element.
Args:
ranges: list of existing annotation ranges
Expand All @@ -264,7 +326,9 @@ def position(self, ranges, page, column, center, offset):
y1, y2 = y1 - offset, y2 - offset
else:
# Try with positive offset
conflicts = self.conflicts(ranges, page, column, y1 + offset, y2 + offset)
conflicts = self.conflicts(
ranges, page, column, y1 + offset, y2 + offset
)
if not conflicts:
y1, y2 = y1 + offset, y2 + offset
else:
Expand Down Expand Up @@ -308,4 +372,6 @@ def overlaps(self, start1, end1, start2, end2):
number of overlapping coordinates
"""

return len(set(range(int(start1), int(end1))) & set(range(int(start2), int(end2))))
return len(
set(range(int(start1), int(end1))) & set(range(int(start2), int(end2)))
)
1 change: 1 addition & 0 deletions test/python/testfactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# pylint: disable=E0401
from txtmarker.factory import Factory


class TestFactory(unittest.TestCase):
"""
Factory tests
Expand Down
44 changes: 32 additions & 12 deletions test/python/testpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# pylint: disable=E0401
from txtmarker.factory import Factory


class TestPDF(unittest.TestCase):
"""
PDF tests
Expand Down Expand Up @@ -36,13 +37,19 @@ def testHighlights(self):

highlights = [
("Basic", "Hashing is a key part"),
("Multi-line", "Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly " +
"different with small changes to data or very similar."),
(
"Multi-line",
"Hashes are used to secure. Hashes can be deterministic or non-deterministic. Hashes can be significantly "
+ "different with small changes to data or very similar.",
),
("Regex", "This article.*Python"),
("Regex Multi-line", "The above(.|\n)+is deterministic"),
(None, "Python provides the built-in .hash()")]
(None, "Python provides the built-in .hash()"),
]

annotations = highlighter.highlight(self.path("hash.pdf"), self.path("out.pdf"), highlights)
annotations = highlighter.highlight(
self.path("hash.pdf"), self.path("out.pdf"), highlights
)

# Check annotations created
self.assertEqual(len(annotations), 5)
Expand All @@ -60,7 +67,9 @@ def testOverlaps(self):
# Create duplicate highlights to test overlapping range
highlights = [("Overlaps", "This article will explore various methods")] * 4

annotations = highlighter.highlight(self.path("embeddings.pdf"), self.path("out.pdf"), highlights)
annotations = highlighter.highlight(
self.path("embeddings.pdf"), self.path("out.pdf"), highlights
)

# Check annotations created
self.assertEqual(len(annotations), 4)
Expand All @@ -72,11 +81,18 @@ def testFormatter(self):

highlighter = Factory.create("pdf", lambda x: re.sub(r"[^A-Za-z0-9]", "", x), 4)

highlights = [("End newline", "txtai builds an AI-powered index over sections of text\n"),
("Long line", "NeuML has years of relevant experience in building data strategies for both small and large organizations. " +
"With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ")]

annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out.pdf"), highlights)
highlights = [
("End newline", "txtai builds an AI-powered index over sections of text\n"),
(
"Long line",
"NeuML has years of relevant experience in building data strategies for both small and large organizations. "
+ "With the right data, valuable insights can be gained by capitalizing on modern advances in machine learning. ",
),
]

annotations = highlighter.highlight(
self.path("neuml.pdf"), self.path("out.pdf"), highlights
)

# Check annotations created
self.assertEqual(len(annotations), 2)
Expand All @@ -88,9 +104,13 @@ def testColumns(self):

highlighter = Factory.create("pdf")

highlights = [("Multi-column", "enable machine-learning(.|\n)+specific domains")]
highlights = [
("Multi-column", "enable machine-learning(.|\n)+specific domains")
]

annotations = highlighter.highlight(self.path("neuml.pdf"), self.path("out2.pdf"), highlights)
annotations = highlighter.highlight(
self.path("neuml.pdf"), self.path("out2.pdf"), highlights
)

# Check annotations created
self.assertEqual(len(annotations), 2)
Expand Down

0 comments on commit d8152a4

Please sign in to comment.