Skip to content

Commit 8928445

Browse files
committed
Add functionality to replace tex
1 parent 5c23e10 commit 8928445

File tree

9 files changed

+163
-14
lines changed

9 files changed

+163
-14
lines changed

camelot/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ def set_config(self, key, value):
6666
help="Characters that should be stripped from a string before"
6767
" assigning it to a cell.",
6868
)
69+
@click.option(
70+
"-replace",
71+
"--replace_text",
72+
help="Characters that should be replaced from a string before"
73+
" assigning it to a cell.",
74+
)
6975
@click.option(
7076
"-M",
7177
"--margins",

camelot/io.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ def read_pdf(
5555
strip_text : str, optional (default: '')
5656
Characters that should be stripped from a string before
5757
assigning it to a cell.
58+
replace_text : dict, optional (default: {})
59+
Characters that should be replaced from a string before
60+
assigning it to a cell.
5861
row_tol^ : int, optional (default: 2)
5962
Tolerance parameter used to combine text vertically,
6063
to generate rows.

camelot/parsers/lattice.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ class Lattice(BaseParser):
6363
strip_text : str, optional (default: '')
6464
Characters that should be stripped from a string before
6565
assigning it to a cell.
66+
replace_text : dict, optional (default: {})
67+
Characters that should be replaced from a string before
68+
assigning it to a cell.
6669
line_tol : int, optional (default: 2)
6770
Tolerance parameter used to merge close vertical and horizontal
6871
lines.
@@ -99,6 +102,7 @@ def __init__(
99102
split_text=False,
100103
flag_size=False,
101104
strip_text="",
105+
replace_text={},
102106
line_tol=2,
103107
joint_tol=2,
104108
threshold_blocksize=15,
@@ -117,6 +121,7 @@ def __init__(
117121
self.split_text = split_text
118122
self.flag_size = flag_size
119123
self.strip_text = strip_text
124+
self.replace_text = replace_text
120125
self.line_tol = line_tol
121126
self.joint_tol = joint_tol
122127
self.threshold_blocksize = threshold_blocksize
@@ -360,6 +365,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
360365
split_text=self.split_text,
361366
flag_size=self.flag_size,
362367
strip_text=self.strip_text,
368+
replace_text=self.replace_text,
363369
)
364370
if indices[0][:2] != (-1, -1):
365371
pos_errors.append(error)

camelot/parsers/stream.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ class Stream(BaseParser):
4545
strip_text : str, optional (default: '')
4646
Characters that should be stripped from a string before
4747
assigning it to a cell.
48+
replace_text : dict, optional (default: {})
49+
Characters that should be replaced from a string before
50+
assigning it to a cell.
4851
edge_tol : int, optional (default: 50)
4952
Tolerance parameter for extending textedges vertically.
5053
row_tol : int, optional (default: 2)
@@ -64,6 +67,7 @@ def __init__(
6467
split_text=False,
6568
flag_size=False,
6669
strip_text="",
70+
replace_text={},
6771
edge_tol=50,
6872
row_tol=2,
6973
column_tol=0,
@@ -76,6 +80,7 @@ def __init__(
7680
self.split_text = split_text
7781
self.flag_size = flag_size
7882
self.strip_text = strip_text
83+
self.replace_text = replace_text
7984
self.edge_tol = edge_tol
8085
self.row_tol = row_tol
8186
self.column_tol = column_tol
@@ -414,6 +419,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
414419
split_text=self.split_text,
415420
flag_size=self.flag_size,
416421
strip_text=self.strip_text,
422+
replace_text=self.replace_text,
417423
)
418424
if indices[:2] != (-1, -1):
419425
pos_errors.append(error)

camelot/utils.py

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -505,12 +505,33 @@ def text_strip(text, strip=""):
505505
return stripped
506506

507507

508+
def text_replace(text, replace={}):
509+
"""Replaces the keys for the values that are present in `text`.
510+
Parameters
511+
----------
512+
text : str
513+
Text to process and modify.
514+
replace : dict, optional (default: {})
515+
key value pairs, where keys are swapped for the values in `text`.
516+
Returns
517+
-------
518+
text : str
519+
"""
520+
if replace is {}:
521+
return text
522+
523+
for key, value in replace.items():
524+
text = text.replace(key, value)
525+
526+
return text
527+
528+
508529
# TODO: combine the following functions into a TextProcessor class which
509530
# applies corresponding transformations sequentially
510531
# (inspired from sklearn.pipeline.Pipeline)
511532

512533

513-
def flag_font_size(textline, direction, strip_text=""):
534+
def flag_font_size(textline, direction, strip_text="", replace_text={}):
514535
"""Flags super/subscripts in text by enclosing them with <s></s>.
515536
May give false positives.
516537
@@ -523,7 +544,9 @@ def flag_font_size(textline, direction, strip_text=""):
523544
strip_text : str, optional (default: '')
524545
Characters that should be stripped from a string before
525546
assigning it to a cell.
526-
547+
replace_text : dict, optional (default: {})
548+
Characters that should be replaced from a string before
549+
assigning it to a cell.
527550
Returns
528551
-------
529552
fstring : string
@@ -559,10 +582,14 @@ def flag_font_size(textline, direction, strip_text=""):
559582
fstring = "".join(flist)
560583
else:
561584
fstring = "".join([t.get_text() for t in textline])
585+
586+
fstring = text_replace(fstring, replace_text)
562587
return text_strip(fstring, strip_text)
563588

564589

565-
def split_textline(table, textline, direction, flag_size=False, strip_text=""):
590+
def split_textline(
591+
table, textline, direction, flag_size=False, strip_text="", replace_text={}
592+
):
566593
"""Splits PDFMiner LTTextLine into substrings if it spans across
567594
multiple rows/columns.
568595
@@ -580,7 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
580607
strip_text : str, optional (default: '')
581608
Characters that should be stripped from a string before
582609
assigning it to a cell.
583-
610+
replace_text : dict, optional (default: {})
611+
Characters that should be replaced from a string before
612+
assigning it to a cell.
584613
Returns
585614
-------
586615
grouped_chars : list
@@ -668,20 +697,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
668697
key[0],
669698
key[1],
670699
flag_font_size(
671-
[t[2] for t in chars], direction, strip_text=strip_text
700+
[t[2] for t in chars],
701+
direction,
702+
strip_text=strip_text,
703+
replace_text=replace_text,
672704
),
673705
)
674706
)
675707
else:
676-
gchars = [t[2].get_text() for t in chars]
677-
grouped_chars.append(
678-
(key[0], key[1], text_strip("".join(gchars), strip_text))
679-
)
708+
gchars = "".join([t[2].get_text() for t in chars])
709+
gchars = text_replace(gchars, replace_text)
710+
grouped_chars.append((key[0], key[1], text_strip(gchars, strip_text)))
680711
return grouped_chars
681712

682713

683714
def get_table_index(
684-
table, t, direction, split_text=False, flag_size=False, strip_text=""
715+
table,
716+
t,
717+
direction,
718+
split_text=False,
719+
flag_size=False,
720+
strip_text="",
721+
replace_text={},
685722
):
686723
"""Gets indices of the table cell where given text object lies by
687724
comparing their y and x-coordinates.
@@ -703,7 +740,9 @@ def get_table_index(
703740
strip_text : str, optional (default: '')
704741
Characters that should be stripped from a string before
705742
assigning it to a cell.
706-
743+
replace_text : dict, optional (default: {})
744+
Characters that should be replaced from a string before
745+
assigning it to a cell.
707746
Returns
708747
-------
709748
indices : list
@@ -761,7 +800,12 @@ def get_table_index(
761800
if split_text:
762801
return (
763802
split_textline(
764-
table, t, direction, flag_size=flag_size, strip_text=strip_text
803+
table,
804+
t,
805+
direction,
806+
flag_size=flag_size,
807+
strip_text=strip_text,
808+
replace_text=replace_text,
765809
),
766810
error,
767811
)
@@ -772,13 +816,20 @@ def get_table_index(
772816
(
773817
r_idx,
774818
c_idx,
775-
flag_font_size(t._objs, direction, strip_text=strip_text),
819+
flag_font_size(
820+
t._objs,
821+
direction,
822+
strip_text=strip_text,
823+
replace_text=replace_text,
824+
),
776825
)
777826
],
778827
error,
779828
)
780829
else:
781-
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
830+
text = t.get_text()
831+
text = text_replace(text, replace_text)
832+
return [(r_idx, c_idx, text_strip(text, strip_text))], error
782833

783834

784835
def compute_accuracy(error_weights):

tests/.DS_Store

6 KB
Binary file not shown.

tests/data.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2306,6 +2306,32 @@
23062306
["ChâteauLéoube2016", "10€"],
23072307
]
23082308

2309+
data_stream_replace_text = [
2310+
["VinsauVerre", ""],
2311+
["LesBlancs", "12.5CL"],
2312+
["A.O.PCôtesduRhône", ""],
2313+
["DomainedelaGuicharde«Autourdelachapelle»3316", "8$"],
2314+
["A.O.PVacqueyras", ""],
2315+
["DomainedeMontvac«Melodine»3316", "10$"],
2316+
["A.O.PChâteauneufduPape", ""],
2317+
["DomainedeBeaurenard3317", "13$"],
2318+
["A.O.PCôteauxduLanguedoc", ""],
2319+
["VillaTempora«Untempspourelle»3314", "9$"],
2320+
["A.O.PCôtesdeProvence", ""],
2321+
["ChâteauGrandBoise3317", "9$"],
2322+
["LesRosés", "125CL"],
2323+
["A.O.PCôtesduRhône", ""],
2324+
["DomainedelaFlorane«AfleurdePampre»3316", "8$"],
2325+
["FamilleCoulon(DomaineBeaurenard)Biotifulfox3317", "8$"],
2326+
["A.O.PVacqueyras", ""],
2327+
["DomainedeMontvac3317", "9$"],
2328+
["A.O.PLanguedoc", ""],
2329+
["DomainedeJoncas«Nébla»3315", "8$"],
2330+
["VillaTempora«L’arroseurarrosé»3315", "9$"],
2331+
["A.O.PCôtesdeProvence", ""],
2332+
["ChâteauGrandBoise«SainteVictoire»3317", "9$"],
2333+
["ChâteauLéoube3316", "10$"],
2334+
]
23092335
data_stream_edge_tol = [
23102336
["Key figures", ""],
23112337
["", "2016"],
@@ -2368,6 +2394,32 @@
23682394
["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"],
23692395
]
23702396

2397+
data_lattice_text_replace = [
2398+
[
2399+
"Cycle \nName",
2400+
"KI \n(1/km)",
2401+
"Distance \n(mi)",
2402+
"Percent Fuel Savings",
2403+
"",
2404+
"",
2405+
"",
2406+
],
2407+
[
2408+
"",
2409+
"",
2410+
"",
2411+
"Improved \nSpeed",
2412+
"Decreased \nAccel",
2413+
"Eliminate \nStops",
2414+
"Decreased \nIdle",
2415+
],
2416+
["2012_2", "3,30", "1,3", "5,9%", "9,5%", "29,2%", "17,4%"],
2417+
["2145_1", "0,68", "11,2", "2,4%", "0,1%", "9,5%", "2,7%"],
2418+
["4234_1", "0,59", "58,7", "8,5%", "1,3%", "8,5%", "3,3%"],
2419+
["2032_2", "0,17", "57,8", "21,7%", "0,3%", "2,7%", "1,2%"],
2420+
["4171_1", "0,07", "173,9", "58,1%", "1,6%", "2,1%", "0,5%"],
2421+
]
2422+
23712423
data_lattice_table_rotated = [
23722424
[
23732425
"State",

tests/test_lattice.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,17 @@ def test_lattice(testdir):
2020
assert_frame_equal(df, tables[0].df)
2121

2222

23+
@skip_on_windows
24+
def test_lattice_text_replace(testdir):
25+
df = pd.DataFrame(data_lattice_text_replace)
26+
27+
filename = os.path.join(
28+
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
29+
)
30+
tables = camelot.read_pdf(filename, pages="2", replace_text={".": ","})
31+
assert_frame_equal(df, tables[0].df)
32+
33+
2334
@skip_on_windows
2435
def test_lattice_table_rotated(testdir):
2536
df = pd.DataFrame(data_lattice_table_rotated)

tests/test_stream.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,20 @@ def test_stream_strip_text(testdir):
9999
assert_frame_equal(df, tables[0].df)
100100

101101

102+
def test_stream_replace_text(testdir):
103+
df = pd.DataFrame(data_stream_replace_text)
104+
105+
filename = os.path.join(testdir, "detect_vertical_false.pdf")
106+
tables = camelot.read_pdf(
107+
filename,
108+
flavor="stream",
109+
strip_text=" ,\n",
110+
replace_text={"€": "$", "20": "33"},
111+
)
112+
113+
assert_frame_equal(df, tables[0].df)
114+
115+
102116
def test_stream_edge_tol(testdir):
103117
df = pd.DataFrame(data_stream_edge_tol)
104118

0 commit comments

Comments
 (0)