Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 3d3efd4

Browse files
sverma25bosd
authored andcommitted
Added multi parameter for page level parameters
1 parent 3068cac commit 3d3efd4

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

camelot/handlers.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class PDFHandler:
3636
3737
"""
3838

39-
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
39+
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None, multi=[]):
4040
if is_url(filepath):
4141
filepath = download_url(filepath)
4242
self.filepath: Union[StrByteType, Path] = filepath
@@ -51,6 +51,7 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5151
if sys.version_info[0] < 3:
5252
self.password = self.password.encode("ascii")
5353
self.pages = self._get_pages(pages)
54+
self.multi = multi
5455

5556
def _get_pages(self, pages):
5657
"""Converts pages string to list of ints.
@@ -188,6 +189,15 @@ def parse(
188189
with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
189190
jobs = []
190191
for p in self.pages:
192+
p_no = p
193+
194+
page_kwargs = kwargs
195+
page_parser = parser
196+
197+
if p_no in self.multi:
198+
page_kwargs.update(self.multi[p_no])
199+
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)
200+
191201
j = pool.apply_async(
192202
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
193203
)
@@ -198,6 +208,15 @@ def parse(
198208
tables.extend(t)
199209
else:
200210
for p in self.pages:
211+
p_no = p
212+
213+
page_kwargs = kwargs
214+
page_parser = parser
215+
216+
if p_no in self.multi:
217+
page_kwargs.update(self.multi[p_no])
218+
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)
219+
201220
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
202221
tables.extend(t)
203222

@@ -224,7 +243,7 @@ def _parse_page(
224243
-------
225244
tables : camelot.core.TableList
226245
List of tables found in PDF.
227-
246+
228247
"""
229248
self._save_page(self.filepath, page, tempdir)
230249
page_path = os.path.join(tempdir, f"page-{page}.pdf")

camelot/io.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def read_pdf(
1717
suppress_stdout=False,
1818
parallel=False,
1919
layout_kwargs=None,
20+
multi = {},
2021
**kwargs
2122
):
2223
"""Read PDF and return extracted tables.
@@ -43,6 +44,9 @@ def read_pdf(
4344
layout_kwargs : dict, optional (default: {})
4445
A dict of `pdfminer.layout.LAParams
4546
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
47+
multi: dict, optional(default: {})
48+
A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params).
49+
Parameters defined in multi overwrite kwargs for that page
4650
table_areas : list, optional (default: None)
4751
List of table area strings of the form x1,y1,x2,y2
4852
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@@ -120,7 +124,7 @@ def read_pdf(
120124
warnings.simplefilter("ignore")
121125

122126
validate_input(kwargs, flavor=flavor)
123-
p = PDFHandler(filepath, pages=pages, password=password)
127+
p = PDFHandler(filepath, pages=pages, password=password, multi=multi)
124128
kwargs = remove_extra(kwargs, flavor=flavor)
125129
tables = p.parse(
126130
flavor=flavor,

0 commit comments

Comments
 (0)