Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 70af652

Browse files
cscanlin-kwhbosd
authored andcommitted
add support for file_bytes argument with managed_file_context()
1 parent 567520b commit 70af652

File tree

11 files changed

+214
-86
lines changed

11 files changed

+214
-86
lines changed

=0.6

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Collecting jeepney
3+
Downloading jeepney-0.8.0-py3-none-any.whl.metadata (1.3 kB)
4+
Using cached jeepney-0.8.0-py3-none-any.whl (48 kB)
5+
Installing collected packages: jeepney
6+
Successfully installed jeepney-0.8.0

=1.13.3

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Requirement already satisfied: numpy in /home/bosd/.local/lib/python3.11/site-packages (1.26.4)

=12.0.0

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Collecting rich
3+
Downloading rich-13.7.1-py3-none-any.whl.metadata (18 kB)
4+
Collecting markdown-it-py>=2.2.0 (from rich)
5+
Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
6+
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/bosd/.local/lib/python3.11/site-packages (from rich) (2.17.2)
7+
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)
8+
Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
9+
Downloading rich-13.7.1-py3-none-any.whl (240 kB)
10+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 240.7/240.7 kB 1.2 MB/s eta 0:00:00
11+
Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
12+
Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
13+
Installing collected packages: mdurl, markdown-it-py, rich
14+
Successfully installed markdown-it-py-3.0.0 mdurl-0.1.2 rich-13.7.1

=2.7.1

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Collecting pygments
3+
Using cached pygments-2.17.2-py3-none-any.whl.metadata (2.6 kB)
4+
Using cached pygments-2.17.2-py3-none-any.whl (1.2 MB)
5+
Installing collected packages: pygments
6+
Successfully installed pygments-2.17.2

=20200726

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Requirement already satisfied: pdfminer.six in /home/bosd/.local/lib/python3.11/site-packages (20231228)
3+
Requirement already satisfied: charset-normalizer>=2.0.0 in /home/bosd/.local/lib/python3.11/site-packages (from pdfminer.six) (3.3.2)
4+
Requirement already satisfied: cryptography>=36.0.0 in /home/bosd/.local/lib/python3.11/site-packages (from pdfminer.six) (42.0.5)
5+
Requirement already satisfied: cffi>=1.12 in /home/bosd/.local/lib/python3.11/site-packages (from cryptography>=36.0.0->pdfminer.six) (1.16.0)
6+
Requirement already satisfied: pycparser in /home/bosd/.local/lib/python3.11/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six) (2.21)

=5.1.1

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Defaulting to user installation because normal site-packages is not writeable
2+
Collecting decorator
3+
Downloading decorator-5.1.1-py3-none-any.whl.metadata (4.0 kB)
4+
Using cached decorator-5.1.1-py3-none-any.whl (9.1 kB)
5+
Installing collected packages: decorator
6+
Successfully installed decorator-5.1.1

camelot/handlers.py

+101-59
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
1+
from contextlib import contextmanager
2+
import io
13
import os
24
import sys
35
from pathlib import Path
4-
from typing import Union
6+
from typing import Union, Any, IO, TypeVar
57

68
from pypdf import PdfReader
79
from pypdf import PdfWriter
8-
from pypdf._utils import StrByteType
910

1011
from .core import TableList
1112
from .parsers import Lattice
1213
from .parsers import Stream
1314
from .utils import TemporaryDirectory
14-
from .utils import download_url
15+
from .utils import InvalidArguments
16+
from .utils import get_url_bytes
1517
from .utils import get_page_layout
1618
from .utils import get_rotation
1719
from .utils import get_text_objects
1820
from .utils import is_url
1921

22+
FilePathType = TypeVar(Union[str, IO[Any], Path, None])
2023

2124
class PDFHandler:
2225
"""Handles all operations like temp directory creation, splitting
@@ -25,21 +28,35 @@ class PDFHandler:
2528
2629
Parameters
2730
----------
28-
filepath : str
29-
Filepath or URL of the PDF file.
31+
filepath : str | pathlib.Path, optional (default: None)
32+
Filepath or URL of the PDF file. Required if file_bytes is not given
3033
pages : str, optional (default: '1')
3134
Comma-separated page numbers.
3235
Example: '1,3,4' or '1,4-end' or 'all'.
3336
password : str, optional (default: None)
3437
Password for decryption.
38+
file_bytes : io.IOBase, optional (default: None)
39+
A file-like stream. Required if filepath is not given
3540
3641
"""
3742

38-
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
43+
def __init__(self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None):
3944
if is_url(filepath):
40-
filepath = download_url(filepath)
41-
self.filepath: Union[StrByteType, Path] = filepath
42-
45+
file_bytes = get_url_bytes(filepath)
46+
47+
if not filepath and not file_bytes:
48+
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
49+
if not filepath:
50+
# filepath must either be passed, or taken from the name attribute
51+
try:
52+
filepath = getattr(file_bytes, 'name')
53+
except AttributeError:
54+
msg = ('Either pass a `filepath`, or give the '
55+
'`file_bytes` argument a name attribute')
56+
raise InvalidArguments(msg)
57+
self.file_bytes = file_bytes # ok to be None
58+
59+
self.filepath = filepath
4360
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
4461
raise NotImplementedError("File format not supported")
4562

@@ -51,13 +68,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5168
self.password = self.password.encode("ascii")
5269
self.pages = self._get_pages(pages)
5370

71+
@contextmanager
72+
def managed_file_context(self):
73+
"""Reads from either the `filepath` or `file_bytes`
74+
attribute of this instance, to return a file-like object.
75+
Closes any open file handles on exit or error.
76+
77+
Returns
78+
-------
79+
file_bytes : io.IOBase
80+
A readable, seekable, file-like object
81+
"""
82+
if self.file_bytes:
83+
# if we can't seek, write to a BytesIO object that can,
84+
# then seek to the beginning before yielding
85+
if not hasattr(self.file_bytes, 'seek'):
86+
self.file_bytes = io.BytesIO(self.file_bytes.read())
87+
self.file_bytes.seek(0)
88+
yield self.file_bytes
89+
else:
90+
with open(self.filepath, "rb") as file_bytes:
91+
yield file_bytes
92+
5493
def _get_pages(self, pages):
5594
"""Converts pages string to list of ints.
5695
5796
Parameters
5897
----------
59-
filepath : str
60-
Filepath or URL of the PDF file.
98+
managed_file_context : io.IOBase
99+
A readable, seekable, file-like object
61100
pages : str, optional (default: '1')
62101
Comma-separated page numbers.
63102
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -73,74 +112,77 @@ def _get_pages(self, pages):
73112
if pages == "1":
74113
page_numbers.append({"start": 1, "end": 1})
75114
else:
76-
infile = PdfReader(self.filepath, strict=False)
77-
78-
if infile.is_encrypted:
79-
infile.decrypt(self.password)
80-
81-
if pages == "all":
82-
page_numbers.append({"start": 1, "end": len(infile.pages)})
83-
else:
84-
for r in pages.split(","):
85-
if "-" in r:
86-
a, b = r.split("-")
87-
if b == "end":
88-
b = len(infile.pages)
89-
page_numbers.append({"start": int(a), "end": int(b)})
90-
else:
91-
page_numbers.append({"start": int(r), "end": int(r)})
115+
with self.managed_file_context() as f:
116+
infile = PdfReader(f, strict=False)
117+
118+
if infile.is_encrypted:
119+
infile.decrypt(self.password)
120+
121+
if pages == "all":
122+
page_numbers.append({"start": 1, "end": len(infile.pages)})
123+
else:
124+
for r in pages.split(","):
125+
if "-" in r:
126+
a, b = r.split("-")
127+
if b == "end":
128+
b = len(infile.pages)
129+
page_numbers.append({"start": int(a), "end": int(b)})
130+
else:
131+
page_numbers.append({"start": int(r), "end": int(r)})
92132

93133
result = []
94134
for p in page_numbers:
95135
result.extend(range(p["start"], p["end"] + 1))
96136
return sorted(set(result))
97137

98-
def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
138+
def _save_page(self, page, temp):
99139
"""Saves specified page from PDF into a temporary directory.
100140
101141
Parameters
102142
----------
103-
filepath : str
104-
Filepath or URL of the PDF file.
143+
managed_file_context : io.IOBase
144+
A readable, seekable, file-like object
105145
page : int
106146
Page number.
107147
temp : str
108148
Tmp directory.
109149
110150
"""
111-
infile = PdfReader(filepath, strict=False)
112-
if infile.is_encrypted:
113-
infile.decrypt(self.password)
114-
fpath = os.path.join(temp, f"page-{page}.pdf")
115-
froot, fext = os.path.splitext(fpath)
116-
p = infile.pages[page - 1]
117-
outfile = PdfWriter()
118-
outfile.add_page(p)
119-
with open(fpath, "wb") as f:
120-
outfile.write(f)
121-
layout, dim = get_page_layout(fpath)
122-
# fix rotated PDF
123-
chars = get_text_objects(layout, ltype="char")
124-
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
125-
vertical_text = get_text_objects(layout, ltype="vertical_text")
126-
rotation = get_rotation(chars, horizontal_text, vertical_text)
127-
if rotation != "":
128-
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
129-
os.rename(fpath, fpath_new)
130-
instream = open(fpath_new, "rb")
131-
infile = PdfReader(instream, strict=False)
151+
152+
with self.managed_file_context() as fileobj:
153+
infile = PdfReader(fileobj, strict=False)
132154
if infile.is_encrypted:
133155
infile.decrypt(self.password)
156+
fpath = os.path.join(temp, f"page-{page}.pdf")
157+
froot, fext = os.path.splitext(fpath)
158+
p = infile.pages[page - 1]
134159
outfile = PdfWriter()
135-
p = infile.pages[0]
136-
if rotation == "anticlockwise":
137-
p.rotate(90)
138-
elif rotation == "clockwise":
139-
p.rotate(-90)
140160
outfile.add_page(p)
141161
with open(fpath, "wb") as f:
142162
outfile.write(f)
143-
instream.close()
163+
layout, dim = get_page_layout(fpath)
164+
# fix rotated PDF
165+
chars = get_text_objects(layout, ltype="char")
166+
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
167+
vertical_text = get_text_objects(layout, ltype="vertical_text")
168+
rotation = get_rotation(chars, horizontal_text, vertical_text)
169+
if rotation != "":
170+
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
171+
os.rename(fpath, fpath_new)
172+
instream = open(fpath_new, "rb")
173+
infile = PdfReader(instream, strict=False)
174+
if infile.is_encrypted:
175+
infile.decrypt(self.password)
176+
outfile = PdfWriter()
177+
p = infile.pages[0]
178+
if rotation == "anticlockwise":
179+
p.rotate(90)
180+
elif rotation == "clockwise":
181+
p.rotate(-90)
182+
outfile.add_page(p)
183+
with open(fpath, "wb") as f:
184+
outfile.write(f)
185+
instream.close()
144186

145187
def parse(
146188
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
@@ -155,7 +197,7 @@ def parse(
155197
Lattice is used by default.
156198
suppress_stdout : str (default: False)
157199
Suppress logs and warnings.
158-
layout_kwargs : dict, optional (default: {})
200+
layout_kwargs : dict, optional (default: None)
159201
A dict of `pdfminer.layout.LAParams
160202
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
161203
kwargs : dict
@@ -173,7 +215,7 @@ def parse(
173215
tables = []
174216
with TemporaryDirectory() as tempdir:
175217
for p in self.pages:
176-
self._save_page(self.filepath, p, tempdir)
218+
self._save_page(p, tempdir)
177219
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
178220
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
179221
for p in pages:

camelot/io.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,23 @@
11
import warnings
22
from pathlib import Path
3-
from typing import Union
43

5-
from pypdf._utils import StrByteType
4+
from .handlers import PDFHandler, FilePathType
65

7-
from .handlers import PDFHandler
8-
from .utils import remove_extra
9-
from .utils import validate_input
6+
from .utils import (
7+
InvalidArguments,
8+
validate_input,
9+
remove_extra,
10+
)
1011

1112

1213
def read_pdf(
13-
filepath: Union[StrByteType, Path],
14+
filepath: FilePathType = None,
1415
pages="1",
1516
password=None,
1617
flavor="lattice",
1718
suppress_stdout=False,
1819
layout_kwargs=None,
20+
file_bytes=None,
1921
**kwargs
2022
):
2123
"""Read PDF and return extracted tables.
@@ -25,8 +27,8 @@ def read_pdf(
2527
2628
Parameters
2729
----------
28-
filepath : str, Path, IO
29-
Filepath or URL of the PDF file.
30+
filepath : str | pathlib.Path, optional (default: None)
31+
Filepath or URL of the PDF file. Required if file_bytes is not given
3032
pages : str, optional (default: '1')
3133
Comma-separated page numbers.
3234
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -37,6 +39,8 @@ def read_pdf(
3739
Lattice is used by default.
3840
suppress_stdout : bool, optional (default: True)
3941
Print all logs and warnings.
42+
file_bytes : io.IOBase, optional (default: None)
43+
A file-like stream. Required if filepath is not given
4044
layout_kwargs : dict, optional (default: {})
4145
A dict of `pdfminer.layout.LAParams
4246
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -112,12 +116,15 @@ def read_pdf(
112116
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
113117
)
114118

119+
if not filepath and not file_bytes:
120+
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
121+
115122
with warnings.catch_warnings():
116123
if suppress_stdout:
117124
warnings.simplefilter("ignore")
118125

119126
validate_input(kwargs, flavor=flavor)
120-
p = PDFHandler(filepath, pages=pages, password=password)
127+
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
121128
kwargs = remove_extra(kwargs, flavor=flavor)
122129
tables = p.parse(
123130
flavor=flavor,

0 commit comments

Comments
 (0)