1
+ from contextlib import contextmanager
2
+ import io
1
3
import os
2
4
import sys
3
5
from pathlib import Path
4
- from typing import Union
6
+ from typing import Union , Any , IO , TypeVar
5
7
6
8
from pypdf import PdfReader
7
9
from pypdf import PdfWriter
8
- from pypdf ._utils import StrByteType
9
10
10
11
from .core import TableList
11
12
from .parsers import Lattice
12
13
from .parsers import Stream
13
14
from .utils import TemporaryDirectory
14
- from .utils import download_url
15
+ from .utils import InvalidArguments
16
+ from .utils import get_url_bytes
15
17
from .utils import get_page_layout
16
18
from .utils import get_rotation
17
19
from .utils import get_text_objects
18
20
from .utils import is_url
19
21
22
+ FilePathType = TypeVar (Union [str , IO [Any ], Path , None ])
20
23
21
24
class PDFHandler :
22
25
"""Handles all operations like temp directory creation, splitting
@@ -25,21 +28,35 @@ class PDFHandler:
25
28
26
29
Parameters
27
30
----------
28
- filepath : str
29
- Filepath or URL of the PDF file.
31
+ filepath : str | pathlib.Path, optional (default: None)
32
+ Filepath or URL of the PDF file. Required if file_bytes is not given
30
33
pages : str, optional (default: '1')
31
34
Comma-separated page numbers.
32
35
Example: '1,3,4' or '1,4-end' or 'all'.
33
36
password : str, optional (default: None)
34
37
Password for decryption.
38
+ file_bytes : io.IOBase, optional (default: None)
39
+ A file-like stream. Required if filepath is not given
35
40
36
41
"""
37
42
38
- def __init__ (self , filepath : Union [ StrByteType , Path ] , pages = "1" , password = None ):
43
+ def __init__ (self , filepath : FilePathType = None , pages = "1" , password = None , file_bytes = None ):
39
44
if is_url (filepath ):
40
- filepath = download_url (filepath )
41
- self .filepath : Union [StrByteType , Path ] = filepath
42
-
45
+ file_bytes = get_url_bytes (filepath )
46
+
47
+ if not filepath and not file_bytes :
48
+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
49
+ if not filepath :
50
+ # filepath must either be passed, or taken from the name attribute
51
+ try :
52
+ filepath = getattr (file_bytes , 'name' )
53
+ except AttributeError :
54
+ msg = ('Either pass a `filepath`, or give the '
55
+ '`file_bytes` argument a name attribute' )
56
+ raise InvalidArguments (msg )
57
+ self .file_bytes = file_bytes # ok to be None
58
+
59
+ self .filepath = filepath
43
60
if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
44
61
raise NotImplementedError ("File format not supported" )
45
62
@@ -51,13 +68,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
51
68
self .password = self .password .encode ("ascii" )
52
69
self .pages = self ._get_pages (pages )
53
70
71
+ @contextmanager
72
+ def managed_file_context (self ):
73
+ """Reads from either the `filepath` or `file_bytes`
74
+ attribute of this instance, to return a file-like object.
75
+ Closes any open file handles on exit or error.
76
+
77
+ Returns
78
+ -------
79
+ file_bytes : io.IOBase
80
+ A readable, seekable, file-like object
81
+ """
82
+ if self .file_bytes :
83
+ # if we can't seek, write to a BytesIO object that can,
84
+ # then seek to the beginning before yielding
85
+ if not hasattr (self .file_bytes , 'seek' ):
86
+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
87
+ self .file_bytes .seek (0 )
88
+ yield self .file_bytes
89
+ else :
90
+ with open (self .filepath , "rb" ) as file_bytes :
91
+ yield file_bytes
92
+
54
93
def _get_pages (self , pages ):
55
94
"""Converts pages string to list of ints.
56
95
57
96
Parameters
58
97
----------
59
- filepath : str
60
- Filepath or URL of the PDF file.
98
+ managed_file_context : io.IOBase
99
+ A readable, seekable, file-like object
61
100
pages : str, optional (default: '1')
62
101
Comma-separated page numbers.
63
102
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -73,74 +112,77 @@ def _get_pages(self, pages):
73
112
if pages == "1" :
74
113
page_numbers .append ({"start" : 1 , "end" : 1 })
75
114
else :
76
- infile = PdfReader (self .filepath , strict = False )
77
-
78
- if infile .is_encrypted :
79
- infile .decrypt (self .password )
80
-
81
- if pages == "all" :
82
- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
83
- else :
84
- for r in pages .split ("," ):
85
- if "-" in r :
86
- a , b = r .split ("-" )
87
- if b == "end" :
88
- b = len (infile .pages )
89
- page_numbers .append ({"start" : int (a ), "end" : int (b )})
90
- else :
91
- page_numbers .append ({"start" : int (r ), "end" : int (r )})
115
+ with self .managed_file_context () as f :
116
+ infile = PdfReader (f , strict = False )
117
+
118
+ if infile .is_encrypted :
119
+ infile .decrypt (self .password )
120
+
121
+ if pages == "all" :
122
+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
123
+ else :
124
+ for r in pages .split ("," ):
125
+ if "-" in r :
126
+ a , b = r .split ("-" )
127
+ if b == "end" :
128
+ b = len (infile .pages )
129
+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
130
+ else :
131
+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
92
132
93
133
result = []
94
134
for p in page_numbers :
95
135
result .extend (range (p ["start" ], p ["end" ] + 1 ))
96
136
return sorted (set (result ))
97
137
98
- def _save_page (self , filepath : Union [ StrByteType , Path ], page , temp ):
138
+ def _save_page (self , page , temp ):
99
139
"""Saves specified page from PDF into a temporary directory.
100
140
101
141
Parameters
102
142
----------
103
- filepath : str
104
- Filepath or URL of the PDF file.
143
+ managed_file_context : io.IOBase
144
+ A readable, seekable, file-like object
105
145
page : int
106
146
Page number.
107
147
temp : str
108
148
Tmp directory.
109
149
110
150
"""
111
- infile = PdfReader (filepath , strict = False )
112
- if infile .is_encrypted :
113
- infile .decrypt (self .password )
114
- fpath = os .path .join (temp , f"page-{ page } .pdf" )
115
- froot , fext = os .path .splitext (fpath )
116
- p = infile .pages [page - 1 ]
117
- outfile = PdfWriter ()
118
- outfile .add_page (p )
119
- with open (fpath , "wb" ) as f :
120
- outfile .write (f )
121
- layout , dim = get_page_layout (fpath )
122
- # fix rotated PDF
123
- chars = get_text_objects (layout , ltype = "char" )
124
- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
125
- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
126
- rotation = get_rotation (chars , horizontal_text , vertical_text )
127
- if rotation != "" :
128
- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
129
- os .rename (fpath , fpath_new )
130
- instream = open (fpath_new , "rb" )
131
- infile = PdfReader (instream , strict = False )
151
+
152
+ with self .managed_file_context () as fileobj :
153
+ infile = PdfReader (fileobj , strict = False )
132
154
if infile .is_encrypted :
133
155
infile .decrypt (self .password )
156
+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
157
+ froot , fext = os .path .splitext (fpath )
158
+ p = infile .pages [page - 1 ]
134
159
outfile = PdfWriter ()
135
- p = infile .pages [0 ]
136
- if rotation == "anticlockwise" :
137
- p .rotate (90 )
138
- elif rotation == "clockwise" :
139
- p .rotate (- 90 )
140
160
outfile .add_page (p )
141
161
with open (fpath , "wb" ) as f :
142
162
outfile .write (f )
143
- instream .close ()
163
+ layout , dim = get_page_layout (fpath )
164
+ # fix rotated PDF
165
+ chars = get_text_objects (layout , ltype = "char" )
166
+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
167
+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
168
+ rotation = get_rotation (chars , horizontal_text , vertical_text )
169
+ if rotation != "" :
170
+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
171
+ os .rename (fpath , fpath_new )
172
+ instream = open (fpath_new , "rb" )
173
+ infile = PdfReader (instream , strict = False )
174
+ if infile .is_encrypted :
175
+ infile .decrypt (self .password )
176
+ outfile = PdfWriter ()
177
+ p = infile .pages [0 ]
178
+ if rotation == "anticlockwise" :
179
+ p .rotate (90 )
180
+ elif rotation == "clockwise" :
181
+ p .rotate (- 90 )
182
+ outfile .add_page (p )
183
+ with open (fpath , "wb" ) as f :
184
+ outfile .write (f )
185
+ instream .close ()
144
186
145
187
def parse (
146
188
self , flavor = "lattice" , suppress_stdout = False , layout_kwargs = None , ** kwargs
@@ -155,7 +197,7 @@ def parse(
155
197
Lattice is used by default.
156
198
suppress_stdout : str (default: False)
157
199
Suppress logs and warnings.
158
- layout_kwargs : dict, optional (default: {} )
200
+ layout_kwargs : dict, optional (default: None )
159
201
A dict of `pdfminer.layout.LAParams
160
202
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
161
203
kwargs : dict
@@ -173,7 +215,7 @@ def parse(
173
215
tables = []
174
216
with TemporaryDirectory () as tempdir :
175
217
for p in self .pages :
176
- self ._save_page (self . filepath , p , tempdir )
218
+ self ._save_page (p , tempdir )
177
219
pages = [os .path .join (tempdir , f"page-{ p } .pdf" ) for p in self .pages ]
178
220
parser = Lattice (** kwargs ) if flavor == "lattice" else Stream (** kwargs )
179
221
for p in pages :
0 commit comments