-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDFCollection.py
221 lines (186 loc) · 7.08 KB
/
PDFCollection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
from PyPDF2 import PdfWriter, PdfReader
from io import BytesIO
from PDFFile import PDFFile
from add_page_numbers import add_page_number
from open_file import open_file
from PDFSortKey import PDFSortKey
class PDFCollection:
captured_keyword = "_"
def __init__(self):
self.files = []
self.bookmarks = {} # {PDFFile: bookmark_title: str}
self.num_files = 0
self.failed_files = set()
def __len__(self):
return self.num_files
def __contains__(self, pdf: PDFFile):
return pdf in self.files
def __iter__(self):
return iter(self.files)
def failed_open(self, pdf: PDFFile):
if pdf.opened_successfully is None:
return None
if pdf.opened_successfully is False:
self.failed_files.add(pdf)
return pdf # returned pdf so that it can be removed from self.files outside of a loop iterating over self.files
return False
def add_bookmark(self, pdf: PDFFile, title: str):
if title:
self.bookmarks[pdf] = title
def remove_bookmark(self, pdf: PDFFile):
if pdf in self.bookmarks:
del self.bookmarks[pdf]
def add_file(self, pdf: PDFFile):
if pdf not in self:
self.files.append(pdf)
self.num_files += 1
def remove_file(self, pdf: PDFFile):
if pdf in self:
self.files.remove(pdf)
self.num_files -= 1
self.remove_bookmark(pdf)
def remove_by_path(self, path: str):
for pdf in self.files:
if pdf.path == path:
self.files.remove(pdf)
self.num_files -= 1
self.remove_bookmark(pdf)
def clear_files(self):
self.files.clear()
self.num_files = 0
self.bookmarks.clear()
def get_file_by_path(self, path: str):
for pdf in self.files:
if pdf.path == path:
return pdf
def sort(self, sort_key: PDFSortKey):
sorted_files = []
files_to_remove = []
for classification in sort_key:
for pdf in self.files:
match = classification.applies_to(pdf)
pdf_if_failed = self.failed_open(
pdf
) # PDFFile when faield, False when not failed
if (
pdf_if_failed is False or pdf_if_failed is None
): # self.failed_open removes the files from self.files and adds it to self.failed_files
if match and pdf not in sorted_files:
sorted_files.append(pdf)
bookmark = classification.bookmark.get().strip()
if bookmark:
for i, group in enumerate(match.groups()):
if i == 0:
bookmark = bookmark.replace(
f"{PDFCollection.captured_keyword}", group
)
self.bookmarks[pdf] = bookmark
elif isinstance(pdf_if_failed, PDFFile):
files_to_remove.append(pdf_if_failed)
for pdf in files_to_remove:
self.remove_file(pdf)
not_matched = [pdf for pdf in self.files if pdf not in sorted_files]
self.files = sorted_files + not_matched
return not_matched
def get_tkinter_table_data(self):
table_values = []
for pdf in self.files:
if pdf in self.bookmarks:
bookmark = self.bookmarks[pdf]
else:
bookmark = ""
table_values.append(pdf.values + (bookmark,))
return table_values
def move_file_up(self, index):
if index > 0:
self.files[index], self.files[index - 1] = (
self.files[index - 1],
self.files[index],
)
def move_file_down(self, index):
if index < len(self.files) - 1:
self.files[index], self.files[index + 1] = (
self.files[index + 1],
self.files[index],
)
def clear_readers(self):
for pdf in self.files:
pdf._reader = None
def build_pdf(
self,
output_path: str,
page_numbers=True,
y_padding=20,
font_size=12,
):
writer = PdfWriter()
current_page = 0
bookmarks = [] # [(page_number, title),]
files_to_remove = []
for i, pdf in enumerate(self):
pdf.text # will open file if not already opened
pdf_or_failed = self.failed_open(pdf)
if isinstance(
pdf_or_failed, PDFFile
): # Issue: failed_open changes self.files
files_to_remove.append(pdf_or_failed)
continue
progress = (i + 1) / len(self.files) * 70
yield progress # Yield progress value
for page in pdf.reader.pages:
writer.add_page(page)
current_page += 1
if pdf in self.bookmarks and self.bookmarks[pdf].strip():
bookmarks.append((current_page - pdf.num_pages, self.bookmarks[pdf]))
if page_numbers:
packet = BytesIO()
writer.write(packet)
packet.seek(0)
tries = 0
max_num_tries = 3
while (
tries < max_num_tries
): # for some reason this takes several tries to work sometimes
tries += 1
try:
packet = add_page_number(
packet, y_padding=y_padding, font_size=font_size
)
except:
if tries == max_num_tries:
raise ValueError(
"Something went wrong while adding page numbers. Please try again."
)
print(f"Failed to add page numbers to pages. Retrying...")
continue
reader = PdfReader(packet)
writer = PdfWriter()
current_page = 0
for page in reader.pages:
writer.add_page(page)
current_page += 1
progress = 95
yield progress
for page_number, title in bookmarks:
writer.add_outline_item(title, page_number, parent=None)
for file in files_to_remove:
self.remove_file(file)
writer.write(output_path)
progress = 100
yield progress
open_file(output_path)
def to_dict(self):
return {
"files": [pdf.to_dict() for pdf in self],
"bookmarks": {pdf.path: title for pdf, title in self.bookmarks.items()},
}
@classmethod
def from_dict(cls, data: dict):
pdf_collection = cls()
for pdf_data in data["files"]:
pdf = PDFFile.from_dict(pdf_data)
pdf_collection.add_file(pdf)
for path, title in data["bookmarks"].items():
pdf = pdf_collection.get_file_by_path(path)
pdf_collection.add_bookmark(pdf, title)
return pdf_collection