-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathepub_processor.py
110 lines (93 loc) · 4.66 KB
/
epub_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import re
from pathlib import Path
class EpubProcessor:
def __init__(self, base_path, character_limit=350000):
self.base_path = Path(base_path)
self.order_file = self.base_path / "files_order.txt"
self.order = self.read_order_file()
self.character_limit = character_limit
def read_order_file(self):
with open(self.order_file, 'r', encoding='utf-8') as file:
order = file.readlines()
return [x.strip() for x in order]
def clean_html_content(self, text):
text = re.sub(r"\s+", " ", text) # Collapses all whitespace into single spaces for cleaner processing
text = re.sub(r"<div[^>]*>", "", text)
text = re.sub(r"</div>", "\n", text)
text = re.sub(r"<p[^>]*>", "", text)
text = re.sub(r"</p>", "\n", text)
text = re.sub(r"<h1[^>]*>", "", text)
text = re.sub(r"</h1>", "\n", text)
text = re.sub(r"<a[^>]*>", "", text)
text = re.sub(r"</a>", "\n", text)
text = re.sub(r"<span[^>]*>", "", text)
text = re.sub(r"<link[^>]*/>", "", text)
text = re.sub(r"</span>", "\n", text)
text = re.sub(r"<!DOCTYPE[^>]*>", "", text)
text = re.sub(r" ", " ", text)
text = re.sub(r"<style.*?>.*?</style>", "", text, flags=re.DOTALL)
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
text = re.sub(r"<.*?>", "", text)
lines = text.split('\n')
non_empty_lines = [line for line in lines if line.strip() != '']
return '\n'.join(non_empty_lines)
def process_files(self):
processed_files = []
skipped_files = []
copyright_keywords = [
"copyright", "all rights reserved",
"ISBN", "Library of Congress"
]
for file_name in self.order:
full_file_path = self.base_path / file_name
if full_file_path.is_file():
with open(full_file_path, 'r', encoding='utf-8', errors='replace') as file:
file_content = file.read()
html_content = re.findall("<.*?>", file_content)
html_content_length = sum(len(tag) for tag in html_content)
total_content_length = len(file_content)
if html_content_length / total_content_length > 0.9:
print(f"File {file_name} is mainly HTML, skipping.")
skipped_files.append(full_file_path)
continue
cleaned_content = self.clean_html_content(file_content)
lines = cleaned_content.split('\n')
non_empty_lines = [line for line in lines if line.strip() != '']
if any(keyword in cleaned_content.lower() for keyword in copyright_keywords):
print(f"File {file_name} detected as a copyright page, skipping.")
skipped_files.append(full_file_path)
continue
if len(non_empty_lines) < 5 or (sum(len(line) for line in non_empty_lines) / len(non_empty_lines)) < 40:
print(f"File {file_name} seems to be an index or footnote, skipping.")
skipped_files.append(full_file_path)
continue
output_file = self.base_path / f"{Path(file_name).stem}.txt"
with open(output_file, 'w', encoding='utf-8') as txt_file:
txt_file.write(cleaned_content)
processed_files.append(output_file.name)
# Remove the original file after processing
try:
full_file_path.unlink()
except PermissionError as e:
print(f"Error deleting file {full_file_path}: {e}")
else:
print(f"File {file_name} not found, skipping.")
skipped_files.append(full_file_path)
# Remove skipped files
for file_path in skipped_files:
try:
file_path.unlink()
except PermissionError as e:
print(f"Error deleting skipped file {file_path}: {e}")
# Update files_order.txt with processed files only
with open(self.order_file, 'w', encoding='utf-8') as order_file:
for file in processed_files:
order_file.write(file + "\n")
return processed_files
def process_epub(book_path, character_limit):
book_path = Path(book_path)
if book_path.is_dir():
print(f"----Processing files in {book_path}----")
processor = EpubProcessor(book_path, character_limit)
processor.process_files()