Skip to content

Commit a3ad5c5

Browse files
committed
Automatically redirect to articles with same checksum
1 parent fe0b5fa commit a3ad5c5

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

src/zimscraperlib/zim/creator.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
- can be used to store a filepath and content read from it (not stored) """
2020

2121
import datetime
22+
import hashlib
2223
import pathlib
24+
import re
2325
import weakref
2426
from typing import Any, Callable, Dict, Optional, Tuple, Union
2527

@@ -104,6 +106,10 @@ def __init__(
104106

105107
self.workaround_nocancel = workaround_nocancel
106108

109+
self.autodedup_filters = []
110+
111+
self.dedup_items = dict()
112+
107113
def start(self):
108114
super().__enter__()
109115

@@ -119,6 +125,35 @@ def update_metadata(self, **kwargs):
119125
for name, value in kwargs.items():
120126
self.add_metadata(name, value)
121127

128+
def add_autodedup_filter(self, filter_regex: str):
129+
self.autodedup_filters.append(re.compile(filter_regex))
130+
131+
def check_for_duplicate(
132+
self,
133+
path: str,
134+
fpath: Optional[pathlib.Path] = None,
135+
content: Optional[bytes] = None,
136+
):
137+
for dedup_filter in self.autodedup_filters:
138+
if dedup_filter.match(path):
139+
if content:
140+
digest = hashlib.sha256(content).digest()
141+
else:
142+
sha256 = hashlib.sha256()
143+
with open(fpath, "rb") as f:
144+
while True:
145+
data = f.read(65536) # lets read stuff in 64kb chunks!
146+
if not data:
147+
break
148+
sha256.update(data)
149+
digest = sha256.digest()
150+
151+
if digest in self.dedup_items:
152+
return self.dedup_items[digest]
153+
self.dedup_items[digest] = path
154+
break
155+
return None
156+
122157
def add_item_for(
123158
self,
124159
path: str,
@@ -151,6 +186,18 @@ def add_item_for(
151186
if fpath is None and content is None:
152187
raise ValueError("One of fpath or content is required")
153188

189+
duplicate_path = self.check_for_duplicate(
190+
path=path, fpath=fpath, content=content
191+
)
192+
if duplicate_path:
193+
self.add_redirect(
194+
path=path,
195+
target_path=duplicate_path,
196+
title=title,
197+
is_front=is_front,
198+
)
199+
return path
200+
154201
mimetype = mimetype_for(
155202
path=path, content=content, fpath=fpath, mimetype=mimetype
156203
)

tests/zim/test_zim_creator.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,41 @@ def test_noindexlanguage(tmp_path):
126126
assert not reader.has_fulltext_index
127127

128128

129+
def test_duplicatefiles(tmp_path, png_image, html_file):
130+
fpath = tmp_path / "test.zim"
131+
132+
with open(png_image, "rb") as fh:
133+
png_data = fh.read()
134+
135+
with Creator(fpath, "welcome", "") as creator:
136+
creator.add_autodedup_filter(r"^images/.*$")
137+
# add a file not matching filter patterns
138+
creator.add_item_for("other_folder1/yahoo0.png", "Image1", fpath=png_image)
139+
# add same file but first matching filter patterns => will be added as-is
140+
creator.add_item_for("images/yahoo1.png", "Image1", fpath=png_image)
141+
# add same file but second matching filter patterns
142+
# => will be replaced by a redirect
143+
creator.add_item_for("images/yahoo2.png", "Image2", fpath=png_image)
144+
# add same file but not matching filter patterns => will be added as-is
145+
creator.add_item_for("other_folder2/yahoo3.png", "Image1", fpath=png_image)
146+
# add same file matching filter patterns but with content instead of fpath
147+
# => will be replaced by a redirect
148+
creator.add_item_for("images/yahoo4.png", "Image3", content=png_data)
149+
150+
reader = Archive(fpath)
151+
# make sure we have our image
152+
assert reader.get_item("images/yahoo1.png")
153+
assert not reader.get_entry_by_path("images/yahoo1.png").is_redirect
154+
assert reader.get_item("images/yahoo2.png")
155+
assert reader.get_entry_by_path("images/yahoo2.png").is_redirect
156+
assert reader.get_item("images/yahoo4.png")
157+
assert reader.get_entry_by_path("images/yahoo4.png").is_redirect
158+
assert reader.get_item("other_folder1/yahoo0.png")
159+
assert not reader.get_entry_by_path("other_folder1/yahoo0.png").is_redirect
160+
assert reader.get_item("other_folder2/yahoo3.png")
161+
assert not reader.get_entry_by_path("other_folder2/yahoo3.png").is_redirect
162+
163+
129164
def test_add_item_for(tmp_path):
130165
fpath = tmp_path / "test.zim"
131166
# test without mimetype

0 commit comments

Comments
 (0)