-
-
Notifications
You must be signed in to change notification settings - Fork 19
Add utility to deduplicate ZIM items and replace them with redirects at ZIM creation time #261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import pathlib | ||
import re | ||
from typing import Any | ||
|
||
import xxhash | ||
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource] | ||
|
||
from zimscraperlib.zim.creator import Creator | ||
|
||
CONTENT_BUFFER_READ_SIZE = 1048576 # 1M | ||
|
||
|
||
class Deduplicator: | ||
"""Automatically deduplicate potential ZIM items before adding them to the ZIM | ||
|
||
This class automatically computes the digest of every item added to the ZIM, and | ||
either add the entry (if item is not yet inside the ZIM) or an alias (if item with | ||
same digest has already been added inside the ZIM). | ||
|
||
This class must be configured with filters to specifiy which items paths to | ||
consider. It is of course possible to consider all paths (i.e. all items) with a | ||
wide regex or to operate on a subset (e.g. all images) with more precise filters. | ||
Item is considered for deduplication if any filter matches. It is recommended to | ||
properly configure these filters to save time / memory by automatically ignoring | ||
items which are known to always be different and / or be too numerous. | ||
|
||
Only the digest and path of items matching the filters are computed and stored. | ||
|
||
The xxh32 algorithm (https://github.com/Cyan4973/xxHash) which is known to be good | ||
at avoiding collision with minimal memory and CPU footprint is used, so the sheer | ||
memory consumption will come from the paths we have to keep. This hashing algorithm | ||
is not meant for security purpose since one might infer original content from | ||
hashes, but this is not our use case. | ||
""" | ||
|
||
def __init__(self, creator: Creator): | ||
self.creator = creator | ||
self.filters: list[re.Pattern[str]] = [] | ||
self.added_items: dict[bytes, str] = {} | ||
|
||
def add_item_for( | ||
self, | ||
path: str, | ||
title: str | None = None, | ||
*, | ||
fpath: pathlib.Path | None = None, | ||
content: bytes | str | None = None, | ||
**kwargs: Any, | ||
): | ||
"""Add an item at given path or an alias""" | ||
existing_item = None | ||
if any(_filter.match(path) is not None for _filter in self.filters): | ||
if content: | ||
digest = xxhash.xxh32( | ||
content.encode() if isinstance(content, str) else content | ||
).digest() | ||
else: | ||
if not fpath: | ||
raise Exception("Either content or fpath are mandatory") | ||
xxh32 = xxhash.xxh32() | ||
with open(fpath, "rb") as f: | ||
while True: | ||
data = f.read(CONTENT_BUFFER_READ_SIZE) # read content in chunk | ||
if not data: | ||
break | ||
xxh32.update(data) | ||
digest = xxh32.digest() | ||
|
||
if existing_item := self.added_items.get(digest): | ||
self.creator.add_alias( | ||
path, | ||
targetPath=existing_item, | ||
title=title or path, | ||
hints={Hint.FRONT_ARTICLE: True} if kwargs.get("is_front") else {}, | ||
) | ||
return | ||
else: | ||
self.added_items[digest] = path | ||
|
||
self.creator.add_item_for(path, title, fpath=fpath, content=content, **kwargs) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import pathlib | ||
import re | ||
from typing import Any | ||
|
||
import pytest | ||
|
||
from zimscraperlib.zim import Archive, Creator | ||
from zimscraperlib.zim.dedup import Deduplicator | ||
|
||
|
||
def test_deduplicator( | ||
tmp_path: pathlib.Path, | ||
png_image: pathlib.Path, | ||
html_file: pathlib.Path, | ||
html_str: str, | ||
html_str_cn: str, | ||
): | ||
main_path = "welcome" | ||
|
||
png_data = png_image.read_bytes() | ||
|
||
def add_items(creator_or_deduplicator: Any): | ||
creator_or_deduplicator.add_item_for( | ||
"welcome1", "wel1", content=html_str, is_front=True | ||
) | ||
creator_or_deduplicator.add_item_for( | ||
"welcome2", "wel2", content=html_str, is_front=True | ||
) | ||
creator_or_deduplicator.add_item_for( | ||
"dedup/welcome3", "wel3", content=html_str, is_front=True | ||
) | ||
creator_or_deduplicator.add_item_for( | ||
"dedup/welcome4", "wel4", content=html_str, is_front=True | ||
) | ||
creator_or_deduplicator.add_item_for( | ||
"prefix/dedup/welcome5", "wel5", content=html_str, is_front=True | ||
) | ||
creator_or_deduplicator.add_item_for("image1", None, fpath=png_image) | ||
creator_or_deduplicator.add_item_for("image2", None, content=png_data) | ||
creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image) | ||
creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data) | ||
creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file) | ||
creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn) | ||
creator_or_deduplicator.add_item_for( | ||
"prefix/dedup/image5", None, content=png_data | ||
) | ||
|
||
fpath_without_dedup = tmp_path / "zim_without_dedup.zim" | ||
with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator: | ||
add_items(creator) | ||
|
||
assert fpath_without_dedup.exists() | ||
|
||
fpath_with_dedup = tmp_path / "zim_with_dedup.zim" | ||
with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator: | ||
deduplicator = Deduplicator(creator) | ||
deduplicator.filters.append(re.compile("^foo/.*$")) | ||
deduplicator.filters.append(re.compile("^dedup/.*$")) | ||
deduplicator.filters.append(re.compile("^bar/.*$")) | ||
add_items(deduplicator) | ||
|
||
# added_items contains only original items, not the duplicates | ||
assert set(deduplicator.added_items.values()) == { | ||
"dedup/welcome3", | ||
"dedup/image3", | ||
"dedup/html_cn", | ||
} | ||
|
||
assert fpath_with_dedup.exists() | ||
|
||
# check that deduplication has a consequence on ZIM size | ||
assert ( | ||
fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size | ||
) > 3000 # 3291 as of libzim 9.3 | ||
|
||
for zim_path in [fpath_with_dedup, fpath_without_dedup]: | ||
reader = Archive(zim_path) | ||
|
||
assert reader.all_entry_count == 24 | ||
|
||
for html_path in [ | ||
"welcome1", | ||
"welcome2", | ||
"dedup/welcome3", | ||
"dedup/welcome4", | ||
"prefix/dedup/welcome5", | ||
"dedup/html", | ||
]: | ||
assert bytes(reader.get_item(html_path).content).decode() == html_str | ||
assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn | ||
|
||
for img_path in [ | ||
"image1", | ||
"image2", | ||
"dedup/image3", | ||
"dedup/image4", | ||
"prefix/dedup/image5", | ||
]: | ||
assert bytes(reader.get_item(img_path).content) == png_data | ||
|
||
|
||
def test_missing_content(tmp_path: pathlib.Path): | ||
with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator: | ||
deduplicator = Deduplicator(creator) | ||
deduplicator.filters.append(re.compile(".*")) | ||
with pytest.raises(Exception, match="Either content or fpath are mandatory"): | ||
deduplicator.add_item_for("welcome", None) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.