1919 - can be used to store a filepath and content read from it (not stored) """
2020
2121import datetime
22+ import hashlib
2223import pathlib
24+ import re
2325import weakref
2426from typing import Any , Callable , Dict , Optional , Tuple , Union
2527
@@ -104,6 +106,10 @@ def __init__(
104106
105107 self .workaround_nocancel = workaround_nocancel
106108
109+ self .autodedup_filters = []
110+
111+ self .dedup_items = dict ()
112+
107113 def start (self ):
108114 super ().__enter__ ()
109115
@@ -119,6 +125,35 @@ def update_metadata(self, **kwargs):
119125 for name , value in kwargs .items ():
120126 self .add_metadata (name , value )
121127
128+ def add_autodedup_filter (self , filter_regex : str ):
129+ self .autodedup_filters .append (re .compile (filter_regex ))
130+
131+ def check_for_duplicate (
132+ self ,
133+ path : str ,
134+ fpath : Optional [pathlib .Path ] = None ,
135+ content : Optional [bytes ] = None ,
136+ ):
137+ for dedup_filter in self .autodedup_filters :
138+ if dedup_filter .match (path ):
139+ if content :
140+ digest = hashlib .sha256 (content ).digest ()
141+ else :
142+ sha256 = hashlib .sha256 ()
143+ with open (fpath , "rb" ) as f :
144+ while True :
145+ data = f .read (65536 ) # lets read stuff in 64kb chunks!
146+ if not data :
147+ break
148+ sha256 .update (data )
149+ digest = sha256 .digest ()
150+
151+ if digest in self .dedup_items :
152+ return self .dedup_items [digest ]
153+ self .dedup_items [digest ] = path
154+ break
155+ return None
156+
122157 def add_item_for (
123158 self ,
124159 path : str ,
@@ -151,6 +186,18 @@ def add_item_for(
151186 if fpath is None and content is None :
152187 raise ValueError ("One of fpath or content is required" )
153188
189+ duplicate_path = self .check_for_duplicate (
190+ path = path , fpath = fpath , content = content
191+ )
192+ if duplicate_path :
193+ self .add_redirect (
194+ path = path ,
195+ target_path = duplicate_path ,
196+ title = title ,
197+ is_front = is_front ,
198+ )
199+ return path
200+
154201 mimetype = mimetype_for (
155202 path = path , content = content , fpath = fpath , mimetype = mimetype
156203 )
0 commit comments