13
13
from licensedcode .tokenize import query_lines
14
14
from samecode .halohash import BitAverageHaloHash
15
15
16
+ from matchcode_toolkit .stemming import get_stem_code
17
+
16
18
# A collection of directory fingerprints that we want to avoid
17
19
IGNORED_DIRECTORY_FINGERPRINTS = [
18
20
# This is both the directory content and directory structure fingerprint for
@@ -224,28 +226,81 @@ def get_file_fingerprint_hashes(
224
226
content = f .read ()
225
227
226
228
return create_file_fingerprints (
227
- content ,
229
+ content = content ,
230
+ ngram_length = ngram_length ,
231
+ window_length = window_length ,
232
+ include_ngrams = include_ngrams ,
233
+ )
234
+
235
+
236
+ def get_stemmed_file_fingerprint_hashes (
237
+ location ,
238
+ ngram_length = 5 ,
239
+ window_length = 16 ,
240
+ include_ngrams = False ,
241
+ ** kwargs ,
242
+ ):
243
+ """
244
+ Return a mapping of stemmed code fingerprint hashes for the file at `location`
245
+
246
+ The `halo1` hash is the hex digest of the fingerprint of the file.
247
+ `halo1` is empty if the file is empty.
248
+
249
+ - We start by breaking the file into words (tokens)
250
+ - We compute ngrams over the list of tokens
251
+
252
+ Return an empty mapping if `location` is not a text file
253
+ """
254
+ from commoncode import filetype
255
+ from typecode .contenttype import get_type
256
+
257
+ # Do not process `location` if it's not a text file
258
+ ft = get_type (location )
259
+ if not (filetype .is_file (location ) and ft .is_text ):
260
+ return {}
261
+
262
+ stemmed_content = get_stem_code (location = location )
263
+
264
+ return create_file_fingerprints (
265
+ stemmed_content = stemmed_content ,
228
266
ngram_length = ngram_length ,
229
267
window_length = window_length ,
230
268
include_ngrams = include_ngrams ,
231
269
)
232
270
233
271
234
272
def create_file_fingerprints (
235
- content , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
273
+ content = None ,
274
+ stemmed_content = None ,
275
+ ngram_length = 5 ,
276
+ window_length = SNIPPET_WINDOW_LENGTH ,
277
+ include_ngrams = False ,
236
278
):
237
279
"""
238
- Return a mapping of halo1 and snippet hashes from content string
280
+ Return a mapping of halo1 and snippet hashes from ` content` or `stemmed_content`, not both.
239
281
"""
240
282
from licensedcode .tokenize import ngrams
241
283
from licensedcode .tokenize import select_ngrams
242
284
285
+ if content and stemmed_content :
286
+ raise Exception (
287
+ "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
288
+ )
289
+
290
+ if stemmed_content :
291
+ halo1_key = "stemmed_halo1"
292
+ snippets_key = "stemmed_snippets"
293
+ else :
294
+ halo1_key = "halo1"
295
+ snippets_key = "snippets"
296
+
243
297
fingerprints = {
244
- "halo1" : "" ,
245
- "snippets" : [],
298
+ halo1_key : "" ,
299
+ snippets_key : [],
246
300
}
247
301
248
302
# tokenize content into words
303
+ content = content or stemmed_content
249
304
words = list (tokenizer (content ))
250
305
251
306
# Create a file fingerprint from the number of elements in the content hash
@@ -259,7 +314,7 @@ def create_file_fingerprints(
259
314
content_fingerprint = content_hash .hexdigest ().decode ("utf-8" )
260
315
ngs_count_hex_str = "%08x" % ngs_count
261
316
file_fingerprint = ngs_count_hex_str + content_fingerprint
262
- fingerprints ["halo1" ] = file_fingerprint
317
+ fingerprints [halo1_key ] = file_fingerprint
263
318
264
319
# Select windows from the content to compute snippet fingerprints
265
320
windows = ngrams (words , window_length )
@@ -279,7 +334,7 @@ def create_file_fingerprints(
279
334
s ["ngrams" ] = list (window )
280
335
snippets .append (s )
281
336
if snippets :
282
- fingerprints ["snippets" ] = snippets
337
+ fingerprints [snippets_key ] = snippets
283
338
284
339
return fingerprints
285
340
0 commit comments