@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
226
226
content = f .read ()
227
227
228
228
return create_file_fingerprints (
229
- content ,
229
+ content = content ,
230
230
ngram_length = ngram_length ,
231
231
window_length = window_length ,
232
232
include_ngrams = include_ngrams ,
233
233
)
234
234
235
235
236
- def get_stem_file_fingerprint_hashes (
236
+ def get_stemmed_file_fingerprint_hashes (
237
237
location ,
238
238
ngram_length = 5 ,
239
239
window_length = 16 ,
@@ -262,28 +262,39 @@ def get_stem_file_fingerprint_hashes(
262
262
stemmed_content = get_stem_code (location = location )
263
263
264
264
return create_file_fingerprints (
265
- stemmed_content ,
265
+ stemmed_content = stemmed_content ,
266
266
ngram_length = ngram_length ,
267
267
window_length = window_length ,
268
268
include_ngrams = include_ngrams ,
269
269
)
270
270
271
271
272
272
def create_file_fingerprints (
273
- content , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
273
+ content = None , stemmed_content = None , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
274
274
):
275
275
"""
276
- Return a mapping of halo1 and snippet hashes from content string
276
+ Return a mapping of halo1 and snippet hashes from ` content` or `stemmed_content`, not both.
277
277
"""
278
278
from licensedcode .tokenize import ngrams
279
279
from licensedcode .tokenize import select_ngrams
280
280
281
+ if content and stemmed_content :
282
+ raise Exception ("create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both." )
283
+
284
+ if stemmed_content :
285
+ halo1_key = "stemmed_halo1"
286
+ snippets_key = "stemmed_snippets"
287
+ else :
288
+ halo1_key = "halo1"
289
+ snippets_key = "snippets"
290
+
281
291
fingerprints = {
282
- "halo1" : "" ,
283
- "snippets" : [],
292
+ halo1_key : "" ,
293
+ snippets_key : [],
284
294
}
285
295
286
296
# tokenize content into words
297
+ content = content or stemmed_content
287
298
words = list (tokenizer (content ))
288
299
289
300
# Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +308,7 @@ def create_file_fingerprints(
297
308
content_fingerprint = content_hash .hexdigest ().decode ("utf-8" )
298
309
ngs_count_hex_str = "%08x" % ngs_count
299
310
file_fingerprint = ngs_count_hex_str + content_fingerprint
300
- fingerprints ["halo1" ] = file_fingerprint
311
+ fingerprints [halo1_key ] = file_fingerprint
301
312
302
313
# Select windows from the content to compute snippet fingerprints
303
314
windows = ngrams (words , window_length )
@@ -317,7 +328,7 @@ def create_file_fingerprints(
317
328
s ["ngrams" ] = list (window )
318
329
snippets .append (s )
319
330
if snippets :
320
- fingerprints ["snippets" ] = snippets
331
+ fingerprints [snippets_key ] = snippets
321
332
322
333
return fingerprints
323
334
0 commit comments