Skip to content

Commit 2f250da

Browse files
committed
Update key field values for stemmed fingerprints
Signed-off-by: Jono Yang <[email protected]>
1 parent fc37d16 commit 2f250da

File tree

2 files changed

+21
-13
lines changed

2 files changed

+21
-13
lines changed

src/matchcode_toolkit/fingerprinting.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
226226
content = f.read()
227227

228228
return create_file_fingerprints(
229-
content,
229+
content=content,
230230
ngram_length=ngram_length,
231231
window_length=window_length,
232232
include_ngrams=include_ngrams,
233233
)
234234

235235

236-
def get_stem_file_fingerprint_hashes(
236+
def get_stemmed_file_fingerprint_hashes(
237237
location,
238238
ngram_length=5,
239239
window_length=16,
@@ -262,28 +262,39 @@ def get_stem_file_fingerprint_hashes(
262262
stemmed_content = get_stem_code(location=location)
263263

264264
return create_file_fingerprints(
265-
stemmed_content,
265+
stemmed_content=stemmed_content,
266266
ngram_length=ngram_length,
267267
window_length=window_length,
268268
include_ngrams=include_ngrams,
269269
)
270270

271271

272272
def create_file_fingerprints(
273-
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
273+
content=None, stemmed_content=None, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
274274
):
275275
"""
276-
Return a mapping of halo1 and snippet hashes from content string
276+
Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both.
277277
"""
278278
from licensedcode.tokenize import ngrams
279279
from licensedcode.tokenize import select_ngrams
280280

281+
if content and stemmed_content:
282+
raise Exception("create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both.")
283+
284+
if stemmed_content:
285+
halo1_key = "stemmed_halo1"
286+
snippets_key = "stemmed_snippets"
287+
else:
288+
halo1_key = "halo1"
289+
snippets_key = "snippets"
290+
281291
fingerprints = {
282-
"halo1": "",
283-
"snippets": [],
292+
halo1_key: "",
293+
snippets_key: [],
284294
}
285295

286296
# tokenize content into words
297+
content = content or stemmed_content
287298
words = list(tokenizer(content))
288299

289300
# Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +308,7 @@ def create_file_fingerprints(
297308
content_fingerprint = content_hash.hexdigest().decode("utf-8")
298309
ngs_count_hex_str = "%08x" % ngs_count
299310
file_fingerprint = ngs_count_hex_str + content_fingerprint
300-
fingerprints["halo1"] = file_fingerprint
311+
fingerprints[halo1_key] = file_fingerprint
301312

302313
# Select windows from the content to compute snippet fingerprints
303314
windows = ngrams(words, window_length)
@@ -317,7 +328,7 @@ def create_file_fingerprints(
317328
s["ngrams"] = list(window)
318329
snippets.append(s)
319330
if snippets:
320-
fingerprints["snippets"] = snippets
331+
fingerprints[snippets_key] = snippets
321332

322333
return fingerprints
323334

src/matchcode_toolkit/pipelines/fingerprint_codebase.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,4 @@ def fingerprint_codebase(self):
4545
"""
4646
matchcode.fingerprint_codebase_directories(self.project)
4747
matchcode.fingerprint_codebase_resources(self.project)
48-
49-
def fingerprint_stem_codebase_resources(self):
50-
"""Compute stem code fingerprint for resources"""
51-
matchcode.fingerprint_stem_codebase_resources(self.project)
48+
matchcode.fingerprint_stemmed_codebase_resources(self.project)

0 commit comments

Comments
 (0)