Skip to content

Commit 5abe3f4

Browse files
authored
Merge pull request #21 from aboutcode-org/create-stemcode-fingerprint
Add pipeline step to compute stem fingerprint
2 parents adc5daf + ae010c1 commit 5abe3f4

File tree

2 files changed

+63
-7
lines changed

2 files changed

+63
-7
lines changed

src/matchcode_toolkit/fingerprinting.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from licensedcode.tokenize import query_lines
1414
from samecode.halohash import BitAverageHaloHash
1515

16+
from matchcode_toolkit.stemming import get_stem_code
17+
1618
# A collection of directory fingerprints that we want to avoid
1719
IGNORED_DIRECTORY_FINGERPRINTS = [
1820
# This is both the directory content and directory structure fingerprint for
@@ -224,28 +226,81 @@ def get_file_fingerprint_hashes(
224226
content = f.read()
225227

226228
return create_file_fingerprints(
227-
content,
229+
content=content,
230+
ngram_length=ngram_length,
231+
window_length=window_length,
232+
include_ngrams=include_ngrams,
233+
)
234+
235+
236+
def get_stemmed_file_fingerprint_hashes(
237+
location,
238+
ngram_length=5,
239+
window_length=16,
240+
include_ngrams=False,
241+
**kwargs,
242+
):
243+
"""
244+
Return a mapping of stemmed code fingerprint hashes for the file at `location`
245+
246+
The `halo1` hash is the hex digest of the fingerprint of the file.
247+
`halo1` is empty if the file is empty.
248+
249+
- We start by breaking the file into words (tokens)
250+
- We compute ngrams over the list of tokens
251+
252+
Return an empty mapping if `location` is not a text file
253+
"""
254+
from commoncode import filetype
255+
from typecode.contenttype import get_type
256+
257+
# Do not process `location` if it's not a text file
258+
ft = get_type(location)
259+
if not (filetype.is_file(location) and ft.is_text):
260+
return {}
261+
262+
stemmed_content = get_stem_code(location=location)
263+
264+
return create_file_fingerprints(
265+
stemmed_content=stemmed_content,
228266
ngram_length=ngram_length,
229267
window_length=window_length,
230268
include_ngrams=include_ngrams,
231269
)
232270

233271

234272
def create_file_fingerprints(
235-
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
273+
content=None,
274+
stemmed_content=None,
275+
ngram_length=5,
276+
window_length=SNIPPET_WINDOW_LENGTH,
277+
include_ngrams=False,
236278
):
237279
"""
238-
Return a mapping of halo1 and snippet hashes from content string
280+
Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both.
239281
"""
240282
from licensedcode.tokenize import ngrams
241283
from licensedcode.tokenize import select_ngrams
242284

285+
if content and stemmed_content:
286+
raise Exception(
287+
"create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
288+
)
289+
290+
if stemmed_content:
291+
halo1_key = "stemmed_halo1"
292+
snippets_key = "stemmed_snippets"
293+
else:
294+
halo1_key = "halo1"
295+
snippets_key = "snippets"
296+
243297
fingerprints = {
244-
"halo1": "",
245-
"snippets": [],
298+
halo1_key: "",
299+
snippets_key: [],
246300
}
247301

248302
# tokenize content into words
303+
content = content or stemmed_content
249304
words = list(tokenizer(content))
250305

251306
# Create a file fingerprint from the number of elements in the content hash
@@ -259,7 +314,7 @@ def create_file_fingerprints(
259314
content_fingerprint = content_hash.hexdigest().decode("utf-8")
260315
ngs_count_hex_str = "%08x" % ngs_count
261316
file_fingerprint = ngs_count_hex_str + content_fingerprint
262-
fingerprints["halo1"] = file_fingerprint
317+
fingerprints[halo1_key] = file_fingerprint
263318

264319
# Select windows from the content to compute snippet fingerprints
265320
windows = ngrams(words, window_length)
@@ -279,7 +334,7 @@ def create_file_fingerprints(
279334
s["ngrams"] = list(window)
280335
snippets.append(s)
281336
if snippets:
282-
fingerprints["snippets"] = snippets
337+
fingerprints[snippets_key] = snippets
283338

284339
return fingerprints
285340

src/matchcode_toolkit/pipelines/fingerprint_codebase.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,4 @@ def fingerprint_codebase(self):
4242
"""
4343
matchcode.fingerprint_codebase_directories(self.project)
4444
matchcode.fingerprint_codebase_resources(self.project)
45+
matchcode.fingerprint_stemmed_codebase_resources(self.project)

0 commit comments

Comments
 (0)